diff options
Diffstat (limited to 'src/broadcom/compiler')
27 files changed, 5299 insertions, 2647 deletions
diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build index 95156140ad9..d5aafb3879e 100644 --- a/src/broadcom/compiler/meson.build +++ b/src/broadcom/compiler/meson.build @@ -32,23 +32,22 @@ libbroadcom_compiler_files = files( 'vir_to_qpu.c', 'qpu_schedule.c', 'qpu_validate.c', - 'v3d33_tex.c', - 'v3d40_tex.c', - 'v3d33_vpm_setup.c', + 'v3d_tex.c', 'v3d_compiler.h', 'v3d_nir_lower_io.c', 'v3d_nir_lower_image_load_store.c', 'v3d_nir_lower_line_smooth.c', + 'v3d_nir_lower_load_store_bitsize.c', 'v3d_nir_lower_logic_ops.c', - 'v3d_nir_lower_robust_buffer_access.c', 'v3d_nir_lower_scratch.c', 'v3d_nir_lower_txf_ms.c', + 'v3d_packing.c', ) libbroadcom_compiler = static_library( - ['broadcom_compiler', v3d_xml_pack], - libbroadcom_compiler_files, - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom], + 'broadcom_compiler', + [libbroadcom_compiler_files, v3d_xml_pack], + include_directories : [inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_broadcom], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers], diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index d0a89f1a7d4..acc62a092f2 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -38,7 +38,7 @@ #define __gen_address_type uint32_t #define __gen_address_offset(reloc) (*reloc) #define __gen_emit_reloc(cl, reloc) -#include "cle/v3d_packet_v41_pack.h" +#include "cle/v3d_packet_v42_pack.h" #define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) #define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) @@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c) c->last_thrsw->qpu.sig.thrsw = true; c->last_thrsw_at_top_level = !c->in_control_flow; - /* We need to lock the scoreboard before any tlb acess happens. If this + /* We need to lock the scoreboard before any tlb access happens. If this * thread switch comes after we have emitted a tlb load, then it means * that we can't lock on the last thread switch any more. */ @@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) } static uint32_t +v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr) +{ + nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr); + switch (atomic_op) { + case nir_atomic_op_iadd: + return instr->intrinsic == nir_intrinsic_ssbo_atomic ? + v3d_get_op_for_atomic_add(instr, 2) : + v3d_get_op_for_atomic_add(instr, 1); + case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN; + case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; + case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX; + case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX; + case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC; + case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC; + case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT; + case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; + case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + default: unreachable("unknown atomic op"); + } +} + +static uint32_t v3d_general_tmu_op(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { @@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr) case nir_intrinsic_load_uniform: case nir_intrinsic_load_shared: case nir_intrinsic_load_scratch: + case nir_intrinsic_load_global_2x32: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: return V3D_TMU_OP_REGULAR; - case nir_intrinsic_ssbo_atomic_add: - return v3d_get_op_for_atomic_add(instr, 2); - case nir_intrinsic_shared_atomic_add: - return v3d_get_op_for_atomic_add(instr, 1); - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_shared_atomic_imin: - return V3D_TMU_OP_WRITE_SMIN; - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_shared_atomic_umin: - return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_shared_atomic_imax: - return V3D_TMU_OP_WRITE_SMAX; - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_shared_atomic_umax: - return V3D_TMU_OP_WRITE_UMAX; - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_shared_atomic_and: - return V3D_TMU_OP_WRITE_AND_READ_INC; - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_shared_atomic_or: - return V3D_TMU_OP_WRITE_OR_READ_DEC; - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_shared_atomic_xor: - return V3D_TMU_OP_WRITE_XOR_READ_NOT; - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_shared_atomic_exchange: - return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; - case nir_intrinsic_ssbo_atomic_comp_swap: - case nir_intrinsic_shared_atomic_comp_swap: - return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_global_atomic_2x32: + case nir_intrinsic_global_atomic_swap_2x32: + return v3d_general_tmu_op_for_atomic(instr); + default: unreachable("unknown intrinsic op"); } @@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c) bool emitted_tmuwt = false; for (int i = 0; i < c->tmu.flush_count; i++) { if (c->tmu.flush[i].component_mask > 0) { - nir_dest *dest = c->tmu.flush[i].dest; - assert(dest); + nir_def *def = c->tmu.flush[i].def; + assert(def); for (int j = 0; j < 4; j++) { if (c->tmu.flush[i].component_mask & (1 << j)) { - ntq_store_dest(c, dest, j, - vir_MOV(c, vir_LDTMU(c))); + ntq_store_def(c, def, j, + vir_MOV(c, vir_LDTMU(c))); } } } else if (!emitted_tmuwt) { @@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c) /** * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller - * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, + * is responsible for ensuring that doing this doesn't overflow the TMU fifos, * and more specifically, the output fifo, since that can't stall. */ void ntq_add_pending_tmu_flush(struct v3d_compile *c, - nir_dest *dest, + nir_def *def, uint32_t component_mask) { const uint32_t num_components = util_bitcount(component_mask); @@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c, if (num_components > 0) { c->tmu.output_fifo_size += num_components; - if (!dest->is_ssa) - _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg); + + nir_intrinsic_instr *store = nir_store_reg_for_def(def); + if (store != NULL) { + nir_def *reg = store->src[1].ssa; + _mesa_set_add(c->tmu.outstanding_regs, reg); + } } - c->tmu.flush[c->tmu.flush_count].dest = dest; + c->tmu.flush[c->tmu.flush_count].def = def; c->tmu.flush[c->tmu.flush_count].component_mask = component_mask; c->tmu.flush_count++; + c->tmu.total_count++; if (c->disable_tmu_pipelining) ntq_flush_tmu(c); @@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c, uint32_t base_const_offset, uint32_t *writemask, uint32_t *const_offset, + uint32_t *type_size, uint32_t *tmu_writes) { struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); @@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c, /* Update the offset for the TMU write based on the * the first component we are writing. */ - *const_offset = base_const_offset + first_component * 4; + *type_size = nir_src_bit_size(instr->src[0]) / 8; + *const_offset = + base_const_offset + first_component * (*type_size); /* Clear these components from the writemask */ uint32_t written_mask = @@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c, int offset_src, struct qreg base_offset, uint32_t const_offset, + uint32_t dest_components, uint32_t *tmu_writes) { if (mode == MODE_COUNT) { @@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c, if (vir_in_nonuniform_control_flow(c)) vir_set_cond(tmu, V3D_QPU_COND_IFA); + + tmu->ldtmu_count = dest_components; } /** @@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c, */ static void ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, - bool is_shared_or_scratch) + bool is_shared_or_scratch, bool is_global) { uint32_t tmu_op = v3d_general_tmu_op(instr); @@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, * amount to add/sub, as that is implicit. */ bool atomic_add_replaced = - ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || - instr->intrinsic == nir_intrinsic_shared_atomic_add) && + (instr->intrinsic == nir_intrinsic_ssbo_atomic || + instr->intrinsic == nir_intrinsic_shared_atomic || + instr->intrinsic == nir_intrinsic_global_atomic_2x32) && + nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd && (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || - tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC); bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || instr->intrinsic == nir_intrinsic_store_scratch || - instr->intrinsic == nir_intrinsic_store_shared); + instr->intrinsic == nir_intrinsic_store_shared || + instr->intrinsic == nir_intrinsic_store_global_2x32); bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || instr->intrinsic == nir_intrinsic_load_ubo || instr->intrinsic == nir_intrinsic_load_ssbo || instr->intrinsic == nir_intrinsic_load_scratch || - instr->intrinsic == nir_intrinsic_load_shared); + instr->intrinsic == nir_intrinsic_load_shared || + instr->intrinsic == nir_intrinsic_load_global_2x32); if (!is_load) c->tmu_dirty_rcl = true; - bool has_index = !is_shared_or_scratch; + if (is_global) + c->has_global_address = true; + + bool has_index = !is_shared_or_scratch && !is_global; int offset_src; if (instr->intrinsic == nir_intrinsic_load_uniform) { @@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, instr->intrinsic == nir_intrinsic_load_ubo || instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_load_shared || + instr->intrinsic == nir_intrinsic_load_global_2x32 || atomic_add_replaced) { offset_src = 0 + has_index; } else if (is_store) { @@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, v3d_unit_data_create(0, const_offset)); const_offset = 0; } else if (instr->intrinsic == nir_intrinsic_load_ubo) { - uint32_t index = nir_src_as_uint(instr->src[0]); - /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index - * shifted up by 1 (0 is gallium's constant buffer 0). + /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0 + * is gallium's constant buffer 0 in GL and push constants + * in Vulkan)). */ - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) - index++; - + uint32_t index = nir_src_as_uint(instr->src[0]) + 1; base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, v3d_unit_data_create(index, const_offset)); @@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, base_offset = c->cs_shared_offset; const_offset += nir_intrinsic_base(instr); } + } else if (is_global) { + /* Global load/store intrinsics use gloal addresses, so the + * offset is the target address and we don't need to add it + * to a base offset. + */ + base_offset = vir_uniform_ui(c, 0); } else { + uint32_t idx = is_store ? 1 : 0; base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, - nir_src_as_uint(instr->src[is_store ? - 1 : 0])); + nir_src_comp_as_uint(instr->src[idx], 0)); } /* We are ready to emit TMU register writes now, but before we actually @@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) { assert(mode == MODE_COUNT || tmu_writes > 0); + uint32_t type_size = 4; + if (is_store) { emit_tmu_general_store_writes(c, mode, instr, base_const_offset, &writemask, &const_offset, + &type_size, &tmu_writes); } else if (!is_load && !atomic_add_replaced) { - emit_tmu_general_atomic_writes(c, mode, instr, - tmu_op, has_index, - &tmu_writes); + emit_tmu_general_atomic_writes(c, mode, instr, + tmu_op, has_index, + &tmu_writes); + } else if (is_load) { + type_size = instr->def.bit_size / 8; } /* For atomics we use 32bit except for CMPXCHG, that we need @@ -618,17 +648,40 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, v3d_tmu_get_type_from_op(tmu_op, !is_load) == V3D_TMU_OP_TYPE_ATOMIC; + /* Only load per-quad if we can be certain that all + * lines in the quad are active. Notice that demoted + * invocations, unlike terminated ones, are still + * active: we want to skip memory writes for them but + * loads should still work. + */ uint32_t perquad = - is_load && !vir_in_nonuniform_control_flow(c) - ? GENERAL_TMU_LOOKUP_PER_QUAD - : GENERAL_TMU_LOOKUP_PER_PIXEL; + is_load && !vir_in_nonuniform_control_flow(c) && + ((c->s->info.stage == MESA_SHADER_FRAGMENT && + c->s->info.fs.needs_quad_helper_invocations && + !c->emitted_discard) || + c->s->info.uses_wide_subgroup_intrinsics) ? + GENERAL_TMU_LOOKUP_PER_QUAD : + GENERAL_TMU_LOOKUP_PER_PIXEL; config = 0xffffff00 | tmu_op << 3 | perquad; if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { config |= GENERAL_TMU_LOOKUP_TYPE_VEC2; } else if (is_atomic || num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + switch (type_size) { + case 4: + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + break; + case 2: + config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI; + break; + case 1: + config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI; + break; + default: + unreachable("Unsupported bitsize"); + } } else { + assert(type_size == 4); config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; } @@ -637,7 +690,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, emit_tmu_general_address_write(c, mode, instr, config, dynamic_src, offset_src, base_offset, const_offset, - &tmu_writes); + dest_components, &tmu_writes); assert(tmu_writes > 0); if (mode == MODE_COUNT) { @@ -660,7 +713,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, */ const uint32_t component_mask = (1 << dest_components) - 1; - ntq_add_pending_tmu_flush(c, &instr->dest, + ntq_add_pending_tmu_flush(c, &instr->def, component_mask); } } @@ -673,7 +726,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, } static struct qreg * -ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) +ntq_init_ssa_def(struct v3d_compile *c, nir_def *def) { struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, def->num_components); @@ -717,8 +770,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig) * its destination to be the NIR reg's destination */ void -ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, - struct qreg result) +ntq_store_def(struct v3d_compile *c, nir_def *def, int chan, + struct qreg result) { struct qinst *last_inst = NULL; if (!list_is_empty(&c->cur_block->instructions)) @@ -731,23 +784,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, assert(result.file == QFILE_TEMP && last_inst && (last_inst == c->defs[result.index] || is_reused_uniform)); - if (dest->is_ssa) { - assert(chan < dest->ssa.num_components); + nir_intrinsic_instr *store = nir_store_reg_for_def(def); + if (store == NULL) { + assert(chan < def->num_components); struct qreg *qregs; struct hash_entry *entry = - _mesa_hash_table_search(c->def_ht, &dest->ssa); + _mesa_hash_table_search(c->def_ht, def); if (entry) qregs = entry->data; else - qregs = ntq_init_ssa_def(c, &dest->ssa); + qregs = ntq_init_ssa_def(c, def); qregs[chan] = result; } else { - nir_register *reg = dest->reg.reg; - assert(dest->reg.base_offset == 0); - assert(reg->num_array_elems == 0); + nir_def *reg = store->src[1].ssa; + ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + assert(nir_intrinsic_base(store) == 0); + assert(nir_intrinsic_num_array_elems(decl) == 0); struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; @@ -802,7 +857,9 @@ struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i) { struct hash_entry *entry; - if (src.is_ssa) { + + nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa); + if (load == NULL) { assert(i < src.ssa->num_components); entry = _mesa_hash_table_search(c->def_ht, src.ssa); @@ -811,10 +868,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i) entry = _mesa_hash_table_search(c->def_ht, src.ssa); } } else { - nir_register *reg = src.reg.reg; - assert(reg->num_array_elems == 0); - assert(src.reg.base_offset == 0); - assert(i < reg->num_components); + nir_def *reg = load->src[0].ssa; + ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + assert(nir_intrinsic_base(load) == 0); + assert(nir_intrinsic_num_array_elems(decl) == 0); + assert(i < nir_intrinsic_num_components(decl)); if (_mesa_set_search(c->tmu.outstanding_regs, reg)) ntq_flush_tmu(c); @@ -830,13 +888,8 @@ static struct qreg ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, unsigned src) { - assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); - unsigned chan = ffs(instr->dest.write_mask) - 1; struct qreg r = ntq_get_src(c, instr->src[src].src, - instr->src[src].swizzle[chan]); - - assert(!instr->src[src].abs); - assert(!instr->src[src].negate); + instr->src[src].swizzle[0]); return r; }; @@ -876,6 +929,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) case GLSL_SAMPLER_DIM_3D: case GLSL_SAMPLER_DIM_CUBE: case GLSL_SAMPLER_DIM_BUF: + case GLSL_SAMPLER_DIM_EXTERNAL: /* Don't minify the array size. */ if (!(instr->is_array && i == dest_size - 1)) { size = ntq_minify(c, size, lod); @@ -890,7 +944,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) unreachable("Bad sampler type"); } - ntq_store_dest(c, &instr->dest, i, size); + ntq_store_def(c, &instr->def, i, size); } } @@ -905,12 +959,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) */ switch (instr->op) { case nir_texop_query_levels: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); return; case nir_texop_texture_samples: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); return; case nir_texop_txs: ntq_emit_txs(c, instr); @@ -919,10 +973,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) break; } - if (c->devinfo->ver >= 40) - v3d40_vir_emit_tex(c, instr); - else - v3d33_vir_emit_tex(c, instr); + v3d_vir_emit_tex(c, instr); } static struct qreg @@ -963,44 +1014,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr) static struct qreg emit_smooth_varying(struct v3d_compile *c, - struct qreg vary, struct qreg w, struct qreg r5) + struct qreg vary, struct qreg w, struct qreg c_reg) { - return vir_FADD(c, vir_FMUL(c, vary, w), r5); + return vir_FADD(c, vir_FMUL(c, vary, w), c_reg); } static struct qreg emit_noperspective_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { - return vir_FADD(c, vir_MOV(c, vary), r5); + return vir_FADD(c, vir_MOV(c, vary), c_reg); } static struct qreg emit_flat_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { vir_MOV_dest(c, c->undef, vary); - return vir_MOV(c, r5); + return vir_MOV(c, c_reg); } static struct qreg emit_fragment_varying(struct v3d_compile *c, nir_variable *var, int8_t input_idx, uint8_t swizzle, int array_index) { - struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); - struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + struct qreg c_reg; /* C coefficient */ + + if (c->devinfo->has_accumulators) + c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + else + c_reg = vir_reg(QFILE_REG, 0); struct qinst *ldvary = NULL; struct qreg vary; - if (c->devinfo->ver >= 41) { - ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, - c->undef, c->undef); - ldvary->qpu.sig.ldvary = true; - vary = vir_emit_def(c, ldvary); - } else { - vir_NOP(c)->qpu.sig.ldvary = true; - vary = r3; - } + ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldvary->qpu.sig.ldvary = true; + vary = vir_emit_def(c, ldvary); /* Store the input value before interpolation so we can implement * GLSL's interpolateAt functions if the shader uses them. @@ -1008,7 +1058,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (input_idx >= 0) { assert(var); c->interp[input_idx].vp = vary; - c->interp[input_idx].C = vir_MOV(c, r5); + c->interp[input_idx].C = vir_MOV(c, c_reg); c->interp[input_idx].mode = var->data.interpolation; } @@ -1018,7 +1068,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, */ if (!var) { assert(input_idx < 0); - return emit_smooth_varying(c, vary, c->payload_w, r5); + return emit_smooth_varying(c, vary, c->payload_w, c_reg); } int i = c->num_inputs++; @@ -1033,20 +1083,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (var->data.centroid) { BITSET_SET(c->centroid_flags, i); result = emit_smooth_varying(c, vary, - c->payload_w_centroid, r5); + c->payload_w_centroid, c_reg); } else { - result = emit_smooth_varying(c, vary, c->payload_w, r5); + result = emit_smooth_varying(c, vary, c->payload_w, c_reg); } break; case INTERP_MODE_NOPERSPECTIVE: BITSET_SET(c->noperspective_flags, i); - result = emit_noperspective_varying(c, vary, r5); + result = emit_noperspective_varying(c, vary, c_reg); break; case INTERP_MODE_FLAT: BITSET_SET(c->flat_shade_flags, i); - result = emit_flat_varying(c, vary, r5); + result = emit_flat_varying(c, vary, c_reg); break; default: @@ -1163,16 +1213,6 @@ ntq_emit_comparison(struct v3d_compile *c, vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); break; - case nir_op_i2b32: - vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); - cond_invert = true; - break; - - case nir_op_f2b32: - vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); - cond_invert = true; - break; - default: return false; } @@ -1188,7 +1228,7 @@ ntq_emit_comparison(struct v3d_compile *c, static struct nir_alu_instr * ntq_get_alu_parent(nir_src src) { - if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) + if (src.ssa->parent_instr->type != nir_instr_type_alu) return NULL; nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); if (!instr) @@ -1199,7 +1239,7 @@ ntq_get_alu_parent(nir_src src) * src. */ for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { - if (!instr->src[i].src.is_ssa) + if (nir_load_reg_for_def(instr->src[i].src.ssa)) return NULL; } @@ -1242,12 +1282,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond) return result; } +static struct qreg +ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond) +{ + struct qreg result = + vir_MOV(c, vir_SEL(c, cond, + vir_uniform_ui(c, 1), + vir_uniform_ui(c, 0))); + c->flags_temp = result.index; + c->flags_cond = cond; + return result; +} + +static struct qreg +f2f16_rtz(struct v3d_compile *c, struct qreg f32) +{ + /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding + * method and seems to be using RTE by default, so we need to implement + * RTZ rounding in software. + */ + struct qreg rf16 = vir_FMOV(c, f32); + vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L); + + struct qreg rf32 = vir_FMOV(c, rf16); + vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L); + + struct qreg f32_abs = vir_FMOV(c, f32); + vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS); + + struct qreg rf32_abs = vir_FMOV(c, rf32); + vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS); + + vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs), + V3D_QPU_PF_PUSHN); + return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, + vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16)); +} + +/** + * Takes the result value of a signed integer width conversion from a smaller + * type to a larger type and if needed, it applies sign extension to it. + */ +static struct qreg +sign_extend(struct v3d_compile *c, + struct qreg value, + uint32_t src_bit_size, + uint32_t dst_bit_size) +{ + assert(src_bit_size < dst_bit_size); + + struct qreg tmp = vir_MOV(c, value); + + /* Do we need to sign-extend? */ + uint32_t sign_mask = 1 << (src_bit_size - 1); + struct qinst *sign_check = + vir_AND_dest(c, vir_nop_reg(), + tmp, vir_uniform_ui(c, sign_mask)); + vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ); + + /* If so, fill in leading sign bits */ + uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) & + ((1ull << dst_bit_size) - 1); + struct qinst *extend_inst = + vir_OR_dest(c, tmp, tmp, + vir_uniform_ui(c, extend_bits)); + vir_set_cond(extend_inst, V3D_QPU_COND_IFNA); + + return tmp; +} + static void ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) { - /* This should always be lowered to ALU operations for V3D. */ - assert(!instr->dest.saturate); - /* Vectors are special in that they have non-scalarized writemasks, * and just take the first swizzle channel for each argument in order * into each writemask channel. @@ -1260,8 +1366,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) srcs[i] = ntq_get_src(c, instr->src[i].src, instr->src[i].swizzle[0]); for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - ntq_store_dest(c, &instr->dest.dest, i, - vir_MOV(c, srcs[i])); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, srcs[i])); return; } @@ -1327,6 +1433,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); break; + case nir_op_f2f16: + case nir_op_f2f16_rtne: + assert(nir_src_bit_size(instr->src[0].src) == 32); + result = vir_FMOV(c, src[0]); + vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L); + break; + + case nir_op_f2f16_rtz: + assert(nir_src_bit_size(instr->src[0].src) == 32); + result = f2f16_rtz(c, src[0]); + break; + + case nir_op_f2f32: + assert(nir_src_bit_size(instr->src[0].src) == 16); + result = vir_FMOV(c, src[0]); + vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); + break; + + case nir_op_i2i16: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 32 || bit_size == 8); + if (bit_size == 32) { + /* We don't have integer pack/unpack methods for + * converting between 16-bit and 32-bit, so we implement + * the conversion manually by truncating the src. + */ + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); + } else { + struct qreg tmp = vir_AND(c, src[0], + vir_uniform_ui(c, 0xff)); + result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16)); + } + break; + } + + case nir_op_u2u16: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 32 || bit_size == 8); + + /* We don't have integer pack/unpack methods for converting + * between 16-bit and 32-bit, so we implement the conversion + * manually by truncating the src. For the 8-bit case, we + * want to make sure we don't copy garbage from any of the + * 24 MSB bits. + */ + if (bit_size == 32) + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); + else + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); + break; + } + + case nir_op_i2i8: + case nir_op_u2u8: + assert(nir_src_bit_size(instr->src[0].src) == 32 || + nir_src_bit_size(instr->src[0].src) == 16); + /* We don't have integer pack/unpack methods for converting + * between 8-bit and 32-bit, so we implement the conversion + * manually by truncating the src. + */ + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); + break; + + case nir_op_u2u32: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 16 || bit_size == 8); + + /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit + * from the src but we make sure to clear any garbage bits that + * may be present in the invalid src bits. + */ + uint32_t mask = (1 << bit_size) - 1; + result = vir_AND(c, src[0], vir_uniform_ui(c, mask)); + break; + } + + case nir_op_i2i32: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 16 || bit_size == 8); + + uint32_t mask = (1 << bit_size) - 1; + struct qreg tmp = vir_AND(c, src[0], + vir_uniform_ui(c, mask)); + + result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32)); + break; + } + case nir_op_iadd: result = vir_ADD(c, src[0], src[1]); break; @@ -1390,8 +1584,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; } - case nir_op_i2b32: - case nir_op_f2b32: case nir_op_feq32: case nir_op_fneu32: case nir_op_fge32: @@ -1485,13 +1677,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_uadd_carry: vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), V3D_QPU_PF_PUSHC); - result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); + result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); + break; + + case nir_op_usub_borrow: + vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]), + V3D_QPU_PF_PUSHC); + result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); break; case nir_op_pack_half_2x16_split: result = vir_VFPACK(c, src[0], src[1]); break; + case nir_op_pack_2x32_to_2x16_v3d: + result = vir_VPACK(c, src[0], src[1]); + break; + + case nir_op_pack_32_to_r11g11b10_v3d: + result = vir_V11FPACK(c, src[0], src[1]); + break; + + case nir_op_pack_uint_32_to_r10g10b10a2_v3d: + result = vir_V10PACK(c, src[0], src[1]); + break; + + case nir_op_pack_4x16_to_4x8_v3d: + result = vir_V8PACK(c, src[0], src[1]); + break; + case nir_op_unpack_half_2x16_split_x: result = vir_FMOV(c, src[0]); vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); @@ -1502,26 +1716,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); break; - case nir_op_fquantize2f16: { - /* F32 -> F16 -> F32 conversion */ - struct qreg tmp = vir_FMOV(c, src[0]); - vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L); - tmp = vir_FMOV(c, tmp); - vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L); + case nir_op_pack_2x16_to_unorm_2x8_v3d: + result = vir_VFTOUNORM8(c, src[0]); + break; - /* Check for denorm */ - struct qreg abs_src = vir_FMOV(c, src[0]); - vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS); - struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14)); - vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold), - V3D_QPU_PF_PUSHC); + case nir_op_pack_2x16_to_snorm_2x8_v3d: + result = vir_VFTOSNORM8(c, src[0]); + break; - /* Return +/-0 for denorms */ - struct qreg zero = - vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000)); - result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); + case nir_op_pack_2x16_to_unorm_2x10_v3d: + result = vir_VFTOUNORM10LO(c, src[0]); + break; + + case nir_op_pack_2x16_to_unorm_10_2_v3d: + result = vir_VFTOUNORM10HI(c, src[0]); + break; + + case nir_op_f2unorm_16_v3d: + result = vir_FTOUNORM16(c, src[0]); + break; + + case nir_op_f2snorm_16_v3d: + result = vir_FTOSNORM16(c, src[0]); break; - } default: fprintf(stderr, "unknown NIR ALU inst: "); @@ -1530,17 +1747,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) abort(); } - /* We have a scalar result, so the instruction should only have a - * single channel written to. - */ - assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); - ntq_store_dest(c, &instr->dest.dest, - ffs(instr->dest.write_mask) - 1, result); + ntq_store_def(c, &instr->def, 0, result); } /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit * specifier. They come from a register that's preloaded with 0xffffffff - * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low + * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low * 8 bits are shifted off the bottom and 0xff shifted in from the top. */ #define TLB_TYPE_F16_COLOR (3 << 6) @@ -1670,15 +1882,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt) static void emit_frag_end(struct v3d_compile *c) { - /* If the shader has no non-TLB side effects and doesn't write Z - * we can promote it to enabling early_fragment_tests even - * if the user didn't. - */ - if (c->output_position_index == -1 && - !(c->s->info.num_images || c->s->info.num_ssbos)) { - c->s->info.fs.early_fragment_tests = true; - } - if (c->output_sample_mask_index != -1) { vir_SETMSF_dest(c, vir_nop_reg(), vir_AND(c, @@ -1703,55 +1906,75 @@ emit_frag_end(struct v3d_compile *c) } struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); - if (c->output_position_index != -1 && - !c->s->info.fs.early_fragment_tests) { - struct qinst *inst = vir_MOV_dest(c, tlbu_reg, - c->outputs[c->output_position_index]); - uint8_t tlb_specifier = TLB_TYPE_DEPTH; - if (c->devinfo->ver >= 42) { - tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | - TLB_SAMPLE_MODE_PER_PIXEL); - } else - tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; + /* If the shader has no non-TLB side effects and doesn't write Z + * we can promote it to enabling early_fragment_tests even + * if the user didn't. + */ + if (c->output_position_index == -1 && + !(c->s->info.num_images || c->s->info.num_ssbos) && + !c->s->info.fs.uses_discard && + !c->s->info.fs.uses_demote && + !c->fs_key->sample_alpha_to_coverage && + c->output_sample_mask_index == -1 && + has_any_tlb_color_write) { + c->s->info.fs.early_fragment_tests = true; + } - inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, - tlb_specifier | - 0xffffff00); + /* By default, Z buffer writes are implicit using the Z values produced + * from FEP (Z value produced from rasterization). When this is not + * desirable (shader writes Z explicitly, has discards, etc) we need + * to let the hardware know by setting c->writes_z to true, in which + * case we always need to write a Z value from the QPU, even if it is + * just the passthrough Z value produced from FEP. + * + * Also, from the V3D 4.2 spec: + * + * "If a shader performs a Z read the “Fragment shader does Z writes” + * bit in the shader record must be enabled to ensure deterministic + * results" + * + * So if c->reads_z is set we always need to write Z, even if it is + * a passthrough from the Z value produced from FEP. + */ + if (!c->s->info.fs.early_fragment_tests || c->reads_z) { c->writes_z = true; - } else if (c->s->info.fs.uses_discard || - !c->s->info.fs.early_fragment_tests || - c->fs_key->sample_alpha_to_coverage || - !has_any_tlb_color_write) { - /* Emit passthrough Z if it needed to be delayed until shader - * end due to potential discards. - * - * Since (single-threaded) fragment shaders always need a TLB - * write, emit passthrouh Z if we didn't have any color - * buffers and flag us as potentially discarding, so that we - * can use Z as the TLB write. - */ - c->s->info.fs.uses_discard = true; - - struct qinst *inst = vir_MOV_dest(c, tlbu_reg, - vir_nop_reg()); uint8_t tlb_specifier = TLB_TYPE_DEPTH; + struct qinst *inst; + + if (c->output_position_index != -1) { + /* Shader writes to gl_FragDepth, use that */ + inst = vir_MOV_dest(c, tlbu_reg, + c->outputs[c->output_position_index]); + + tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | + TLB_SAMPLE_MODE_PER_PIXEL); + } else { + /* Shader doesn't write to gl_FragDepth, take Z from + * FEP. + */ + c->writes_z_from_fep = true; + inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg()); - if (c->devinfo->ver >= 42) { /* The spec says the PER_PIXEL flag is ignored for * invariant writes, but the simulator demands it. */ tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | TLB_SAMPLE_MODE_PER_PIXEL); - } else { - tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; + + /* Since (single-threaded) fragment shaders always need + * a TLB write, if we dond't have any we emit a + * passthrouh Z and flag us as potentially discarding, + * so that we can use Z as the required TLB write. + */ + if (!has_any_tlb_color_write) + c->s->info.fs.uses_discard = true; } - inst->uniform = vir_get_uniform_index(c, - QUNIFORM_CONSTANT, + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, tlb_specifier | 0xffffff00); - c->writes_z = true; + inst->is_tlb_z_write = true; } /* XXX: Performance improvement: Merge Z write and color writes TLB @@ -1767,7 +1990,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c, struct qreg vpm_index, bool uniform_vpm_index) { - assert(c->devinfo->ver >= 40); if (uniform_vpm_index) vir_STVPMV(c, vpm_index, val); else @@ -1777,13 +1999,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c, static void vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { - if (c->devinfo->ver >= 40) { - vir_VPM_WRITE_indirect(c, val, - vir_uniform_ui(c, vpm_index), true); - } else { - /* XXX: v3d33_vir_vpm_write_setup(c); */ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); - } + vir_VPM_WRITE_indirect(c, val, + vir_uniform_ui(c, vpm_index), true); } static void @@ -1791,7 +2008,7 @@ emit_vert_end(struct v3d_compile *c) { /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ - if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + if (c->devinfo->ver == 42) vir_VPMWT(c); } @@ -1800,7 +2017,7 @@ emit_geom_end(struct v3d_compile *c) { /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ - if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + if (c->devinfo->ver == 42) vir_VPMWT(c); } @@ -1812,8 +2029,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset, nir_intrinsic_instr *high, void *data) { - /* Our backend is 32-bit only at present */ - if (bit_size != 32) + /* TMU general access only supports 32-bit vectors */ + if (bit_size > 32) + return false; + + if ((bit_size == 8 || bit_size == 16) && num_components > 1) return false; if (align_mul % 4 != 0 || align_offset % 4 != 0) @@ -1843,7 +2063,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) do { progress = false; - NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp); + NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp); + NIR_PASS(progress, s, nir_opt_deref); + + NIR_PASS(progress, s, nir_lower_vars_to_ssa); + if (!s->info.var_copies_lowered) { + /* Only run this pass if nir_lower_var_copies was not called + * yet. That would lower away any copy_deref instructions and we + * don't want to introduce any more. + */ + NIR_PASS(progress, s, nir_opt_find_array_copies); + } + + NIR_PASS(progress, s, nir_opt_copy_prop_vars); + NIR_PASS(progress, s, nir_opt_dead_write_vars); + NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all); + + NIR_PASS(progress, s, nir_remove_dead_variables, + (nir_variable_mode)(nir_var_function_temp | + nir_var_shader_temp | + nir_var_mem_shared), + NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); NIR_PASS(progress, s, nir_copy_prop); @@ -1851,10 +2093,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false); + NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_intrinsics); + NIR_PASS(progress, s, nir_opt_idiv_const, 32); + NIR_PASS(progress, s, nir_lower_alu); + + if (nir_opt_loop(s)) { + progress = true; + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_dce); + } + + NIR_PASS(progress, s, nir_opt_conditional_discard); + + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_if, false); + if (c && !c->disable_gcm) { + bool local_progress = false; + NIR_PASS(local_progress, s, nir_opt_gcm, false); + c->gcm_progress |= local_progress; + progress |= local_progress; + } + + /* Note that vectorization may undo the load/store scalarization + * pass we run for non 32-bit TMU general load/store by + * converting, for example, 2 consecutive 16-bit loads into a + * single 32-bit load. This is fine (and desirable) as long as + * the resulting 32-bit load meets 32-bit alignment requirements, + * which mem_vectorize_callback() should be enforcing. + */ nir_load_store_vectorize_options vectorize_opts = { .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared | @@ -1862,7 +2133,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) .callback = mem_vectorize_callback, .robust_modes = 0, }; - NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); + bool vectorize_progress = false; + + + /* This requires that we have called + * nir_lower_vars_to_explicit_types / nir_lower_explicit_io + * first, which we may not have done yet if we call here too + * early durign NIR pre-processing. We can detect this because + * in that case we won't have a compile object + */ + if (c) { + NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize, + &vectorize_opts); + if (vectorize_progress) { + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(progress, s, nir_lower_pack); + progress = true; + } + } if (lower_flrp != 0) { bool lower_flrp_progress = false; @@ -1895,10 +2183,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) nir_move_options sink_opts = nir_move_const_undef | nir_move_comparisons | nir_move_copies | - nir_move_load_ubo; + nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform; NIR_PASS(progress, s, nir_opt_sink, sink_opts); - - NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo); } static int @@ -1915,27 +2201,9 @@ ntq_emit_vpm_read(struct v3d_compile *c, uint32_t *remaining, uint32_t vpm_index) { - struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); - - if (c->devinfo->ver >= 40 ) { - return vir_LDVPMV_IN(c, - vir_uniform_ui(c, - (*num_components_queued)++)); - } - - if (*num_components_queued != 0) { - (*num_components_queued)--; - return vir_MOV(c, vpm); - } - - uint32_t num_components = MIN2(*remaining, 32); - - v3d33_vir_vpm_read_setup(c, num_components); - - *num_components_queued = num_components - 1; - *remaining -= num_components; - - return vir_MOV(c, vpm); + return vir_LDVPMV_IN(c, + vir_uniform_ui(c, + (*num_components_queued)++)); } static void @@ -2005,31 +2273,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c) } /* The actual loads will happen directly in nir_intrinsic_load_input - * on newer versions. */ - if (c->devinfo->ver >= 40) - return; - - for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { - resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + 1) * 4); - - for (int i = 0; i < c->vattr_sizes[loc]; i++) { - c->inputs[loc * 4 + i] = - ntq_emit_vpm_read(c, - &vpm_components_queued, - &num_components, - loc * 4 + i); - - } - } - - if (c->devinfo->ver >= 40) { - assert(vpm_components_queued == num_components); - } else { - assert(vpm_components_queued == 0); - assert(num_components == 0); - } + return; } static bool @@ -2058,14 +2303,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c) */ assert(glsl_type_is_array(var->type)); const struct glsl_type *type = glsl_get_array_element(var->type); - unsigned array_len = MAX2(glsl_get_length(type), 1); + unsigned var_len = glsl_count_vec4_slots(type, false, false); unsigned loc = var->data.driver_location; resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + array_len) * 4); + (loc + var_len) * 4); if (var->data.compact) { - for (unsigned j = 0; j < array_len; j++) { + for (unsigned j = 0; j < var_len; j++) { unsigned input_idx = c->num_inputs++; unsigned loc_frac = var->data.location_frac + j; unsigned loc = var->data.location + loc_frac / 4; @@ -2076,8 +2321,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c) continue; } - for (unsigned j = 0; j < array_len; j++) { - unsigned num_elements = glsl_get_vector_elements(type); + for (unsigned j = 0; j < var_len; j++) { + unsigned num_elements = + glsl_type_is_struct(glsl_without_array(type)) ? + 4 : glsl_get_vector_elements(type); for (unsigned k = 0; k < num_elements; k++) { unsigned chan = var->data.location_frac + k; unsigned input_idx = c->num_inputs++; @@ -2124,7 +2371,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c) } else if (var->data.compact) { for (int j = 0; j < var_len; j++) emit_compact_fragment_input(c, loc, var, j); - } else if (glsl_type_is_struct(var->type)) { + } else if (glsl_type_is_struct(glsl_without_array(var->type))) { for (int j = 0; j < var_len; j++) { emit_fragment_input(c, loc, var, j, 4); } @@ -2143,12 +2390,9 @@ ntq_setup_outputs(struct v3d_compile *c) return; nir_foreach_shader_out_variable(var, c->s) { - unsigned array_len = MAX2(glsl_get_length(var->type), 1); + assert(glsl_type_is_vector_or_scalar(var->type)); unsigned loc = var->data.driver_location * 4; - assert(array_len == 1); - (void)array_len; - for (int i = 0; i < 4 - var->data.location_frac; i++) { add_output(c, loc + var->data.location_frac + i, var->data.location, @@ -2157,15 +2401,17 @@ ntq_setup_outputs(struct v3d_compile *c) switch (var->data.location) { case FRAG_RESULT_COLOR: - c->output_color_var[0] = var; - c->output_color_var[1] = var; - c->output_color_var[2] = var; - c->output_color_var[3] = var; + for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) + c->output_color_var[i] = var; break; case FRAG_RESULT_DATA0: case FRAG_RESULT_DATA1: case FRAG_RESULT_DATA2: case FRAG_RESULT_DATA3: + case FRAG_RESULT_DATA4: + case FRAG_RESULT_DATA5: + case FRAG_RESULT_DATA6: + case FRAG_RESULT_DATA7: c->output_color_var[var->data.location - FRAG_RESULT_DATA0] = var; break; @@ -2185,17 +2431,19 @@ ntq_setup_outputs(struct v3d_compile *c) * Each nir_register gets a struct qreg per 32-bit component being stored. */ static void -ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) +ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl) { - foreach_list_typed(nir_register, nir_reg, node, list) { - unsigned array_len = MAX2(nir_reg->num_array_elems, 1); + nir_foreach_reg_decl(decl, impl) { + unsigned num_components = nir_intrinsic_num_components(decl); + unsigned array_len = nir_intrinsic_num_array_elems(decl); + array_len = MAX2(array_len, 1); struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, - array_len * - nir_reg->num_components); + array_len * num_components); + nir_def *nir_reg = &decl->def; _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); - for (int i = 0; i < array_len * nir_reg->num_components; i++) + for (int i = 0; i < array_len * num_components; i++) qregs[i] = vir_get_temp(c); } } @@ -2222,23 +2470,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) assert(nir_src_as_uint(instr->src[1]) == 0); - ntq_store_dest(c, &instr->dest, 0, + ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); if (instr->num_components > 1) { - ntq_store_dest(c, &instr->dest, 1, - vir_uniform(c, - instr->num_components == 2 && is_array ? - QUNIFORM_IMAGE_ARRAY_SIZE : - QUNIFORM_IMAGE_HEIGHT, - image_index)); + ntq_store_def(c, &instr->def, 1, + vir_uniform(c, + instr->num_components == 2 && is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_HEIGHT, + image_index)); } if (instr->num_components > 2) { - ntq_store_dest(c, &instr->dest, 2, - vir_uniform(c, - is_array ? - QUNIFORM_IMAGE_ARRAY_SIZE : - QUNIFORM_IMAGE_DEPTH, - image_index)); + ntq_store_def(c, &instr->def, 2, + vir_uniform(c, + is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_DEPTH, + image_index)); } } @@ -2263,16 +2511,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) * * To fix that, we make sure we always emit a thread switch before the * first tlb color read. If that happens to be the last thread switch - * we emit, then everything is fine, but otherwsie, if any code after + * we emit, then everything is fine, but otherwise, if any code after * this point needs to emit additional thread switches, then we will * switch the strategy to locking the scoreboard on the first thread * switch instead -- see vir_emit_thrsw(). */ if (!c->emitted_tlb_load) { - if (!c->last_thrsw_at_top_level) { - assert(c->devinfo->ver >= 41); + if (!c->last_thrsw_at_top_level) vir_emit_thrsw(c); - } c->emitted_tlb_load = true; } @@ -2371,27 +2617,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) } assert(color_reads_for_sample[component].file != QFILE_NULL); - ntq_store_dest(c, &instr->dest, 0, - vir_MOV(c, color_reads_for_sample[component])); + ntq_store_def(c, &instr->def, 0, + vir_MOV(c, color_reads_for_sample[component])); +} + +static bool +ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr); + +static bool +try_emit_uniform(struct v3d_compile *c, + int offset, + int num_components, + nir_def *def, + enum quniform_contents contents) +{ + /* Even though ldunif is strictly 32-bit we can still use it + * to load scalar 8-bit/16-bit uniforms so long as their offset + * is 32-bit aligned. In this case, ldunif would still load + * 32-bit into the destination with the 8-bit/16-bit uniform + * data in the LSB and garbage in the MSB, but that is fine + * because we should only be accessing the valid bits of the + * destination. + * + * FIXME: if in the future we improve our register allocator to + * pack 2 16-bit variables in the MSB and LSB of the same + * register then this optimization would not be valid as is, + * since the load clobbers the MSB. + */ + if (offset % 4 != 0) + return false; + + /* We need dwords */ + offset = offset / 4; + + for (int i = 0; i < num_components; i++) { + ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i)); + } + + return true; } static void ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) { + /* We scalarize general TMU access for anything that is not 32-bit. */ + assert(instr->def.bit_size == 32 || + instr->num_components == 1); + + /* Try to emit ldunif if possible, otherwise fallback to general TMU */ if (nir_src_is_const(instr->src[0])) { int offset = (nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0])); - assert(offset % 4 == 0); - /* We need dwords */ - offset = offset / 4; - for (int i = 0; i < instr->num_components; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_UNIFORM, - offset + i)); + + if (try_emit_uniform(c, offset, instr->num_components, + &instr->def, QUNIFORM_UNIFORM)) { + return; + } + } + + if (!ntq_emit_load_unifa(c, instr)) { + ntq_emit_tmu_general(c, instr, false, false); + c->has_general_tmu_load = true; + } +} + +static bool +ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + if (c->compiler->max_inline_uniform_buffers <= 0) + return false; + + /* Regular UBOs start after inline UBOs */ + uint32_t index = nir_src_as_uint(instr->src[0]); + if (index >= c->compiler->max_inline_uniform_buffers) + return false; + + /* We scalarize general TMU access for anything that is not 32-bit */ + assert(instr->def.bit_size == 32 || + instr->num_components == 1); + + if (nir_src_is_const(instr->src[1])) { + int offset = nir_src_as_uint(instr->src[1]); + if (try_emit_uniform(c, offset, instr->num_components, + &instr->def, + QUNIFORM_INLINE_UBO_0 + index)) { + return true; } - } else { - ntq_emit_tmu_general(c, instr, false); } + + /* Fallback to regular UBO load */ + return false; } static void @@ -2411,7 +2726,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) unsigned offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]); - if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) { + if (c->s->info.stage != MESA_SHADER_FRAGMENT) { /* Emit the LDVPM directly now, rather than at the top * of the shader like we did for V3D 3.x (which needs * vpmsetup when not just taking the next offset). @@ -2433,19 +2748,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) SYSTEM_VALUE_VERTEX_ID)) { index++; } - for (int i = 0; i < offset; i++) - index += c->vattr_sizes[i]; + + for (int i = 0; i < offset; i++) { + /* GFXH-1602: if any builtins (vid, iid, etc) are read then + * attribute 0 must be active (size > 0). When we hit this, + * the driver is expected to program attribute 0 to have a + * size of 1, so here we need to add that. + */ + if (i == 0 && c->vs_key->is_coord && + c->vattr_sizes[i] == 0 && index > 0) { + index++; + } else { + index += c->vattr_sizes[i]; + } + } + index += nir_intrinsic_component(instr); for (int i = 0; i < instr->num_components; i++) { struct qreg vpm_offset = vir_uniform_ui(c, index++); - ntq_store_dest(c, &instr->dest, i, - vir_LDVPMV_IN(c, vpm_offset)); + ntq_store_def(c, &instr->def, i, + vir_LDVPMV_IN(c, vpm_offset)); } } else { for (int i = 0; i < instr->num_components; i++) { int comp = nir_intrinsic_component(instr) + i; - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, c->inputs[offset * 4 + comp])); + struct qreg input = c->inputs[offset * 4 + comp]; + ntq_store_def(c, &instr->def, i, vir_MOV(c, input)); + + if (c->s->info.stage == MESA_SHADER_FRAGMENT && + input.file == c->payload_z.file && + input.index == c->payload_z.index) { + c->reads_z = true; + } } } } @@ -2610,18 +2944,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c, /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */ struct qreg F = vir_uniform_ui(c, 0); struct qreg T = vir_uniform_ui(c, ~0); - struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1); + struct qreg s0 = vir_AND(c, sample_mask, i1); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); - s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2); + s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s1 = vir_AND(c, sample_mask, i2); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); - s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4); + s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s2 = vir_AND(c, sample_mask, i4); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); - s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8); + s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s3 = vir_AND(c, sample_mask, i8); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ); - s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); + s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */ struct qreg sample_idx = i3; @@ -2708,28 +3042,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result) c->current_unifa_offset += 4; } -static void -ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) +/* Checks if the value of a nir src is derived from a nir register */ +static bool +nir_src_derived_from_reg(nir_src src) +{ + nir_def *def = src.ssa; + if (nir_load_reg_for_def(def)) + return true; + + nir_instr *parent = def->parent_instr; + switch (parent->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(parent); + int num_srcs = nir_op_infos[alu->op].num_inputs; + for (int i = 0; i < num_srcs; i++) { + if (nir_src_derived_from_reg(alu->src[i].src)) + return true; + } + return false; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); + int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs; + for (int i = 0; i < num_srcs; i++) { + if (nir_src_derived_from_reg(intr->src[i])) + return true; + } + return false; + } + case nir_instr_type_load_const: + case nir_instr_type_undef: + return false; + default: + /* By default we assume it may come from a register, the above + * cases should be able to handle the majority of situations + * though. + */ + return true; + }; +} + +static bool +ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) { + assert(instr->intrinsic == nir_intrinsic_load_ubo || + instr->intrinsic == nir_intrinsic_load_ssbo || + instr->intrinsic == nir_intrinsic_load_uniform); + + bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; + bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; + bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo; + /* Every ldunifa auto-increments the unifa address by 4 bytes, so our * current unifa offset is 4 bytes ahead of the offset of the last load. */ static const int32_t max_unifa_skip_dist = MAX_UNIFA_SKIP_DISTANCE - 4; - bool dynamic_src = !nir_src_is_const(instr->src[1]); - uint32_t const_offset = - dynamic_src ? 0 : nir_src_as_uint(instr->src[1]); + /* We can only use unifa if the offset is uniform */ + nir_src offset = is_uniform ? instr->src[0] : instr->src[1]; + if (nir_src_is_divergent(offset)) + return false; - /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index - * shifted up by 1 (0 is gallium's constant buffer 0). + /* Emitting loads from unifa may not be safe under non-uniform control + * flow. It seems the address that is used to write to the unifa + * register is taken from the first lane and if that lane is disabled + * by control flow then the value we read may be bogus and lead to + * invalid memory accesses on follow-up ldunifa instructions. However, + * ntq_store_def only emits conditional writes for nir registersas long + * we can be certain that the offset isn't derived from a load_reg we + * should be fine. + * + * The following CTS test can be used to trigger the problem, which + * causes a GMP violations in the sim without this check: + * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int */ - uint32_t index = nir_src_as_uint(instr->src[0]); - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) + if (vir_in_nonuniform_control_flow(c) && + nir_src_derived_from_reg(offset)) { + return false; + } + + /* We can only use unifa with SSBOs if they are read-only. Otherwise + * ldunifa won't see the shader writes to that address (possibly + * because ldunifa doesn't read from the L2T cache). + */ + if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE)) + return false; + + /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms + * that we may have been written to scratch using the TMU. + */ + bool dynamic_src = !nir_src_is_const(offset); + if (is_uniform && dynamic_src && c->s->scratch_size > 0) + return false; + + uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset); + if (is_uniform) + const_offset += nir_intrinsic_base(instr); + + /* ldunifa is a 32-bit load instruction so we can only use it with + * 32-bit aligned addresses. We always produce 32-bit aligned addresses + * except for types smaller than 32-bit, so in these cases we can only + * use ldunifa if we can verify alignment, which we can only do for + * loads with a constant offset. + */ + uint32_t bit_size = instr->def.bit_size; + uint32_t value_skips = 0; + if (bit_size < 32) { + if (dynamic_src) { + return false; + } else if (const_offset % 4 != 0) { + /* If we are loading from an unaligned offset, fix + * alignment and skip over unused elements in result. + */ + value_skips = (const_offset % 4) / (bit_size / 8); + const_offset &= ~0x3; + } + } + + assert((bit_size == 32 && value_skips == 0) || + (bit_size == 16 && value_skips <= 1) || + (bit_size == 8 && value_skips <= 3)); + + /* Both Vulkan and OpenGL reserve index 0 for uniforms / push + * constants. + */ + uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]); + + /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use + * index 0 for Gallium's constant buffer (GL) or push constants + * (Vulkan). + */ + if (is_ubo) index++; /* We can only keep track of the last unifa address we used with - * constant offset loads. If the new load targets the same UBO and + * constant offset loads. If the new load targets the same buffer and * is close enough to the previous load, we can skip the unifa register * write by emitting dummy ldunifa instructions to update the unifa * address. @@ -2739,6 +3187,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) if (dynamic_src) { c->current_unifa_block = NULL; } else if (c->cur_block == c->current_unifa_block && + c->current_unifa_is_ubo == !is_ssbo && c->current_unifa_index == index && c->current_unifa_offset <= const_offset && c->current_unifa_offset + max_unifa_skip_dist >= const_offset) { @@ -2746,32 +3195,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) ldunifa_skips = (const_offset - c->current_unifa_offset) / 4; } else { c->current_unifa_block = c->cur_block; + c->current_unifa_is_ubo = !is_ssbo; c->current_unifa_index = index; c->current_unifa_offset = const_offset; } if (!skip_unifa) { - struct qreg base_offset = + struct qreg base_offset = !is_ssbo ? vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(index, const_offset)); + v3d_unit_data_create(index, const_offset)) : + vir_uniform(c, QUNIFORM_SSBO_OFFSET, index); struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); if (!dynamic_src) { - vir_MOV_dest(c, unifa, base_offset); + if (!is_ssbo) { + /* Avoid the extra MOV to UNIFA by making + * ldunif load directly into it. We can't + * do this if we have not actually emitted + * ldunif and are instead reusing a previous + * one. + */ + struct qinst *inst = + (struct qinst *)c->cur_block->instructions.prev; + if (inst == c->defs[base_offset.index]) { + inst->dst = unifa; + c->defs[base_offset.index] = NULL; + } else { + vir_MOV_dest(c, unifa, base_offset); + } + } else { + vir_ADD_dest(c, unifa, base_offset, + vir_uniform_ui(c, const_offset)); + } } else { vir_ADD_dest(c, unifa, base_offset, - ntq_get_src(c, instr->src[1], 0)); + ntq_get_src(c, offset, 0)); } } else { for (int i = 0; i < ldunifa_skips; i++) emit_ldunifa(c, NULL); } - for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) { + uint32_t num_components = nir_intrinsic_dest_components(instr); + for (uint32_t i = 0; i < num_components; ) { struct qreg data; emit_ldunifa(c, &data); - ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data)); + + if (bit_size == 32) { + assert(value_skips == 0); + ntq_store_def(c, &instr->def, i, vir_MOV(c, data)); + i++; + } else { + assert((bit_size == 16 && value_skips <= 1) || + (bit_size == 8 && value_skips <= 3)); + + /* If we have any values to skip, shift to the first + * valid value in the ldunifa result. + */ + if (value_skips > 0) { + data = vir_SHR(c, data, + vir_uniform_ui(c, bit_size * + value_skips)); + } + + /* Check how many valid components we have discounting + * read components to skip. + */ + uint32_t valid_count = (32 / bit_size) - value_skips; + assert((bit_size == 16 && valid_count <= 2) || + (bit_size == 8 && valid_count <= 4)); + assert(valid_count > 0); + + /* Process the valid components */ + do { + struct qreg tmp; + uint32_t mask = (1 << bit_size) - 1; + tmp = vir_AND(c, vir_MOV(c, data), + vir_uniform_ui(c, mask)); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, tmp)); + i++; + valid_count--; + + /* Shift to next component */ + if (i < num_components && valid_count > 0) { + data = vir_SHR(c, data, + vir_uniform_ui(c, bit_size)); + } + } while (i < num_components && valid_count > 0); + } } + + return true; } static inline struct qreg @@ -2781,187 +3296,273 @@ emit_load_local_invocation_index(struct v3d_compile *c) vir_uniform_ui(c, 32 - c->local_invocation_index_bits)); } -/* Various subgroup operations rely on the A flags, so this helper ensures that - * A flags represents currently active lanes in the subgroup. +/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in + * fragment shaders a lane is considered active if any sample flags are set + * for *any* lane in the same quad, however, we still need to ensure that + * terminated lanes (OpTerminate) are not included. Further, we also need to + * disable lanes that may be disabled because of non-uniform control + * flow. */ -static void -set_a_flags_for_subgroup(struct v3d_compile *c) +static enum v3d_qpu_cond +setup_subgroup_control_flow_condition(struct v3d_compile *c) { - /* MSF returns 0 for disabled lanes in compute shaders so - * PUSHZ will set A=1 for disabled lanes. We want the inverse - * of this but we don't have any means to negate the A flags - * directly, but we can do it by repeating the same operation - * with NORZ (A = ~A & ~Z). + assert(c->s->info.stage == MESA_SHADER_FRAGMENT || + c->s->info.stage == MESA_SHADER_COMPUTE); + + enum v3d_qpu_cond cond = V3D_QPU_COND_NONE; + + /* We need to make sure that terminated lanes in fragment shaders are + * not included. We can identify these lanes by comparing the inital + * sample mask with the current. This fixes: + * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_* */ - assert(c->s->info.stage == MESA_SHADER_COMPUTE); - vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); - vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ); + if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) { + vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf, + vir_NOT(c, vir_XOR(c, c->start_msf, + vir_MSF(c)))), + V3D_QPU_PF_PUSHZ); + cond = V3D_QPU_COND_IFNA; + } - /* If we are under non-uniform control flow we also need to - * AND the A flags with the current execute mask. + /* If we are in non-uniform control-flow update the condition to + * also limit lanes to those in the current execution mask. */ if (vir_in_nonuniform_control_flow(c)) { - const uint32_t bidx = c->cur_block->index; - vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(), - c->execute, - vir_uniform_ui(c, bidx)), - V3D_QPU_UF_ANDZ); + if (cond == V3D_QPU_COND_IFNA) { + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_UF_NORNZ); + } else { + assert(cond == V3D_QPU_COND_NONE); + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + cond = V3D_QPU_COND_IFA; } + + return cond; +} + +static void +emit_compute_barrier(struct v3d_compile *c) +{ + /* Ensure we flag the use of the control barrier. NIR's + * gather info pass usually takes care of this, but that + * requires that we call that pass after any other pass + * may emit a control barrier, so this is safer. + */ + c->s->info.uses_control_barrier = true; + + /* Emit a TSY op to get all invocations in the workgroup + * (actually supergroup) to block until the last + * invocation reaches the TSY op. + */ + vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB)); +} + +static void +emit_barrier(struct v3d_compile *c) +{ + struct qreg eidx = vir_EIDX(c); + + /* The config for the TSY op should be setup like this: + * - Lane 0: Quorum + * - Lane 2: TSO id + * - Lane 3: TSY opcode + */ + + /* Lane 0: we want to synchronize across one subgroup. Here we write to + * all lanes unconditionally and will overwrite other lanes below. + */ + struct qreg tsy_conf = vir_uniform_ui(c, 1); + + /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the + * curent QPU index and thread index to ensure we get a unique one for + * this group of invocations in this core. + */ + struct qreg tso_id = + vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f)); + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id); + + /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */ + struct qreg tsy_op = vir_uniform_ui(c, 16); + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op); + + /* Emit TSY sync */ + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf); } static void ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) { switch (instr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + break; /* Ignore these */ + case nir_intrinsic_load_uniform: ntq_emit_load_uniform(c, instr); break; + case nir_intrinsic_load_global_2x32: + ntq_emit_tmu_general(c, instr, false, true); + c->has_general_tmu_load = true; + break; + case nir_intrinsic_load_ubo: - if (!nir_src_is_divergent(instr->src[1])) - ntq_emit_load_ubo_unifa(c, instr); - else - ntq_emit_tmu_general(c, instr, false); - break; - - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: + if (ntq_emit_inline_ubo_load(c, instr)) + break; + FALLTHROUGH; case nir_intrinsic_load_ssbo: + if (!ntq_emit_load_unifa(c, instr)) { + ntq_emit_tmu_general(c, instr, false, false); + c->has_general_tmu_load = true; + } + break; + case nir_intrinsic_store_ssbo: - ntq_emit_tmu_general(c, instr, false); - break; - - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - case nir_intrinsic_load_shared: + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + ntq_emit_tmu_general(c, instr, false, false); + break; + + case nir_intrinsic_store_global_2x32: + case nir_intrinsic_global_atomic_2x32: + case nir_intrinsic_global_atomic_swap_2x32: + ntq_emit_tmu_general(c, instr, false, true); + break; + + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: case nir_intrinsic_store_shared: - case nir_intrinsic_load_scratch: case nir_intrinsic_store_scratch: - ntq_emit_tmu_general(c, instr, true); + ntq_emit_tmu_general(c, instr, true, false); + break; + + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_shared: + ntq_emit_tmu_general(c, instr, true, false); + c->has_general_tmu_load = true; break; - case nir_intrinsic_image_load: case nir_intrinsic_image_store: - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_comp_swap: - v3d40_vir_emit_image_load_store(c, instr); + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + v3d_vir_emit_image_load_store(c, instr); + break; + + case nir_intrinsic_image_load: + v3d_vir_emit_image_load_store(c, instr); + /* Not really a general TMU load, but we only use this flag + * for NIR scheduling and we do schedule these under the same + * policy as general TMU. + */ + c->has_general_tmu_load = true; break; case nir_intrinsic_get_ssbo_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, - nir_src_comp_as_uint(instr->src[0], 0))); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, + nir_src_comp_as_uint(instr->src[0], 0))); break; case nir_intrinsic_get_ubo_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_GET_UBO_SIZE, - nir_src_comp_as_uint(instr->src[0], 0))); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_GET_UBO_SIZE, + nir_src_comp_as_uint(instr->src[0], 0))); break; case nir_intrinsic_load_user_clip_plane: for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, - nir_intrinsic_ucp_id(instr) * - 4 + i)); + ntq_store_def(c, &instr->def, i, + vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, + nir_intrinsic_ucp_id(instr) * + 4 + i)); } break; case nir_intrinsic_load_viewport_x_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); break; case nir_intrinsic_load_viewport_y_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); break; case nir_intrinsic_load_viewport_z_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); break; case nir_intrinsic_load_viewport_z_offset: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); break; case nir_intrinsic_load_line_coord: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x)); break; case nir_intrinsic_load_line_width: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); break; case nir_intrinsic_load_aa_line_width: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); break; case nir_intrinsic_load_sample_mask_in: - ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); + ntq_store_def(c, &instr->def, 0, vir_MSF(c)); break; case nir_intrinsic_load_helper_invocation: vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); - ntq_store_dest(c, &instr->dest, 0, qdest); + ntq_store_def(c, &instr->def, 0, qdest); break; case nir_intrinsic_load_front_face: /* The register contains 0 (front) or 1 (back), and we need to * turn it into a NIR bool where true means front. */ - ntq_store_dest(c, &instr->dest, 0, - vir_ADD(c, - vir_uniform_ui(c, -1), - vir_REVF(c))); + ntq_store_def(c, &instr->def, 0, + vir_ADD(c, + vir_uniform_ui(c, -1), + vir_REVF(c))); break; case nir_intrinsic_load_base_instance: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid)); break; case nir_intrinsic_load_instance_id: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid)); break; case nir_intrinsic_load_vertex_id: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid)); + break; + + case nir_intrinsic_load_draw_id: + ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0)); break; case nir_intrinsic_load_tlb_color_v3d: vir_emit_tlb_color_read(c, instr); break; + case nir_intrinsic_load_fep_w_v3d: + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w)); + break; + case nir_intrinsic_load_input: ntq_emit_load_input(c, instr); break; @@ -2978,7 +3579,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_image_size(c, instr); break; + /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which + * is intended to match the semantics of GLSL's discard) should + * terminate the invocation immediately. Our implementation doesn't + * do that. What we do is actually a demote by removing the invocations + * from the sample mask. Maybe we could be more strict and force an + * early termination by emitting a (maybe conditional) jump to the + * end section of the fragment shader for affected invocations. + */ case nir_intrinsic_discard: + case nir_intrinsic_terminate: + c->emitted_discard = true; + FALLTHROUGH; + case nir_intrinsic_demote: ntq_flush_tmu(c); if (vir_in_nonuniform_control_flow(c)) { @@ -2993,7 +3606,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) } break; - case nir_intrinsic_discard_if: { + case nir_intrinsic_discard_if: + case nir_intrinsic_terminate_if: + c->emitted_discard = true; + FALLTHROUGH; + case nir_intrinsic_demote_if: { ntq_flush_tmu(c); enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); @@ -3011,102 +3628,79 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), vir_uniform_ui(c, 0)), cond); - break; } - case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_memory_barrier_tcs_patch: - case nir_intrinsic_group_memory_barrier: - /* We don't do any instruction scheduling of these NIR - * instructions between each other, so we just need to make - * sure that the TMU operations before the barrier are flushed + case nir_intrinsic_barrier: + /* Ensure that the TMU operations before the barrier are flushed * before the ones after the barrier. */ ntq_flush_tmu(c); - break; - - case nir_intrinsic_control_barrier: - /* Emit a TSY op to get all invocations in the workgroup - * (actually supergroup) to block until the last invocation - * reaches the TSY op. - */ - ntq_flush_tmu(c); - if (c->devinfo->ver >= 42) { - vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, - V3D_QPU_WADDR_SYNCB)); - } else { - struct qinst *sync = - vir_BARRIERID_dest(c, - vir_reg(QFILE_MAGIC, - V3D_QPU_WADDR_SYNCU)); - sync->uniform = - vir_get_uniform_index(c, QUNIFORM_CONSTANT, - 0xffffff00 | - V3D_TSY_WAIT_INC_CHECK); + if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) { + if (c->s->info.stage == MESA_SHADER_COMPUTE) + emit_compute_barrier(c); + else + emit_barrier(c); + /* The blocking of a TSY op only happens at the next + * thread switch. No texturing may be outstanding at the + * time of a TSY blocking operation. + */ + vir_emit_thrsw(c); } - - /* The blocking of a TSY op only happens at the next thread - * switch. No texturing may be outstanding at the time of a - * TSY blocking operation. - */ - vir_emit_thrsw(c); break; case nir_intrinsic_load_num_workgroups: for (int i = 0; i < 3; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, - i)); + ntq_store_def(c, &instr->def, i, + vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, + i)); } break; case nir_intrinsic_load_workgroup_id: { struct qreg x = vir_AND(c, c->cs_payload[0], vir_uniform_ui(c, 0xffff)); + ntq_store_def(c, &instr->def, 0, x); struct qreg y = vir_SHR(c, c->cs_payload[0], vir_uniform_ui(c, 16)); + ntq_store_def(c, &instr->def, 1, y); struct qreg z = vir_AND(c, c->cs_payload[1], vir_uniform_ui(c, 0xffff)); + ntq_store_def(c, &instr->def, 2, z); + break; + } - /* We only support dispatch base in Vulkan */ - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { - x = vir_ADD(c, x, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0)); - y = vir_ADD(c, y, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1)); - z = vir_ADD(c, z, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2)); - } + case nir_intrinsic_load_base_workgroup_id: { + struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0); + ntq_store_def(c, &instr->def, 0, x); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y)); - ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z)); + struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1); + ntq_store_def(c, &instr->def, 1, y); + + struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2); + ntq_store_def(c, &instr->def, 2, z); break; } case nir_intrinsic_load_local_invocation_index: - ntq_store_dest(c, &instr->dest, 0, - emit_load_local_invocation_index(c)); + ntq_store_def(c, &instr->def, 0, + emit_load_local_invocation_index(c)); break; case nir_intrinsic_load_subgroup_id: { /* This is basically the batch index, which is the Local * Invocation Index divided by the SIMD width). */ - STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS)); + STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0); const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1; struct qreg lii = emit_load_local_invocation_index(c); - ntq_store_dest(c, &instr->dest, 0, - vir_SHR(c, lii, - vir_uniform_ui(c, divide_shift))); + ntq_store_def(c, &instr->def, 0, + vir_SHR(c, lii, + vir_uniform_ui(c, divide_shift))); break; } @@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg col = ntq_get_src(c, instr->src[0], 0); for (int i = 0; i < instr->num_components; i++) { struct qreg row = vir_uniform_ui(c, row_idx++); - ntq_store_dest(c, &instr->dest, i, - vir_LDVPMG_IN(c, row, col)); + ntq_store_def(c, &instr->def, i, + vir_LDVPMG_IN(c, row, col)); } break; } @@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) * using ldvpm(v,d)_in (See Table 71). */ assert(c->s->info.stage == MESA_SHADER_GEOMETRY); - ntq_store_dest(c, &instr->dest, 0, - vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); + ntq_store_def(c, &instr->def, 0, + vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); break; } case nir_intrinsic_load_invocation_id: - ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); + ntq_store_def(c, &instr->def, 0, vir_IID(c)); break; case nir_intrinsic_load_fb_layers_v3d: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); break; case nir_intrinsic_load_sample_id: - ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c)); + ntq_store_def(c, &instr->def, 0, vir_SAMPID(c)); break; case nir_intrinsic_load_sample_pos: - ntq_store_dest(c, &instr->dest, 0, - vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); - ntq_store_dest(c, &instr->dest, 1, - vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); + ntq_store_def(c, &instr->def, 0, + vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); + ntq_store_def(c, &instr->def, 1, + vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); break; case nir_intrinsic_load_barycentric_at_offset: - ntq_store_dest(c, &instr->dest, 0, - vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); - ntq_store_dest(c, &instr->dest, 1, - vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); + ntq_store_def(c, &instr->def, 0, + vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); + ntq_store_def(c, &instr->def, 1, + vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); break; case nir_intrinsic_load_barycentric_pixel: - ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); - ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f)); break; case nir_intrinsic_load_barycentric_at_sample: { if (!c->fs_key->msaa) { - ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); - ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f)); return; } @@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0); ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x)); + ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y)); break; } @@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg offset_y = vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); - ntq_store_dest(c, &instr->dest, 0, - vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); - ntq_store_dest(c, &instr->dest, 1, - vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); + ntq_store_def(c, &instr->def, 0, + vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); + ntq_store_def(c, &instr->def, 1, + vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); break; } case nir_intrinsic_load_barycentric_centroid: { struct qreg offset_x, offset_y; ntq_get_barycentric_centroid(c, &offset_x, &offset_y); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x)); + ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y)); break; } @@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) */ if (!c->fs_key->msaa || c->interp[input_idx].vp.file == QFILE_NULL) { - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, c->inputs[input_idx])); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, c->inputs[input_idx])); continue; } @@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_load_interpolated_input(c, p, C, offset_x, offset_y, interp_mode); - ntq_store_dest(c, &instr->dest, i, result); + ntq_store_def(c, &instr->def, i, result); } break; } case nir_intrinsic_load_subgroup_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform_ui(c, V3D_CHANNELS)); + ntq_store_def(c, &instr->def, 0, + vir_uniform_ui(c, V3D_CHANNELS)); break; case nir_intrinsic_load_subgroup_invocation: - ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); + ntq_store_def(c, &instr->def, 0, vir_EIDX(c)); break; case nir_intrinsic_elect: { - set_a_flags_for_subgroup(c); - struct qreg first = vir_FLAFIRST(c); + struct qreg first; + if (vir_in_nonuniform_control_flow(c)) { + /* Sets A=1 for lanes enabled in the execution mask */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + /* Updates A ANDing with lanes enabled in MSF */ + vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), + V3D_QPU_UF_ANDNZ); + first = vir_FLAFIRST(c); + } else { + /* Sets A=1 for inactive lanes */ + vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), + V3D_QPU_PF_PUSHZ); + first = vir_FLNAFIRST(c); + } - /* Produce a boolean result from Flafirst */ + /* Produce a boolean result */ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), first, vir_uniform_ui(c, 1)), V3D_QPU_PF_PUSHZ); struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); - ntq_store_dest(c, &instr->dest, 0, result); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_ballot: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_BALLOT_dest(c, res, value), cond); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg index = ntq_get_src(c, instr->src[1], 0); + struct qreg res = vir_SHUFFLE(c, value, index); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_first_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_BCASTF_dest(c, res, value), cond); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_shuffle: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg indices = ntq_get_src(c, instr->src[1], 0); + struct qreg res = vir_SHUFFLE(c, value, indices); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_vote_feq: + case nir_intrinsic_vote_ieq: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ? + vir_ALLEQ_dest(c, res, value) : + vir_ALLFEQ_dest(c, res, value), + cond); + + /* Produce boolean result */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_vote_all: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); + + /* We want to check if 'all lanes are equal (alleq != 0) and + * their value is True (value != 0)'. + * + * The first MOV.pushz generates predicate for 'alleq == 0'. + * The second MOV.NORZ generates predicate for: + * '!(alleq == 0) & !(value == 0). + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value), + V3D_QPU_UF_NORZ); + struct qreg result = + ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_vote_any: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); + + /* We want to check 'not (all lanes are equal (alleq != 0)' + * and their value is False (value == 0))'. + * + * The first MOV.pushz generates predicate for 'alleq == 0'. + * The second MOV.NORNZ generates predicate for: + * '!(alleq == 0) & (value == 0). + * The IFNA condition negates the predicate when evaluated: + * '!(!alleq == 0) & (value == 0)) + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value), + V3D_QPU_UF_NORNZ); + struct qreg result = + ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA); + ntq_store_def(c, &instr->def, 0, result); break; } @@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_view_index: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); break; default: @@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); } +static bool +is_cheap_block(nir_block *block) +{ + int32_t cost = 3; + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + case nir_instr_type_undef: + case nir_instr_type_load_const: + if (--cost <= 0) + return false; + break; + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + continue; + default: + return false; + } + } + default: + return false; + } + } + return true; +} + static void ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) { @@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) c->execute, vir_uniform_ui(c, else_block->index)); - /* Jump to ELSE if nothing is active for THEN, otherwise fall - * through. + /* Set the flags for taking the THEN block */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + + /* Jump to ELSE if nothing is active for THEN (unless THEN block is + * so small it won't pay off), otherwise fall through. */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); - vir_link_blocks(c->cur_block, else_block); + bool is_cheap = exec_list_is_singular(&if_stmt->then_list) && + is_cheap_block(nir_if_first_then_block(if_stmt)); + if (!is_cheap) { + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); + vir_link_blocks(c->cur_block, else_block); + } vir_link_blocks(c->cur_block, then_block); - /* Process the THEN block. */ + /* Process the THEN block. + * + * Notice we don't call ntq_activate_execute_for_block here on purpose: + * c->execute is already set up to be 0 for lanes that must take the + * THEN block. + */ vir_set_emit_block(c, then_block); ntq_emit_cf_list(c, &if_stmt->then_list); @@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, after_block->index)); - /* If everything points at ENDIF, then jump there immediately. */ - vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), - c->execute, - vir_uniform_ui(c, after_block->index)), - V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); - vir_link_blocks(c->cur_block, after_block); + /* If everything points at ENDIF, then jump there immediately + * (unless ELSE block is so small it won't pay off). + */ + bool is_cheap = exec_list_is_singular(&if_stmt->else_list) && + is_cheap_block(nir_else_block); + if (!is_cheap) { + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), + c->execute, + vir_uniform_ui(c, after_block->index)), + V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); + vir_link_blocks(c->cur_block, after_block); + } vir_link_blocks(c->cur_block, else_block); vir_set_emit_block(c, else_block); @@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) ntq_emit_load_const(c, nir_instr_as_load_const(instr)); break; - case nir_instr_type_ssa_undef: + case nir_instr_type_undef: unreachable("Should've been lowered by nir_lower_undef_to_zero"); break; @@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop) static void ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) { - c->loop_cont_block = vir_new_block(c); c->loop_break_block = vir_new_block(c); @@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) static void ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) { + assert(!nir_loop_has_continue_construct(loop)); + + /* Disable flags optimization for loop conditions. The problem here is + * that we can have code like this: + * + * // block_0 + * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2 + * loop { + * // block_1 + * if ssa_9 { + * + * In this example we emit flags to compute ssa_9 and the optimization + * will skip regenerating them again for the loop condition in the + * loop continue block (block_1). However, this is not safe after the + * first iteration because the loop body can stomp the flags if it has + * any conditionals. + */ + c->flags_temp = -1; + bool was_in_control_flow = c->in_control_flow; c->in_control_flow = true; @@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) static void ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) { - ntq_setup_registers(c, &impl->registers); + ntq_setup_registers(c, impl); ntq_emit_cf_list(c, &impl->body); } @@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c) { switch (c->s->info.stage) { case MESA_SHADER_FRAGMENT: - c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->start_msf = vir_MSF(c); + if (c->devinfo->ver < 71) + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + else + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); @@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c) emit_fragment_varying(c, NULL, -1, 0, 0); } - if (c->fs_key->is_points && - (c->devinfo->ver < 40 || program_reads_point_coord(c))) { + if (c->fs_key->is_points && program_reads_point_coord(c)) { c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0); c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0); c->uses_implicit_point_line_varyings = true; } else if (c->fs_key->is_lines && - (c->devinfo->ver < 40 || - BITSET_TEST(c->s->info.system_values_read, + (BITSET_TEST(c->s->info.system_values_read, SYSTEM_VALUE_LINE_COORD))) { c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0); c->uses_implicit_point_line_varyings = true; } - - c->force_per_sample_msaa = - c->s->info.fs.uses_sample_qualifier || - BITSET_TEST(c->s->info.system_values_read, - SYSTEM_VALUE_SAMPLE_ID) || - BITSET_TEST(c->s->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS); break; case MESA_SHADER_COMPUTE: /* Set up the TSO for barriers, assuming we do some. */ @@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c) V3D_QPU_WADDR_SYNC)); } - c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); - c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + if (c->devinfo->ver == 42) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } else if (c->devinfo->ver >= 71) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } /* Set up the division between gl_LocalInvocationIndex and * wg_in_mem in the payload reg. @@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c) /* Find the main function and emit the body. */ nir_foreach_function(function, c->s) { - assert(strcmp(function->name, "main") == 0); + assert(function->is_entrypoint); assert(function->impl); ntq_emit_impl(c, function->impl); } @@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c, { *restore_last_thrsw = c->last_thrsw; - /* On V3D before 4.1, we need a TMU op to be outstanding when thread - * switching, so disable threads if we didn't do any TMU ops (each of - * which would have emitted a THRSW). - */ - if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { - c->threads = 1; - if (c->last_thrsw) - vir_remove_thrsw(c); - *restore_last_thrsw = NULL; - } - /* If we're threaded and the last THRSW was in conditional code, then * we need to emit another one so that we can flag it as the last * thrsw. */ - if (c->last_thrsw && !c->last_thrsw_at_top_level) { - assert(c->devinfo->ver >= 41); + if (c->last_thrsw && !c->last_thrsw_at_top_level) vir_emit_thrsw(c); - } /* If we're threaded, then we need to mark the last THRSW instruction * so we can emit a pair of them at QPU emit time. @@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c, * For V3D 4.x, we can spawn the non-fragment shaders already in the * post-last-THRSW state, so we can skip this. */ - if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { - assert(c->devinfo->ver >= 41); + if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) vir_emit_thrsw(c); - } /* If we have not inserted a last thread switch yet, do it now to ensure * any potential spilling we do happens before this. If we don't spill @@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c) vir_for_each_inst_inorder(inst, c) { for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file == QFILE_REG && - inst->src[i].index == 0) { + if (inst->src[i].file == c->payload_w.file && + inst->src[i].index == c->payload_w.index) { c->uses_center_w = true; return; } @@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c) void v3d_nir_to_vir(struct v3d_compile *c) { - if (V3D_DEBUG & (V3D_DEBUG_NIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(NIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d NIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c) unreachable("bad stage"); } - if (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c) * instructions until the results are needed. */ - if (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c) /* Attempt to allocate registers for the temporaries. If we fail, * reduce thread count and try again. */ - int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; + int min_threads = 2; struct qpu_reg *temp_registers; while (true) { - bool spilled; - temp_registers = v3d_register_allocate(c, &spilled); - if (spilled) - continue; - - if (temp_registers) + temp_registers = v3d_register_allocate(c); + if (temp_registers) { + assert(c->spills + c->fills <= c->max_tmu_spills); break; + } if (c->threads == min_threads && - (V3D_DEBUG & V3D_DEBUG_RA)) { + V3D_DBG(RA)) { fprintf(stderr, "Failed to register allocate using %s\n", c->fallback_scheduler ? "the fallback scheduler:" : @@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c) } if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) { - if (V3D_DEBUG & V3D_DEBUG_PERF) { + if (V3D_DBG(PERF)) { fprintf(stderr, - "Failed to register allocate %s at " - "%d threads.\n", vir_get_stage_name(c), - c->threads); + "Failed to register allocate %s " + "prog %d/%d at %d threads.\n", + vir_get_stage_name(c), + c->program_id, c->variant_id, c->threads); } c->compilation_result = V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; return; } - c->spill_count = 0; + c->spills = 0; + c->fills = 0; c->threads /= 2; if (c->threads == 1) @@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c) vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock); if (c->spills && - (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { + (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage))) { fprintf(stderr, "%s prog %d/%d spilled VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index c559814b9ea..ba76ac87e1e 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -85,6 +85,7 @@ struct schedule_state { struct schedule_node *last_unif; struct schedule_node *last_rtop; struct schedule_node *last_unifa; + struct schedule_node *last_setmsf; enum direction dir; /* Estimated cycle when the current instruction would start. */ uint32_t time; @@ -97,7 +98,7 @@ add_dep(struct schedule_state *state, bool write) { bool write_after_read = !write && state->dir == R; - void *edge_data = (void *)(uintptr_t)write_after_read; + uintptr_t edge_data = write_after_read; if (!before || !after) return; @@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; - if (inst->alu.add.magic_write && + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) return true; - if (inst->alu.mul.magic_write && + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) return true; @@ -153,12 +156,13 @@ static void process_mux_deps(struct schedule_state *state, struct schedule_node *n, enum v3d_qpu_mux mux) { + assert(state->devinfo->ver < 71); switch (mux) { case V3D_QPU_MUX_A: add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); break; case V3D_QPU_MUX_B: - if (!n->inst->qpu.sig.small_imm) { + if (!n->inst->qpu.sig.small_imm_b) { add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); } @@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, } } + +static void +process_raddr_deps(struct schedule_state *state, struct schedule_node *n, + uint8_t raddr, bool is_small_imm) +{ + assert(state->devinfo->ver >= 71); + + if (!is_small_imm) + add_read_dep(state, state->last_rf[raddr], n); +} + static bool tmu_write_is_sequence_terminator(uint32_t waddr) { @@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr) static bool can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) { - if (devinfo->ver < 40) - return false; - if (tmu_write_is_sequence_terminator(waddr)) return false; @@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, break; case V3D_QPU_WADDR_UNIFA: - if (state->devinfo->ver >= 40) - add_write_dep(state, &state->last_unifa, n); + add_write_dep(state, &state->last_unifa, n); break; case V3D_QPU_WADDR_NOP: @@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* If the input and output segments are shared, then all VPM reads to * a location need to happen before all writes. We handle this by * serializing all VPM operations for now. + * + * FIXME: we are assuming that the segments are shared. That is + * correct right now as we are only using shared, but technically you + * can choose. */ bool separate_vpm_segment = false; @@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* XXX: LOAD_IMM */ - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) - process_mux_deps(state, n, inst->alu.add.a); - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) - process_mux_deps(state, n, inst->alu.add.b); + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.a.raddr, + inst->sig.small_imm_a); + } + } + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.b.raddr, + inst->sig.small_imm_b); + } + } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) - process_mux_deps(state, n, inst->alu.mul.a); - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) - process_mux_deps(state, n, inst->alu.mul.b); + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.a.raddr, + inst->sig.small_imm_c); + } + } + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.b.raddr, + inst->sig.small_imm_d); + } + } switch (inst->alu.add.op) { case V3D_QPU_A_VPMSETUP: @@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) case V3D_QPU_A_MSF: add_read_dep(state, state->last_tlb, n); + add_read_dep(state, state->last_setmsf, n); break; case V3D_QPU_A_SETMSF: + add_write_dep(state, &state->last_setmsf, n); + add_write_dep(state, &state->last_tmu_write, n); + FALLTHROUGH; case V3D_QPU_A_SETREVF: add_write_dep(state, &state->last_tlb, n); break; + case V3D_QPU_A_BALLOT: + case V3D_QPU_A_BCASTF: + case V3D_QPU_A_ALLEQ: + case V3D_QPU_A_ALLFEQ: + add_read_dep(state, state->last_setmsf, n); + break; + default: break; } @@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_r[4], n); if (v3d_qpu_writes_r5(devinfo, inst)) add_write_dep(state, &state->last_r[5], n); + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) + add_write_dep(state, &state->last_rf[0], n); /* If we add any more dependencies here we should consider whether we * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. @@ -492,9 +544,16 @@ struct choose_scoreboard { int last_thrsw_tick; int last_branch_tick; int last_setmsf_tick; - bool tlb_locked; + bool first_thrsw_emitted; + bool last_thrsw_emitted; bool fixup_ldvary; int ldvary_count; + int pending_ldtmu_count; + bool first_ldtmu_after_thrsw; + + /* V3D 7.x */ + int last_implicit_rf0_write_tick; + bool has_rf0_flops_conflict; }; static bool @@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard, } static bool -reads_too_soon_after_write(struct choose_scoreboard *scoreboard, +reads_too_soon(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, uint8_t raddr) +{ + switch (raddr) { + case 0: /* ldvary delayed write of C coefficient to rf0 */ + if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) + return true; + break; + default: + break; + } + + return false; +} + +static bool +reads_too_soon_after_write(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, struct qinst *qinst) { const struct v3d_qpu_instr *inst = &qinst->qpu; @@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); if (inst->alu.add.op != V3D_QPU_A_NOP) { - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr)) + return true; + } } - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr)) + return true; + } } } if (inst->alu.mul.op != V3D_QPU_M_NOP) { - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr)) + return true; + } } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) + return true; + } } } @@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo, v3d_qpu_writes_r4(devinfo, inst)) return true; + if (devinfo->ver == 42) + return false; + + /* Don't schedule anything that writes rf0 right after ldvary, since + * that would clash with the ldvary's delayed rf0 write (the exception + * is another ldvary, since its implicit rf0 write would also have + * one cycle of delay and would not clash). + */ + if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick && + (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + (v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !inst->sig.ldvary))) { + return true; + } + return false; } static bool -pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, +scoreboard_is_locked(struct choose_scoreboard *scoreboard, + bool lock_scoreboard_on_first_thrsw) +{ + if (lock_scoreboard_on_first_thrsw) { + return scoreboard->first_thrsw_emitted && + scoreboard->tick - scoreboard->last_thrsw_tick >= 3; + } + + return scoreboard->last_thrsw_emitted && + scoreboard->tick - scoreboard->last_thrsw_tick >= 3; +} + +static bool +pixel_scoreboard_too_soon(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { - return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); + return qpu_inst_is_tlb(inst) && + !scoreboard_is_locked(scoreboard, + c->lock_scoreboard_on_first_thrsw); } static bool -qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, +qpu_instruction_uses_rf(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, uint32_t waddr) { if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && - inst->raddr_a == waddr) - return true; + if (devinfo->ver < 71) { + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && + inst->raddr_a == waddr) + return true; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && - !inst->sig.small_imm && (inst->raddr_b == waddr)) - return true; + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && + !inst->sig.small_imm_b && (inst->raddr_b == waddr)) + return true; + } else { + if (v3d71_qpu_reads_raddr(inst, waddr)) + return true; + } return false; } static bool -mux_read_stalls(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst) +read_stalls(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) { return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && - qpu_instruction_uses_rf(inst, + qpu_instruction_uses_rf(devinfo, inst, scoreboard->last_stallable_sfu_reg); } /* We define a max schedule priority to allow negative priorities as result of - * substracting this max when an instruction stalls. So instructions that + * subtracting this max when an instruction stalls. So instructions that * stall have lower priority than regular instructions. */ #define MAX_SCHEDULE_PRIORITY 16 @@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo, return next_score; next_score++; + /* Empirical testing shows that using priorities to hide latency of + * TMU operations when scheduling QPU leads to slightly worse + * performance, even at 2 threads. We think this is because the thread + * switching is already quite effective at hiding latency and NIR + * scheduling (and possibly TMU pipelining too) are sufficient to hide + * TMU latency, so piling up on that here doesn't provide any benefits + * and instead may cause us to postpone critical paths that depend on + * the TMU results. + */ +#if 0 /* Schedule texture read results collection late to hide latency. */ if (v3d_qpu_waits_on_tmu(inst)) return next_score; next_score++; +#endif /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; +#if 0 /* Schedule texture read setup early to hide their latency better. */ if (v3d_qpu_writes_tmu(devinfo, inst)) return next_score; next_score++; +#endif /* We should increase the maximum if we assert here */ assert(next_score < MAX_SCHEDULE_PRIORITY); @@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo, return baseline_score; } -static bool -qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo, - enum v3d_qpu_waddr waddr) -{ - return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) || - v3d_qpu_magic_waddr_is_sfu(waddr) || - v3d_qpu_magic_waddr_is_tlb(waddr) || - v3d_qpu_magic_waddr_is_vpm(waddr) || - v3d_qpu_magic_waddr_is_tsy(waddr)); -} +enum { + V3D_PERIPHERAL_VPM_READ = (1 << 0), + V3D_PERIPHERAL_VPM_WRITE = (1 << 1), + V3D_PERIPHERAL_VPM_WAIT = (1 << 2), + V3D_PERIPHERAL_SFU = (1 << 3), + V3D_PERIPHERAL_TMU_WRITE = (1 << 4), + V3D_PERIPHERAL_TMU_READ = (1 << 5), + V3D_PERIPHERAL_TMU_WAIT = (1 << 6), + V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), + V3D_PERIPHERAL_TSY = (1 << 8), + V3D_PERIPHERAL_TLB_READ = (1 << 9), + V3D_PERIPHERAL_TLB_WRITE = (1 << 10), +}; -static bool -qpu_accesses_peripheral(const struct v3d_device_info *devinfo, - const struct v3d_qpu_instr *inst) +static uint32_t +qpu_peripherals(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { - if (v3d_qpu_uses_vpm(inst)) - return true; + uint32_t result = 0; + if (v3d_qpu_reads_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_READ; + if (v3d_qpu_writes_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_WRITE; + if (v3d_qpu_waits_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_WAIT; + + if (v3d_qpu_writes_tmu(devinfo, inst)) + result |= V3D_PERIPHERAL_TMU_WRITE; + if (inst->sig.ldtmu) + result |= V3D_PERIPHERAL_TMU_READ; + if (inst->sig.wrtmuc) + result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG; + if (v3d_qpu_uses_sfu(inst)) - return true; + result |= V3D_PERIPHERAL_SFU; + + if (v3d_qpu_reads_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_READ; + if (v3d_qpu_writes_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_WRITE; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && - qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) { - return true; + v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) { + result |= V3D_PERIPHERAL_TSY; } if (inst->alu.add.op == V3D_QPU_A_TMUWT) - return true; - - if (inst->alu.mul.op != V3D_QPU_M_NOP && - inst->alu.mul.magic_write && - qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) { - return true; - } + result |= V3D_PERIPHERAL_TMU_WAIT; } - return (inst->sig.ldvpm || - inst->sig.ldtmu || - inst->sig.ldtlb || - inst->sig.ldtlbu || - inst->sig.wrtmuc); + return result; } static bool @@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { - const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a); - const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b); + const uint32_t a_peripherals = qpu_peripherals(devinfo, a); + const uint32_t b_peripherals = qpu_peripherals(devinfo, b); /* We can always do one peripheral access per instruction. */ - if (!a_uses_peripheral || !b_uses_peripheral) + if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1) return true; - if (devinfo->ver < 41) + /* V3D 4.x can't do more than one peripheral access except in a + * few cases: + */ + if (devinfo->ver == 42) { + /* WRTMUC signal with TMU register write (other than tmuc). */ + if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); + } + if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); + } + + /* TMU read with VPM read/write. */ + if (a_peripherals == V3D_PERIPHERAL_TMU_READ && + (b_peripherals == V3D_PERIPHERAL_VPM_READ || + b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + if (b_peripherals == V3D_PERIPHERAL_TMU_READ && + (a_peripherals == V3D_PERIPHERAL_VPM_READ || + a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + return false; + } - /* V3D 4.1 and later allow TMU read along with a VPM read or write, and - * WRTMUC with a TMU magic register write (other than tmuc). - */ - if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) || - (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) { - return true; + /* V3D 7.x can't have more than one of these restricted peripherals */ + const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE | + V3D_PERIPHERAL_TMU_WRTMUC_SIG | + V3D_PERIPHERAL_TSY | + V3D_PERIPHERAL_TLB_READ | + V3D_PERIPHERAL_SFU | + V3D_PERIPHERAL_VPM_READ | + V3D_PERIPHERAL_VPM_WRITE; + + const uint32_t a_restricted = a_peripherals & restricted; + const uint32_t b_restricted = b_peripherals & restricted; + if (a_restricted && b_restricted) { + /* WRTMUC signal with TMU register write (other than tmuc) is + * allowed though. + */ + if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || + (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) { + return false; + } } - if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || - (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) { - return true; + /* Only one TMU read per instruction */ + if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) && + (b_peripherals & V3D_PERIPHERAL_TMU_READ)) { + return false; } - return false; + /* Only one TLB access per instruction */ + if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ)) && + (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ))) { + return false; + } + + return true; } /* Compute a bitmask of which rf registers are used between @@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, uint64_t raddrs_used = 0; if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) raddrs_used |= (1ll << a->raddr_a); - if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) + if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) raddrs_used |= (1ll << a->raddr_b); if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) raddrs_used |= (1ll << b->raddr_a); - if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) + if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) raddrs_used |= (1ll << b->raddr_b); return raddrs_used; } -/* Take two instructions and attempt to merge their raddr fields - * into one merged instruction. Returns false if the two instructions - * access more than two different rf registers between them, or more - * than one rf register and one small immediate. +/* Takes two instructions and attempts to merge their raddr fields (including + * small immediates) into one merged instruction. For V3D 4.x, returns false + * if the two instructions access more than two different rf registers between + * them, or more than one rf register and one small immediate. For 7.x returns + * false if both instructions use small immediates. */ static bool qpu_merge_raddrs(struct v3d_qpu_instr *result, const struct v3d_qpu_instr *add_instr, - const struct v3d_qpu_instr *mul_instr) + const struct v3d_qpu_instr *mul_instr, + const struct v3d_device_info *devinfo) { + if (devinfo->ver >= 71) { + assert(add_instr->sig.small_imm_a + + add_instr->sig.small_imm_b <= 1); + assert(add_instr->sig.small_imm_c + + add_instr->sig.small_imm_d == 0); + assert(mul_instr->sig.small_imm_a + + mul_instr->sig.small_imm_b == 0); + assert(mul_instr->sig.small_imm_c + + mul_instr->sig.small_imm_d <= 1); + + result->sig.small_imm_a = add_instr->sig.small_imm_a; + result->sig.small_imm_b = add_instr->sig.small_imm_b; + result->sig.small_imm_c = mul_instr->sig.small_imm_c; + result->sig.small_imm_d = mul_instr->sig.small_imm_d; + + return (result->sig.small_imm_a + + result->sig.small_imm_b + + result->sig.small_imm_c + + result->sig.small_imm_d) <= 1; + } + + assert(devinfo->ver == 42); + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); int naddrs = util_bitcount64(raddrs_used); if (naddrs > 2) return false; - if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { + if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) { if (naddrs > 1) return false; - if (add_instr->sig.small_imm && mul_instr->sig.small_imm) + if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b) if (add_instr->raddr_b != mul_instr->raddr_b) return false; - result->sig.small_imm = true; - result->raddr_b = add_instr->sig.small_imm ? + result->sig.small_imm_b = true; + result->raddr_b = add_instr->sig.small_imm_b ? add_instr->raddr_b : mul_instr->raddr_b; } @@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, raddrs_used &= ~(1ll << raddr_a); result->raddr_a = raddr_a; - if (!result->sig.small_imm) { + if (!result->sig.small_imm_b) { if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && raddr_a == add_instr->raddr_b) { - if (add_instr->alu.add.a == V3D_QPU_MUX_B) - result->alu.add.a = V3D_QPU_MUX_A; - if (add_instr->alu.add.b == V3D_QPU_MUX_B && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B) + result->alu.add.a.mux = V3D_QPU_MUX_A; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_A; + result->alu.add.b.mux = V3D_QPU_MUX_A; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && raddr_a == mul_instr->raddr_b) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) - result->alu.mul.a = V3D_QPU_MUX_A; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B) + result->alu.mul.a.mux = V3D_QPU_MUX_A; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_A; + result->alu.mul.b.mux = V3D_QPU_MUX_A; } } } @@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, result->raddr_b = raddr_b; if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && raddr_b == add_instr->raddr_a) { - if (add_instr->alu.add.a == V3D_QPU_MUX_A) - result->alu.add.a = V3D_QPU_MUX_B; - if (add_instr->alu.add.b == V3D_QPU_MUX_A && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A) + result->alu.add.a.mux = V3D_QPU_MUX_B; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_B; + result->alu.add.b.mux = V3D_QPU_MUX_B; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && raddr_b == mul_instr->raddr_a) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) - result->alu.mul.a = V3D_QPU_MUX_B; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A) + result->alu.mul.a.mux = V3D_QPU_MUX_B; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_B; + result->alu.mul.b.mux = V3D_QPU_MUX_B; } } @@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op) } static void -qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) +qpu_convert_add_to_mul(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst) { STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); assert(inst->alu.add.op != V3D_QPU_A_NOP); @@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) inst->flags.ac = V3D_QPU_COND_NONE; inst->flags.apf = V3D_QPU_PF_NONE; inst->flags.auf = V3D_QPU_UF_NONE; + + inst->alu.mul.output_pack = inst->alu.add.output_pack; + + inst->alu.mul.a.unpack = inst->alu.add.a.unpack; + inst->alu.mul.b.unpack = inst->alu.add.b.unpack; + inst->alu.add.output_pack = V3D_QPU_PACK_NONE; + inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + + if (devinfo->ver >= 71) { + assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d); + assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1); + if (inst->sig.small_imm_a) { + inst->sig.small_imm_c = true; + inst->sig.small_imm_a = false; + } else if (inst->sig.small_imm_b) { + inst->sig.small_imm_d = true; + inst->sig.small_imm_b = false; + } + } +} + +static bool +can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + case V3D_QPU_M_FMOV: + return devinfo->ver >= 71; + default: + return false; + } +} + +static enum v3d_qpu_mul_op +mul_op_as_add_op(enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + return V3D_QPU_A_MOV; + case V3D_QPU_M_FMOV: + return V3D_QPU_A_FMOV; + default: + unreachable("unexpected mov opcode"); + } +} + +static void +qpu_convert_mul_to_add(struct v3d_qpu_instr *inst) +{ + STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul)); + assert(inst->alu.mul.op != V3D_QPU_M_NOP); + assert(inst->alu.add.op == V3D_QPU_A_NOP); + + memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add)); + inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op); + inst->alu.mul.op = V3D_QPU_M_NOP; + + inst->flags.ac = inst->flags.mc; + inst->flags.apf = inst->flags.mpf; + inst->flags.auf = inst->flags.muf; + inst->flags.mc = V3D_QPU_COND_NONE; + inst->flags.mpf = V3D_QPU_PF_NONE; + inst->flags.muf = V3D_QPU_UF_NONE; + + inst->alu.add.output_pack = inst->alu.mul.output_pack; + inst->alu.add.a.unpack = inst->alu.mul.a.unpack; + inst->alu.add.b.unpack = inst->alu.mul.b.unpack; + inst->alu.mul.output_pack = V3D_QPU_PACK_NONE; + inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b); + assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1); + if (inst->sig.small_imm_c) { + inst->sig.small_imm_a = true; + inst->sig.small_imm_c = false; + } else if (inst->sig.small_imm_d) { + inst->sig.small_imm_b = true; + inst->sig.small_imm_d = false; + } } static bool @@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(b->alu.add.op)) { mul_inst = *b; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge.alu.mul = mul_inst.alu.mul; - merge.flags.mc = b->flags.ac; - merge.flags.mpf = b->flags.apf; - merge.flags.muf = b->flags.auf; + merge.flags.mc = mul_inst.flags.mc; + merge.flags.mpf = mul_inst.flags.mpf; + merge.flags.muf = mul_inst.flags.muf; add_instr = a; mul_instr = &mul_inst; } else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(a->alu.add.op)) { mul_inst = *a; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge = mul_inst; merge.alu.add = b->alu.add; @@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, } } + struct v3d_qpu_instr add_inst; if (b->alu.mul.op != V3D_QPU_M_NOP) { - if (a->alu.mul.op != V3D_QPU_M_NOP) - return false; - merge.alu.mul = b->alu.mul; + if (a->alu.mul.op == V3D_QPU_M_NOP) { + merge.alu.mul = b->alu.mul; - merge.flags.mc = b->flags.mc; - merge.flags.mpf = b->flags.mpf; - merge.flags.muf = b->flags.muf; + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; - mul_instr = b; - add_instr = a; + mul_instr = b; + add_instr = a; + } + /* If a's mul op is used but its add op is not, then see if we + * can convert either a's mul op or b's mul op to an add op + * so we can merge. + */ + else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, b->alu.mul.op)) { + add_inst = *b; + qpu_convert_mul_to_add(&add_inst); + + merge.alu.add = add_inst.alu.add; + + merge.flags.ac = add_inst.flags.ac; + merge.flags.apf = add_inst.flags.apf; + merge.flags.auf = add_inst.flags.auf; + + mul_instr = a; + add_instr = &add_inst; + } else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, a->alu.mul.op)) { + add_inst = *a; + qpu_convert_mul_to_add(&add_inst); + + merge = add_inst; + merge.alu.mul = b->alu.mul; + + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; + + mul_instr = b; + add_instr = &add_inst; + } else { + return false; + } } + /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and + * they have restrictions on the number of raddrs that can be adressed + * in a single instruction. In V3D 7.x, we don't have that restriction, + * but we are still limited to a single small immediate per instruction. + */ if (add_instr && mul_instr && - !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { - return false; + !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { + return false; } merge.sig.thrsw |= b->sig.thrsw; @@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, merge.sig.ldtmu |= b->sig.ldtmu; merge.sig.ldvary |= b->sig.ldvary; merge.sig.ldvpm |= b->sig.ldvpm; - merge.sig.small_imm |= b->sig.small_imm; merge.sig.ldtlb |= b->sig.ldtlb; merge.sig.ldtlbu |= b->sig.ldtlbu; merge.sig.ucb |= b->sig.ucb; @@ -1047,24 +1403,25 @@ retry: * regfile A or B that was written to by the previous * instruction." */ - if (reads_too_soon_after_write(scoreboard, n->inst)) + if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; - /* "A scoreboard wait must not occur in the first two - * instructions of a fragment shader. This is either the - * explicit Wait for Scoreboard signal or an implicit wait - * with the first tile-buffer read or write instruction." + /* "Before doing a TLB access a scoreboard wait must have been + * done. This happens either on the first or last thread + * switch, depending on a setting (scb_wait_on_first_thrsw) in + * the shader state." */ - if (pixel_scoreboard_too_soon(scoreboard, inst)) + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; - /* ldunif and ldvary both write r5, but ldunif does so a tick - * sooner. If the ldvary's r5 wasn't used, then ldunif might + /* ldunif and ldvary both write the same register (r5 for v42 + * and below, rf0 for v71), but ldunif does so a tick sooner. + * If the ldvary's register wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update - * r5 in the same tick. + * the register in the same tick. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { @@ -1131,24 +1488,54 @@ retry: continue; } - /* Don't merge in something that will lock the TLB. - * Hopwefully what we have in inst will release some - * other instructions, allowing us to delay the - * TLB-locking instruction until later. + /* Don't merge TLB instructions before we have acquired + * the scoreboard lock. */ - if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; - /* When we succesfully pair up an ldvary we then try + /* When we successfully pair up an ldvary we then try * to merge it into the previous instruction if * possible to improve pipelining. Don't pick up the * ldvary now if the follow-up fixup would place * it in the delay slots of a thrsw, which is not * allowed and would prevent the fixup from being - * successul. + * successful. In V3D 7.x we can allow this to happen + * as long as it is not the last delay slot. */ - if (inst->sig.ldvary && - scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { + if (inst->sig.ldvary) { + if (c->devinfo->ver == 42 && + scoreboard->last_thrsw_tick + 2 >= + scoreboard->tick - 1) { + continue; + } + if (c->devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 == + scoreboard->tick - 1) { + continue; + } + } + + /* We can emit a new tmu lookup with a previous ldtmu + * if doing this would free just enough space in the + * TMU output fifo so we don't overflow, however, this + * is only safe if the ldtmu cannot stall. + * + * A ldtmu can stall if it is not the first following a + * thread switch and corresponds to the first word of a + * read request. + * + * FIXME: For now we forbid pairing up a new lookup + * with a previous ldtmu that is not the first after a + * thrsw if that could overflow the TMU output fifo + * regardless of whether the ldtmu is reading the first + * word of a TMU result or not, since we don't track + * this aspect in the compiler yet. + */ + if (prev_inst->inst->qpu.sig.ldtmu && + !scoreboard->first_ldtmu_after_thrsw && + (scoreboard->pending_ldtmu_count + + n->inst->ldtmu_count > 16 / c->threads)) { continue; } @@ -1161,7 +1548,7 @@ retry: int prio = get_instruction_priority(c->devinfo, inst); - if (mux_read_stalls(scoreboard, inst)) { + if (read_stalls(c->devinfo, scoreboard, inst)) { /* Don't merge an instruction that stalls */ if (prev_inst) continue; @@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, { if (v3d_qpu_magic_waddr_is_sfu(waddr)) scoreboard->last_magic_sfu_write_tick = scoreboard->tick; - else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) + else if (waddr == V3D_QPU_WADDR_UNIFA) scoreboard->last_unifa_write_tick = scoreboard->tick; } @@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, } static void +update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard, + const struct qinst *inst) +{ + /* Track if the have seen any ldtmu after the last thread switch */ + if (scoreboard->tick == scoreboard->last_thrsw_tick + 2) + scoreboard->first_ldtmu_after_thrsw = true; + + /* Track the number of pending ldtmu instructions for outstanding + * TMU lookups. + */ + scoreboard->pending_ldtmu_count += inst->ldtmu_count; + if (inst->qpu.sig.ldtmu) { + assert(scoreboard->pending_ldtmu_count > 0); + scoreboard->pending_ldtmu_count--; + scoreboard->first_ldtmu_after_thrsw = false; + } +} + +static void +set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick && + v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic) { + scoreboard->has_rf0_flops_conflict = true; + } +} + +static void +update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return; + + /* Thread switch restrictions: + * + * At the point of a thread switch or thread end (when the actual + * thread switch or thread end happens, not when the signalling + * instruction is processed): + * + * - If the most recent write to rf0 was from a ldunif, ldunifa, or + * ldvary instruction in which another signal also wrote to the + * register file, and the final instruction of the thread section + * contained a signal which wrote to the register file, then the + * value of rf0 is undefined at the start of the new section + * + * Here we use the scoreboard to track if our last rf0 implicit write + * happens at the same time that another signal writes the register + * file (has_rf0_flops_conflict). We will use that information when + * scheduling thrsw instructions to avoid putting anything in their + * last delay slot which has a signal that writes to the register file. + */ + + /* Reset tracking if we have an explicit rf0 write or we are starting + * a new thread section. + */ + if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + scoreboard->tick - scoreboard->last_thrsw_tick == 3) { + scoreboard->last_implicit_rf0_write_tick = -10; + scoreboard->has_rf0_flops_conflict = false; + } + + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) { + scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ? + scoreboard->tick + 1 : scoreboard->tick; + } + + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); +} + +static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst, + const struct qinst *qinst, const struct v3d_device_info *devinfo) { + const struct v3d_qpu_instr *inst = &qinst->qpu; + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; @@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, } } + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) { + update_scoreboard_for_magic_waddr(scoreboard, + inst->sig_addr, + devinfo); + } + if (inst->sig.ldvary) scoreboard->last_ldvary_tick = scoreboard->tick; - if (qpu_inst_is_tlb(inst)) - scoreboard->tlb_locked = true; + update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo); + + update_scoreboard_tmu_tracking(scoreboard, qinst); } static void @@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo, after_inst->type != V3D_QPU_INSTR_TYPE_ALU) return latency; - if (before_inst->alu.add.magic_write) { + if (v3d_qpu_instr_is_sfu(before_inst)) + return 2; + + if (before_inst->alu.add.op != V3D_QPU_A_NOP && + before_inst->alu.add.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.add.waddr, after_inst)); } - if (before_inst->alu.mul.magic_write) { + if (before_inst->alu.mul.op != V3D_QPU_M_NOP && + before_inst->alu.mul.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.mul.waddr, after_inst)); } - if (v3d_qpu_instr_is_sfu(before_inst)) - return 2; - return latency; } @@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c, { list_addtail(&inst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); + update_scoreboard_for_chosen(scoreboard, inst, c->devinfo); c->qpu_inst_count++; scoreboard->tick++; } @@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, { const struct v3d_qpu_instr *inst = &qinst->qpu; - /* Only TLB Z writes are prohibited in the last slot, but we don't - * have those flagged so prohibit all TLB ops for now. - */ - if (slot == 2 && qpu_inst_is_tlb(inst)) + if (slot == 2 && qinst->is_tlb_z_write) return false; if (slot > 0 && qinst->uniform != ~0) return false; - if (v3d_qpu_uses_vpm(inst)) + if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst)) return false; if (inst->sig.ldvary) @@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { /* GFXH-1625: TMUWT not allowed in the final instruction. */ - if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) + if (c->devinfo->ver == 42 && slot == 2 && + inst->alu.add.op == V3D_QPU_A_TMUWT) { return false; + } - /* No writing physical registers at the end. */ - if (!inst->alu.add.magic_write || - !inst->alu.mul.magic_write) { - return false; + if (c->devinfo->ver == 42) { + /* No writing physical registers at the end. */ + bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; + bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; + if ((!add_is_nop && !inst->alu.add.magic_write) || + (!mul_is_nop && !inst->alu.mul.magic_write)) { + return false; + } + + if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && + !inst->sig_magic) { + return false; + } } - if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) - return false; + if (c->devinfo->ver >= 71) { + /* The thread end instruction must not write to the + * register file via the add/mul ALUs. + */ + if (slot == 0 && + (!inst->alu.add.magic_write || + !inst->alu.mul.magic_write)) { + return false; + } + } - /* RF0-2 might be overwritten during the delay slots by - * fragment shader setup. - */ - if (inst->raddr_a < 3 && - (inst->alu.add.a == V3D_QPU_MUX_A || - inst->alu.add.b == V3D_QPU_MUX_A || - inst->alu.mul.a == V3D_QPU_MUX_A || - inst->alu.mul.b == V3D_QPU_MUX_A)) { - return false; + if (c->devinfo->ver == 42) { + /* RF0-2 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) + return false; + + if (inst->raddr_b < 3 && + !inst->sig.small_imm_b && + v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { + return false; + } } - if (inst->raddr_b < 3 && - !inst->sig.small_imm && - (inst->alu.add.a == V3D_QPU_MUX_B || - inst->alu.add.b == V3D_QPU_MUX_B || - inst->alu.mul.a == V3D_QPU_MUX_B || - inst->alu.mul.b == V3D_QPU_MUX_B)) { - return false; + if (c->devinfo->ver >= 71) { + /* RF2-3 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (v3d71_qpu_reads_raddr(inst, 2) || + v3d71_qpu_reads_raddr(inst, 3)) { + return false; + } + + if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) || + v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) { + return false; + } } } @@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, */ static bool qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, const struct qinst *qinst, uint32_t slot) { @@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, * thread. The simulator complains for safety, though it * would only occur for dead code in our case. */ - if (slot > 0 && - qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || - v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { - return false; + if (slot > 0) { + if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu)) + return false; + if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu)) + return false; } - if (slot > 0 && qinst->qpu.sig.ldvary) - return false; + if (qinst->qpu.sig.ldvary) { + if (c->devinfo->ver == 42 && slot > 0) + return false; + if (c->devinfo->ver >= 71 && slot == 2) + return false; + } /* unifa and the following 3 instructions can't overlap a * thread switch/end. The docs further clarify that this means @@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) return false; + /* See comment when we set has_rf0_flops_conflict for details */ + if (c->devinfo->ver >= 71 && + slot == 2 && + v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) && + !qinst->qpu.sig_magic) { + if (scoreboard->has_rf0_flops_conflict) + return false; + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick) + return false; + } + return true; } @@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, assert(slot <= 2); /* We merge thrsw instructions back into the instruction stream - * manually, so any instructions scheduled after a thrsw shold be + * manually, so any instructions scheduled after a thrsw should be * in the actual delay slots and not in the same slot as the thrsw. */ assert(slot >= 1); @@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, * also apply to instructions scheduled after the thrsw that we want * to place in its delay slots. */ - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot)) return false; /* TLB access is disallowed until scoreboard wait is executed, which @@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (v3d_qpu_writes_flags(&qinst->qpu)) return false; + /* TSY sync ops materialize at the point of the next thread switch, + * therefore, if we have a TSY sync right after a thread switch, we + * cannot place it in its delay slots, or we would be moving the sync + * to the thrsw before it instead. + */ + if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID) + return false; + return true; } @@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard struct qinst *qinst, int instructions_in_sequence, bool is_thrend) { - /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ - if (scoreboard->last_thrsw_tick + 3 > - scoreboard->tick - instructions_in_sequence) { - return false; - } - for (int slot = 0; slot < instructions_in_sequence; slot++) { - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, + qinst, slot)) { return false; + } if (is_thrend && !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { @@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c, /* Find how far back into previous instructions we can put the THRSW. */ int slots_filled = 0; + int invalid_sig_count = 0; + int invalid_seq_count = 0; + bool last_thrsw_after_invalid_ok = false; struct qinst *merge_inst = NULL; vir_for_each_inst_rev(prev_inst, block) { - struct v3d_qpu_sig sig = prev_inst->qpu.sig; - sig.thrsw = true; - uint32_t packed_sig; - - if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) + /* No emitting our thrsw while the previous thrsw hasn't + * happened yet. + */ + if (scoreboard->last_thrsw_tick + 3 > + scoreboard->tick - (slots_filled + 1)) { break; + } + if (!valid_thrsw_sequence(c, scoreboard, prev_inst, slots_filled + 1, is_thrend)) { - break; + /* Even if the current sequence isn't valid, we may + * be able to get a valid sequence by trying to move the + * thrsw earlier, so keep going. + */ + invalid_seq_count++; + goto cont_block; + } + + struct v3d_qpu_sig sig = prev_inst->qpu.sig; + sig.thrsw = true; + uint32_t packed_sig; + if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) { + /* If we can't merge the thrsw here because of signal + * incompatibility, keep going, we might be able to + * merge it in an earlier instruction. + */ + invalid_sig_count++; + goto cont_block; } + /* For last thrsw we need 2 consecutive slots that are + * thrsw compatible, so if we have previously jumped over + * an incompatible signal, flag that we have found the first + * valid slot here and keep going. + */ + if (inst->is_last_thrsw && invalid_sig_count > 0 && + !last_thrsw_after_invalid_ok) { + last_thrsw_after_invalid_ok = true; + invalid_sig_count++; + goto cont_block; + } + + /* We can merge the thrsw in this instruction */ + last_thrsw_after_invalid_ok = false; + invalid_sig_count = 0; + invalid_seq_count = 0; merge_inst = prev_inst; + +cont_block: if (++slots_filled == 3) break; } + /* If we jumped over a signal incompatibility and did not manage to + * merge the thrsw in the end, we need to adjust slots filled to match + * the last valid merge point. + */ + assert((invalid_sig_count == 0 && invalid_seq_count == 0) || + slots_filled >= invalid_sig_count + invalid_seq_count); + if (invalid_sig_count > 0) + slots_filled -= invalid_sig_count; + if (invalid_seq_count > 0) + slots_filled -= invalid_seq_count; + bool needs_free = false; if (merge_inst) { merge_inst->qpu.sig.thrsw = true; @@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c, merge_inst = inst; } + scoreboard->first_thrsw_emitted = true; + /* If we're emitting the last THRSW (other than program end), then * signal that to the HW by emitting two THRSWs in a row. */ @@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c, struct qinst *second_inst = (struct qinst *)merge_inst->link.next; second_inst->qpu.sig.thrsw = true; + scoreboard->last_thrsw_emitted = true; } /* Make sure the thread end executes within the program lifespan */ @@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c, assert(scoreboard->last_branch_tick + 3 < branch_tick); assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); - /* Can't place a branch with msfign != 0 and cond != 0,2,3 after + /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after * setmsf. */ bool is_safe_msf_branch = + c->devinfo->ver >= 71 || inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || @@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c, break; } + /* Do not move up a branch if it can disrupt an ldvary sequence + * as that can cause stomping of the r5 register. + */ + if (scoreboard->last_ldvary_tick + 2 >= + branch_tick - slots_filled) { + break; + } + /* Can't move a conditional branch before the instruction * that writes the flags for its condition. */ @@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c, } static bool -alu_reads_register(struct v3d_qpu_instr *inst, +alu_reads_register(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst, bool add, bool magic, uint32_t index) { uint32_t num_src; - enum v3d_qpu_mux mux_a, mux_b; - - if (add) { + if (add) num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); - mux_a = inst->alu.add.a; - mux_b = inst->alu.add.b; - } else { + else num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); - mux_a = inst->alu.mul.a; - mux_b = inst->alu.mul.b; - } - for (int i = 0; i < num_src; i++) { - if (magic) { - if (i == 0 && mux_a == index) - return true; - if (i == 1 && mux_b == index) - return true; + if (devinfo->ver == 42) { + enum v3d_qpu_mux mux_a, mux_b; + if (add) { + mux_a = inst->alu.add.a.mux; + mux_b = inst->alu.add.b.mux; } else { - if (i == 0 && mux_a == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 0 && mux_a == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; + mux_a = inst->alu.mul.a.mux; + mux_b = inst->alu.mul.b.mux; + } + + for (int i = 0; i < num_src; i++) { + if (magic) { + if (i == 0 && mux_a == index) + return true; + if (i == 1 && mux_b == index) + return true; + } else { + if (i == 0 && mux_a == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 0 && mux_a == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } } } + + return false; + } + + assert(devinfo->ver >= 71); + assert(!magic); + + uint32_t raddr_a, raddr_b; + if (add) { + raddr_a = inst->alu.add.a.raddr; + raddr_b = inst->alu.add.b.raddr; + } else { + raddr_a = inst->alu.mul.a.raddr; + raddr_b = inst->alu.mul.b.raddr; + } + + for (int i = 0; i < num_src; i++) { + if (i == 0 && raddr_a == index) + return true; + if (i == 1 && raddr_b == index) + return true; } return false; @@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c, struct qblock *block, struct v3d_qpu_instr *inst) { - /* We only call this if we have successfuly merged an ldvary into a + const struct v3d_device_info *devinfo = c->devinfo; + + /* We only call this if we have successfully merged an ldvary into a * previous instruction. */ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); @@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c, * the ldvary destination, if it does, then moving the ldvary before * it would overwrite it. */ - if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index)) return false; - if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index)) + return false; + + /* The implicit ldvary destination may not be written to by a signal + * in the instruction following ldvary. Since we are planning to move + * ldvary to the previous instruction, this means we need to check if + * the current instruction has any other signal that could create this + * conflict. The only other signal that can write to the implicit + * ldvary destination that is compatible with ldvary in the same + * instruction is ldunif. + */ + if (inst->sig.ldunif) return false; /* The previous instruction can't write to the same destination as the @@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, } /* The previous instruction cannot have a conflicting signal */ - if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) + if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig)) + return false; + + uint32_t sig; + struct v3d_qpu_sig new_sig = prev->qpu.sig; + new_sig.ldvary = true; + if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) return false; /* The previous instruction cannot use flags since ldvary uses the @@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, /* We can't put an ldvary in the delay slots of a thrsw. We should've * prevented this when pairing up the ldvary with another instruction - * and flagging it for a fixup. + * and flagging it for a fixup. In V3D 7.x this is limited only to the + * second delay slot. */ - assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); + assert((devinfo->ver == 42 && + scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) || + (devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1)); /* Move the ldvary to the previous instruction and remove it from the * current one. @@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c, inst->sig_magic = false; inst->sig_addr = 0; - /* By moving ldvary to the previous instruction we make it update - * r5 in the current one, so nothing else in it should write r5. + /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */ + if (devinfo->ver >= 71) { + scoreboard->last_implicit_rf0_write_tick = scoreboard->tick; + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); + } + + /* By moving ldvary to the previous instruction we make it update r5 + * (rf0 for ver >= 71) in the current one, so nothing else in it + * should write this register. + * * This should've been prevented by our depedency tracking, which * would not allow ldvary to be paired up with an instruction that - * writes r5 (since our dependency tracking doesn't know that the - * ldvary write r5 happens in the next instruction). + * writes r5/rf0 (since our dependency tracking doesn't know that the + * ldvary write to r5/rf0 happens in the next instruction). */ - assert(!v3d_qpu_writes_r5(c->devinfo, inst)); + assert(!v3d_qpu_writes_r5(devinfo, inst)); + assert(devinfo->ver == 42 || + (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0))); return true; } @@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c, merge->inst->uniform; } + chosen->inst->ldtmu_count += + merge->inst->ldtmu_count; + if (debug) { fprintf(stderr, "t=%4d: merging: ", time); @@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c, } } } - if (mux_read_stalls(scoreboard, inst)) + if (read_stalls(c->devinfo, scoreboard, inst)) c->qpu_inst_stalled_count++; } @@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_branch_tick = -10; scoreboard.last_setmsf_tick = -10; scoreboard.last_stallable_sfu_tick = -10; + scoreboard.first_ldtmu_after_thrsw = true; + scoreboard.last_implicit_rf0_write_tick = - 10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c index ec9ed66650c..538b247e3e0 100644 --- a/src/broadcom/compiler/qpu_validate.c +++ b/src/broadcom/compiler/qpu_validate.c @@ -41,6 +41,7 @@ struct v3d_qpu_validate_state { int last_sfu_write; int last_branch_ip; int last_thrsw_ip; + int first_tlb_z_write; /* Set when we've found the last-THRSW signal, or if we were started * in single-segment mode. @@ -110,11 +111,58 @@ static void qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) { const struct v3d_device_info *devinfo = state->c->devinfo; + + if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write) + state->first_tlb_z_write = state->ip; + const struct v3d_qpu_instr *inst = &qinst->qpu; + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && + state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write && + inst->branch.msfign != V3D_QPU_MSFIGN_NONE && + inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && + inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && + inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { + fail_instr(state, "Implicit branch MSF read after TLB Z write"); + } + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return; + if (inst->alu.add.op == V3D_QPU_A_SETMSF && + state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write) { + fail_instr(state, "SETMSF after TLB Z write"); + } + + if (state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write && + inst->alu.add.op == V3D_QPU_A_MSF) { + fail_instr(state, "MSF read after TLB Z write"); + } + + if (devinfo->ver < 71) { + if (inst->sig.small_imm_a || inst->sig.small_imm_c || + inst->sig.small_imm_d) { + fail_instr(state, "small imm a/c/d added after V3D 7.1"); + } + } else { + if ((inst->sig.small_imm_a || inst->sig.small_imm_b) && + !vir_is_add(qinst)) { + fail_instr(state, "small imm a/b used but no ADD inst"); + } + if ((inst->sig.small_imm_c || inst->sig.small_imm_d) && + !vir_is_mul(qinst)) { + fail_instr(state, "small imm c/d used but no MUL inst"); + } + if (inst->sig.small_imm_a + inst->sig.small_imm_b + + inst->sig.small_imm_c + inst->sig.small_imm_d > 1) { + fail_instr(state, "only one small immediate can be " + "enabled per instruction"); + } + } + /* LDVARY writes r5 two instructions later and LDUNIF writes * r5 one instruction later, which is illegal to have * together. @@ -128,7 +176,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) * * FIXME: This would not check correctly for V3D 4.2 versions lower * than V3D 4.2.14, but that is not a real issue because the simulator - * will still catch this, and we are not really targetting any such + * will still catch this, and we are not really targeting any such * versions anyway. */ if (state->c->devinfo->ver < 42) { @@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) "SFU write started during THRSW delay slots "); } - if (inst->sig.ldvary) - fail_instr(state, "LDVARY during THRSW delay slots"); + if (inst->sig.ldvary) { + if (devinfo->ver == 42) + fail_instr(state, "LDVARY during THRSW delay slots"); + if (devinfo->ver >= 71 && + state->ip - state->last_thrsw_ip == 2) { + fail_instr(state, "LDVARY in 2nd THRSW delay slot"); + } + } } (void)qpu_magic_waddr_matches; /* XXX */ @@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) vpm_writes + tlb_writes + tsy_writes + - inst->sig.ldtmu + + (devinfo->ver == 42 ? inst->sig.ldtmu : 0) + inst->sig.ldtlb + inst->sig.ldvpm + inst->sig.ldtlbu > 1) { @@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) inst->type == V3D_QPU_INSTR_TYPE_ALU) { if ((inst->alu.add.op != V3D_QPU_A_NOP && !inst->alu.add.magic_write)) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver == 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71) { + if (state->last_thrsw_ip - state->ip == 0) { + fail_instr(state, + "ADD RF write at THREND"); + } + if (inst->alu.add.waddr == 2 || + inst->alu.add.waddr == 3) { + fail_instr(state, + "RF2-3 write after THREND"); + } + } } if ((inst->alu.mul.op != V3D_QPU_M_NOP && !inst->alu.mul.magic_write)) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver == 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71) { + if (state->last_thrsw_ip - state->ip == 0) { + fail_instr(state, + "MUL RF write at THREND"); + } + + if (inst->alu.mul.waddr == 2 || + inst->alu.mul.waddr == 3) { + fail_instr(state, + "RF2-3 write after THREND"); + } + } } if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && !inst->sig_magic) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver == 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71 && + (inst->sig_addr == 2 || + inst->sig_addr == 3)) { + fail_instr(state, "RF2-3 write after THREND"); + } } /* GFXH-1625: No TMUWT in the last instruction */ @@ -312,7 +397,7 @@ qpu_validate(struct v3d_compile *c) * keep compiling the validation code to make sure it doesn't get * broken. */ -#ifndef DEBUG +#if !MESA_DEBUG return; #endif @@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c) .last_sfu_write = -10, .last_thrsw_ip = -10, .last_branch_ip = -10, + .first_tlb_z_write = INT_MAX, .ip = 0, .last_thrsw_found = !c->last_thrsw, diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c deleted file mode 100644 index b933635f6fe..00000000000 --- a/src/broadcom/compiler/v3d33_tex.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright © 2016-2018 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "v3d_compiler.h" - -/* We don't do any address packing. */ -#define __gen_user_data void -#define __gen_address_type uint32_t -#define __gen_address_offset(reloc) (*reloc) -#define __gen_emit_reloc(cl, reloc) -#include "cle/v3d_packet_v33_pack.h" - -void -v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) -{ - /* FIXME: We don't bother implementing pipelining for texture reads - * for any pre 4.x hardware. It should be straight forward to do but - * we are not really testing or even targetting this hardware at - * present. - */ - ntq_flush_tmu(c); - - unsigned unit = instr->texture_index; - - struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = { - V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header, - - .fetch_sample_mode = instr->op == nir_texop_txf, - }; - - struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked = { - }; - - switch (instr->sampler_dim) { - case GLSL_SAMPLER_DIM_1D: - if (instr->is_array) - p0_unpacked.lookup_type = TEXTURE_1D_ARRAY; - else - p0_unpacked.lookup_type = TEXTURE_1D; - break; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - if (instr->is_array) - p0_unpacked.lookup_type = TEXTURE_2D_ARRAY; - else - p0_unpacked.lookup_type = TEXTURE_2D; - break; - case GLSL_SAMPLER_DIM_3D: - p0_unpacked.lookup_type = TEXTURE_3D; - break; - case GLSL_SAMPLER_DIM_CUBE: - p0_unpacked.lookup_type = TEXTURE_CUBE_MAP; - break; - default: - unreachable("Bad sampler type"); - } - - struct qreg coords[5]; - int next_coord = 0; - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - for (int j = 0; j < instr->coord_components; j++) { - coords[next_coord++] = - ntq_get_src(c, instr->src[i].src, j); - } - if (instr->coord_components < 2) - coords[next_coord++] = vir_uniform_f(c, 0.5); - break; - case nir_tex_src_bias: - coords[next_coord++] = - ntq_get_src(c, instr->src[i].src, 0); - - p0_unpacked.bias_supplied = true; - break; - case nir_tex_src_lod: - coords[next_coord++] = - vir_FADD(c, - ntq_get_src(c, instr->src[i].src, 0), - vir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, - unit)); - - if (instr->op != nir_texop_txf && - instr->op != nir_texop_tg4) { - p0_unpacked.disable_autolod_use_bias_only = true; - } - break; - case nir_tex_src_comparator: - coords[next_coord++] = - ntq_get_src(c, instr->src[i].src, 0); - - p0_unpacked.shadow = true; - break; - - case nir_tex_src_offset: { - p0_unpacked.texel_offset_for_s_coordinate = - nir_src_comp_as_int(instr->src[i].src, 0); - - if (instr->coord_components >= 2) - p0_unpacked.texel_offset_for_t_coordinate = - nir_src_comp_as_int(instr->src[i].src, 1); - - if (instr->coord_components >= 3) - p0_unpacked.texel_offset_for_r_coordinate = - nir_src_comp_as_int(instr->src[i].src, 2); - break; - } - - default: - unreachable("unknown texture source"); - } - } - - /* Limit the number of channels returned to both how many the NIR - * instruction writes and how many the instruction could produce. - */ - p1_unpacked.return_words_of_texture_data = - instr->dest.is_ssa ? - nir_ssa_def_components_read(&instr->dest.ssa) : - (1 << instr->dest.reg.reg->num_components) - 1; - - uint32_t p0_packed; - V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL, - (uint8_t *)&p0_packed, - &p0_unpacked); - - uint32_t p1_packed; - V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL, - (uint8_t *)&p1_packed, - &p1_unpacked); - /* Load unit number into the address field, which will be be used by - * the driver to decide which texture to put in the actual address - * field. - */ - p1_packed |= unit << 5; - - /* There is no native support for GL texture rectangle coordinates, so - * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, - * 1]). - */ - if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { - coords[0] = vir_FMUL(c, coords[0], - vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, - unit)); - coords[1] = vir_FMUL(c, coords[1], - vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, - unit)); - } - - int texture_u[] = { - vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed), - vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed), - }; - - for (int i = 0; i < next_coord; i++) { - struct qreg dst; - - if (i == next_coord - 1) - dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL); - else - dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU); - - struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]); - - if (i < 2) - tmu->uniform = texture_u[i]; - } - - vir_emit_thrsw(c); - - for (int i = 0; i < 4; i++) { - if (p1_unpacked.return_words_of_texture_data & (1 << i)) - ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); - } -} diff --git a/src/broadcom/compiler/v3d33_vpm_setup.c b/src/broadcom/compiler/v3d33_vpm_setup.c deleted file mode 100644 index 8bce67dfae9..00000000000 --- a/src/broadcom/compiler/v3d33_vpm_setup.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright © 2016-2018 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "v3d_compiler.h" - -/* We don't do any address packing. */ -#define __gen_user_data void -#define __gen_address_type uint32_t -#define __gen_address_offset(reloc) (*reloc) -#define __gen_emit_reloc(cl, reloc) -#include "broadcom/cle/v3d_packet_v33_pack.h" - -void -v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components) -{ - struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = { - V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header, - - .horiz = true, - .laned = false, - /* If the field is 0, that means a read count of 32. */ - .num = num_components & 31, - .segs = true, - .stride = 1, - .size = VPM_SETUP_SIZE_32_BIT, - .addr = c->num_inputs, - }; - - uint32_t packed; - V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL, - (uint8_t *)&packed, - &unpacked); - vir_VPMSETUP(c, vir_uniform_ui(c, packed)); -} - -void -v3d33_vir_vpm_write_setup(struct v3d_compile *c) -{ - uint32_t packed; - struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = { - V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header, - - .horiz = true, - .laned = false, - .segs = true, - .stride = 1, - .size = VPM_SETUP_SIZE_32_BIT, - .addr = 0, - }; - - V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL, - (uint8_t *)&packed, - &unpacked); - vir_VPMSETUP(c, vir_uniform_ui(c, packed)); -} diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 0c1419661d3..12aaacdc14a 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -31,6 +31,7 @@ #include <stdint.h> #include <string.h> +#include "util/blend.h" #include "util/macros.h" #include "common/v3d_debug.h" #include "common/v3d_device_info.h" @@ -40,7 +41,6 @@ #include "util/u_math.h" #include "qpu/qpu_instr.h" -#include "pipe/p_state.h" /** * Maximum number of outstanding TMU operations we can queue for execution. @@ -87,7 +87,7 @@ enum qfile { /** A physical register, such as the W coordinate payload. */ QFILE_REG, - /** One of the regsiters for fixed function interactions. */ + /** One of the registers for fixed function interactions. */ QFILE_MAGIC, /** @@ -97,12 +97,6 @@ enum qfile { QFILE_TEMP, /** - * VPM reads use this with an index value to say what part of the VPM - * is being read. - */ - QFILE_VPM, - - /** * Stores an immediate value in the index field that will be used * directly by qpu_load_imm(). */ @@ -169,6 +163,19 @@ struct qinst { * otherwise. */ int uniform; + + /* If this is a a TLB Z write */ + bool is_tlb_z_write; + + /* If this is a retiring TMU instruction (the last in a lookup sequence), + * how many ldtmu instructions are required to read the results. + */ + uint32_t ldtmu_count; + + /* Position of this instruction in the program. Filled in during + * register allocation. + */ + int32_t ip; }; enum quniform_contents { @@ -330,6 +337,19 @@ enum quniform_contents { * Current value of gl_ViewIndex for Multiview rendering. */ QUNIFORM_VIEW_INDEX, + + /** + * Inline uniform buffers + */ + QUNIFORM_INLINE_UBO_0, + QUNIFORM_INLINE_UBO_1, + QUNIFORM_INLINE_UBO_2, + QUNIFORM_INLINE_UBO_3, + + /** + * Current value of DrawIndex for Multidraw + */ + QUNIFORM_DRAW_ID, }; static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) @@ -369,13 +389,7 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) return slot.slot_and_component & 3; } -enum v3d_execution_environment { - V3D_ENVIRONMENT_OPENGL = 0, - V3D_ENVIRONMENT_VULKAN, -}; - struct v3d_key { - void *shader_state; struct { uint8_t swizzle[4]; } tex[V3D_MAX_TEXTURE_SAMPLERS]; @@ -388,9 +402,9 @@ struct v3d_key { uint8_t num_samplers_used; uint8_t ucp_enables; bool is_last_geometry_stage; - bool robust_buffer_access; - - enum v3d_execution_environment environment; + bool robust_uniform_access; + bool robust_storage_access; + bool robust_image_access; }; struct v3d_fs_key { @@ -400,7 +414,6 @@ struct v3d_fs_key { bool line_smoothing; bool point_coord_upper_left; bool msaa; - bool sample_coverage; bool sample_alpha_to_coverage; bool sample_alpha_to_one; /* Mask of which color render targets are present. */ @@ -419,14 +432,12 @@ struct v3d_fs_key { */ struct { enum pipe_format format; - const uint8_t *swizzle; + uint8_t swizzle[4]; } color_fmt[V3D_MAX_DRAW_BUFFERS]; - uint8_t logicop_func; + enum pipe_logicop logicop_func; uint32_t point_sprite_mask; - struct pipe_rt_blend_state blend; - /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios: * * - If there is a geometry shader, then gl_PrimitiveID must be written @@ -468,7 +479,7 @@ struct v3d_vs_key { bool clamp_color; }; -/** A basic block of VIR intructions. */ +/** A basic block of VIR instructions. */ struct qblock { struct list_head link; @@ -566,6 +577,7 @@ enum v3d_compilation_result { */ struct v3d_compiler { const struct v3d_device_info *devinfo; + uint32_t max_inline_uniform_buffers; struct ra_regs *regs; struct ra_class *reg_class_any[3]; struct ra_class *reg_class_r5[3]; @@ -584,6 +596,19 @@ struct v3d_interp_input { unsigned mode; /* interpolation mode */ }; +struct v3d_ra_node_info { + struct { + uint32_t priority; + uint8_t class_bits; + bool is_program_end; + bool unused; + + /* V3D 7.x */ + bool is_ldunif_dst; + } *info; + uint32_t alloc_count; +}; + struct v3d_compile { const struct v3d_device_info *devinfo; nir_shader *s; @@ -596,7 +621,7 @@ struct v3d_compile { void *debug_output_data; /** - * Mapping from nir_register * or nir_ssa_def * to array of struct + * Mapping from nir_register * or nir_def * to array of struct * qreg for the values. */ struct hash_table *def_ht; @@ -615,11 +640,12 @@ struct v3d_compile { uint32_t output_fifo_size; struct { - nir_dest *dest; + nir_def *def; uint8_t num_components; uint8_t component_mask; } flush[MAX_TMU_QUEUE_SIZE]; uint32_t flush_count; + uint32_t total_count; } tmu; /** @@ -652,16 +678,13 @@ struct v3d_compile { bool uses_center_w; bool writes_z; + bool writes_z_from_fep; + bool reads_z; bool uses_implicit_point_line_varyings; /* True if a fragment shader reads gl_PrimitiveID */ bool fs_uses_primitive_id; - /* If the fragment shader does anything that requires to force - * per-sample MSAA, such as reading gl_SampleID. - */ - bool force_per_sample_msaa; - /* Whether we are using the fallback scheduler. This will be set after * register allocation has failed once. */ @@ -681,6 +704,11 @@ struct v3d_compile { bool disable_constant_ubo_load_sorting; bool sorted_any_ubo_loads; + /* Moves UBO/SSBO loads right before their first user (nir_opt_move). + * This can reduce register pressure. + */ + bool move_buffer_loads; + /* Emits ldunif for each new uniform, even if the uniform was already * emitted in the same block. Useful to compile shaders with high * register pressure or to disable the optimization during uniform @@ -692,6 +720,19 @@ struct v3d_compile { bool disable_loop_unrolling; bool unrolled_any_loops; + /* Disables nir_opt_gcm to reduce register pressure. */ + bool disable_gcm; + + /* If calling nir_opt_gcm made any progress. Used to skip new rebuilds + * if possible + */ + bool gcm_progress; + + /* Disables scheduling of general TMU loads (and unfiltered image load). + */ + bool disable_general_tmu_sched; + bool has_general_tmu_load; + /* Minimum number of threads we are willing to use to register allocate * a shader with the current compilation strategy. This only prevents * us from lowering the thread count to register allocate successfully, @@ -705,7 +746,9 @@ struct v3d_compile { * strategies that can reduce register pressure and hopefully reduce or * eliminate TMU spills in the shader. */ - bool tmu_spilling_allowed; + uint32_t max_tmu_spills; + + uint32_t compile_strategy_idx; /* The UBO index and block used with the last unifa load, as well as the * current unifa offset *after* emitting that load. This is used to skip @@ -715,6 +758,7 @@ struct v3d_compile { struct qblock *current_unifa_block; int32_t current_unifa_index; uint32_t current_unifa_offset; + bool current_unifa_is_ubo; /* State for whether we're executing on each channel currently. 0 if * yes, otherwise a block number + 1 that the channel jumped to. @@ -749,6 +793,11 @@ struct v3d_compile { struct qreg cs_shared_offset; int local_invocation_index_bits; + /* Starting value of the sample mask in a fragment shader. We use + * this to identify lanes that have been terminated/discarded. + */ + struct qreg start_msf; + /* If the shader uses subgroup functionality */ bool has_subgroups; @@ -761,14 +810,27 @@ struct v3d_compile { uint32_t spill_size; /* Shader-db stats */ uint32_t spills, fills, loops; + + /* Whether we are in the process of spilling registers for + * register allocation + */ + bool spilling; + /** * Register spilling's per-thread base address, shared between each - * spill/fill's addressing calculations. + * spill/fill's addressing calculations (also used for scratch + * access). */ struct qreg spill_base; + /* Bit vector of which temps may be spilled */ BITSET_WORD *spillable; + /* Used during register allocation */ + int thread_index; + struct v3d_ra_node_info nodes; + struct ra_graph *g; + /** * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. * @@ -799,11 +861,16 @@ struct v3d_compile { uint32_t uniform_array_size; uint32_t num_uniforms; uint32_t output_position_index; - nir_variable *output_color_var[4]; + nir_variable *output_color_var[V3D_MAX_DRAW_BUFFERS]; uint32_t output_sample_mask_index; struct qreg undef; uint32_t num_temps; + /* Number of temps in the program right before we spill a new temp. We + * use this to know which temps existed before a spill and which were + * added with the spill itself. + */ + uint32_t spill_start_num_temps; struct vir_cursor cursor; struct list_head blocks; @@ -848,12 +915,16 @@ struct v3d_compile { bool emitted_tlb_load; bool lock_scoreboard_on_first_thrsw; - /* Total number of spilled registers in the program */ - uint32_t spill_count; - enum v3d_compilation_result compilation_result; bool tmu_dirty_rcl; + bool has_global_address; + + /* If we have processed a discard/terminate instruction. This may + * cause some lanes to be inactive even during uniform control + * flow. + */ + bool emitted_discard; }; struct v3d_uniform_list { @@ -866,6 +937,13 @@ struct v3d_prog_data { struct v3d_uniform_list uniforms; uint32_t spill_size; + uint32_t tmu_spills; + uint32_t tmu_fills; + uint32_t tmu_count; + + uint32_t qpu_read_stalls; + + uint8_t compile_strategy_idx; uint8_t threads; @@ -877,6 +955,8 @@ struct v3d_prog_data { bool tmu_dirty_rcl; bool has_control_barrier; + + bool has_global_address; }; struct v3d_vs_prog_data { @@ -964,10 +1044,15 @@ struct v3d_fs_prog_data { uint8_t num_inputs; bool writes_z; + bool writes_z_from_fep; bool disable_ez; bool uses_center_w; bool uses_implicit_point_line_varyings; bool lock_scoreboard_on_first_thrsw; + + /* If the fragment shader does anything that requires to force + * per-sample MSAA, such as reading gl_SampleID. + */ bool force_per_sample_msaa; }; @@ -998,6 +1083,10 @@ v3d_compute_vpm_config(struct v3d_device_info *devinfo, struct v3d_gs_prog_data *gs, struct vpm_config *vpm_cfg_bin, struct vpm_config *vpm_cfg); +void +v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo, + uint32_t *p1_packed, + bool unnormalized_coordinates); static inline bool vir_has_uniform(struct qinst *inst) @@ -1005,7 +1094,8 @@ vir_has_uniform(struct qinst *inst) return inst->uniform != ~0; } -const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo); +const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo, + uint32_t max_inline_uniform_buffers); void v3d_compiler_free(const struct v3d_compiler *compiler); void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s); @@ -1066,15 +1156,14 @@ bool vir_is_raw_mov(struct qinst *inst); bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); bool vir_is_add(struct qinst *inst); bool vir_is_mul(struct qinst *inst); -bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); -bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); +bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); uint8_t vir_channels_written(struct qinst *inst); struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); -void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, - struct qreg result); +void ntq_store_def(struct v3d_compile *c, nir_def *def, int chan, + struct qreg result); bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components); -void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest, +void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_def *def, uint32_t component_mask); void ntq_flush_tmu(struct v3d_compile *c); void vir_emit_thrsw(struct v3d_compile *c); @@ -1095,32 +1184,27 @@ bool vir_opt_redundant_flags(struct v3d_compile *c); bool vir_opt_small_immediates(struct v3d_compile *c); bool vir_opt_vpm(struct v3d_compile *c); bool vir_opt_constant_alu(struct v3d_compile *c); -void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_line_smooth(nir_shader *shader); -void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c); -void v3d_nir_lower_scratch(nir_shader *s); -void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_image_load_store(nir_shader *s); -void vir_lower_uniforms(struct v3d_compile *c); - -void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); -void v3d33_vir_vpm_write_setup(struct v3d_compile *c); -void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); -void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); -void v3d40_vir_emit_image_load_store(struct v3d_compile *c, - nir_intrinsic_instr *instr); +bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); +bool v3d_nir_lower_line_smooth(nir_shader *shader); +bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); +bool v3d_nir_lower_scratch(nir_shader *s); +bool v3d_nir_lower_txf_ms(nir_shader *s); +bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c); +bool v3d_nir_lower_load_store_bitsize(nir_shader *s); + +void v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); +void v3d_vir_emit_image_load_store(struct v3d_compile *c, + nir_intrinsic_instr *instr); void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); void qpu_validate(struct v3d_compile *c); -struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled); +struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); bool vir_init_reg_sets(struct v3d_compiler *compiler); int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str); -bool v3d_gl_format_is_return_32(GLenum format); +bool v3d_gl_format_is_return_32(enum pipe_format format); uint32_t v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src); @@ -1220,28 +1304,35 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ #define VIR_SFU(name) \ static inline struct qreg \ vir_##name(struct v3d_compile *c, struct qreg a) \ -{ \ - if (c->devinfo->ver >= 41) { \ - return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ - c->undef, \ - a, c->undef)); \ - } else { \ - vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ - return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ - } \ +{ \ + return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ + c->undef, \ + a, c->undef)); \ } \ static inline struct qinst * \ vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ struct qreg a) \ { \ - if (c->devinfo->ver >= 41) { \ - return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ - dest, \ - a, c->undef)); \ - } else { \ - vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ - return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ - } \ + return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ + dest, \ + a, c->undef)); \ +} + +#define VIR_SFU2(name) \ +static inline struct qreg \ +vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ +{ \ + return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ + c->undef, \ + a, b)); \ +} \ +static inline struct qinst * \ +vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ + struct qreg a, struct qreg b) \ +{ \ + return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ + dest, \ + a, b)); \ } #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name) @@ -1343,6 +1434,28 @@ VIR_SFU(LOG) VIR_SFU(SIN) VIR_SFU(RSQRT2) +VIR_SFU(BALLOT) +VIR_SFU(BCASTF) +VIR_SFU(ALLEQ) +VIR_SFU(ALLFEQ) +VIR_SFU2(ROTQ) +VIR_SFU2(ROT) +VIR_SFU2(SHUFFLE) + +VIR_A_ALU2(VPACK) +VIR_A_ALU2(V8PACK) +VIR_A_ALU2(V10PACK) +VIR_A_ALU2(V11FPACK) + +VIR_M_ALU1(FTOUNORM16) +VIR_M_ALU1(FTOSNORM16) + +VIR_M_ALU1(VFTOUNORM8) +VIR_M_ALU1(VFTOSNORM8) + +VIR_M_ALU1(VFTOUNORM10LO) +VIR_M_ALU1(VFTOUNORM10HI) + static inline struct qinst * vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, struct qreg dest, struct qreg src) @@ -1372,16 +1485,11 @@ vir_NOP(struct v3d_compile *c) static inline struct qreg vir_LDTMU(struct v3d_compile *c) { - if (c->devinfo->ver >= 41) { - struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, - c->undef, c->undef); - ldtmu->qpu.sig.ldtmu = true; - - return vir_emit_def(c, ldtmu); - } else { - vir_NOP(c)->qpu.sig.ldtmu = true; - return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); - } + struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtmu->qpu.sig.ldtmu = true; + + return vir_emit_def(c, ldtmu); } static inline struct qreg @@ -1394,7 +1502,6 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1) static inline struct qreg vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) { - assert(c->devinfo->ver >= 41); /* XXX */ assert((config & 0xffffff00) == 0xffffff00); struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, @@ -1407,38 +1514,12 @@ vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) static inline struct qreg vir_TLB_COLOR_READ(struct v3d_compile *c) { - assert(c->devinfo->ver >= 41); /* XXX */ - struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef); ldtlb->qpu.sig.ldtlb = true; return vir_emit_def(c, ldtlb); } -/* -static inline struct qreg -vir_LOAD_IMM(struct v3d_compile *c, uint32_t val) -{ - return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef, - vir_reg(QFILE_LOAD_IMM, val), c->undef)); -} - -static inline struct qreg -vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val) -{ - return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef, - vir_reg(QFILE_LOAD_IMM, val), - c->undef)); -} -static inline struct qreg -vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val) -{ - return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef, - vir_reg(QFILE_LOAD_IMM, val), - c->undef)); -} -*/ - static inline struct qinst * vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) { diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c index 2706432d5ef..9a651bfc6a7 100644 --- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c +++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c @@ -40,9 +40,20 @@ * calculations and load/store using the TMU general memory access path. */ +static const unsigned bits_8[4] = {8, 8, 8, 8}; +static const unsigned bits_16[4] = {16, 16, 16, 16}; +static const unsigned bits_1010102[4] = {10, 10, 10, 2}; + bool v3d_gl_format_is_return_32(enum pipe_format format) { + /* We can get a NONE format in Vulkan because we support the + * shaderStorageImageReadWithoutFormat feature. We consider these to + * always use 32-bit precision. + */ + if (format == PIPE_FORMAT_NONE) + return true; + const struct util_format_description *desc = util_format_description(format); const struct util_format_channel_description *chan = &desc->channel[0]; @@ -52,15 +63,17 @@ v3d_gl_format_is_return_32(enum pipe_format format) /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a * 32-bit SSA value, with as many channels as necessary to store all the bits + * + * This is the generic helper, using all common nir operations. */ -static nir_ssa_def * -pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, +static nir_def * +pack_bits(nir_builder *b, nir_def *color, const unsigned *bits, int num_components, bool mask) { - nir_ssa_def *results[4]; + nir_def *results[4]; int offset = 0; for (int i = 0; i < num_components; i++) { - nir_ssa_def *chan = nir_channel(b, color, i); + nir_def *chan = nir_channel(b, color, i); /* Channels being stored shouldn't cross a 32-bit boundary. */ assert((offset & ~31) == ((offset + bits[i] - 1) & ~31)); @@ -84,10 +97,187 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, return nir_vec(b, results, DIV_ROUND_UP(offset, 32)); } -static void -v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) +/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is + * just easier to read vfpack on the code, specially while using the PRM as + * reference + */ +static inline nir_def * +nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2) +{ + return nir_pack_half_2x16_split(b, p1, p2); +} + +static inline nir_def * +pack_11f11f10f(nir_builder *b, nir_def *color) +{ + nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + nir_def *undef = nir_undef(b, 1, color->bit_size); + nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef); + + return nir_pack_32_to_r11g11b10_v3d(b, p1, p2); +} + +static inline nir_def * +pack_r10g10b10a2_uint(nir_builder *b, nir_def *color) +{ + nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + + return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2); +} + +static inline nir_def * +pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color) +{ + nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1); + + nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2); + + return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2); +} + +enum hw_conversion { + NONE, + TO_SNORM, + TO_UNORM +}; + +static inline nir_def * +pack_8bit(nir_builder *b, nir_def *color, + unsigned num_components, + enum hw_conversion conversion) +{ + /* Note that usually you should not use this method (that relies on + * custom packing) for 1 component if we are not doing any + * conversion. But we support also that case, and let the caller + * decide which method to use. + */ + nir_def *p1; + nir_def *p2; + + if (conversion == NONE) { + p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0), + nir_channel(b, color, num_components == 1 ? 0 : 1)); + } else { + p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, num_components == 1 ? 0 : 1)); + p1 = (conversion == TO_UNORM) ? + nir_pack_2x16_to_unorm_2x8_v3d(b, p1) : + nir_pack_2x16_to_snorm_2x8_v3d(b, p1); + } + if (num_components == 4) { + if (conversion == NONE) { + p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + } else { + p2 = nir_vfpack(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + p2 = (conversion == TO_UNORM) ? + nir_pack_2x16_to_unorm_2x8_v3d(b, p2) : + nir_pack_2x16_to_snorm_2x8_v3d(b, p2); + } + } else { + /* Using an undef here would be more correct. But for this + * case we are getting worse shader-db values with some CTS + * tests, so we just reuse the first packing. + */ + p2 = p1; + } + + return nir_pack_4x16_to_4x8_v3d(b, p1, p2); +} + +static inline nir_def * +pack_16bit(nir_builder *b, nir_def *color, + unsigned num_components, + enum hw_conversion conversion) +{ + nir_def *results[2] = {0}; + nir_def *channels[4] = {0}; + + for (unsigned i = 0; i < num_components; i++) { + channels[i] = nir_channel(b, color, i); + switch (conversion) { + case TO_SNORM: + channels[i] = nir_f2snorm_16_v3d(b, channels[i]); + break; + case TO_UNORM: + channels[i] = nir_f2unorm_16_v3d(b, channels[i]); + break; + default: + /* Note that usually you should not use this method + * (that relies on custom packing) if we are not doing + * any conversion. But we support also that case, and + * let the caller decide which method to use. + */ + break; + } + } + + switch (num_components) { + case 1: + results[0] = channels[0]; + break; + case 4: + results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]); + FALLTHROUGH; + case 2: + results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]); + break; + default: + unreachable("Invalid number of components"); + } + + return nir_vec(b, results, DIV_ROUND_UP(num_components, 2)); +} + +static inline nir_def * +pack_xbit(nir_builder *b, nir_def *color, + unsigned num_components, + const struct util_format_channel_description *r_chan) +{ + bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED); + enum hw_conversion conversion = NONE; + if (r_chan->normalized) { + conversion = + (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM; + } + + switch (r_chan->size) { + case 8: + if (conversion == NONE && num_components < 2) + return pack_bits(b, color, bits_8, num_components, pack_mask); + else + return pack_8bit(b, color, num_components, conversion); + break; + case 16: + /* pack_mask implies that the generic packing method would + * need to include extra operations to handle negative values, + * so in that case, even without a conversion, it is better to + * use the packing using custom hw operations. + */ + if (conversion == NONE && !pack_mask) + return pack_bits(b, color, bits_16, num_components, pack_mask); + else + return pack_16bit(b, color, num_components, conversion); + break; + default: + unreachable("unrecognized bits"); + } +} + +static bool +v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr) { enum pipe_format format = nir_intrinsic_format(instr); + assert(format != PIPE_FORMAT_NONE); const struct util_format_description *desc = util_format_description(format); const struct util_format_channel_description *r_chan = &desc->channel[0]; @@ -95,10 +285,10 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *color = nir_channels(b, - nir_ssa_for_src(b, instr->src[3], 4), - (1 << num_components) - 1); - nir_ssa_def *formatted = NULL; + nir_def *color = nir_trim_vector(b, + instr->src[3].ssa, + num_components); + nir_def *formatted = NULL; if (format == PIPE_FORMAT_R11G11B10_FLOAT) { formatted = nir_format_pack_11f11f10f(b, color); @@ -110,9 +300,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) */ formatted = color; } else { - static const unsigned bits_8[4] = {8, 8, 8, 8}; - static const unsigned bits_16[4] = {16, 16, 16, 16}; - static const unsigned bits_1010102[4] = {10, 10, 10, 2}; const unsigned *bits; switch (r_chan->size) { @@ -132,11 +319,13 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) bool pack_mask = false; if (r_chan->pure_integer && r_chan->type == UTIL_FORMAT_TYPE_SIGNED) { - formatted = nir_format_clamp_sint(b, color, bits); + /* We don't need to do any conversion or clamping in this case */ + formatted = color; pack_mask = true; } else if (r_chan->pure_integer && r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) { - formatted = nir_format_clamp_uint(b, color, bits); + /* We don't need to do any conversion or clamping in this case */ + formatted = color; } else if (r_chan->normalized && r_chan->type == UTIL_FORMAT_TYPE_SIGNED) { formatted = nir_format_float_to_snorm(b, color, bits); @@ -154,75 +343,116 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) pack_mask); } - nir_instr_rewrite_src(&instr->instr, &instr->src[3], - nir_src_for_ssa(formatted)); + nir_src_rewrite(&instr->src[3], formatted); instr->num_components = formatted->num_components; + + return true; } -static void + +static bool +v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr) +{ + enum pipe_format format = nir_intrinsic_format(instr); + assert(format != PIPE_FORMAT_NONE); + const struct util_format_description *desc = + util_format_description(format); + const struct util_format_channel_description *r_chan = &desc->channel[0]; + unsigned num_components = util_format_get_nr_components(format); + b->cursor = nir_before_instr(&instr->instr); + + nir_def *color = + nir_trim_vector(b, instr->src[3].ssa, num_components); + nir_def *formatted = NULL; + if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + formatted = nir_format_pack_r9g9b9e5(b, color); + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + formatted = pack_11f11f10f(b, color); + } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) { + formatted = pack_r10g10b10a2_uint(b, color); + } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) { + formatted = pack_r10g10b10a2_unorm(b, color); + } else if (r_chan->size == 32) { + /* For 32-bit formats, we just have to move the vector + * across (possibly reducing the number of channels). + */ + formatted = color; + } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) { + assert(r_chan->size == 16); + formatted = nir_format_float_to_half(b, color); + formatted = pack_bits(b, formatted, bits_16, num_components, + false); + } else { + assert(r_chan->size == 8 || r_chan->size == 16); + formatted = pack_xbit(b, color, num_components, r_chan); + } + + nir_src_rewrite(&instr->src[3], formatted); + instr->num_components = formatted->num_components; + + return true; +} + +static bool v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr) { static const unsigned bits16[] = {16, 16, 16, 16}; enum pipe_format format = nir_intrinsic_format(instr); if (v3d_gl_format_is_return_32(format)) - return; + return false; b->cursor = nir_after_instr(&instr->instr); - assert(instr->dest.is_ssa); - nir_ssa_def *result = &instr->dest.ssa; + nir_def *result = &instr->def; if (util_format_is_pure_uint(format)) { result = nir_format_unpack_uint(b, result, bits16, 4); } else if (util_format_is_pure_sint(format)) { result = nir_format_unpack_sint(b, result, bits16, 4); } else { - nir_ssa_def *rg = nir_channel(b, result, 0); - nir_ssa_def *ba = nir_channel(b, result, 1); - result = nir_vec4(b, - nir_unpack_half_2x16_split_x(b, rg), - nir_unpack_half_2x16_split_y(b, rg), - nir_unpack_half_2x16_split_x(b, ba), - nir_unpack_half_2x16_split_y(b, ba)); + nir_def *rg = nir_channel(b, result, 0); + nir_def *ba = nir_channel(b, result, 1); + result = nir_vec4(b, + nir_unpack_half_2x16_split_x(b, rg), + nir_unpack_half_2x16_split_y(b, rg), + nir_unpack_half_2x16_split_x(b, ba), + nir_unpack_half_2x16_split_y(b, ba)); } - nir_ssa_def_rewrite_uses_after(&instr->dest.ssa, result, + nir_def_rewrite_uses_after(&instr->def, result, result->parent_instr); + + return true; } -void -v3d_nir_lower_image_load_store(nir_shader *s) +static bool +v3d_nir_lower_image_load_store_cb(nir_builder *b, + nir_intrinsic_instr *intr, + void *_state) { - nir_foreach_function(function, s) { - if (!function->impl) - continue; - - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = - nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_image_load: - v3d_nir_lower_image_load(&b, intr); - break; - case nir_intrinsic_image_store: - v3d_nir_lower_image_store(&b, intr); - break; - default: - break; - } - } - } + struct v3d_compile *c = (struct v3d_compile *) _state; - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + switch (intr->intrinsic) { + case nir_intrinsic_image_load: + return v3d_nir_lower_image_load(b, intr); + case nir_intrinsic_image_store: + if (c->devinfo->ver >= 71) + return v3d_nir_lower_image_store_v71(b, intr); + else + return v3d_nir_lower_image_store_v42(b, intr); + break; + default: + return false; } + + return false; +} + +bool +v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c) +{ + return nir_shader_intrinsics_pass(s, + v3d_nir_lower_image_load_store_cb, + nir_metadata_block_index | + nir_metadata_dominance, c); } diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c index 895b1a39163..55e2e4f2e11 100644 --- a/src/broadcom/compiler/v3d_nir_lower_io.c +++ b/src/broadcom/compiler/v3d_nir_lower_io.c @@ -24,8 +24,6 @@ #include "compiler/v3d_compiler.h" #include "compiler/nir/nir_builder.h" -#include "util/u_helpers.h" - /** * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io * intrinsics into something amenable to the V3D architecture. @@ -64,7 +62,7 @@ struct v3d_nir_lower_io_state { BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)]; - nir_ssa_def *pos[4]; + nir_def *pos[4]; }; static void @@ -72,8 +70,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, struct v3d_nir_lower_io_state *state); static void -v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset, - nir_ssa_def *chan) +v3d_nir_store_output(nir_builder *b, int base, nir_def *offset, + nir_def *chan) { if (offset) { /* When generating the VIR instruction, the base and the offset @@ -90,29 +88,6 @@ v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset, nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0); } -/* Convert the uniform offset to bytes. If it happens to be a constant, - * constant-folding will clean up the shift for us. - */ -static void -v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b, - nir_intrinsic_instr *intr) -{ - /* On SPIR-V/Vulkan we are already getting our offsets in - * bytes. - */ - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) - return; - - b->cursor = nir_before_instr(&intr->instr); - - nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16); - - nir_instr_rewrite_src(&intr->instr, - &intr->src[0], - nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa, - nir_imm_int(b, 4)))); -} - static int v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component) { @@ -159,14 +134,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, /* If this is a geometry shader we need to emit our outputs * to the current vertex offset in the VPM. */ - nir_ssa_def *offset_reg = + nir_def *offset_reg = c->s->info.stage == MESA_SHADER_GEOMETRY ? nir_load_var(b, state->gs.output_offset_var) : NULL; int start_comp = nir_intrinsic_component(intr); unsigned location = nir_intrinsic_io_semantics(intr).location; - nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0], - intr->num_components); + nir_def *src = intr->src[0].ssa; /* Save off the components of the position for the setup of VPM inputs * read by fixed function HW. */ @@ -184,8 +158,8 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, if (location == VARYING_SLOT_LAYER) { assert(c->s->info.stage == MESA_SHADER_GEOMETRY); - nir_ssa_def *header = nir_load_var(b, state->gs.header_var); - header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff)); + nir_def *header = nir_load_var(b, state->gs.header_var); + header = nir_iand_imm(b, header, 0xff00ffff); /* From the GLES 3.2 spec: * @@ -205,24 +179,26 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, * to 0 in that case (we always allocate tile state for at * least one layer). */ - nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32); - nir_ssa_def *cond = nir_ige(b, src, fb_layers); - nir_ssa_def *layer_id = + nir_def *fb_layers = nir_load_fb_layers_v3d(b, 32); + nir_def *cond = nir_ige(b, src, fb_layers); + nir_def *layer_id = nir_bcsel(b, cond, nir_imm_int(b, 0), - nir_ishl(b, src, nir_imm_int(b, 16))); + nir_ishl_imm(b, src, 16)); header = nir_ior(b, header, layer_id); nir_store_var(b, state->gs.header_var, header, 0x1); } /* Scalarize outputs if it hasn't happened already, since we want to - * schedule each VPM write individually. We can skip any outut + * schedule each VPM write individually. We can skip any output * components not read by the FS. */ for (int i = 0; i < intr->num_components; i++) { int vpm_offset = v3d_varying_slot_vpm_offset(c, location, start_comp + i); + if (!(nir_intrinsic_write_mask(intr) & (1 << i))) + continue; if (vpm_offset == -1) continue; @@ -261,9 +237,9 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b, { b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *header = nir_load_var(b, state->gs.header_var); - nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var); - nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var); + nir_def *header = nir_load_var(b, state->gs.header_var); + nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var); + nir_def *output_offset = nir_load_var(b, state->gs.output_offset_var); /* Emit fixed function outputs */ v3d_nir_emit_ff_vpm_outputs(c, b, state); @@ -273,13 +249,13 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b, /* Update VPM offset for next vertex output data and header */ output_offset = - nir_iadd(b, output_offset, - nir_imm_int(b, state->gs.output_vertex_data_size)); + nir_iadd_imm(b, output_offset, + state->gs.output_vertex_data_size); - header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1)); + header_offset = nir_iadd_imm(b, header_offset, 1); /* Reset the New Primitive bit */ - header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe)); + header = nir_iand_imm(b, header, 0xfffffffe); nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1); nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1); @@ -304,7 +280,7 @@ v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b, * doesn't provide means to do that, so we need to apply the swizzle in the * vertex shader. * - * This is required at least in Vulkan to support madatory vertex attribute + * This is required at least in Vulkan to support mandatory vertex attribute * format VK_FORMAT_B8G8R8A8_UNORM. */ static void @@ -327,59 +303,6 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b, nir_intrinsic_set_component(instr, (comp + 2) % 4); } -/* Sometimes the origin of gl_PointCoord is in the upper left rather than the - * lower left so we need to flip it. - * - * This is needed for Vulkan, Gallium uses lower_wpos_pntc. - */ -static void -v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b, - nir_intrinsic_instr *intr) -{ - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); - - /* Gallium uses lower_wpos_pntc */ - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) - return; - - b->cursor = nir_after_instr(&intr->instr); - - int comp = nir_intrinsic_component(intr); - - nir_variable *input_var = - nir_find_variable_with_driver_location(c->s, - nir_var_shader_in, - nir_intrinsic_base(intr)); - - if (input_var && util_varying_is_point_coord(input_var->data.location, - c->fs_key->point_sprite_mask)) { - assert(intr->num_components == 1); - - nir_ssa_def *result = &intr->dest.ssa; - - switch (comp) { - case 0: - case 1: - if (!c->fs_key->is_points) - result = nir_imm_float(b, 0.0); - break; - case 2: - result = nir_imm_float(b, 0.0); - break; - case 3: - result = nir_imm_float(b, 1.0); - break; - } - if (c->fs_key->point_coord_upper_left && comp == 1) - result = nir_fsub(b, nir_imm_float(b, 1.0), result); - if (result != &intr->dest.ssa) { - nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, - result, - result->parent_instr); - } - } -} - static void v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr, @@ -393,12 +316,6 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, case nir_intrinsic_load_input: if (c->s->info.stage == MESA_SHADER_VERTEX) v3d_nir_lower_vertex_input(c, b, intr); - else if (c->s->info.stage == MESA_SHADER_FRAGMENT) - v3d_nir_lower_fragment_input(c, b, intr); - break; - - case nir_intrinsic_load_uniform: - v3d_nir_lower_uniform(c, b, intr); break; case nir_intrinsic_store_output: @@ -558,16 +475,16 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, /* If this is a geometry shader we need to emit our fixed function * outputs to the current vertex offset in the VPM. */ - nir_ssa_def *offset_reg = + nir_def *offset_reg = c->s->info.stage == MESA_SHADER_GEOMETRY ? nir_load_var(b, state->gs.output_offset_var) : NULL; for (int i = 0; i < 4; i++) { if (!state->pos[i]) - state->pos[i] = nir_ssa_undef(b, 1, 32); + state->pos[i] = nir_undef(b, 1, 32); } - nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]); + nir_def *rcp_wc = nir_frcp(b, state->pos[3]); if (state->pos_vpm_offset != -1) { for (int i = 0; i < 4; i++) { @@ -578,8 +495,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, if (state->vp_vpm_offset != -1) { for (int i = 0; i < 2; i++) { - nir_ssa_def *pos; - nir_ssa_def *scale; + nir_def *pos; + nir_def *scale; pos = state->pos[i]; if (i == 0) scale = nir_load_viewport_x_scale(b); @@ -598,14 +515,18 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, * The correct fix for this as recommended by Broadcom * is to convert to .8 fixed-point with ffloor(). */ - pos = nir_f2i32(b, nir_ffloor(b, pos)); - v3d_nir_store_output(b, state->vp_vpm_offset + i, - offset_reg, pos); + if (c->devinfo->ver == 42) + pos = nir_f2i32(b, nir_ffloor(b, pos)); + else + pos = nir_f2i32(b, nir_fround_even(b, pos)); + + v3d_nir_store_output(b, state->vp_vpm_offset + i, + offset_reg, pos); } } if (state->zs_vpm_offset != -1) { - nir_ssa_def *z = state->pos[2]; + nir_def *z = state->pos[2]; z = nir_fmul(b, z, nir_load_viewport_z_scale(b)); z = nir_fmul(b, z, rcp_wc); z = nir_fadd(b, z, nir_load_viewport_z_offset(b)); @@ -679,21 +600,22 @@ emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b, * have a variable just to keep track of the number of vertices we * emitted and instead we can just compute it here from the header * offset variable by removing the one generic header slot that always - * goes at the begining of out header. + * goes at the beginning of out header. */ - nir_ssa_def *header_offset = + nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var); - nir_ssa_def *vertex_count = - nir_isub(b, header_offset, nir_imm_int(b, 1)); - nir_ssa_def *header = - nir_ior(b, nir_imm_int(b, state->gs.output_header_size), - nir_ishl(b, vertex_count, - nir_imm_int(b, VERTEX_COUNT_OFFSET))); + nir_def *vertex_count = + nir_iadd_imm(b, header_offset, -1); + nir_def *header = + nir_ior_imm(b, + nir_ishl_imm(b, vertex_count, + VERTEX_COUNT_OFFSET), + state->gs.output_header_size); v3d_nir_store_output(b, 0, NULL, header); } -void +bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) { struct v3d_nir_lower_io_state state = { 0 }; @@ -713,36 +635,39 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) unreachable("Unsupported shader stage"); } - nir_foreach_function(function, s) { - if (function->impl) { - nir_builder b; - nir_builder_init(&b, function->impl); - - if (c->s->info.stage == MESA_SHADER_GEOMETRY) - emit_gs_prolog(c, &b, function->impl, &state); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) - v3d_nir_lower_io_instr(c, &b, instr, - &state); - } - - nir_block *last = nir_impl_last_block(function->impl); - b.cursor = nir_after_block(last); - if (s->info.stage == MESA_SHADER_VERTEX) { - v3d_nir_emit_ff_vpm_outputs(c, &b, &state); - } else if (s->info.stage == MESA_SHADER_GEOMETRY) { - emit_gs_vpm_output_header_prolog(c, &b, &state); - } - - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + nir_foreach_function_impl(impl, s) { + nir_builder b = nir_builder_create(impl); + + if (c->s->info.stage == MESA_SHADER_GEOMETRY) + emit_gs_prolog(c, &b, impl, &state); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) + v3d_nir_lower_io_instr(c, &b, instr, + &state); } + + nir_block *last = nir_impl_last_block(impl); + b.cursor = nir_after_block(last); + if (s->info.stage == MESA_SHADER_VERTEX) { + v3d_nir_emit_ff_vpm_outputs(c, &b, &state); + } else if (s->info.stage == MESA_SHADER_GEOMETRY) { + emit_gs_vpm_output_header_prolog(c, &b, &state); + } + + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); } if (s->info.stage == MESA_SHADER_VERTEX || s->info.stage == MESA_SHADER_GEOMETRY) { v3d_nir_lower_io_update_output_var_base(c, &state); } + + /* It is really unlikely that we don't get progress here, and fully + * filtering when not would make code more complex, but we are still + * interested on getting this lowering going through NIR_PASS + */ + return true; } diff --git a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c index 8f6e7d4e648..05b5224bc52 100644 --- a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c +++ b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Raspberry Pi + * Copyright © 2020 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -42,25 +42,23 @@ lower_line_smooth_intrinsic(struct lower_line_smooth_state *state, { b->cursor = nir_before_instr(&intr->instr); - nir_ssa_def *one = nir_imm_float(b, 1.0f); + nir_def *one = nir_imm_float(b, 1.0f); - nir_ssa_def *coverage = nir_load_var(b, state->coverage); + nir_def *coverage = nir_load_var(b, state->coverage); - nir_ssa_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage), + nir_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage), intr->src[0].ssa); - nir_instr_rewrite_src(&intr->instr, - &intr->src[0], - nir_src_for_ssa(new_val)); + nir_src_rewrite(&intr->src[0], new_val); } -static void +static bool lower_line_smooth_func(struct lower_line_smooth_state *state, nir_function_impl *impl) { - nir_builder b; + bool progress = false; - nir_builder_init(&b, impl); + nir_builder b = nir_builder_create(impl); nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { @@ -72,58 +70,66 @@ lower_line_smooth_func(struct lower_line_smooth_state *state, if (intr->intrinsic != nir_intrinsic_store_output || nir_intrinsic_base(intr) != 0 || - intr->num_components != 4 || - !intr->src[0].is_ssa) + intr->num_components != 4) continue; lower_line_smooth_intrinsic(state, &b, intr); + progress = true; } } + + return progress; } static void initialise_coverage_var(struct lower_line_smooth_state *state, nir_function_impl *impl) { - nir_builder b; - - nir_builder_init(&b, impl); + nir_builder b = nir_builder_at(nir_before_impl(impl)); - b.cursor = nir_before_block(nir_start_block(impl)); + nir_def *line_width = nir_load_line_width(&b); - nir_ssa_def *line_width = nir_load_line_width(&b); + nir_def *real_line_width = nir_load_aa_line_width(&b); - nir_ssa_def *real_line_width = nir_load_aa_line_width(&b); - - /* The line coord varies from 0.0 to 1.0 across the width of the line */ - nir_ssa_def *line_coord = nir_load_line_coord(&b); + /* According to the PRM, the line coord varies from 0.0 to 1.0 across + * the width of the line. But actually, when a perspective projection + * is used, it is also applied to the line coords, so the values end + * up being between [min_coord, 1], based on the Wc coordinate. We + * need to re-map the values to be between [0.0, 1.0]. + */ + nir_def *line_coord = nir_load_line_coord(&b); + nir_def *wc = nir_load_fep_w_v3d(&b, 32); + nir_def *min_coord_val = nir_fsub(&b, nir_imm_float(&b, 1.0f), wc); + nir_def *normalized_line_coord = nir_fdiv(&b, + nir_fsub(&b, line_coord, min_coord_val), + nir_fsub_imm(&b, 1.0, min_coord_val));; /* fabs(line_coord - 0.5) * real_line_width */ - nir_ssa_def *pixels_from_center = + nir_def *pixels_from_center = nir_fmul(&b, real_line_width, - nir_fabs(&b, nir_fsub(&b, line_coord, + nir_fabs(&b, nir_fsub(&b, normalized_line_coord, nir_imm_float(&b, 0.5f)))); /* 0.5 - 1/√2 * (pixels_from_center - line_width * 0.5) */ - nir_ssa_def *coverage = + nir_def *coverage = nir_fsub(&b, nir_imm_float(&b, 0.5f), nir_fmul(&b, nir_imm_float(&b, 1.0f / M_SQRT2), nir_fsub(&b, pixels_from_center, - nir_fmul(&b, - line_width, - nir_imm_float(&b, 0.5f))))); + nir_fmul_imm(&b, + line_width, + 0.5f)))); /* Discard fragments that aren’t covered at all by the line */ - nir_ssa_def *outside = nir_fge(&b, nir_imm_float(&b, 0.0f), coverage); + nir_def *outside = nir_fle_imm(&b, coverage, 0.0f); nir_discard_if(&b, outside); /* Clamp to at most 1.0. If it was less than 0.0 then the fragment will * be discarded so we don’t need to handle that. */ - nir_ssa_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f)); + nir_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f)); nir_store_var(&b, state->coverage, clamped, 0x1 /* writemask */); } @@ -140,9 +146,11 @@ make_coverage_var(nir_shader *s) return var; } -void +bool v3d_nir_lower_line_smooth(nir_shader *s) { + bool progress = false; + assert(s->info.stage == MESA_SHADER_FRAGMENT); struct lower_line_smooth_state state = { @@ -150,10 +158,20 @@ v3d_nir_lower_line_smooth(nir_shader *s) .coverage = make_coverage_var(s), }; - nir_foreach_function(function, s) { + nir_foreach_function_with_impl(function, impl, s) { if (function->is_entrypoint) - initialise_coverage_var(&state, function->impl); + initialise_coverage_var(&state, impl); + + progress |= lower_line_smooth_func(&state, impl); - lower_line_smooth_func(&state, function->impl); + if (progress) { + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); + } else { + nir_metadata_preserve(impl, nir_metadata_all); + } } + + return progress; } diff --git a/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c new file mode 100644 index 00000000000..0caf5dbc92c --- /dev/null +++ b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c @@ -0,0 +1,260 @@ +/* + * Copyright © 2021 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/v3d_compiler.h" +#include "compiler/nir/nir_builder.h" + +/** + * The V3D TMU unit can only do 32-bit general vector access so for anything + * else we need to split vector load/store instructions to scalar. + * + * Note that a vectorization pass after this lowering may be able to + * re-vectorize some of these using 32-bit load/store instructions instead, + * which we do support. + */ + +static int +value_src(nir_intrinsic_op intrinsic) +{ + switch (intrinsic) { + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: + return 0; + default: + unreachable("Unsupported intrinsic"); + } +} + +static int +offset_src(nir_intrinsic_op intrinsic) +{ + switch (intrinsic) { + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_global_2x32: + return 0; + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: + return 1; + case nir_intrinsic_store_ssbo: + return 2; + default: + unreachable("Unsupported intrinsic"); + } +} + +static nir_intrinsic_instr * +init_scalar_intrinsic(nir_builder *b, + nir_intrinsic_instr *intr, + uint32_t component, + nir_def *offset, + uint32_t bit_size, + nir_def **scalar_offset) +{ + + nir_intrinsic_instr *new_intr = + nir_intrinsic_instr_create(b->shader, intr->intrinsic); + + nir_intrinsic_copy_const_indices(new_intr, intr); + + const int offset_units = bit_size / 8; + assert(offset_units >= 1); + + if (nir_intrinsic_has_align_mul(intr)) { + assert(nir_intrinsic_has_align_offset(intr)); + unsigned align_mul = nir_intrinsic_align_mul(intr); + unsigned align_off = nir_intrinsic_align_offset(intr); + + align_off += offset_units * component; + align_off = align_off % align_mul; + + nir_intrinsic_set_align(new_intr, align_mul, align_off); + } + + *scalar_offset = offset; + unsigned offset_adj = offset_units * component; + if (nir_intrinsic_has_base(intr)) { + nir_intrinsic_set_base( + new_intr, nir_intrinsic_base(intr) + offset_adj); + } else { + *scalar_offset = + nir_iadd(b, offset, + nir_imm_intN_t(b, offset_adj, + offset->bit_size)); + } + + new_intr->num_components = 1; + + return new_intr; +} + +static bool +lower_load_bitsize(nir_builder *b, + nir_intrinsic_instr *intr) +{ + uint32_t bit_size = intr->def.bit_size; + if (bit_size == 32) + return false; + + /* No need to split if it is already scalar */ + int num_comp = nir_intrinsic_dest_components(intr); + if (num_comp <= 1) + return false; + + b->cursor = nir_before_instr(&intr->instr); + + /* For global 2x32 we ignore Y component because it must be zero */ + unsigned offset_idx = offset_src(intr->intrinsic); + nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1); + + /* Split vector store to multiple scalar loads */ + nir_def *dest_components[4] = { NULL }; + const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; + for (int component = 0; component < num_comp; component++) { + nir_def *scalar_offset; + nir_intrinsic_instr *new_intr = + init_scalar_intrinsic(b, intr, component, offset, + bit_size, &scalar_offset); + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (i == offset_idx) { + nir_def *final_offset; + final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ? + scalar_offset : + nir_vec2(b, scalar_offset, + nir_imm_int(b, 0)); + new_intr->src[i] = nir_src_for_ssa(final_offset); + } else { + new_intr->src[i] = intr->src[i]; + } + } + + nir_def_init(&new_intr->instr, &new_intr->def, 1, + bit_size); + dest_components[component] = &new_intr->def; + + nir_builder_instr_insert(b, &new_intr->instr); + } + + nir_def *new_dst = nir_vec(b, dest_components, num_comp); + nir_def_rewrite_uses(&intr->def, new_dst); + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_store_bitsize(nir_builder *b, + nir_intrinsic_instr *intr) +{ + /* No need to split if it is already scalar */ + int value_idx = value_src(intr->intrinsic); + int num_comp = nir_intrinsic_src_components(intr, value_idx); + if (num_comp <= 1) + return false; + + /* No need to split if it is 32-bit */ + if (nir_src_bit_size(intr->src[value_idx]) == 32) + return false; + + nir_def *value = intr->src[value_idx].ssa; + + b->cursor = nir_before_instr(&intr->instr); + + /* For global 2x32 we ignore Y component because it must be zero */ + unsigned offset_idx = offset_src(intr->intrinsic); + nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1); + + /* Split vector store to multiple scalar stores */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; + unsigned wrmask = nir_intrinsic_write_mask(intr); + while (wrmask) { + unsigned component = ffs(wrmask) - 1; + + nir_def *scalar_offset; + nir_intrinsic_instr *new_intr = + init_scalar_intrinsic(b, intr, component, offset, + value->bit_size, &scalar_offset); + + nir_intrinsic_set_write_mask(new_intr, 0x1); + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (i == value_idx) { + nir_def *scalar_value = + nir_channels(b, value, 1 << component); + new_intr->src[i] = nir_src_for_ssa(scalar_value); + } else if (i == offset_idx) { + nir_def *final_offset; + final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ? + scalar_offset : + nir_vec2(b, scalar_offset, + nir_imm_int(b, 0)); + new_intr->src[i] = nir_src_for_ssa(final_offset); + } else { + new_intr->src[i] = intr->src[i]; + } + } + + nir_builder_instr_insert(b, &new_intr->instr); + + wrmask &= ~(1 << component); + } + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr, + void *data) +{ + switch (intr->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_global_2x32: + return lower_load_bitsize(b, intr); + + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: + return lower_store_bitsize(b, intr); + + default: + return false; + } +} + +bool +v3d_nir_lower_load_store_bitsize(nir_shader *s) +{ + return nir_shader_intrinsics_pass(s, lower_load_store_bitsize, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} diff --git a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c index 11782c7348f..4affb79a7e2 100644 --- a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c +++ b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c @@ -36,8 +36,8 @@ #include "v3d_compiler.h" -typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c); -typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c); +typedef nir_def *(*nir_pack_func)(nir_builder *b, nir_def *c); +typedef nir_def *(*nir_unpack_func)(nir_builder *b, nir_def *c); static bool logicop_depends_on_dst_color(int logicop_func) @@ -53,9 +53,9 @@ logicop_depends_on_dst_color(int logicop_func) } } -static nir_ssa_def * +static nir_def * v3d_logicop(nir_builder *b, int logicop_func, - nir_ssa_def *src, nir_ssa_def *dst) + nir_def *src, nir_def *dst) { switch (logicop_func) { case PIPE_LOGICOP_CLEAR: @@ -96,8 +96,8 @@ v3d_logicop(nir_builder *b, int logicop_func, } } -static nir_ssa_def * -v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) +static nir_def * +v3d_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz) { switch (swiz) { default: @@ -116,57 +116,57 @@ v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) } } -static nir_ssa_def * -v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans, +static nir_def * +v3d_nir_swizzle_and_pack(nir_builder *b, nir_def **chans, const uint8_t *swiz, nir_pack_func pack_func) { - nir_ssa_def *c[4]; + nir_def *c[4]; for (int i = 0; i < 4; i++) c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]); return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3])); } -static nir_ssa_def * -v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed, +static nir_def * +v3d_nir_unpack_and_swizzle(nir_builder *b, nir_def *packed, const uint8_t *swiz, nir_unpack_func unpack_func) { - nir_ssa_def *unpacked = unpack_func(b, packed); + nir_def *unpacked = unpack_func(b, packed); - nir_ssa_def *unpacked_chans[4]; + nir_def *unpacked_chans[4]; for (int i = 0; i < 4; i++) unpacked_chans[i] = nir_channel(b, unpacked, i); - nir_ssa_def *c[4]; + nir_def *c[4]; for (int i = 0; i < 4; i++) c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]); return nir_vec4(b, c[0], c[1], c[2], c[3]); } -static nir_ssa_def * -pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) +static nir_def * +pack_unorm_rgb10a2(nir_builder *b, nir_def *c) { static const unsigned bits[4] = { 10, 10, 10, 2 }; - nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits); + nir_def *unorm = nir_format_float_to_unorm(b, c, bits); - nir_ssa_def *chans[4]; + nir_def *chans[4]; for (int i = 0; i < 4; i++) chans[i] = nir_channel(b, unorm, i); - nir_ssa_def *result = nir_mov(b, chans[0]); + nir_def *result = nir_mov(b, chans[0]); int offset = bits[0]; for (int i = 1; i < 4; i++) { - nir_ssa_def *shifted_chan = - nir_ishl(b, chans[i], nir_imm_int(b, offset)); + nir_def *shifted_chan = + nir_ishl_imm(b, chans[i], offset); result = nir_ior(b, result, shifted_chan); offset += bits[i]; } return result; } -static nir_ssa_def * -unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) +static nir_def * +unpack_unorm_rgb10a2(nir_builder *b, nir_def *c) { static const unsigned bits[4] = { 10, 10, 10, 2 }; const unsigned masks[4] = { BITFIELD_MASK(bits[0]), @@ -174,11 +174,11 @@ unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) BITFIELD_MASK(bits[2]), BITFIELD_MASK(bits[3]) }; - nir_ssa_def *chans[4]; + nir_def *chans[4]; for (int i = 0; i < 4; i++) { - nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i])); + nir_def *unorm = nir_iand_imm(b, c, masks[i]); chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]); - c = nir_ushr(b, c, nir_imm_int(b, bits[i])); + c = nir_ushr_imm(b, c, bits[i]); } return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]); @@ -201,13 +201,13 @@ v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt) } } -static nir_ssa_def * +static nir_def * v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample) { uint32_t num_components = util_format_get_nr_components(c->fs_key->color_fmt[rt].format); - nir_ssa_def *color[4]; + nir_def *color[4]; for (int i = 0; i < 4; i++) { if (i < num_components) { color[i] = @@ -222,71 +222,68 @@ v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample) return nir_vec4(b, color[0], color[1], color[2], color[3]); } -static nir_ssa_def * +static nir_def * v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b, - nir_ssa_def **src_chans, nir_ssa_def **dst_chans, + nir_def **src_chans, nir_def **dst_chans, int rt, int sample) { const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt); - nir_ssa_def *op_res[4]; + nir_def *op_res[4]; for (int i = 0; i < 4; i++) { - nir_ssa_def *src = src_chans[i]; - nir_ssa_def *dst = + nir_def *src = src_chans[i]; + nir_def *dst = v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]); op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst); - /* In Vulkan we configure our integer RTs to clamp, so we need - * to ignore result bits that don't fit in the destination RT - * component size. + /* We configure our integer RTs to clamp, so we need to ignore + * result bits that don't fit in the destination RT component + * size. */ - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { - uint32_t bits = - util_format_get_component_bits( - c->fs_key->color_fmt[rt].format, - UTIL_FORMAT_COLORSPACE_RGB, i); - if (bits > 0 && bits < 32) { - nir_ssa_def *mask = - nir_imm_int(b, (1u << bits) - 1); - op_res[i] = nir_iand(b, op_res[i], mask); - } + uint32_t bits = + util_format_get_component_bits( + c->fs_key->color_fmt[rt].format, + UTIL_FORMAT_COLORSPACE_RGB, i); + if (bits > 0 && bits < 32) { + op_res[i] = + nir_iand_imm(b, op_res[i], (1u << bits) - 1); } } - nir_ssa_def *r[4]; + nir_def *r[4]; for (int i = 0; i < 4; i++) r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]); return nir_vec4(b, r[0], r[1], r[2], r[3]); } -static nir_ssa_def * +static nir_def * v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b, - nir_ssa_def **src_chans, nir_ssa_def **dst_chans, + nir_def **src_chans, nir_def **dst_chans, int rt, int sample, nir_pack_func pack_func, nir_unpack_func unpack_func) { static const uint8_t src_swz[4] = { 0, 1, 2, 3 }; - nir_ssa_def *packed_src = + nir_def *packed_src = v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func); const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt); - nir_ssa_def *packed_dst = + nir_def *packed_dst = v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func); - nir_ssa_def *packed_result = + nir_def *packed_result = v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst); return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func); } -static nir_ssa_def * +static nir_def * v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b, - nir_ssa_def *src, int rt, int sample) + nir_def *src, int rt, int sample) { - nir_ssa_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample); + nir_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample); - nir_ssa_def *src_chans[4], *dst_chans[4]; + nir_def *src_chans[4], *dst_chans[4]; for (unsigned i = 0; i < 4; i++) { src_chans[i] = nir_channel(b, src, i); dst_chans[i] = nir_channel(b, dst, i); @@ -309,7 +306,7 @@ v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b, static void v3d_emit_ms_output(nir_builder *b, - nir_ssa_def *color, nir_src *offset, + nir_def *color, nir_src *offset, nir_alu_type type, int rt, int sample) { nir_store_tlb_sample_color_v3d(b, color, nir_imm_int(b, rt), .base = sample, .component = 0, .src_type = type); @@ -321,7 +318,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, nir_intrinsic_instr *intr, int rt) { - nir_ssa_def *frag_color = intr->src[0].ssa; + nir_def *frag_color = intr->src[0].ssa; const int logic_op = c->fs_key->logicop_func; @@ -331,7 +328,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, nir_src *offset = &intr->src[1]; nir_alu_type type = nir_intrinsic_src_type(intr); for (int i = 0; i < V3D_MAX_SAMPLES; i++) { - nir_ssa_def *sample = + nir_def *sample = v3d_nir_emit_logic_op(c, b, frag_color, rt, i); v3d_emit_ms_output(b, sample, offset, type, rt, i); @@ -339,11 +336,10 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, nir_instr_remove(&intr->instr); } else { - nir_ssa_def *result = + nir_def *result = v3d_nir_emit_logic_op(c, b, frag_color, rt, 0); - nir_instr_rewrite_src(&intr->instr, &intr->src[0], - nir_src_for_ssa(result)); + nir_src_rewrite(&intr->src[0], result); intr->num_components = result->num_components; } } @@ -351,6 +347,8 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, static bool v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c) { + bool progress = false; + nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; @@ -384,35 +382,40 @@ v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c) continue; } - nir_function_impl *impl = - nir_cf_node_get_function(&block->cf_node); - nir_builder b; - nir_builder_init(&b, impl); - b.cursor = nir_before_instr(&intr->instr); + nir_builder b = nir_builder_at(nir_before_instr(&intr->instr)); v3d_nir_lower_logic_op_instr(c, &b, intr, rt); + + progress = true; } } - return true; + return progress; } -void +bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c) { + bool progress = false; + /* Nothing to do if logic op is 'copy src to dst' or if logic ops are * disabled (we set the logic op to copy in that case). */ if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY) - return; + return false; - nir_foreach_function(function, s) { - if (function->impl) { - nir_foreach_block(block, function->impl) - v3d_nir_lower_logic_ops_block(block, c); + nir_foreach_function_impl(impl, s) { + nir_foreach_block(block, impl) + progress |= v3d_nir_lower_logic_ops_block(block, c); - nir_metadata_preserve(function->impl, + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); + } else { + nir_metadata_preserve(impl, + nir_metadata_all); } } + + return progress; } diff --git a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c b/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c deleted file mode 100644 index 40f1cc23b1a..00000000000 --- a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright © 2020 Raspberry Pi - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "compiler/v3d_compiler.h" -#include "compiler/nir/nir_builder.h" - -static void -rewrite_offset(nir_builder *b, - nir_intrinsic_instr *instr, - uint32_t buffer_idx, - uint32_t offset_src, - nir_intrinsic_op buffer_size_op) -{ - b->cursor = nir_before_instr(&instr->instr); - - /* Get size of the buffer */ - nir_intrinsic_instr *size = - nir_intrinsic_instr_create(b->shader, buffer_size_op); - size->src[0] = nir_src_for_ssa(nir_imm_int(b, buffer_idx)); - nir_ssa_dest_init(&size->instr, &size->dest, 1, 32, NULL); - nir_builder_instr_insert(b, &size->instr); - - /* All out TMU accesses are 32-bit aligned */ - nir_ssa_def *aligned_buffer_size = - nir_iand(b, &size->dest.ssa, nir_imm_int(b, 0xfffffffc)); - - /* Rewrite offset */ - nir_ssa_def *offset = - nir_umin(b, instr->src[offset_src].ssa, aligned_buffer_size); - nir_instr_rewrite_src(&instr->instr, &instr->src[offset_src], - nir_src_for_ssa(offset)); -} - -static void -lower_load(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - uint32_t index = nir_src_comp_as_uint(instr->src[0], 0); - - nir_intrinsic_op op; - if (instr->intrinsic == nir_intrinsic_load_ubo) { - op = nir_intrinsic_get_ubo_size; - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) - index--; - } else { - op = nir_intrinsic_get_ssbo_size; - } - - rewrite_offset(b, instr, index, 1, op); -} - -static void -lower_store(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - uint32_t index = nir_src_comp_as_uint(instr->src[1], 0); - rewrite_offset(b, instr, index, 2, nir_intrinsic_get_ssbo_size); -} - -static void -lower_atomic(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - uint32_t index = nir_src_comp_as_uint(instr->src[0], 0); - rewrite_offset(b, instr, index, 1, nir_intrinsic_get_ssbo_size); -} - -static void -lower_shared(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *aligned_size = - nir_imm_int(b, c->s->info.shared_size & 0xfffffffc); - nir_ssa_def *offset = nir_umin(b, instr->src[0].ssa, aligned_size); - nir_instr_rewrite_src(&instr->instr, &instr->src[0], - nir_src_for_ssa(offset)); -} - -static void -lower_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr) -{ - if (instr->type != nir_instr_type_intrinsic) - return; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ssbo: - lower_load(c, b, intr); - break; - case nir_intrinsic_store_ssbo: - lower_store(c, b, intr); - break; - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - lower_atomic(c, b, intr); - break; - case nir_intrinsic_load_shared: - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - lower_shared(c, b, intr); - break; - default: - break; - } -} - -void -v3d_nir_lower_robust_buffer_access(nir_shader *s, struct v3d_compile *c) -{ - nir_foreach_function(function, s) { - if (function->impl) { - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) - lower_instr(c, &b, instr); - } - - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); - } - } -} diff --git a/src/broadcom/compiler/v3d_nir_lower_scratch.c b/src/broadcom/compiler/v3d_nir_lower_scratch.c index 893b6f6ae28..93ed1bb6e26 100644 --- a/src/broadcom/compiler/v3d_nir_lower_scratch.c +++ b/src/broadcom/compiler/v3d_nir_lower_scratch.c @@ -34,11 +34,11 @@ * writemasks in the process. */ -static nir_ssa_def * +static nir_def * v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr) { bool is_store = instr->intrinsic == nir_intrinsic_store_scratch; - nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1); + nir_def *offset = instr->src[is_store ? 1 : 0].ssa; assert(nir_intrinsic_align_mul(instr) >= 4); assert(nir_intrinsic_align_offset(instr) == 0); @@ -55,18 +55,18 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr) { b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr); + nir_def *offset = v3d_nir_scratch_offset(b,instr); - nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS]; + nir_def *chans[NIR_MAX_VEC_COMPONENTS]; for (int i = 0; i < instr->num_components; i++) { - nir_ssa_def *chan_offset = + nir_def *chan_offset = nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4); nir_intrinsic_instr *chan_instr = nir_intrinsic_instr_create(b->shader, instr->intrinsic); chan_instr->num_components = 1; - nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1, - instr->dest.ssa.bit_size, NULL); + nir_def_init(&chan_instr->instr, &chan_instr->def, 1, + instr->def.bit_size); chan_instr->src[0] = nir_src_for_ssa(chan_offset); @@ -74,11 +74,11 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr) nir_builder_instr_insert(b, &chan_instr->instr); - chans[i] = &chan_instr->dest.ssa; + chans[i] = &chan_instr->def; } - nir_ssa_def *result = nir_vec(b, chans, instr->num_components); - nir_ssa_def_rewrite_uses(&instr->dest.ssa, result); + nir_def *result = nir_vec(b, chans, instr->num_components); + nir_def_rewrite_uses(&instr->def, result); nir_instr_remove(&instr->instr); } @@ -87,15 +87,14 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr) { b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr); - nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0], - instr->num_components); + nir_def *offset = v3d_nir_scratch_offset(b, instr); + nir_def *value = instr->src[0].ssa; for (int i = 0; i < instr->num_components; i++) { if (!(nir_intrinsic_write_mask(instr) & (1 << i))) continue; - nir_ssa_def *chan_offset = + nir_def *chan_offset = nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4); nir_intrinsic_instr *chan_instr = @@ -115,39 +114,29 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr) nir_instr_remove(&instr->instr); } -void -v3d_nir_lower_scratch(nir_shader *s) +static bool +v3d_nir_lower_scratch_cb(nir_builder *b, + nir_intrinsic_instr *intr, + void *_state) { - nir_foreach_function(function, s) { - if (!function->impl) - continue; - - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = - nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_load_scratch: - v3d_nir_lower_load_scratch(&b, intr); - break; - case nir_intrinsic_store_scratch: - v3d_nir_lower_store_scratch(&b, intr); - break; - default: - break; - } - } - } - - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + switch (intr->intrinsic) { + case nir_intrinsic_load_scratch: + v3d_nir_lower_load_scratch(b, intr); + return true; + case nir_intrinsic_store_scratch: + v3d_nir_lower_store_scratch(b, intr); + return true; + default: + return false; } + + return false; +} + +bool +v3d_nir_lower_scratch(nir_shader *s) +{ + return nir_shader_intrinsics_pass(s, v3d_nir_lower_scratch_cb, + nir_metadata_block_index | + nir_metadata_dominance, NULL); } diff --git a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c index d79969374d5..e78c3cb9e3e 100644 --- a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c +++ b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c @@ -32,25 +32,21 @@ * 2x2 quad. */ -#define V3D_MAX_SAMPLES 4 - -static nir_ssa_def * +static nir_def * v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data) { nir_tex_instr *instr = nir_instr_as_tex(in_instr); b->cursor = nir_before_instr(&instr->instr); - int coord_index = nir_tex_instr_src_index(instr, nir_tex_src_coord); - int sample_index = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - nir_ssa_def *coord = instr->src[coord_index].src.ssa; - nir_ssa_def *sample = instr->src[sample_index].src.ssa; + nir_def *coord = nir_steal_tex_src(instr, nir_tex_src_coord); + nir_def *sample = nir_steal_tex_src(instr, nir_tex_src_ms_index); - nir_ssa_def *one = nir_imm_int(b, 1); - nir_ssa_def *x = nir_iadd(b, + nir_def *one = nir_imm_int(b, 1); + nir_def *x = nir_iadd(b, nir_ishl(b, nir_channel(b, coord, 0), one), nir_iand(b, sample, one)); - nir_ssa_def *y = nir_iadd(b, + nir_def *y = nir_iadd(b, nir_ishl(b, nir_channel(b, coord, 1), one), nir_iand(b, nir_ushr(b, sample, one), one)); if (instr->is_array) @@ -58,10 +54,7 @@ v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data) else coord = nir_vec2(b, x, y); - nir_instr_rewrite_src(&instr->instr, - &instr->src[nir_tex_src_coord].src, - nir_src_for_ssa(coord)); - nir_tex_instr_remove_src(instr, sample_index); + nir_tex_instr_add_src(instr, nir_tex_src_coord, coord); instr->op = nir_texop_txf; instr->sampler_dim = GLSL_SAMPLER_DIM_2D; @@ -75,11 +68,11 @@ v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data) nir_instr_as_tex(instr)->op == nir_texop_txf_ms); } -void -v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c) +bool +v3d_nir_lower_txf_ms(nir_shader *s) { - nir_shader_lower_instructions(s, - v3d_nir_lower_txf_ms_filter, - v3d_nir_lower_txf_ms_instr, - NULL); + return nir_shader_lower_instructions(s, + v3d_nir_lower_txf_ms_filter, + v3d_nir_lower_txf_ms_instr, + NULL); } diff --git a/src/broadcom/compiler/v3d_packing.c b/src/broadcom/compiler/v3d_packing.c new file mode 100644 index 00000000000..46643edd5e6 --- /dev/null +++ b/src/broadcom/compiler/v3d_packing.c @@ -0,0 +1,50 @@ +/* + * Copyright © 2023 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3d_compiler.h" + +#define __gen_user_data void +#define __gen_address_type uint32_t +#define __gen_address_offset(reloc) (*reloc) +#define __gen_emit_reloc(cl, reloc) +#define __gen_unpack_address(cl, s, e) (__gen_unpack_uint(cl, s, e) << (31 - (e - s))) +#include "cle/v3d_packet_v42_pack.h" + + +/* Typically, this method would wrap calling version-specific variant of this + * method, but as TMU_CONFIG_PARAMETER_1 doesn't change between v42 and v71, + * we can assume that p1_packed is the same struct, and use the same method. + */ +void +v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo, + uint32_t *p1_packed, + bool unnormalized_coordinates) +{ + assert(devinfo->ver == 71 || devinfo->ver == 42); + + struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked; + V3D42_TMU_CONFIG_PARAMETER_1_unpack((uint8_t *)p1_packed, &p1_unpacked); + p1_unpacked.unnormalized_coordinates = unnormalized_coordinates; + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)p1_packed, + &p1_unpacked); +} diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d_tex.c index 7bebfe95552..643c73c4e58 100644 --- a/src/broadcom/compiler/v3d40_tex.c +++ b/src/broadcom/compiler/v3d_tex.c @@ -28,27 +28,29 @@ #define __gen_address_type uint32_t #define __gen_address_offset(reloc) (*reloc) #define __gen_emit_reloc(cl, reloc) -#include "cle/v3d_packet_v41_pack.h" +#include "cle/v3d_packet_v42_pack.h" -static inline void +static inline struct qinst * vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val) { /* XXX perf: We should figure out how to merge ALU operations * producing the val with this MOV, when possible. */ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); + return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); } -static inline void +static inline struct qinst * vir_TMU_WRITE_or_count(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val, uint32_t *tmu_writes) { - if (tmu_writes) + if (tmu_writes) { (*tmu_writes)++; - else - vir_TMU_WRITE(c, waddr, val); + return NULL; + } else { + return vir_TMU_WRITE(c, waddr, val); + } } static void @@ -59,11 +61,11 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data inst->uniform = vir_get_uniform_index(c, contents, data); } -static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = { +static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = { .per_pixel_mask_enable = true, }; -static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = { +static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = { .op = V3D_TMU_OP_REGULAR, }; @@ -84,7 +86,7 @@ handle_tex_src(struct v3d_compile *c, nir_tex_instr *instr, unsigned src_idx, unsigned non_array_components, - struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, + struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked, struct qreg *s_out, unsigned *tmu_writes) { @@ -199,7 +201,7 @@ handle_tex_src(struct v3d_compile *c, static void vir_tex_handle_srcs(struct v3d_compile *c, nir_tex_instr *instr, - struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, + struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked, struct qreg *s, unsigned *tmu_writes) { @@ -222,31 +224,62 @@ get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr) } void -v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) +v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) { - assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42); - unsigned texture_idx = instr->texture_index; - unsigned sampler_idx = instr->sampler_index; - struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = { + /* For instructions that don't have a sampler (i.e. txf) we bind + * default sampler state via the backend_flags to handle precision. + */ + unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ? + instr->sampler_index : instr->backend_flags; + + /* Even if the texture operation doesn't need a sampler by + * itself, we still need to add the sampler configuration + * parameter if the output is 32 bit + */ + assert(sampler_idx < c->key->num_samplers_used); + bool output_type_32_bit = + c->key->sampler[sampler_idx].return_size == 32; + + struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = { }; /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. */ - p0_unpacked.return_words_of_texture_data = - instr->dest.is_ssa ? - nir_ssa_def_components_read(&instr->dest.ssa) : - (1 << instr->dest.reg.reg->num_components) - 1; + nir_intrinsic_instr *store = nir_store_reg_for_def(&instr->def); + if (store == NULL) { + p0_unpacked.return_words_of_texture_data = + nir_def_components_read(&instr->def); + } else { + nir_def *reg = store->src[1].ssa; + nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + unsigned reg_num_components = + nir_intrinsic_num_components(decl); + + /* For the non-ssa case we don't have a full equivalent to + * nir_def_components_read. This is a problem for the 16 + * bit case. nir_lower_tex will not change the destination as + * nir_tex_instr_dest_size will still return 4. The driver is + * just expected to not store on other channels, so we + * manually ensure that here. + */ + uint32_t num_components = output_type_32_bit ? + MIN2(reg_num_components, 4) : + MIN2(reg_num_components, 2); + + p0_unpacked.return_words_of_texture_data = (1 << num_components) - 1; + } assert(p0_unpacked.return_words_of_texture_data != 0); - struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { .op = V3D_TMU_OP_REGULAR, .gather_mode = instr->op == nir_texop_tg4, .gather_component = instr->component, .coefficient_mode = instr->op == nir_texop_txd, - .disable_autolod = instr->op == nir_texop_tg4 + .disable_autolod = instr->op == nir_texop_tg4, + .lod_query = instr->op == nir_texop_lod, }; const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr); @@ -270,22 +303,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL); uint32_t p0_packed; - V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL, (uint8_t *)&p0_packed, &p0_unpacked); uint32_t p2_packed; - V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL, (uint8_t *)&p2_packed, &p2_unpacked); - /* We manually set the LOD Query bit (see - * V3D42_TMU_CONFIG_PARAMETER_2) as right now is the only V42 specific - * feature over V41 we are using - */ - if (instr->op == nir_texop_lod) - p2_packed |= 1UL << 24; - /* Load texture_idx number into the high bits of the texture address field, * which will be be used by the driver to decide which texture to put * in the actual address field. @@ -294,14 +320,6 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed); - /* Even if the texture operation doesn't need a sampler by - * itself, we still need to add the sampler configuration - * parameter if the output is 32 bit - */ - bool output_type_32_bit = - c->key->sampler[sampler_idx].return_size == 32 && - !instr->is_shadow; - /* p1 is optional, but we can skip it only if p2 can be skipped too */ bool needs_p2_config = (instr->op == nir_texop_lod || @@ -313,7 +331,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) output_type_32_bit; if (non_default_p1_config) { - struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = { .output_type_32_bit = output_type_32_bit, .unnormalized_coordinates = (instr->sampler_dim == @@ -330,7 +348,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) p0_unpacked.return_words_of_texture_data < (1 << 2)); uint32_t p1_packed; - V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)&p1_packed, &p1_unpacked); @@ -358,7 +376,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) * address */ uint32_t p1_packed_default; - V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)&p1_packed_default, &p1_unpacked_default); vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default); @@ -368,48 +386,54 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); /* Emit retiring TMU write */ + struct qinst *retiring; if (instr->op == nir_texop_txf) { assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE); - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); } else if (instr->op == nir_texop_txl) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); } else { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); } - ntq_add_pending_tmu_flush(c, &instr->dest, + retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data; + ntq_add_pending_tmu_flush(c, &instr->def, p0_unpacked.return_words_of_texture_data); } static uint32_t -v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr) +v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr) +{ + nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr); + switch (atomic_op) { + case nir_atomic_op_iadd: return v3d_get_op_for_atomic_add(instr, 3); + case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN; + case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; + case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX; + case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX; + case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC; + case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC; + case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT; + case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; + case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + default: unreachable("unknown atomic op"); + } +} + +static uint32_t +v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { case nir_intrinsic_image_load: case nir_intrinsic_image_store: return V3D_TMU_OP_REGULAR; - case nir_intrinsic_image_atomic_add: - return v3d_get_op_for_atomic_add(instr, 3); - case nir_intrinsic_image_atomic_imin: - return V3D_TMU_OP_WRITE_SMIN; - case nir_intrinsic_image_atomic_umin: - return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; - case nir_intrinsic_image_atomic_imax: - return V3D_TMU_OP_WRITE_SMAX; - case nir_intrinsic_image_atomic_umax: - return V3D_TMU_OP_WRITE_UMAX; - case nir_intrinsic_image_atomic_and: - return V3D_TMU_OP_WRITE_AND_READ_INC; - case nir_intrinsic_image_atomic_or: - return V3D_TMU_OP_WRITE_OR_READ_DEC; - case nir_intrinsic_image_atomic_xor: - return V3D_TMU_OP_WRITE_XOR_READ_NOT; - case nir_intrinsic_image_atomic_exchange: - return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; - case nir_intrinsic_image_atomic_comp_swap: - return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + return v3d_image_atomic_tmu_op(instr); + default: unreachable("unknown image intrinsic"); }; @@ -427,7 +451,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr) * which is why we always call ntq_get_src() even if we are only interested in * register write counts. */ -static void +static struct qinst * vir_image_emit_register_writes(struct v3d_compile *c, nir_intrinsic_instr *instr, bool atomic_add_replaced, @@ -480,7 +504,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, } /* Second atomic argument */ - if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) { + if (instr->intrinsic == nir_intrinsic_image_atomic_swap && + nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) { struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0); vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0, tmu_writes); @@ -494,7 +519,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, V3D_QPU_PF_PUSHZ); } - vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); + struct qinst *retiring = + vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); if (!tmu_writes && vir_in_nonuniform_control_flow(c) && instr->intrinsic != nir_intrinsic_image_load) { @@ -502,6 +528,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, (struct qinst *)c->cur_block->instructions.prev; vir_set_cond(last_inst, V3D_QPU_COND_IFA); } + + return retiring; } static unsigned @@ -516,21 +544,21 @@ get_required_image_tmu_writes(struct v3d_compile *c, } void -v3d40_vir_emit_image_load_store(struct v3d_compile *c, - nir_intrinsic_instr *instr) +v3d_vir_emit_image_load_store(struct v3d_compile *c, + nir_intrinsic_instr *instr) { unsigned format = nir_intrinsic_format(instr); unsigned unit = nir_src_as_uint(instr->src[0]); - struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = { }; - struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = { .per_pixel_mask_enable = true, .output_type_32_bit = v3d_gl_format_is_return_32(format), }; - struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 }; + struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 }; /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. @@ -542,19 +570,20 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, p0_unpacked.return_words_of_texture_data = (1 << instr_return_channels) - 1; - p2_unpacked.op = v3d40_image_load_store_tmu_op(instr); + p2_unpacked.op = v3d_image_load_store_tmu_op(instr); /* If we were able to replace atomic_add for an inc/dec, then we * need/can to do things slightly different, like not loading the * amount to add/sub, as that is implicit. */ bool atomic_add_replaced = - (instr->intrinsic == nir_intrinsic_image_atomic_add && - (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC || - p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + instr->intrinsic == nir_intrinsic_image_atomic && + nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd && + (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC || + p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC); uint32_t p0_packed; - V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL, (uint8_t *)&p0_packed, &p0_unpacked); @@ -565,12 +594,12 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, p0_packed |= unit << 24; uint32_t p1_packed; - V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)&p1_packed, &p1_unpacked); uint32_t p2_packed; - V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL, (uint8_t *)&p2_packed, &p2_unpacked); @@ -599,8 +628,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked))) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); - vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); - - ntq_add_pending_tmu_flush(c, &instr->dest, + struct qinst *retiring = + vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); + retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data; + ntq_add_pending_tmu_flush(c, &instr->def, p0_unpacked.return_words_of_texture_data); } diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 27869a35a3b..c59a8aac434 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -23,7 +23,6 @@ #include "broadcom/common/v3d_device_info.h" #include "v3d_compiler.h" -#include "util/u_prim.h" #include "compiler/nir/nir_schedule.h" #include "compiler/nir/nir_builder.h" @@ -89,7 +88,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) * pointer, so each read has a side effect (we don't care for ldunif * because we reconstruct the uniform stream buffer after compiling * with the surviving uniforms), so allowing DCE to remove - * one would break follow-up loads. We could fix this by emiting a + * one would break follow-up loads. We could fix this by emitting a * unifa for each ldunifa, but each unifa requires 3 delay slots * before a ldunifa, so that would be quite expensive. */ @@ -113,10 +112,10 @@ vir_is_raw_mov(struct qinst *inst) return false; } - if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { + if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) { return false; } @@ -156,30 +155,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) } bool -vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) +vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, + struct qinst *inst) { - for (int i = 0; i < vir_get_nsrc(inst); i++) { - switch (inst->src[i].file) { - case QFILE_VPM: - return true; - default: - break; - } - } - - if (devinfo->ver < 41 && (inst->qpu.sig.ldvary || - inst->qpu.sig.ldtlb || - inst->qpu.sig.ldtlbu || - inst->qpu.sig.ldvpm)) { - return true; - } - - return false; -} + if (!devinfo->has_accumulators) + return false; -bool -vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) -{ switch (inst->dst.file) { case QFILE_MAGIC: switch (inst->dst.index) { @@ -195,9 +176,6 @@ vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) break; } - if (devinfo->ver < 41 && inst->qpu.sig.ldtmu) - return true; - return false; } @@ -209,15 +187,15 @@ vir_set_unpack(struct qinst *inst, int src, if (vir_is_add(inst)) { if (src == 0) - inst->qpu.alu.add.a_unpack = unpack; + inst->qpu.alu.add.a.unpack = unpack; else - inst->qpu.alu.add.b_unpack = unpack; + inst->qpu.alu.add.b.unpack = unpack; } else { assert(vir_is_mul(inst)); if (src == 0) - inst->qpu.alu.mul.a_unpack = unpack; + inst->qpu.alu.mul.a.unpack = unpack; else - inst->qpu.alu.mul.b_unpack = unpack; + inst->qpu.alu.mul.b.unpack = unpack; } } @@ -369,6 +347,8 @@ vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct q inst->src[1] = src1; inst->uniform = ~0; + inst->ip = -1; + return inst; } @@ -385,6 +365,8 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q inst->src[1] = src1; inst->uniform = ~0; + inst->ip = -1; + return inst; } @@ -404,12 +386,16 @@ vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) inst->dst = vir_nop_reg(); inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0); + inst->ip = -1; + return inst; } static void vir_emit(struct v3d_compile *c, struct qinst *inst) { + inst->ip = -1; + switch (c->cursor.mode) { case vir_cursor_add: list_add(&inst->link, c->cursor.link); @@ -509,13 +495,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor) } const struct v3d_compiler * -v3d_compiler_init(const struct v3d_device_info *devinfo) +v3d_compiler_init(const struct v3d_device_info *devinfo, + uint32_t max_inline_uniform_buffers) { struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler); if (!compiler) return NULL; compiler->devinfo = devinfo; + compiler->max_inline_uniform_buffers = max_inline_uniform_buffers; if (!vir_init_reg_sets(compiler)) { ralloc_free(compiler); @@ -531,6 +519,19 @@ v3d_compiler_free(const struct v3d_compiler *compiler) ralloc_free((void *)compiler); } +struct v3d_compiler_strategy { + const char *name; + uint32_t max_threads; + uint32_t min_threads; + bool disable_general_tmu_sched; + bool disable_gcm; + bool disable_loop_unrolling; + bool disable_ubo_load_sorting; + bool move_buffer_loads; + bool disable_tmu_pipelining; + uint32_t max_tmu_spills; +}; + static struct v3d_compile * vir_compile_init(const struct v3d_compiler *compiler, struct v3d_key *key, @@ -539,12 +540,8 @@ vir_compile_init(const struct v3d_compiler *compiler, void *debug_output_data), void *debug_output_data, int program_id, int variant_id, - uint32_t max_threads, - uint32_t min_threads_for_reg_alloc, - bool tmu_spilling_allowed, - bool disable_loop_unrolling, - bool disable_constant_ubo_load_sorting, - bool disable_tmu_pipelining, + uint32_t compile_strategy_idx, + const struct v3d_compiler_strategy *strategy, bool fallback_scheduler) { struct v3d_compile *c = rzalloc(NULL, struct v3d_compile); @@ -554,17 +551,22 @@ vir_compile_init(const struct v3d_compiler *compiler, c->key = key; c->program_id = program_id; c->variant_id = variant_id; - c->threads = max_threads; + c->compile_strategy_idx = compile_strategy_idx; + c->threads = strategy->max_threads; c->debug_output = debug_output; c->debug_output_data = debug_output_data; c->compilation_result = V3D_COMPILATION_SUCCEEDED; - c->min_threads_for_reg_alloc = min_threads_for_reg_alloc; - c->tmu_spilling_allowed = tmu_spilling_allowed; + c->min_threads_for_reg_alloc = strategy->min_threads; + c->max_tmu_spills = strategy->max_tmu_spills; c->fallback_scheduler = fallback_scheduler; - c->disable_tmu_pipelining = disable_tmu_pipelining; - c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; - c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL - ? true : disable_loop_unrolling; + c->disable_general_tmu_sched = strategy->disable_general_tmu_sched; + c->disable_tmu_pipelining = strategy->disable_tmu_pipelining; + c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting; + c->move_buffer_loads = strategy->move_buffer_loads; + c->disable_gcm = strategy->disable_gcm; + c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL) + ? true : strategy->disable_loop_unrolling; + s = nir_shader_clone(c, s); c->s = s; @@ -590,17 +592,107 @@ type_size_vec4(const struct glsl_type *type, bool bindless) return glsl_count_attribute_slots(type, false); } +static enum nir_lower_tex_packing +lower_tex_packing_cb(const nir_tex_instr *tex, const void *data) +{ + struct v3d_compile *c = (struct v3d_compile *) data; + + int sampler_index = nir_tex_instr_need_sampler(tex) ? + tex->sampler_index : tex->backend_flags; + + assert(sampler_index < c->key->num_samplers_used); + return c->key->sampler[sampler_index].return_size == 16 ? + nir_lower_tex_packing_16 : nir_lower_tex_packing_none; +} + +static bool +v3d_nir_lower_null_pointers_cb(nir_builder *b, + nir_intrinsic_instr *intr, + void *_state) +{ + uint32_t buffer_src_idx; + + switch (intr->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + buffer_src_idx = 0; + break; + case nir_intrinsic_store_ssbo: + buffer_src_idx = 1; + break; + default: + return false; + } + + /* If index if constant we are good */ + nir_src *src = &intr->src[buffer_src_idx]; + if (nir_src_is_const(*src)) + return false; + + /* Otherwise, see if it comes from a bcsel including a null pointer */ + if (src->ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr); + if (alu->op != nir_op_bcsel) + return false; + + /* A null pointer is specified using block index 0xffffffff */ + int32_t null_src_idx = -1; + for (int i = 1; i < 3; i++) { + /* FIXME: since we are running this before optimization maybe + * we need to also handle the case where we may have bcsel + * chain that we need to recurse? + */ + if (!nir_src_is_const(alu->src[i].src)) + continue; + if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff) + continue; + + /* One of the bcsel srcs is a null pointer reference */ + null_src_idx = i; + break; + } + + if (null_src_idx < 0) + return false; + + assert(null_src_idx == 1 || null_src_idx == 2); + int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1; + + /* Rewrite the null pointer reference so we use the same buffer index + * as the other bcsel branch. This will allow optimization to remove + * the bcsel and we should then end up with a constant buffer index + * like we need. + */ + b->cursor = nir_before_instr(&alu->instr); + nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa); + nir_src_rewrite(&alu->src[null_src_idx].src, copy); + + return true; +} + +static bool +v3d_nir_lower_null_pointers(nir_shader *s) +{ + return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb, + nir_metadata_block_index | + nir_metadata_dominance, NULL); +} + static void v3d_lower_nir(struct v3d_compile *c) { struct nir_lower_tex_options tex_options = { .lower_txd = true, + .lower_tg4_offsets = true, .lower_tg4_broadcom_swizzle = true, .lower_rect = false, /* XXX: Use this on V3D 3.x */ .lower_txp = ~0, /* Apply swizzles to all samplers. */ .swizzle_result = ~0, + .lower_invalid_implicit_lod = true, }; /* Lower the format swizzle and (for 32-bit returns) @@ -612,38 +704,35 @@ v3d_lower_nir(struct v3d_compile *c) tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j]; } - assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler)); - for (int i = 0; i < c->key->num_samplers_used; i++) { - if (c->key->sampler[i].return_size == 16) { - tex_options.lower_tex_packing[i] = - nir_lower_tex_packing_16; - } - } - - /* CS textures may not have return_size reflecting the shadow state. */ - nir_foreach_uniform_variable(var, c->s) { - const struct glsl_type *type = glsl_without_array(var->type); - unsigned array_len = MAX2(glsl_get_length(var->type), 1); + tex_options.lower_tex_packing_cb = lower_tex_packing_cb; + tex_options.lower_tex_packing_data = c; - if (!glsl_type_is_sampler(type) || - !glsl_sampler_type_is_shadow(type)) - continue; + NIR_PASS(_, c->s, nir_lower_tex, &tex_options); + NIR_PASS(_, c->s, nir_lower_system_values); - for (int i = 0; i < array_len; i++) { - tex_options.lower_tex_packing[var->data.binding + i] = - nir_lower_tex_packing_16; - } + if (c->s->info.zero_initialize_shared_memory && + c->s->info.shared_size > 0) { + /* All our BOs allocate full pages, so the underlying allocation + * for shared memory will always be a multiple of 4KB. This + * ensures that we can do an exact number of full chunk_size + * writes to initialize the memory independently of the actual + * shared_size used by the shader, which is a requirement of + * the initialization pass. + */ + const unsigned chunk_size = 16; /* max single store size */ + NIR_PASS(_, c->s, nir_zero_initialize_shared_memory, + align(c->s->info.shared_size, chunk_size), chunk_size); } - NIR_PASS_V(c->s, nir_lower_tex, &tex_options); - NIR_PASS_V(c->s, nir_lower_system_values); - NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL); + NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL); - NIR_PASS_V(c->s, nir_lower_vars_to_scratch, - nir_var_function_temp, - 0, - glsl_get_natural_size_align_bytes); - NIR_PASS_V(c->s, v3d_nir_lower_scratch); + NIR_PASS(_, c->s, nir_lower_vars_to_scratch, + nir_var_function_temp, + 0, + glsl_get_natural_size_align_bytes); + NIR_PASS(_, c->s, nir_lower_is_helper_invocation); + NIR_PASS(_, c->s, v3d_nir_lower_scratch); + NIR_PASS(_, c->s, v3d_nir_lower_null_pointers); } static void @@ -711,6 +800,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c, /* Set us up for shared input/output segments. This is apparently * necessary for our VCM setup to avoid varying corruption. + * + * FIXME: initial testing on V3D 7.1 seems to work fine when using + * separate segments. So we could try to reevaluate in the future, if + * there is any advantage of using separate segments. */ prog_data->separate_segments = false; prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, @@ -807,13 +900,14 @@ v3d_fs_set_prog_data(struct v3d_compile *c, { v3d_set_fs_prog_data_inputs(c, prog_data); prog_data->writes_z = c->writes_z; + prog_data->writes_z_from_fep = c->writes_z_from_fep; prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; prog_data->uses_center_w = c->uses_center_w; prog_data->uses_implicit_point_line_varyings = c->uses_implicit_point_line_varyings; prog_data->lock_scoreboard_on_first_thrsw = c->lock_scoreboard_on_first_thrsw; - prog_data->force_per_sample_msaa = c->force_per_sample_msaa; + prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading; prog_data->uses_pid = c->fs_uses_primitive_id; } @@ -837,8 +931,14 @@ v3d_set_prog_data(struct v3d_compile *c, prog_data->threads = c->threads; prog_data->single_seg = !c->last_thrsw; prog_data->spill_size = c->spill_size; + prog_data->tmu_spills = c->spills; + prog_data->tmu_fills = c->fills; + prog_data->tmu_count = c->tmu.total_count; + prog_data->qpu_read_stalls = c->qpu_inst_stalled_count; + prog_data->compile_strategy_idx = c->compile_strategy_idx; prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl; prog_data->has_control_barrier = c->s->info.uses_control_barrier; + prog_data->has_global_address = c->has_global_address; v3d_set_prog_data_uniforms(c, prog_data); @@ -882,32 +982,32 @@ v3d_nir_lower_vs_early(struct v3d_compile *c) /* Split our I/O vars and dead code eliminate the unused * components. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, - nir_var_shader_in | nir_var_shader_out); + NIR_PASS(_, c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); uint64_t used_outputs[4] = {0}; for (int i = 0; i < c->vs_key->num_used_outputs; i++) { int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]); int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]); used_outputs[comp] |= 1ull << slot; } - NIR_PASS_V(c->s, nir_remove_unused_io_vars, - nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ - NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + NIR_PASS(_, c->s, nir_remove_unused_io_vars, + nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ + NIR_PASS(_, c->s, nir_lower_global_vars_to_local); v3d_optimize_nir(c, c->s); - NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); + NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); /* This must go before nir_lower_io */ if (c->vs_key->per_vertex_point_size) - NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f); - NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - type_size_vec4, - (nir_lower_io_options)0); + NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); /* clean up nir_lower_io's deref_var remains and do a constant folding pass * on the code it generated. */ - NIR_PASS_V(c->s, nir_opt_dce); - NIR_PASS_V(c->s, nir_opt_constant_folding); + NIR_PASS(_, c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_opt_constant_folding); } static void @@ -916,29 +1016,32 @@ v3d_nir_lower_gs_early(struct v3d_compile *c) /* Split our I/O vars and dead code eliminate the unused * components. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, - nir_var_shader_in | nir_var_shader_out); + NIR_PASS(_, c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); uint64_t used_outputs[4] = {0}; for (int i = 0; i < c->gs_key->num_used_outputs; i++) { int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); used_outputs[comp] |= 1ull << slot; } - NIR_PASS_V(c->s, nir_remove_unused_io_vars, - nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ - NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + NIR_PASS(_, c->s, nir_remove_unused_io_vars, + nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ + NIR_PASS(_, c->s, nir_lower_global_vars_to_local); v3d_optimize_nir(c, c->s); - NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); + NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); /* This must go before nir_lower_io */ if (c->gs_key->per_vertex_point_size) - NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f); - NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - type_size_vec4, - (nir_lower_io_options)0); - /* clean up nir_lower_io's deref_var remains */ - NIR_PASS_V(c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); + /* clean up nir_lower_io's deref_var remains and do a constant folding pass + * on the code it generated. + */ + NIR_PASS(_, c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_opt_constant_folding); } static void @@ -977,11 +1080,11 @@ v3d_nir_lower_fs_early(struct v3d_compile *c) if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) v3d_fixup_fs_output_types(c); - NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c); + NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c); if (c->fs_key->line_smoothing) { - v3d_nir_lower_line_smooth(c->s); - NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + NIR_PASS(_, c->s, v3d_nir_lower_line_smooth); + NIR_PASS(_, c->s, nir_lower_global_vars_to_local); /* The lowering pass can introduce new sysval reads */ nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s)); } @@ -991,26 +1094,26 @@ static void v3d_nir_lower_gs_late(struct v3d_compile *c) { if (c->key->ucp_enables) { - NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables, - false, NULL); + NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables, + true, NULL); } /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); } static void v3d_nir_lower_vs_late(struct v3d_compile *c) { if (c->key->ucp_enables) { - NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables, - false, false, NULL); + NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables, + false, true, NULL); NIR_PASS_V(c->s, nir_lower_io_to_scalar, - nir_var_shader_out); + nir_var_shader_out, NULL, NULL); } /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); } static void @@ -1024,9 +1127,9 @@ v3d_nir_lower_fs_late(struct v3d_compile *c) * are using. */ if (c->key->ucp_enables) - NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true); + NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true); - NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); } static uint32_t @@ -1107,6 +1210,69 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr, return false; } +static unsigned +v3d_instr_delay_cb(nir_instr *instr, void *data) +{ + struct v3d_compile *c = (struct v3d_compile *) data; + + switch (instr->type) { + case nir_instr_type_undef: + case nir_instr_type_load_const: + case nir_instr_type_alu: + case nir_instr_type_deref: + case nir_instr_type_jump: + case nir_instr_type_parallel_copy: + case nir_instr_type_call: + case nir_instr_type_phi: + return 1; + + /* We should not use very large delays for TMU instructions. Typically, + * thread switches will be sufficient to hide all or most of the latency, + * so we typically only need a little bit of extra room. If we over-estimate + * the latency here we may end up unnecessarily delaying the critical path in + * the shader, which would have a negative effect in performance, so here + * we are trying to strike a balance based on empirical testing. + */ + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (!c->disable_general_tmu_sched) { + switch (intr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + return 0; + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_shared: + case nir_intrinsic_image_load: + return 3; + case nir_intrinsic_load_ubo: + if (nir_src_is_divergent(intr->src[1])) + return 3; + FALLTHROUGH; + default: + return 1; + } + } else { + switch (intr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + return 0; + default: + return 1; + } + } + break; + } + + case nir_instr_type_tex: + return 5; + } + + return 0; +} + static bool should_split_wrmask(const nir_instr *instr, const void *data) { @@ -1197,7 +1363,7 @@ v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref) * reference offset, since otherwise we would not be able to * skip the unifa write for them. See ntq_emit_load_ubo_unifa. */ - if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE) + if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE) continue; /* We will move this load if its offset is smaller than ref's @@ -1349,16 +1515,14 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c, static bool v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c) { - nir_foreach_function(function, s) { - if (function->impl) { - nir_foreach_block(block, function->impl) { - c->sorted_any_ubo_loads |= - v3d_nir_sort_constant_ubo_loads_block(c, block); - } - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + nir_foreach_function_impl(impl, s) { + nir_foreach_block(block, impl) { + c->sorted_any_ubo_loads |= + v3d_nir_sort_constant_ubo_loads_block(c, block); } + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); } return c->sorted_any_ubo_loads; } @@ -1376,8 +1540,8 @@ lower_load_num_subgroups(struct v3d_compile *c, DIV_ROUND_UP(c->s->info.workgroup_size[0] * c->s->info.workgroup_size[1] * c->s->info.workgroup_size[2], V3D_CHANNELS); - nir_ssa_def *result = nir_imm_int(b, num_subgroups); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); + nir_def *result = nir_imm_int(b, num_subgroups); + nir_def_rewrite_uses(&intr->def, result); nir_instr_remove(&intr->instr); } @@ -1404,6 +1568,36 @@ lower_subgroup_intrinsics(struct v3d_compile *c, case nir_intrinsic_load_subgroup_size: case nir_intrinsic_load_subgroup_invocation: case nir_intrinsic_elect: + case nir_intrinsic_ballot: + case nir_intrinsic_inverse_ballot: + case nir_intrinsic_ballot_bitfield_extract: + case nir_intrinsic_ballot_bit_count_reduce: + case nir_intrinsic_ballot_find_lsb: + case nir_intrinsic_ballot_find_msb: + case nir_intrinsic_ballot_bit_count_exclusive: + case nir_intrinsic_ballot_bit_count_inclusive: + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + case nir_intrinsic_read_invocation: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_load_subgroup_eq_mask: + case nir_intrinsic_load_subgroup_ge_mask: + case nir_intrinsic_load_subgroup_gt_mask: + case nir_intrinsic_load_subgroup_le_mask: + case nir_intrinsic_load_subgroup_lt_mask: + case nir_intrinsic_shuffle: + case nir_intrinsic_shuffle_xor: + case nir_intrinsic_shuffle_up: + case nir_intrinsic_shuffle_down: + case nir_intrinsic_vote_all: + case nir_intrinsic_vote_any: + case nir_intrinsic_vote_feq: + case nir_intrinsic_vote_ieq: + case nir_intrinsic_quad_broadcast: + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: c->has_subgroups = true; break; default: @@ -1418,18 +1612,15 @@ static bool v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c) { bool progress = false; - nir_foreach_function(function, s) { - if (function->impl) { - nir_builder b; - nir_builder_init(&b, function->impl); + nir_foreach_function_impl(impl, s) { + nir_builder b = nir_builder_create(impl); - nir_foreach_block(block, function->impl) - progress |= lower_subgroup_intrinsics(c, block, &b); + nir_foreach_block(block, impl) + progress |= lower_subgroup_intrinsics(c, block, &b); - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); - } + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); } return progress; } @@ -1483,30 +1674,54 @@ v3d_attempt_compile(struct v3d_compile *c) break; } - NIR_PASS_V(c->s, v3d_nir_lower_io, c); - NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); - NIR_PASS_V(c->s, v3d_nir_lower_image_load_store); + NIR_PASS(_, c->s, v3d_nir_lower_io, c); + NIR_PASS(_, c->s, v3d_nir_lower_txf_ms); + NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c); + + NIR_PASS(_, c->s, nir_opt_idiv_const, 8); nir_lower_idiv_options idiv_options = { - .imprecise_32bit_lowering = true, .allow_fp16 = true, }; - NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options); - - if (c->key->robust_buffer_access) { - /* v3d_nir_lower_robust_buffer_access assumes constant buffer - * indices on ubo/ssbo intrinsics so run copy propagation and - * constant folding passes before we run the lowering to warrant - * this. We also want to run the lowering before v3d_optimize to - * clean-up redundant get_buffer_size calls produced in the pass. - */ - NIR_PASS_V(c->s, nir_copy_prop); - NIR_PASS_V(c->s, nir_opt_constant_folding); - NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c); + NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options); + NIR_PASS(_, c->s, nir_lower_alu); + + if (c->key->robust_uniform_access || c->key->robust_storage_access || + c->key->robust_image_access) { + /* nir_lower_robust_access assumes constant buffer + * indices on ubo/ssbo intrinsics so run copy propagation and + * constant folding passes before we run the lowering to warrant + * this. We also want to run the lowering before v3d_optimize to + * clean-up redundant get_buffer_size calls produced in the pass. + */ + NIR_PASS(_, c->s, nir_copy_prop); + NIR_PASS(_, c->s, nir_opt_constant_folding); + + nir_lower_robust_access_options opts = { + .lower_image = c->key->robust_image_access, + .lower_ssbo = c->key->robust_storage_access, + .lower_ubo = c->key->robust_uniform_access, + }; + + NIR_PASS(_, c->s, nir_lower_robust_access, &opts); } - NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s); + NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s); - NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c); + NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize); + + NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c); + + const nir_lower_subgroups_options subgroup_opts = { + .subgroup_size = V3D_CHANNELS, + .ballot_components = 1, + .ballot_bit_size = 32, + .lower_to_scalar = true, + .lower_inverse_ballot = true, + .lower_subgroup_masks = true, + .lower_relative_shuffle = true, + .lower_quad = true, + }; + NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts); v3d_optimize_nir(c, c->s); @@ -1519,25 +1734,25 @@ v3d_attempt_compile(struct v3d_compile *c) while (more_late_algebraic) { more_late_algebraic = false; NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); - NIR_PASS_V(c->s, nir_opt_constant_folding); - NIR_PASS_V(c->s, nir_copy_prop); - NIR_PASS_V(c->s, nir_opt_dce); - NIR_PASS_V(c->s, nir_opt_cse); + NIR_PASS(_, c->s, nir_opt_constant_folding); + NIR_PASS(_, c->s, nir_copy_prop); + NIR_PASS(_, c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_opt_cse); } - NIR_PASS_V(c->s, nir_lower_bool_to_int32); - nir_convert_to_lcssa(c->s, true, true); + NIR_PASS(_, c->s, nir_lower_bool_to_int32); + NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true); NIR_PASS_V(c->s, nir_divergence_analysis); - NIR_PASS_V(c->s, nir_convert_from_ssa, true); + NIR_PASS(_, c->s, nir_convert_from_ssa, true); struct nir_schedule_options schedule_options = { /* Schedule for about half our register space, to enable more * shaders to hit 4 threads. */ - .threshold = 24, + .threshold = c->threads == 4 ? 24 : 48, /* Vertex shaders share the same memory for inputs and outputs, - * fragement and geometry shaders do not. + * fragment and geometry shaders do not. */ .stages_with_shared_io_memory = (((1 << MESA_ALL_SHADER_STAGES) - 1) & @@ -1548,11 +1763,22 @@ v3d_attempt_compile(struct v3d_compile *c) .intrinsic_cb = v3d_intrinsic_dependency_cb, .intrinsic_cb_data = c, + + .instr_delay_cb = v3d_instr_delay_cb, + .instr_delay_cb_data = c, }; NIR_PASS_V(c->s, nir_schedule, &schedule_options); if (!c->disable_constant_ubo_load_sorting) - NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c); + NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c); + + const nir_move_options buffer_opts = c->move_buffer_loads ? + (nir_move_load_ubo | nir_move_load_ssbo) : 0; + NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform | + nir_move_const_undef | + buffer_opts); + + NIR_PASS_V(c->s, nir_trivialize_registers); v3d_nir_to_vir(c); } @@ -1611,32 +1837,28 @@ int v3d_shaderdb_dump(struct v3d_compile *c, * register allocation to any particular thread count). This is fine * because v3d_nir_to_vir will cap this to the actual minimum. */ -struct v3d_compiler_strategy { - const char *name; - uint32_t max_threads; - uint32_t min_threads; - bool disable_loop_unrolling; - bool disable_ubo_load_sorting; - bool disable_tmu_pipelining; - bool tmu_spilling_allowed; -} static const strategies[] = { - /*0*/ { "default", 4, 4, false, false, false, false }, - /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false }, - /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false }, - /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false }, - /*4*/ { "lower thread count", 2, 1, false, false, false, false }, - /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false }, - /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false }, - /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true }, - /*8*/ { "fallback scheduler", 2, 1, true, true, true, true } +static const struct v3d_compiler_strategy strategies[] = { + /*0*/ { "default", 4, 4, false, false, false, false, false, false, 0 }, + /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, false, false, 0 }, + /*2*/ { "disable gcm", 4, 4, true, true, false, false, false, false, 0 }, + /*3*/ { "disable loop unrolling", 4, 4, true, true, true, false, false, false, 0 }, + /*4*/ { "disable UBO load sorting", 4, 4, true, true, true, true, false, false, 0 }, + /*5*/ { "disable TMU pipelining", 4, 4, true, true, true, true, false, true, 0 }, + /*6*/ { "lower thread count", 2, 1, false, false, false, false, false, false, -1 }, + /*7*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, false, false, -1 }, + /*8*/ { "disable gcm (2t)", 2, 1, true, true, false, false, false, false, -1 }, + /*9*/ { "disable loop unrolling (2t)", 2, 1, true, true, true, false, false, false, -1 }, + /*10*/ { "Move buffer loads (2t)", 2, 1, true, true, true, true, true, false, -1 }, + /*11*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, true, true, -1 }, + /*12*/ { "fallback scheduler", 2, 1, true, true, true, true, true, true, -1 } }; /** * If a particular optimization didn't make any progress during a compile - * attempt disabling it alone won't allow us to compile the shader successfuly, + * attempt disabling it alone won't allow us to compile the shader successfully, * since we'll end up with the same code. Detect these scenarios so we can * avoid wasting time with useless compiles. We should also consider if the - * strategy changes other aspects of the compilation process though, like + * gy changes other aspects of the compilation process though, like * spilling, and not skip it in that case. */ static bool @@ -1649,31 +1871,55 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx) assert(idx > 0); /* Don't skip a strategy that changes spilling behavior */ - if (strategies[idx].tmu_spilling_allowed != - strategies[idx - 1].tmu_spilling_allowed) { + if (strategies[idx].max_tmu_spills != + strategies[idx - 1].max_tmu_spills) { return false; } switch (idx) { - /* Loop unrolling: skip if we didn't unroll any loops */ + /* General TMU sched.: skip if we didn't emit any TMU loads */ case 1: - case 5: + case 7: + return !c->has_general_tmu_load; + /* Global code motion: skip if nir_opt_gcm didn't make any progress */ + case 2: + case 8: + return !c->gcm_progress; + /* Loop unrolling: skip if we didn't unroll any loops */ + case 3: + case 9: return !c->unrolled_any_loops; /* UBO load sorting: skip if we didn't sort any loads */ - case 2: - case 6: + case 4: return !c->sorted_any_ubo_loads; + /* Move buffer loads: we assume any shader with difficult RA + * most likely has UBO / SSBO loads so we never try to skip. + * For now, we only try this for 2-thread compiles since it + * is expected to impact instruction counts and latency. + */ + case 10: + assert(c->threads < 4); + return false; /* TMU pipelining: skip if we didn't pipeline any TMU ops */ - case 3: - case 7: + case 5: + case 11: return !c->pipelined_any_tmu; /* Lower thread count: skip if we already tried less that 4 threads */ - case 4: + case 6: return c->threads < 4; default: return false; }; } + +static inline void +set_best_compile(struct v3d_compile **best, struct v3d_compile *c) +{ + if (*best) + vir_compile_destroy(*best); + *best = c; +} + uint64_t *v3d_compile(const struct v3d_compiler *compiler, struct v3d_key *key, struct v3d_prog_data **out_prog_data, @@ -1685,58 +1931,106 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, uint32_t *final_assembly_size) { struct v3d_compile *c = NULL; - for (int i = 0; i < ARRAY_SIZE(strategies); i++) { + + uint32_t best_spill_fill_count = UINT32_MAX; + struct v3d_compile *best_c = NULL; + for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) { /* Fallback strategy */ - if (i > 0) { + if (strat > 0) { assert(c); - if (skip_compile_strategy(c, i)) + if (skip_compile_strategy(c, strat)) continue; char *debug_msg; int ret = asprintf(&debug_msg, - "Falling back to strategy '%s' for %s", - strategies[i].name, - vir_get_stage_name(c)); + "Falling back to strategy '%s' " + "for %s prog %d/%d", + strategies[strat].name, + vir_get_stage_name(c), + c->program_id, c->variant_id); if (ret >= 0) { - if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) + if (V3D_DBG(PERF)) fprintf(stderr, "%s\n", debug_msg); c->debug_output(debug_msg, c->debug_output_data); free(debug_msg); } - vir_compile_destroy(c); + if (c != best_c) + vir_compile_destroy(c); } c = vir_compile_init(compiler, key, s, debug_output, debug_output_data, program_id, variant_id, - strategies[i].max_threads, - strategies[i].min_threads, - strategies[i].tmu_spilling_allowed, - strategies[i].disable_loop_unrolling, - strategies[i].disable_ubo_load_sorting, - strategies[i].disable_tmu_pipelining, - i == ARRAY_SIZE(strategies) - 1); + strat, &strategies[strat], + strat == ARRAY_SIZE(strategies) - 1); v3d_attempt_compile(c); - if (i >= ARRAY_SIZE(strategies) - 1 || - c->compilation_result != - V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) { + /* Broken shader or driver bug */ + if (c->compilation_result == V3D_COMPILATION_FAILED) break; + + /* If we compiled without spills, choose this. + * Otherwise if this is a 4-thread compile, choose this (these + * have a very low cap on the allowed TMU spills so we assume + * it will be better than a 2-thread compile without spills). + * Otherwise, keep going while tracking the strategy with the + * lowest spill count. + */ + if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) { + if (c->spills == 0 || + strategies[strat].min_threads == 4 || + V3D_DBG(OPT_COMPILE_TIME)) { + set_best_compile(&best_c, c); + break; + } else if (c->spills + c->fills < + best_spill_fill_count) { + set_best_compile(&best_c, c); + best_spill_fill_count = c->spills + c->fills; + } + + if (V3D_DBG(PERF)) { + char *debug_msg; + int ret = asprintf(&debug_msg, + "Compiled %s prog %d/%d with %d " + "spills and %d fills. Will try " + "more strategies.", + vir_get_stage_name(c), + c->program_id, c->variant_id, + c->spills, c->fills); + if (ret >= 0) { + fprintf(stderr, "%s\n", debug_msg); + c->debug_output(debug_msg, c->debug_output_data); + free(debug_msg); + } + } } + + /* Only try next streategy if we failed to register allocate + * or we had to spill. + */ + assert(c->compilation_result == + V3D_COMPILATION_FAILED_REGISTER_ALLOCATION || + c->spills > 0); } - if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) && + /* If the best strategy was not the last, choose that */ + if (best_c && c != best_c) + set_best_compile(&c, best_c); + + if (V3D_DBG(PERF) && c->compilation_result != V3D_COMPILATION_FAILED_REGISTER_ALLOCATION && c->spills > 0) { char *debug_msg; int ret = asprintf(&debug_msg, - "Compiled %s with %d spills and %d fills", + "Compiled %s prog %d/%d with %d " + "spills and %d fills", vir_get_stage_name(c), + c->program_id, c->variant_id, c->spills, c->fills); fprintf(stderr, "%s\n", debug_msg); @@ -1747,8 +2041,12 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, } if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) { - fprintf(stderr, "Failed to compile %s with any strategy.\n", - vir_get_stage_name(c)); + fprintf(stderr, "Failed to compile %s prog %d/%d " + "with any strategy.\n", + vir_get_stage_name(c), c->program_id, c->variant_id); + + vir_compile_destroy(c); + return NULL; } struct v3d_prog_data *prog_data; @@ -1762,8 +2060,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, char *shaderdb; int ret = v3d_shaderdb_dump(c, &shaderdb); if (ret >= 0) { - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) - fprintf(stderr, "SHADER-DB: %s\n", shaderdb); + if (V3D_DBG(SHADERDB)) + fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb); c->debug_output(shaderdb, c->debug_output_data); free(shaderdb); @@ -1872,8 +2170,11 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) struct qinst *prev_inst = NULL; assert(c->cur_block); -#ifdef DEBUG - /* Check if the current instruction is part of the current block */ +#if MESA_DEBUG + /* We can only reuse a uniform if it was emitted in the same block, + * so callers must make sure the current instruction is being emitted + * in the current block. + */ bool found = false; vir_for_each_inst(inst, c->cur_block) { if (&inst->link == c->cursor.link) { @@ -1882,7 +2183,7 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) } } - assert(found || list_is_empty(&c->cur_block->instructions)); + assert(found || &c->cur_block->instructions == c->cursor.link); #endif list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev, @@ -1900,6 +2201,12 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) if (!prev_inst) return false; + /* Only reuse the ldunif result if it was written to a temp register, + * otherwise there may be special restrictions (for example, ldunif + * may write directly to unifa, which is a write-only register). + */ + if (prev_inst->dst.file != QFILE_TEMP) + return false; list_for_each_entry_from(struct qinst, inst, prev_inst->link.next, &c->cur_block->instructions, link) { diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c index 5c47bbdc1b0..631eeee52ab 100644 --- a/src/broadcom/compiler/vir_dump.c +++ b/src/broadcom/compiler/vir_dump.c @@ -182,11 +182,6 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst, break; } - case QFILE_VPM: - fprintf(stderr, "vpm%d.%d", - reg.index / 4, reg.index % 4); - break; - case QFILE_TEMP: fprintf(stderr, "t%d", reg.index); break; @@ -197,9 +192,6 @@ static void vir_dump_sig_addr(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) { - if (devinfo->ver < 41) - return; - if (!instr->sig_magic) fprintf(stderr, ".rf%d", instr->sig_addr); else { @@ -270,8 +262,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack)); - unpack[0] = instr->alu.add.a_unpack; - unpack[1] = instr->alu.add.b_unpack; + unpack[0] = instr->alu.add.a.unpack; + unpack[1] = instr->alu.add.b.unpack; } else { fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op)); fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc)); @@ -282,8 +274,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack)); - unpack[0] = instr->alu.mul.a_unpack; - unpack[1] = instr->alu.mul.b_unpack; + unpack[0] = instr->alu.mul.a.unpack; + unpack[1] = instr->alu.mul.b.unpack; } for (int i = 0; i < nsrc; i++) { diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c index 2fd6430a0f4..d1f44aa9cf7 100644 --- a/src/broadcom/compiler/vir_live_variables.c +++ b/src/broadcom/compiler/vir_live_variables.c @@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c) flags_inst = NULL; } - /* Payload registers: r0/1/2 contain W, centroid W, - * and Z at program start. Register allocation will - * force their nodes to R0/1/2. + /* Payload registers: for fragment shaders, W, + * centroid W, and Z will be initialized in r0/1/2 + * until v42, or r1/r2/r3 since v71. + * + * For compute shaders, payload is in r0/r2 up to v42, + * r2/r3 since v71. + * + * Register allocation will force their nodes to those + * registers. */ if (inst->src[0].file == QFILE_REG) { - switch (inst->src[0].index) { - case 0: - case 1: - case 2: + uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0; + uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2; + if (inst->src[0].index >= min_payload_r || + inst->src[0].index <= max_payload_r) { c->temp_start[inst->dst.index] = 0; - break; } } @@ -306,6 +311,8 @@ vir_calculate_live_intervals(struct v3d_compile *c) vir_for_each_block(block, c) { ralloc_free(block->def); + ralloc_free(block->defin); + ralloc_free(block->defout); ralloc_free(block->use); ralloc_free(block->live_in); ralloc_free(block->live_out); diff --git a/src/broadcom/compiler/vir_opt_constant_alu.c b/src/broadcom/compiler/vir_opt_constant_alu.c index 483646f882e..dc4c8a65026 100644 --- a/src/broadcom/compiler/vir_opt_constant_alu.c +++ b/src/broadcom/compiler/vir_opt_constant_alu.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -155,6 +155,7 @@ vir_opt_constant_alu(struct v3d_compile *c) { bool progress = false; vir_for_each_block(block, c) { + c->cur_block = block; vir_for_each_inst_safe(inst, block) { progress = try_opt_constant_alu(c, inst) || progress; } diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c index c5bb6112173..611c4693ed3 100644 --- a/src/broadcom/compiler/vir_opt_copy_propagate.c +++ b/src/broadcom/compiler/vir_opt_copy_propagate.c @@ -35,7 +35,7 @@ #include "v3d_compiler.h" static bool -is_copy_mov(struct qinst *inst) +is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst) { if (!inst) return false; @@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst) return false; } - switch (inst->src[0].file) { - case QFILE_MAGIC: - /* No copy propagating from R3/R4/R5 -- the MOVs from those - * are there to register allocate values produced into R3/4/5 - * to other regs (though hopefully r3/4/5). - */ - switch (inst->src[0].index) { - case V3D_QPU_WADDR_R3: - case V3D_QPU_WADDR_R4: - case V3D_QPU_WADDR_R5: - return false; + if (devinfo->ver == 42) { + switch (inst->src[0].file) { + case QFILE_MAGIC: + /* No copy propagating from R3/R4/R5 -- the MOVs from + * those are there to register allocate values produced + * into R3/4/5 to other regs (though hopefully r3/4/5). + */ + switch (inst->src[0].index) { + case V3D_QPU_WADDR_R3: + case V3D_QPU_WADDR_R4: + case V3D_QPU_WADDR_R5: + return false; + default: + break; + } + break; + + case QFILE_REG: + switch (inst->src[0].index) { + case 0: + case 1: + case 2: + /* MOVs from rf0/1/2 are only to track the live + * intervals for W/centroid W/Z. + */ + return false; + } + break; + default: break; } - break; - - case QFILE_REG: - switch (inst->src[0].index) { - case 0: - case 1: - case 2: - /* MOVs from rf0/1/2 are only to track the live + } else { + assert(devinfo->ver >= 71); + switch (inst->src[0].file) { + case QFILE_REG: + switch (inst->src[0].index) { + /* MOVs from rf1/2/3 are only to track the live * intervals for W/centroid W/Z. + * + * Note: rf0 can be implicitly written by ldvary + * (no temp involved), so it is not an SSA value and + * could clash with writes to other temps that are + * also allocated to rf0. In theory, that would mean + * that we can't copy propagate from it, but we handle + * this at register allocation time, preventing temps + * from being allocated to rf0 while the rf0 value from + * ldvary is still live. */ - return false; - } - break; + case 1: + case 2: + case 3: + return false; + } + break; - default: - break; + default: + break; + } } return true; @@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan) if (vir_is_add(inst)) { if (chan == 0) - return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE; else - return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE; } else { if (chan == 0) - return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE; else - return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE; } } @@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) */ struct qinst *mov = movs[inst->src[i].index]; if (!mov) { - if (!is_copy_mov(c->defs[inst->src[i].index])) + if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index])) continue; mov = c->defs[inst->src[i].index]; @@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) continue; /* these ops can't represent abs. */ - if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) { + if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) { switch (inst->qpu.alu.add.op) { case V3D_QPU_A_VFPACK: case V3D_QPU_A_FROUND: @@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) inst->src[i] = mov->src[0]; if (vir_has_unpack(mov, 0)) { - enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack; + enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack; vir_set_unpack(inst, i, unpack); } @@ -238,12 +267,14 @@ vir_opt_copy_propagate(struct v3d_compile *c) */ memset(movs, 0, sizeof(struct qinst *) * c->num_temps); + c->cur_block = block; vir_for_each_inst(inst, block) { + progress = try_copy_prop(c, inst, movs) || progress; apply_kills(c, movs, inst); - if (is_copy_mov(inst)) + if (is_copy_mov(c->devinfo, inst)) movs[inst->dst.index] = inst; } } diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c index 64c762c88db..fd1af944427 100644 --- a/src/broadcom/compiler/vir_opt_dead_code.c +++ b/src/broadcom/compiler/vir_opt_dead_code.c @@ -52,21 +52,10 @@ dce(struct v3d_compile *c, struct qinst *inst) } static bool -has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst) -{ - for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file == QFILE_VPM) - return true; - } - - return false; -} - -static bool can_write_to_null(struct v3d_compile *c, struct qinst *inst) { /* The SFU instructions must write to a physical register. */ - if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu)) + if (v3d_qpu_uses_sfu(&inst->qpu)) return false; return true; @@ -149,30 +138,25 @@ check_first_ldunifa(struct v3d_compile *c, } static bool -increment_unifa_address(struct v3d_compile *c, struct qblock *block, struct qinst *unifa) +increment_unifa_address(struct v3d_compile *c, struct qinst *unifa) { - struct qblock *current_block = c->cur_block; if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU && unifa->qpu.alu.mul.op == V3D_QPU_M_MOV) { c->cursor = vir_after_inst(unifa); - c->cur_block = block; struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); vir_ADD_dest(c, unifa_reg, unifa->src[0], vir_uniform_ui(c, 4u)); vir_remove_instruction(c, unifa); - c->cur_block = current_block; return true; } if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU && unifa->qpu.alu.add.op == V3D_QPU_A_ADD) { c->cursor = vir_after_inst(unifa); - c->cur_block = block; struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); struct qreg tmp = vir_ADD(c, unifa->src[1], vir_uniform_ui(c, 4u)); vir_ADD_dest(c, unifa_reg, unifa->src[0], tmp); vir_remove_instruction(c, unifa); - c->cur_block = current_block; return true; } @@ -200,7 +184,7 @@ vir_opt_dead_code(struct v3d_compile *c) vir_for_each_block(block, c) { struct qinst *last_flags_write = NULL; - + c->cur_block = block; vir_for_each_inst_safe(inst, block) { /* If this instruction reads the flags, we can't * remove the flags generation for it. @@ -246,7 +230,6 @@ vir_opt_dead_code(struct v3d_compile *c) } if (v3d_qpu_writes_flags(&inst->qpu) || - has_nonremovable_reads(c, inst) || (is_ldunifa && !is_first_ldunifa && !is_last_ldunifa)) { /* If we can't remove the instruction, but we * don't need its destination value, just @@ -276,7 +259,7 @@ vir_opt_dead_code(struct v3d_compile *c) */ if (is_first_ldunifa) { assert(unifa); - if (!increment_unifa_address(c, block, unifa)) + if (!increment_unifa_address(c, unifa)) continue; } diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c index 4609ef9c361..6b61ed6a39a 100644 --- a/src/broadcom/compiler/vir_opt_redundant_flags.c +++ b/src/broadcom/compiler/vir_opt_redundant_flags.c @@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b) a->qpu.flags.mpf != b->qpu.flags.mpf || a->qpu.alu.add.op != b->qpu.alu.add.op || a->qpu.alu.mul.op != b->qpu.alu.mul.op || - a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack || - a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack || + a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack || + a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack || a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack || - a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack || - a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack || + a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack || + a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack || a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) { return false; } @@ -99,6 +99,7 @@ vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block) struct qinst *last_flags = NULL; bool progress = false; + c->cur_block = block; vir_for_each_inst(inst, block) { if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || inst->qpu.flags.auf != V3D_QPU_UF_NONE || diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c index 47d7722968d..56f0bf20706 100644 --- a/src/broadcom/compiler/vir_opt_small_immediates.c +++ b/src/broadcom/compiler/vir_opt_small_immediates.c @@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c) /* The small immediate value sits in the raddr B field, so we * can't have 2 small immediates in one instruction (unless * they're the same value, but that should be optimized away - * elsewhere). + * elsewhere). Since 7.x we can encode small immediates in + * any raddr field, but each instruction can still only use + * one. */ bool uses_small_imm = false; for (int i = 0; i < vir_get_nsrc(inst); i++) { @@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c) */ struct v3d_qpu_sig new_sig = inst->qpu.sig; uint32_t sig_packed; - new_sig.small_imm = true; + if (c->devinfo->ver == 42) { + new_sig.small_imm_b = true; + } else { + if (vir_is_add(inst)) { + if (i == 0) + new_sig.small_imm_a = true; + else + new_sig.small_imm_b = true; + } else { + if (i == 0) + new_sig.small_imm_c = true; + else + new_sig.small_imm_d = true; + } + } + if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed)) continue; @@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c) vir_dump_inst(c, inst); fprintf(stderr, "\n"); } - inst->qpu.sig.small_imm = true; + inst->qpu.sig.small_imm_a = new_sig.small_imm_a; + inst->qpu.sig.small_imm_b = new_sig.small_imm_b; + inst->qpu.sig.small_imm_c = new_sig.small_imm_c; + inst->qpu.sig.small_imm_d = new_sig.small_imm_d; inst->qpu.raddr_b = packed; inst->src[i].file = QFILE_SMALL_IMM; diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index 08698b4ece1..53e84840899 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -26,12 +26,100 @@ #include "common/v3d_device_info.h" #include "v3d_compiler.h" -#define QPU_R(i) { .magic = false, .index = i } - #define ACC_INDEX 0 #define ACC_COUNT 6 -#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) -#define PHYS_COUNT 64 + +/* RA nodes used to track RF registers with implicit writes */ +#define IMPLICIT_RF_COUNT 1 + +#define PHYS_COUNT 64 + +static uint8_t +get_phys_index(const struct v3d_device_info *devinfo) +{ + if (devinfo->has_accumulators) + return ACC_INDEX + ACC_COUNT; + else + return 0; +} + +/* ACC as accumulator */ +#define CLASS_BITS_PHYS (1 << 0) +#define CLASS_BITS_ACC (1 << 1) +#define CLASS_BITS_R5 (1 << 4) + +static uint8_t +get_class_bit_any(const struct v3d_device_info *devinfo) +{ + if (devinfo->has_accumulators) + return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5); + else + return CLASS_BITS_PHYS; +} + +static uint8_t +filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits) +{ + if (!devinfo->has_accumulators) { + assert(class_bits & CLASS_BITS_PHYS); + class_bits = CLASS_BITS_PHYS; + } + return class_bits; +} + +static inline uint32_t +temp_to_node(struct v3d_compile *c, uint32_t temp) +{ + return temp + (c->devinfo->has_accumulators ? ACC_COUNT : + IMPLICIT_RF_COUNT); +} + +static inline uint32_t +node_to_temp(struct v3d_compile *c, uint32_t node) +{ + assert((c->devinfo->has_accumulators && node >= ACC_COUNT) || + (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT)); + return node - (c->devinfo->has_accumulators ? ACC_COUNT : + IMPLICIT_RF_COUNT); +} + +static inline uint8_t +get_temp_class_bits(struct v3d_compile *c, + uint32_t temp) +{ + return c->nodes.info[temp_to_node(c, temp)].class_bits; +} + +static inline void +set_temp_class_bits(struct v3d_compile *c, + uint32_t temp, uint8_t class_bits) +{ + c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits; +} + +static struct ra_class * +choose_reg_class(struct v3d_compile *c, uint8_t class_bits) +{ + if (class_bits == CLASS_BITS_PHYS) { + return c->compiler->reg_class_phys[c->thread_index]; + } else if (class_bits == (CLASS_BITS_R5)) { + assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_r5[c->thread_index]; + } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) { + assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_phys_or_acc[c->thread_index]; + } else { + assert(class_bits == get_class_bit_any(c->devinfo)); + return c->compiler->reg_class_any[c->thread_index]; + } +} + +static inline struct ra_class * +choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp) +{ + assert(temp < c->num_temps && temp < c->nodes.alloc_count); + return choose_reg_class(c, get_temp_class_bits(c, temp)); +} static inline bool qinst_writes_tmu(const struct v3d_device_info *devinfo, @@ -46,23 +134,22 @@ static bool is_end_of_tmu_sequence(const struct v3d_device_info *devinfo, struct qinst *inst, struct qblock *block) { - if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { - return true; - } - - if (!inst->qpu.sig.ldtmu) + /* Only tmuwt and ldtmu can finish TMU sequences */ + bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + inst->qpu.alu.add.op == V3D_QPU_A_TMUWT; + bool is_ldtmu = inst->qpu.sig.ldtmu; + if (!is_tmuwt && !is_ldtmu) return false; + /* Check if this is the last tmuwt or ldtmu in the sequence */ list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, &block->instructions, link) { - if (scan_inst->qpu.sig.ldtmu) - return false; + is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT; + is_ldtmu = scan_inst->qpu.sig.ldtmu; - if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { - return true; - } + if (is_tmuwt || is_ldtmu) + return false; if (qinst_writes_tmu(devinfo, scan_inst)) return true; @@ -79,11 +166,101 @@ vir_is_mov_uniform(struct v3d_compile *c, int temp) return def && def->qpu.sig.ldunif; } +static bool +can_reconstruct_inst(struct qinst *inst) +{ + assert(inst); + + if (vir_is_add(inst)) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_FXCD: + case V3D_QPU_A_FYCD: + case V3D_QPU_A_XCD: + case V3D_QPU_A_YCD: + case V3D_QPU_A_IID: + case V3D_QPU_A_EIDX: + case V3D_QPU_A_TIDX: + case V3D_QPU_A_SAMPID: + /* No need to check input unpacks because none of these + * opcodes read sources. FXCD,FYCD have pack variants. + */ + return inst->qpu.flags.ac == V3D_QPU_COND_NONE && + inst->qpu.flags.auf == V3D_QPU_UF_NONE && + inst->qpu.flags.apf == V3D_QPU_PF_NONE && + inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE; + default: + return false; + } + } + + return false; +} + +static bool +can_reconstruct_temp(struct v3d_compile *c, int temp) +{ + struct qinst *def = c->defs[temp]; + return def && can_reconstruct_inst(def); +} + +static struct qreg +reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op) +{ + struct qreg dest; + switch (op) { + case V3D_QPU_A_FXCD: + dest = vir_FXCD(c); + break; + case V3D_QPU_A_FYCD: + dest = vir_FYCD(c); + break; + case V3D_QPU_A_XCD: + dest = vir_XCD(c); + break; + case V3D_QPU_A_YCD: + dest = vir_YCD(c); + break; + case V3D_QPU_A_IID: + dest = vir_IID(c); + break; + case V3D_QPU_A_EIDX: + dest = vir_EIDX(c); + break; + case V3D_QPU_A_TIDX: + dest = vir_TIDX(c); + break; + case V3D_QPU_A_SAMPID: + dest = vir_SAMPID(c); + break; + default: + unreachable("Unexpected opcode for reconstruction"); + } + + return dest; +} + +enum temp_spill_type { + SPILL_TYPE_UNIFORM, + SPILL_TYPE_RECONSTRUCT, + SPILL_TYPE_TMU +}; + +static enum temp_spill_type +get_spill_type_for_temp(struct v3d_compile *c, int temp) +{ + if (vir_is_mov_uniform(c, temp)) + return SPILL_TYPE_UNIFORM; + + if (can_reconstruct_temp(c, temp)) + return SPILL_TYPE_RECONSTRUCT; + + return SPILL_TYPE_TMU; +} + static int -v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, - uint32_t *temp_to_node) +v3d_choose_spill_node(struct v3d_compile *c) { - const float tmu_scale = 5; + const float tmu_scale = 10; float block_scale = 1.0; float spill_costs[c->num_temps]; bool in_tmu_operation = false; @@ -99,7 +276,8 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, * starting output writes. */ bool no_spilling = - c->threads > 1 && started_last_seg; + (c->threads > 1 && started_last_seg) || + (c->max_tmu_spills == 0); /* Discourage spilling of TMU operations */ for (int i = 0; i < vir_get_nsrc(inst); i++) { @@ -107,7 +285,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, continue; int temp = inst->src[i].index; - if (vir_is_mov_uniform(c, temp)) { + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); + + if (spill_type != SPILL_TYPE_TMU) { spill_costs[temp] += block_scale; } else if (!no_spilling) { float tmu_op_scale = in_tmu_operation ? @@ -122,11 +303,11 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, if (inst->dst.file == QFILE_TEMP) { int temp = inst->dst.index; + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); - if (vir_is_mov_uniform(c, temp)) { - /* We just rematerialize the unform - * later. - */ + if (spill_type != SPILL_TYPE_TMU) { + /* We just rematerialize it later */ } else if (!no_spilling) { spill_costs[temp] += (block_scale * tmu_scale); @@ -147,10 +328,6 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, if (inst->is_last_thrsw) started_last_seg = true; - if (v3d_qpu_writes_vpm(&inst->qpu) || - v3d_qpu_uses_tlb(&inst->qpu)) - started_last_seg = true; - /* Track when we're in between a TMU setup and the * final LDTMU or TMUWT from that TMU setup. We * penalize spills during that time. @@ -163,12 +340,53 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, } } + /* We always emit a "last thrsw" to ensure all our spilling occurs + * before the last thread section. See vir_emit_last_thrsw. + */ + assert(started_last_seg); + for (unsigned i = 0; i < c->num_temps; i++) { - if (BITSET_TEST(c->spillable, i)) - ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]); + if (BITSET_TEST(c->spillable, i)) { + ra_set_node_spill_cost(c->g, temp_to_node(c, i), + spill_costs[i]); + } } - return ra_get_best_spill_node(g); + return ra_get_best_spill_node(c->g); +} + +static void +ensure_nodes(struct v3d_compile *c) +{ + if (c->num_temps < c->nodes.alloc_count) + return; + + c->nodes.alloc_count *= 2; + c->nodes.info = reralloc_array_size(c, + c->nodes.info, + sizeof(c->nodes.info[0]), + c->nodes.alloc_count + + MAX2(ACC_COUNT, IMPLICIT_RF_COUNT)); +} + +/* Creates the interference node for a new temp. We use this to keep the node + * list updated during the spilling process, which generates new temps/nodes. + */ +static void +add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) +{ + ensure_nodes(c); + + int node = ra_add_node(c->g, choose_reg_class(c, class_bits)); + assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT : + node == temp + IMPLICIT_RF_COUNT); + + /* We fill the node priority after we are done inserting spills */ + c->nodes.info[node].class_bits = class_bits; + c->nodes.info[node].priority = 0; + c->nodes.info[node].is_ldunif_dst = false; + c->nodes.info[node].is_program_end = false; + c->nodes.info[node].unused = false; } /* The spill offset for this thread takes a bit of setup, so do it once at @@ -206,79 +424,224 @@ v3d_setup_spill_base(struct v3d_compile *c) vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0)); /* Make sure that we don't spill the spilling setup instructions. */ - for (int i = start_num_temps; i < c->num_temps; i++) + for (int i = start_num_temps; i < c->num_temps; i++) { BITSET_CLEAR(c->spillable, i); + /* If we are spilling, update the RA map with the temps added + * by the spill setup. Our spill_base register can never be an + * accumulator because it is used for TMU spill/fill and thus + * needs to persist across thread switches. + */ + if (c->spilling) { + int temp_class = CLASS_BITS_PHYS; + if (c->devinfo->has_accumulators && + i != c->spill_base.index) { + temp_class |= CLASS_BITS_ACC; + } + add_node(c, i, temp_class); + } + } + /* Restore the current block. */ c->cur_block = current_block; c->cursor = vir_after_block(c->cur_block); } -static struct qinst * -v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) +/** + * Computes the address for a spill/fill sequence and completes the spill/fill + * sequence by emitting the following code: + * + * ldunif.spill_offset + * add tmua spill_base spill_offset + * thrsw + * + * If the sequence is for a spill, then it will emit a tmuwt after the thrsw, + * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'. + * + * The parameter 'ip' represents the ip at which the spill/fill is happening. + * This is used to disallow accumulators on temps that cross this ip boundary + * due to the new thrsw itroduced in the sequence above. + */ +static void +v3d_emit_spill_tmua(struct v3d_compile *c, + uint32_t spill_offset, + enum v3d_qpu_cond cond, + int32_t ip, + struct qreg *fill_dst) { - return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), - c->spill_base, vir_uniform_ui(c, spill_offset)); -} + assert(ip >= 0); + + /* Load a uniform with the spill offset and add it to the spill base + * to obtain the TMUA address. It can be of class ANY because we know + * we are consuming it immediately without thrsw in between. + */ + assert(c->disable_ldunif_opt); + struct qreg offset = vir_uniform_ui(c, spill_offset); + add_node(c, offset.index, get_class_bit_any(c->devinfo)); + /* We always enable per-quad on spills/fills to ensure we spill + * any channels involved with helper invocations. + */ + struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset); + inst->qpu.flags.ac = cond; + inst->ldtmu_count = 1; + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, + 0xffffff7f); /* per-quad */ + + vir_emit_thrsw(c); + + /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the + * result of the fill. The TMUWT temp is not really read, the ldtmu + * temp will be used immediately so just like the uniform above we + * can allow accumulators. + */ + int temp_class = + filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC); + if (!fill_dst) { + struct qreg dst = vir_TMUWT(c); + assert(dst.file == QFILE_TEMP); + add_node(c, dst.index, temp_class); + } else { + *fill_dst = vir_LDTMU(c); + assert(fill_dst->file == QFILE_TEMP); + add_node(c, fill_dst->index, temp_class); + } + + /* Temps across the thread switch we injected can't be assigned to + * accumulators. + * + * Fills inject code before ip, so anything that starts at ip or later + * is not affected by the thrsw. Something that ends at ip will be + * affected though. + * + * Spills inject code after ip, so anything that starts strictly later + * than ip is not affected (the temp starting at ip is usually the + * spilled temp except for postponed spills). Something that ends at ip + * won't be affected either. + */ + for (int i = 0; i < c->spill_start_num_temps; i++) { + bool thrsw_cross = fill_dst ? + c->temp_start[i] < ip && c->temp_end[i] >= ip : + c->temp_start[i] <= ip && c->temp_end[i] > ip; + if (thrsw_cross) { + ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class(c, CLASS_BITS_PHYS)); + } + } +} static void -v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst, - struct qinst *position, uint32_t spill_offset) +v3d_emit_tmu_spill(struct v3d_compile *c, + struct qinst *inst, + struct qreg spill_temp, + struct qinst *position, + uint32_t ip, + uint32_t spill_offset) { assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); + assert(inst->dst.file == QFILE_TEMP); c->cursor = vir_after_inst(position); - inst->dst = vir_get_temp(c); + enum v3d_qpu_cond cond = vir_get_cond(inst); + + /* If inst and position don't match, this is a postponed spill, + * in which case we have already allocated the temp for the spill + * and we should use that, otherwise create a new temp with the + * same register class bits as the original. + */ + if (inst == position) { + uint8_t class_bits = get_temp_class_bits(c, inst->dst.index); + inst->dst = vir_get_temp(c); + add_node(c, inst->dst.index, class_bits); + } else { + inst->dst = spill_temp; + + /* If this is a postponed spill the register being spilled may + * have been written more than once including conditional + * writes, so ignore predication on the spill instruction and + * always spill the full register. + */ + cond = V3D_QPU_COND_NONE; + } + struct qinst *tmp = vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), inst->dst); tmp->qpu.flags.mc = cond; - tmp = v3d_emit_spill_tmua(c, spill_offset); - tmp->qpu.flags.ac = cond; - vir_emit_thrsw(c); - vir_TMUWT(c); + + v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL); + c->spills++; c->tmu_dirty_rcl = true; } +static inline bool +interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end) +{ + return !(t0_start >= t1_end || t1_start >= t0_end); +} + static void -v3d_spill_reg(struct v3d_compile *c, int spill_temp) +v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes, + int spill_temp) { - c->spill_count++; + c->spill_start_num_temps = c->num_temps; + c->spilling = true; - bool is_uniform = vir_is_mov_uniform(c, spill_temp); + enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp); uint32_t spill_offset = 0; - - if (!is_uniform) { + if (spill_type == SPILL_TYPE_TMU) { spill_offset = c->spill_size; c->spill_size += V3D_CHANNELS * sizeof(uint32_t); - if (spill_offset == 0) + if (spill_offset == 0) { v3d_setup_spill_base(c); + + /* Don't allocate our spill base to rf0 to avoid + * conflicts with instructions doing implicit writes + * to that register. + */ + if (!c->devinfo->has_accumulators) { + ra_add_node_interference( + c->g, + temp_to_node(c, c->spill_base.index), + implicit_rf_nodes[0]); + } + } } struct qinst *last_thrsw = c->last_thrsw; assert(last_thrsw && last_thrsw->is_last_thrsw); - int start_num_temps = c->num_temps; - int uniform_index = ~0; - if (is_uniform) { + if (spill_type == SPILL_TYPE_UNIFORM) { struct qinst *orig_unif = c->defs[spill_temp]; uniform_index = orig_unif->uniform; } + enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP; + if (spill_type == SPILL_TYPE_RECONSTRUCT) { + struct qinst *orig_def = c->defs[spill_temp]; + assert(vir_is_add(orig_def)); + reconstruct_op = orig_def->qpu.alu.add.op; + } + + uint32_t spill_node = temp_to_node(c, spill_temp); + /* We must disable the ldunif optimization if we are spilling uniforms */ bool had_disable_ldunif_opt = c->disable_ldunif_opt; c->disable_ldunif_opt = true; struct qinst *start_of_tmu_sequence = NULL; struct qinst *postponed_spill = NULL; + struct qreg postponed_spill_temp = { 0 }; vir_for_each_block(block, c) { vir_for_each_inst_safe(inst, block) { + int32_t ip = inst->ip; + /* Track when we're in between a TMU setup and the final * LDTMU or TMUWT from that TMU setup. We can't spill/fill any * temps during that time, because that involves inserting a @@ -289,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) if (is_end_of_tmu_sequence(c->devinfo, inst, block)) { if (postponed_spill) { v3d_emit_tmu_spill(c, postponed_spill, - inst, spill_offset); + postponed_spill_temp, + inst, ip, spill_offset); } start_of_tmu_sequence = NULL; @@ -302,49 +666,103 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) } /* fills */ + int filled_src = -1; for (int i = 0; i < vir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_TEMP || inst->src[i].index != spill_temp) { continue; } + if (filled_src >= 0) { + inst->src[i] = inst->src[filled_src]; + continue; + } + c->cursor = vir_before_inst(inst); - if (is_uniform) { + if (spill_type == SPILL_TYPE_UNIFORM) { struct qreg unif = vir_uniform(c, c->uniform_contents[uniform_index], c->uniform_data[uniform_index]); inst->src[i] = unif; + /* We are using the uniform in the + * instruction immediately after, so + * we can use any register class for it. + */ + add_node(c, unif.index, + get_class_bit_any(c->devinfo)); + } else if (spill_type == SPILL_TYPE_RECONSTRUCT) { + struct qreg temp = + reconstruct_temp(c, reconstruct_op); + inst->src[i] = temp; + /* We are using the temp in the + * instruction immediately after so we + * can use ACC. + */ + int temp_class = + filter_class_bits(c->devinfo, CLASS_BITS_PHYS | + CLASS_BITS_ACC); + add_node(c, temp.index, temp_class); } else { - /* If we have a postponed spill, we don't need - * a fill as the temp would not have been - * spilled yet. + /* If we have a postponed spill, we + * don't need a fill as the temp would + * not have been spilled yet, however, + * we need to update the temp index. */ - if (postponed_spill) - continue; - if (start_of_tmu_sequence) - c->cursor = vir_before_inst(start_of_tmu_sequence); - - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - inst->src[i] = vir_LDTMU(c); - c->fills++; + if (postponed_spill) { + inst->src[i] = + postponed_spill_temp; + } else { + int32_t fill_ip = ip; + if (start_of_tmu_sequence) { + c->cursor = vir_before_inst(start_of_tmu_sequence); + fill_ip = start_of_tmu_sequence->ip; + } + + v3d_emit_spill_tmua(c, spill_offset, + V3D_QPU_COND_NONE, + fill_ip, &inst->src[i]); + c->fills++; + } } + + filled_src = i; } /* spills */ if (inst->dst.file == QFILE_TEMP && inst->dst.index == spill_temp) { - if (is_uniform) { + if (spill_type != SPILL_TYPE_TMU) { c->cursor.link = NULL; vir_remove_instruction(c, inst); } else { - if (start_of_tmu_sequence) + /* If we are in the middle of a TMU + * sequence, we postpone the actual + * spill until we have finished it. We, + * still need to replace the spill temp + * with a new temp though. + */ + if (start_of_tmu_sequence) { + if (postponed_spill) { + postponed_spill->dst = + postponed_spill_temp; + } + if (!postponed_spill || + vir_get_cond(inst) == V3D_QPU_COND_NONE) { + postponed_spill_temp = + vir_get_temp(c); + add_node(c, + postponed_spill_temp.index, + c->nodes.info[spill_node].class_bits); + } postponed_spill = inst; - else - v3d_emit_tmu_spill(c, inst, inst, + } else { + v3d_emit_tmu_spill(c, inst, + postponed_spill_temp, + inst, ip, spill_offset); + } } } } @@ -358,21 +776,64 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) /* Don't allow spilling of our spilling instructions. There's no way * they can help get things colored. */ - for (int i = start_num_temps; i < c->num_temps; i++) + for (int i = c->spill_start_num_temps; i < c->num_temps; i++) BITSET_CLEAR(c->spillable, i); + /* Reset interference for spilled node */ + ra_set_node_spill_cost(c->g, spill_node, 0); + ra_reset_node_interference(c->g, spill_node); + BITSET_CLEAR(c->spillable, spill_temp); + + /* Rebuild program ips */ + int32_t ip = 0; + vir_for_each_inst_inorder(inst, c) + inst->ip = ip++; + + /* Rebuild liveness */ + vir_calculate_live_intervals(c); + + /* Add interferences for the new spilled temps and update interferences + * for c->spill_base (since we may have modified its liveness). Also, + * update node priorities based one new liveness data. + */ + uint32_t sb_temp =c->spill_base.index; + uint32_t sb_node = temp_to_node(c, sb_temp); + for (uint32_t i = 0; i < c->num_temps; i++) { + if (c->temp_end[i] == -1) + continue; + + uint32_t node_i = temp_to_node(c, i); + c->nodes.info[node_i].priority = + c->temp_end[i] - c->temp_start[i]; + + for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps); + j < c->num_temps; j++) { + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { + uint32_t node_j = temp_to_node(c, j); + ra_add_node_interference(c->g, node_i, node_j); + } + } + + if (spill_type == SPILL_TYPE_TMU) { + if (i != sb_temp && + interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[sb_temp], c->temp_end[sb_temp])) { + ra_add_node_interference(c->g, node_i, sb_node); + } + } + } + c->disable_ldunif_opt = had_disable_ldunif_opt; + c->spilling = false; } -struct node_to_temp_map { - uint32_t temp; - uint32_t priority; -}; - struct v3d_ra_select_callback_data { + uint32_t phys_index; uint32_t next_acc; uint32_t next_phys; - struct node_to_temp_map *map; + struct v3d_ra_node_info *nodes; + const struct v3d_device_info *devinfo; }; /* Choosing accumulators improves chances of merging QPU instructions @@ -384,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, int priority) { + if (!v3d_ra->devinfo->has_accumulators) + return false; + /* Favor accumulators if we have less that this number of physical * registers. Accumulators have more restrictions (like being * invalidated through thrsw), so running out of physical registers @@ -393,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, static const int available_rf_threshold = 5; int available_rf = 0 ; for (int i = 0; i < PHYS_COUNT; i++) { - if (BITSET_TEST(regs, PHYS_INDEX + i)) + if (BITSET_TEST(regs, v3d_ra->phys_index + i)) available_rf++; if (available_rf >= available_rf_threshold) break; @@ -419,6 +883,19 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, unsigned int *out) { + if (!v3d_ra->devinfo->has_accumulators) + return false; + + /* Choose r5 for our ldunifs if possible (nobody else can load to that + * reg, and it keeps the QPU cond field free from being occupied by + * ldunifrf). + */ + int r5 = ACC_INDEX + 5; + if (BITSET_TEST(regs, r5)) { + *out = r5; + return true; + } + /* Round-robin through our accumulators to give post-RA instruction * selection more options. */ @@ -438,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, static bool v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + unsigned int node, BITSET_WORD *regs, unsigned int *out) { + /* If this node is for an unused temp, ignore. */ + if (v3d_ra->nodes->info[node].unused) { + *out = 0; + return true; + } + + /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst + * so we can avoid turning them into ldunifrf (which uses the + * cond field to encode the dst and would prevent merge with + * instructions that use cond flags). + */ + if (v3d_ra->nodes->info[node].is_ldunif_dst && + BITSET_TEST(regs, v3d_ra->phys_index)) { + assert(v3d_ra->devinfo->ver >= 71); + *out = v3d_ra->phys_index; + return true; + } + + /* The last 3 instructions in a shader can't use some specific registers + * (usually early rf registers, depends on v3d version) so try to + * avoid allocating these to registers used by the last instructions + * in the shader. + */ + const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4; + if (v3d_ra->nodes->info[node].is_program_end && + v3d_ra->next_phys < safe_rf_start) { + v3d_ra->next_phys = safe_rf_start; + } + for (int i = 0; i < PHYS_COUNT; i++) { int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; - int phys = PHYS_INDEX + phys_off; + + /* Try to keep rf0 available for ldunif in 7.x (see above). */ + if (v3d_ra->devinfo->ver >= 71 && phys_off == 0) + continue; + + int phys = v3d_ra->phys_index + phys_off; if (BITSET_TEST(regs, phys)) { v3d_ra->next_phys = phys_off + 1; @@ -452,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, } } + /* If we couldn't allocate, do try to assign rf0 if it is available. */ + if (v3d_ra->devinfo->ver >= 71 && + BITSET_TEST(regs, v3d_ra->phys_index)) { + v3d_ra->next_phys = 1; + *out = v3d_ra->phys_index; + return true; + } + return false; } @@ -459,22 +979,14 @@ static unsigned int v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) { struct v3d_ra_select_callback_data *v3d_ra = data; - int r5 = ACC_INDEX + 5; - - /* Choose r5 for our ldunifs if possible (nobody else can load to that - * reg, and it keeps the QPU cond field free from being occupied by - * ldunifrf). - */ - if (BITSET_TEST(regs, r5)) - return r5; unsigned int reg; - if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) && + if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) && v3d_ra_select_accum(v3d_ra, regs, ®)) { return reg; } - if (v3d_ra_select_rf(v3d_ra, regs, ®)) + if (v3d_ra_select_rf(v3d_ra, n, regs, ®)) return reg; /* If we ran out of physical registers try to assign an accumulator @@ -492,9 +1004,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler) /* Allocate up to 3 regfile classes, for the ways the physical * register file can be divided up for fragment shader threading. */ - int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); + int max_thread_index = 2; + uint8_t phys_index = get_phys_index(compiler->devinfo); - compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, + compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT, false); if (!compiler->regs) return false; @@ -502,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler) for (int threads = 0; threads < max_thread_index; threads++) { compiler->reg_class_any[threads] = ra_alloc_contig_reg_class(compiler->regs, 1); - compiler->reg_class_r5[threads] = - ra_alloc_contig_reg_class(compiler->regs, 1); - compiler->reg_class_phys_or_acc[threads] = - ra_alloc_contig_reg_class(compiler->regs, 1); + if (compiler->devinfo->has_accumulators) { + compiler->reg_class_r5[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + compiler->reg_class_phys_or_acc[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + } compiler->reg_class_phys[threads] = ra_alloc_contig_reg_class(compiler->regs, 1); - for (int i = PHYS_INDEX; - i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { - ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + /* Init physical regs */ + for (int i = phys_index; + i < phys_index + (PHYS_COUNT >> threads); i++) { + if (compiler->devinfo->has_accumulators) + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); ra_class_add_reg(compiler->reg_class_phys[threads], i); ra_class_add_reg(compiler->reg_class_any[threads], i); } - for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { - ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); - ra_class_add_reg(compiler->reg_class_any[threads], i); + /* Init accumulator regs */ + if (compiler->devinfo->has_accumulators) { + for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->reg_class_any[threads], i); + } + /* r5 can only store a single 32-bit value, so not much can + * use it. + */ + ra_class_add_reg(compiler->reg_class_r5[threads], + ACC_INDEX + 5); + ra_class_add_reg(compiler->reg_class_any[threads], + ACC_INDEX + 5); } - /* r5 can only store a single 32-bit value, so not much can - * use it. - */ - ra_class_add_reg(compiler->reg_class_r5[threads], - ACC_INDEX + 5); - ra_class_add_reg(compiler->reg_class_any[threads], - ACC_INDEX + 5); } ra_set_finalize(compiler->regs, NULL); @@ -534,52 +1054,220 @@ vir_init_reg_sets(struct v3d_compiler *compiler) return true; } -static int -node_to_temp_priority(const void *in_a, const void *in_b) +static inline bool +tmu_spilling_allowed(struct v3d_compile *c) { - const struct node_to_temp_map *a = in_a; - const struct node_to_temp_map *b = in_b; - - return a->priority - b->priority; + return c->spills + c->fills < c->max_tmu_spills; } -/** - * Computes the number of registers to spill in a batch after a register - * allocation failure. - */ -static uint32_t -get_spill_batch_size(struct v3d_compile *c) -{ - /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of - * over-spilling if the program requires few spills to compile. - */ - if (c->spill_count < 10) - return 1; - - /* If we have to spill more than that we assume performance is not going to - * be great and we shift focus to batching spills to cut down compile - * time at the expense of over-spilling. - */ - return 20; -} - -/* Don't emit spills using the TMU until we've dropped thread count first. We, - * may also disable spilling when certain optimizations that are known to - * increase register pressure are active so we favor recompiling with - * optimizations disabled instead of spilling. - */ -static inline bool -tmu_spilling_allowed(struct v3d_compile *c, int thread_index) +static void +update_graph_and_reg_classes_for_inst(struct v3d_compile *c, + int *acc_nodes, + int *implicit_rf_nodes, + int last_ldvary_ip, + struct qinst *inst) { - return thread_index == 0 && c->tmu_spilling_allowed; + int32_t ip = inst->ip; + assert(ip >= 0); + + /* If the instruction writes r4 (and optionally moves its + * result to a temp), nothing else can be stored in r4 across + * it. + */ + if (vir_writes_r4_implicitly(c->devinfo, inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + acc_nodes[4]); + } + } + } + + /* If any instruction writes to a physical register implicitly + * nothing else can write the same register across it. + */ + if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + implicit_rf_nodes[0]); + } + } + } + + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_LDVPMV_IN: + case V3D_QPU_A_LDVPMV_OUT: + case V3D_QPU_A_LDVPMD_IN: + case V3D_QPU_A_LDVPMD_OUT: + case V3D_QPU_A_LDVPMP: + case V3D_QPU_A_LDVPMG_IN: + case V3D_QPU_A_LDVPMG_OUT: { + /* LDVPMs only store to temps (the MA flag + * decides whether the LDVPM is in or out) + */ + assert(inst->dst.file == QFILE_TEMP); + set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } + + case V3D_QPU_A_RECIP: + case V3D_QPU_A_RSQRT: + case V3D_QPU_A_EXP: + case V3D_QPU_A_LOG: + case V3D_QPU_A_SIN: + case V3D_QPU_A_RSQRT2: { + /* The SFU instructions write directly to the + * phys regfile. + */ + assert(inst->dst.file == QFILE_TEMP); + set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } + + default: + break; + } + } + + if (inst->src[0].file == QFILE_REG) { + switch (inst->src[0].index) { + case 0: + /* V3D 7.x doesn't use rf0 for thread payload */ + if (c->devinfo->ver >= 71) + break; + else + FALLTHROUGH; + case 1: + case 2: + case 3: { + /* Payload setup instructions: Force allocate + * the dst to the given register (so the MOV + * will disappear). + */ + assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); + assert(inst->dst.file == QFILE_TEMP); + uint32_t node = temp_to_node(c, inst->dst.index); + ra_set_node_reg(c->g, node, + get_phys_index(c->devinfo) + + inst->src[0].index); + break; + } + } + } + + /* Don't allocate rf0 to temps that cross ranges where we have + * live implicit rf0 writes from ldvary. We can identify these + * by tracking the last ldvary instruction and explicit reads + * of rf0. + */ + if (c->devinfo->ver >= 71 && + ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) || + (vir_get_nsrc(inst) > 1 && + inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && + c->temp_end[i] > last_ldvary_ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + implicit_rf_nodes[0]); + } + } + } + + if (inst->dst.file == QFILE_TEMP) { + /* Only a ldunif gets to write to R5, which only has a + * single 32-bit channel of storage. + * + * NOTE: ldunifa is subject to the same, however, going by + * shader-db it is best to keep r5 exclusive to ldunif, probably + * because ldunif has usually a shorter lifespan, allowing for + * more accumulator reuse and QPU merges. + */ + if (c->devinfo->has_accumulators) { + if (!inst->qpu.sig.ldunif) { + uint8_t class_bits = + get_temp_class_bits(c, inst->dst.index) & + ~CLASS_BITS_R5; + set_temp_class_bits(c, inst->dst.index, + class_bits); + + } + } else { + /* Make sure we don't allocate the ldvary's + * destination to rf0, since it would clash + * with its implicit write to that register. + */ + if (inst->qpu.sig.ldvary) { + ra_add_node_interference(c->g, + temp_to_node(c, inst->dst.index), + implicit_rf_nodes[0]); + } + /* Flag dst temps from ldunif(a) instructions + * so we can try to assign rf0 to them and avoid + * converting these to ldunif(a)rf. + */ + if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) { + const uint32_t dst_n = + temp_to_node(c, inst->dst.index); + c->nodes.info[dst_n].is_ldunif_dst = true; + } + } + } + + /* All accumulators are invalidated across a thread switch. */ + if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + set_temp_class_bits(c, i, + CLASS_BITS_PHYS); + } + } + } } -#define CLASS_BIT_PHYS (1 << 0) -#define CLASS_BIT_ACC (1 << 1) -#define CLASS_BIT_R5 (1 << 4) -#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \ - CLASS_BIT_ACC | \ - CLASS_BIT_R5) +static void +flag_program_end_nodes(struct v3d_compile *c) +{ + /* Only look for registers used in this many instructions */ + uint32_t last_set_count = 6; + + struct qblock *last_block = vir_exit_block(c); + list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) { + if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU) + continue; + + int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); + for (int i = 0; i < num_src; i++) { + if (inst->src[i].file == QFILE_TEMP) { + int node = temp_to_node(c, inst->src[i].index); + c->nodes.info[node].is_program_end = true; + } + } + + num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); + for (int i = 0; i < num_src; i++) { + if (inst->src[i].file == QFILE_TEMP) { + int node = temp_to_node(c, inst->src[i].index); + c->nodes.info[node].is_program_end = true; + + } + } + + if (inst->dst.file == QFILE_TEMP) { + int node = temp_to_node(c, inst->dst.index); + c->nodes.info[node].is_program_end = true; + } + + if (--last_set_count == 0) + break; + } +} /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. @@ -587,24 +1275,37 @@ tmu_spilling_allowed(struct v3d_compile *c, int thread_index) * The return value should be freed by the caller. */ struct qpu_reg * -v3d_register_allocate(struct v3d_compile *c, bool *spilled) +v3d_register_allocate(struct v3d_compile *c) { - uint32_t UNUSED start_num_temps = c->num_temps; - struct node_to_temp_map map[c->num_temps]; - uint32_t temp_to_node[c->num_temps]; - uint8_t class_bits[c->num_temps]; int acc_nodes[ACC_COUNT]; + int implicit_rf_nodes[IMPLICIT_RF_COUNT]; + + unsigned num_ra_nodes = c->num_temps; + if (c->devinfo->has_accumulators) + num_ra_nodes += ARRAY_SIZE(acc_nodes); + else + num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes); + + c->nodes = (struct v3d_ra_node_info) { + .alloc_count = c->num_temps, + .info = ralloc_array_size(c, sizeof(c->nodes.info[0]), + num_ra_nodes), + }; + + uint32_t phys_index = get_phys_index(c->devinfo); + struct v3d_ra_select_callback_data callback_data = { + .phys_index = phys_index, .next_acc = 0, /* Start at RF3, to try to keep the TLB writes from using - * RF0-2. + * RF0-2. Start at RF4 in 7.x to prevent TLB writes from + * using RF2-3. */ - .next_phys = 3, - .map = map, + .next_phys = c->devinfo->ver == 42 ? 3 : 4, + .nodes = &c->nodes, + .devinfo = c->devinfo, }; - *spilled = false; - vir_calculate_live_intervals(c); /* Convert 1, 2, 4 threads to 0, 1, 2 index. @@ -612,257 +1313,163 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) * V3D 4.x has double the physical register space, so 64 physical regs * are available at both 1x and 2x threading, and 4x has 32. */ - int thread_index = ffs(c->threads) - 1; - if (c->devinfo->ver >= 40) { - if (thread_index >= 1) - thread_index--; - } + c->thread_index = ffs(c->threads) - 1; + if (c->thread_index >= 1) + c->thread_index--; - struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, - c->num_temps + - ARRAY_SIZE(acc_nodes)); - ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data); + c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes); + ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data); /* Make some fixed nodes for the accumulators, which we will need to * interfere with when ops have implied r3/r4 writes or for the thread * switches. We could represent these as classes for the nodes to * live in, but the classes take up a lot of memory to set up, so we - * don't want to make too many. + * don't want to make too many. We use the same mechanism on platforms + * without accumulators that can have implicit writes to phys regs. */ - for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) { - acc_nodes[i] = c->num_temps + i; - ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i); - } - - for (uint32_t i = 0; i < c->num_temps; i++) { - map[i].temp = i; - map[i].priority = c->temp_end[i] - c->temp_start[i]; - } - qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); - for (uint32_t i = 0; i < c->num_temps; i++) { - temp_to_node[map[i].temp] = i; + for (uint32_t i = 0; i < num_ra_nodes; i++) { + c->nodes.info[i].is_ldunif_dst = false; + c->nodes.info[i].is_program_end = false; + c->nodes.info[i].unused = false; + c->nodes.info[i].priority = 0; + c->nodes.info[i].class_bits = 0; + if (c->devinfo->has_accumulators && i < ACC_COUNT) { + acc_nodes[i] = i; + ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); + } else if (!c->devinfo->has_accumulators && + i < ARRAY_SIZE(implicit_rf_nodes)) { + implicit_rf_nodes[i] = i; + ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i); + } else { + uint32_t t = node_to_temp(c, i); + c->nodes.info[i].priority = + c->temp_end[t] - c->temp_start[t]; + c->nodes.info[i].class_bits = + get_class_bit_any(c->devinfo); + } } - /* Figure out our register classes and preallocated registers. We - * start with any temp being able to be in any file, then instructions - * incrementally remove bits that the temp definitely can't be in. + /* Walk the instructions adding register class restrictions and + * interferences. */ - memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits)); - int ip = 0; + int last_ldvary_ip = -1; vir_for_each_inst_inorder(inst, c) { - /* If the instruction writes r3/r4 (and optionally moves its - * result to a temp), nothing else can be stored in r3/r4 across - * it. + inst->ip = ip++; + + /* ldunif(a) always write to a temporary, so we have + * liveness info available to decide if rf0 is + * available for them, however, ldvary is different: + * it always writes to rf0 directly so we don't have + * liveness information for its implicit rf0 write. + * + * That means the allocator may assign rf0 to a temp + * that is defined while an implicit rf0 write from + * ldvary is still live. We fix that by manually + * tracking rf0 live ranges from ldvary instructions. */ - if (vir_writes_r3(c->devinfo, inst)) { - for (int i = 0; i < c->num_temps; i++) { - if (c->temp_start[i] < ip && - c->temp_end[i] > ip) { - ra_add_node_interference(g, - temp_to_node[i], - acc_nodes[3]); - } - } - } - if (vir_writes_r4(c->devinfo, inst)) { - for (int i = 0; i < c->num_temps; i++) { - if (c->temp_start[i] < ip && - c->temp_end[i] > ip) { - ra_add_node_interference(g, - temp_to_node[i], - acc_nodes[4]); - } - } - } - - if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { - switch (inst->qpu.alu.add.op) { - case V3D_QPU_A_LDVPMV_IN: - case V3D_QPU_A_LDVPMV_OUT: - case V3D_QPU_A_LDVPMD_IN: - case V3D_QPU_A_LDVPMD_OUT: - case V3D_QPU_A_LDVPMP: - case V3D_QPU_A_LDVPMG_IN: - case V3D_QPU_A_LDVPMG_OUT: - /* LDVPMs only store to temps (the MA flag - * decides whether the LDVPM is in or out) - */ - assert(inst->dst.file == QFILE_TEMP); - class_bits[inst->dst.index] &= CLASS_BIT_PHYS; - break; - - case V3D_QPU_A_RECIP: - case V3D_QPU_A_RSQRT: - case V3D_QPU_A_EXP: - case V3D_QPU_A_LOG: - case V3D_QPU_A_SIN: - case V3D_QPU_A_RSQRT2: - /* The SFU instructions write directly to the - * phys regfile. - */ - assert(inst->dst.file == QFILE_TEMP); - class_bits[inst->dst.index] &= CLASS_BIT_PHYS; - break; - - default: - break; - } - } + if (inst->qpu.sig.ldvary) + last_ldvary_ip = ip; - if (inst->src[0].file == QFILE_REG) { - switch (inst->src[0].index) { - case 0: - case 1: - case 2: - case 3: - /* Payload setup instructions: Force allocate - * the dst to the given register (so the MOV - * will disappear). - */ - assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); - assert(inst->dst.file == QFILE_TEMP); - ra_set_node_reg(g, - temp_to_node[inst->dst.index], - PHYS_INDEX + - inst->src[0].index); - break; - } - } - - if (inst->dst.file == QFILE_TEMP) { - /* Only a ldunif gets to write to R5, which only has a - * single 32-bit channel of storage. - */ - if (!inst->qpu.sig.ldunif) { - class_bits[inst->dst.index] &= ~CLASS_BIT_R5; - } else { - /* Until V3D 4.x, we could only load a uniform - * to r5, so we'll need to spill if uniform - * loads interfere with each other. - */ - if (c->devinfo->ver < 40) { - class_bits[inst->dst.index] &= - CLASS_BIT_R5; - } - } - } - - if (inst->qpu.sig.thrsw) { - /* All accumulators are invalidated across a thread - * switch. - */ - for (int i = 0; i < c->num_temps; i++) { - if (c->temp_start[i] < ip && c->temp_end[i] > ip) - class_bits[i] &= CLASS_BIT_PHYS; - } - } - - ip++; + update_graph_and_reg_classes_for_inst(c, acc_nodes, + implicit_rf_nodes, + last_ldvary_ip, inst); } + /* Flag the nodes that are used in the last instructions of the program + * (there are some registers that cannot be used in the last 3 + * instructions). We only do this for fragment shaders, because the idea + * is that by avoiding this conflict we may be able to emit the last + * thread switch earlier in some cases, however, in non-fragment shaders + * this won't happen because the last instructions are always VPM stores + * with a small immediate, which conflicts with other signals, + * preventing us from ever moving the thrsw earlier. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) + flag_program_end_nodes(c); + + /* Set the register classes for all our temporaries in the graph */ for (uint32_t i = 0; i < c->num_temps; i++) { - if (class_bits[i] == CLASS_BIT_PHYS) { - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_phys[thread_index]); - } else if (class_bits[i] == (CLASS_BIT_R5)) { - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_r5[thread_index]); - } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) { - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_phys_or_acc[thread_index]); - } else { - assert(class_bits[i] == CLASS_BITS_ANY); - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_any[thread_index]); - } + ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class_for_temp(c, i)); } + /* Add register interferences based on liveness data */ for (uint32_t i = 0; i < c->num_temps; i++) { + /* And while we are here, let's also flag nodes for + * unused temps. + */ + if (c->temp_start[i] > c->temp_end[i]) + c->nodes.info[temp_to_node(c, i)].unused = true; + for (uint32_t j = i + 1; j < c->num_temps; j++) { - if (!(c->temp_start[i] >= c->temp_end[j] || - c->temp_start[j] >= c->temp_end[i])) { - ra_add_node_interference(g, - temp_to_node[i], - temp_to_node[j]); + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + temp_to_node(c, j)); } } } - /* Debug code to force a bit of register spilling, for running across - * conformance tests to make sure that spilling works. + /* Debug option to force a bit of TMU spilling, for running + * across conformance tests to make sure that spilling works. */ - int force_register_spills = 0; - if (c->spill_size < - V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { - int node = v3d_choose_spill_node(c, g, temp_to_node); - if (node != -1) { - v3d_spill_reg(c, map[node].temp); - ralloc_free(g); - *spilled = true; - return NULL; + const int force_register_spills = 0; + if (force_register_spills > 0) + c->max_tmu_spills = UINT32_MAX; + + struct qpu_reg *temp_registers = NULL; + while (true) { + if (c->spill_size < + V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { + int node = v3d_choose_spill_node(c); + uint32_t temp = node_to_temp(c, node); + if (node != -1) { + v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + continue; + } } - } - - bool ok = ra_allocate(g); - if (!ok) { - const uint32_t spill_batch_size = get_spill_batch_size(c); - - for (uint32_t i = 0; i < spill_batch_size; i++) { - int node = v3d_choose_spill_node(c, g, temp_to_node); - if (node == -1) - break; - - /* TMU spills inject thrsw signals that invalidate - * accumulators, so we can't batch them. - */ - bool is_uniform = vir_is_mov_uniform(c, map[node].temp); - if (i > 0 && !is_uniform) - break; - if (is_uniform || tmu_spilling_allowed(c, thread_index)) { - v3d_spill_reg(c, map[node].temp); - - /* Ask the outer loop to call back in. */ - *spilled = true; + if (ra_allocate(c->g)) + break; - /* See comment above about batching TMU spills. - */ - if (!is_uniform) { - assert(i == 0); - break; - } - } else { - break; - } + /* Failed allocation, try to spill */ + int node = v3d_choose_spill_node(c); + if (node == -1) + goto spill_fail; + + uint32_t temp = node_to_temp(c, node); + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); + if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) { + v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + if (c->spills + c->fills > c->max_tmu_spills) + goto spill_fail; + } else { + goto spill_fail; } - - ralloc_free(g); - return NULL; } - /* Ensure that we are not accessing temp_to_node out of bounds. We - * should never trigger this assertion because `c->num_temps` only - * grows when we spill, in which case we return early and don't get - * here. - */ - assert(start_num_temps == c->num_temps); - struct qpu_reg *temp_registers = calloc(c->num_temps, - sizeof(*temp_registers)); - + /* Allocation was successful, build the 'temp -> reg' map */ + temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); for (uint32_t i = 0; i < c->num_temps; i++) { - int ra_reg = ra_get_node_reg(g, temp_to_node[i]); - if (ra_reg < PHYS_INDEX) { + int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i)); + if (ra_reg < phys_index) { temp_registers[i].magic = true; temp_registers[i].index = (V3D_QPU_WADDR_R0 + ra_reg - ACC_INDEX); } else { temp_registers[i].magic = false; - temp_registers[i].index = ra_reg - PHYS_INDEX; + temp_registers[i].index = ra_reg - phys_index; } } - ralloc_free(g); - +spill_fail: + ralloc_free(c->nodes.info); + c->nodes.info = NULL; + c->nodes.alloc_count = 0; + ralloc_free(c->g); + c->g = NULL; return temp_registers; } diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c index aa33545420e..605c3e4c7d5 100644 --- a/src/broadcom/compiler/vir_to_qpu.c +++ b/src/broadcom/compiler/vir_to_qpu.c @@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr) return reg; } -static inline struct qpu_reg -qpu_acc(int acc) -{ - return qpu_magic(V3D_QPU_WADDR_R0 + acc); -} - struct v3d_qpu_instr v3d_qpu_nop(void) { @@ -92,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst) return q; } +static void +v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src) +{ + /* If we have a small immediate move it from inst->raddr_b to the + * corresponding raddr. + */ + if (src.smimm) { + assert(instr->sig.small_imm_a || instr->sig.small_imm_b || + instr->sig.small_imm_c || instr->sig.small_imm_d); + *raddr = instr->raddr_b; + return; + } + + assert(!src.magic); + *raddr = src.index; +} + /** * Allocates the src register (accumulator or register file) into the RADDR * fields of the instruction. */ static void -set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) +v3d42_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) { if (src.smimm) { - assert(instr->sig.small_imm); + assert(instr->sig.small_imm_b); *mux = V3D_QPU_MUX_B; return; } @@ -112,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) return; } - if (instr->alu.add.a != V3D_QPU_MUX_A && - instr->alu.add.b != V3D_QPU_MUX_A && - instr->alu.mul.a != V3D_QPU_MUX_A && - instr->alu.mul.b != V3D_QPU_MUX_A) { + if (instr->alu.add.a.mux != V3D_QPU_MUX_A && + instr->alu.add.b.mux != V3D_QPU_MUX_A && + instr->alu.mul.a.mux != V3D_QPU_MUX_A && + instr->alu.mul.b.mux != V3D_QPU_MUX_A) { instr->raddr_a = src.index; *mux = V3D_QPU_MUX_A; } else { if (instr->raddr_a == src.index) { *mux = V3D_QPU_MUX_A; } else { - assert(!(instr->alu.add.a == V3D_QPU_MUX_B && - instr->alu.add.b == V3D_QPU_MUX_B && - instr->alu.mul.a == V3D_QPU_MUX_B && - instr->alu.mul.b == V3D_QPU_MUX_B) || + assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B && + instr->alu.add.b.mux == V3D_QPU_MUX_B && + instr->alu.mul.a.mux == V3D_QPU_MUX_B && + instr->alu.mul.b.mux == V3D_QPU_MUX_B) || src.index == instr->raddr_b); instr->raddr_b = src.index; @@ -134,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) } } -static bool -is_no_op_mov(struct qinst *qinst) +/* + * The main purpose of the following wrapper is to make calling set_src + * cleaner. This is the reason it receives both mux and raddr pointers. Those + * will be filled or not based on the device version. + */ +static void +set_src(struct v3d_qpu_instr *instr, + enum v3d_qpu_mux *mux, + uint8_t *raddr, + struct qpu_reg src, + const struct v3d_device_info *devinfo) { - static const struct v3d_qpu_sig no_sig = {0}; - - /* Make sure it's just a lone MOV. */ - if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || - qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || - qinst->qpu.alu.add.op != V3D_QPU_A_NOP || - memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { - return false; - } + if (devinfo->ver < 71) + return v3d42_set_src(instr, mux, src); + else + return v3d71_set_src(instr, raddr, src); +} - /* Check if it's a MOV from a register to itself. */ +static bool +v3d42_mov_src_and_dst_equal(struct qinst *qinst) +{ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; if (qinst->qpu.alu.mul.magic_write) { if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4) return false; - if (qinst->qpu.alu.mul.a != + if (qinst->qpu.alu.mul.a.mux != V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) { return false; } } else { int raddr; - switch (qinst->qpu.alu.mul.a) { + switch (qinst->qpu.alu.mul.a.mux) { case V3D_QPU_MUX_A: raddr = qinst->qpu.raddr_a; break; @@ -174,10 +192,61 @@ is_no_op_mov(struct qinst *qinst) return false; } + return true; +} + +static bool +v3d71_mov_src_and_dst_equal(struct qinst *qinst) +{ + if (qinst->qpu.alu.mul.magic_write) + return false; + + enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; + int raddr; + + raddr = qinst->qpu.alu.mul.a.raddr; + if (raddr != waddr) + return false; + + return true; +} + +static bool +mov_src_and_dst_equal(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return v3d42_mov_src_and_dst_equal(qinst); + else + return v3d71_mov_src_and_dst_equal(qinst); +} + + +static bool +is_no_op_mov(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + static const struct v3d_qpu_sig no_sig = {0}; + + /* Make sure it's just a lone MOV. We only check for M_MOV. Although + * for V3D 7.x there is also A_MOV, we don't need to check for it as + * we always emit using M_MOV. We could use A_MOV later on the + * squedule to improve performance + */ + if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || + qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || + qinst->qpu.alu.add.op != V3D_QPU_A_NOP || + memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { + return false; + } + + if (!mov_src_and_dst_equal(qinst, devinfo)) + return false; + /* No packing or flags updates, or we need to execute the * instruction. */ - if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || + if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE || qinst->qpu.flags.mc != V3D_QPU_COND_NONE || qinst->qpu.flags.mpf != V3D_QPU_PF_NONE || @@ -193,8 +262,6 @@ v3d_generate_code_block(struct v3d_compile *c, struct qblock *block, struct qpu_reg *temp_registers) { - int last_vpm_read_index = -1; - vir_for_each_inst_safe(qinst, block) { #if 0 fprintf(stderr, "translating qinst to qpu: "); @@ -202,8 +269,6 @@ v3d_generate_code_block(struct v3d_compile *c, fprintf(stderr, "\n"); #endif - struct qinst *temp; - if (vir_has_uniform(qinst)) c->num_uniforms++; @@ -219,8 +284,14 @@ v3d_generate_code_block(struct v3d_compile *c, src[i] = qpu_magic(qinst->src[i].index); break; case QFILE_NULL: + /* QFILE_NULL is an undef, so we can load + * anything. Using a reg that doesn't have + * sched. restrictions. + */ + src[i] = qpu_reg(5); + break; case QFILE_LOAD_IMM: - src[i] = qpu_acc(0); + assert(!"not reached"); break; case QFILE_TEMP: src[i] = temp_registers[index]; @@ -228,18 +299,6 @@ v3d_generate_code_block(struct v3d_compile *c, case QFILE_SMALL_IMM: src[i].smimm = true; break; - - case QFILE_VPM: - assert((int)qinst->src[i].index >= - last_vpm_read_index); - (void)last_vpm_read_index; - last_vpm_read_index = qinst->src[i].index; - - temp = new_qpu_nop_before(qinst); - temp->qpu.sig.ldvpm = true; - - src[i] = qpu_acc(3); - break; } } @@ -261,10 +320,6 @@ v3d_generate_code_block(struct v3d_compile *c, dst = temp_registers[qinst->dst.index]; break; - case QFILE_VPM: - dst = qpu_magic(V3D_QPU_WADDR_VPM); - break; - case QFILE_SMALL_IMM: case QFILE_LOAD_IMM: assert(!"not reached"); @@ -276,10 +331,15 @@ v3d_generate_code_block(struct v3d_compile *c, assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); - if (!dst.magic || - dst.index != V3D_QPU_WADDR_R5) { - assert(c->devinfo->ver >= 40); + bool use_rf; + if (c->devinfo->has_accumulators) { + use_rf = !dst.magic || + dst.index != V3D_QPU_WADDR_R5; + } else { + use_rf = dst.magic || dst.index != 0; + } + if (use_rf) { if (qinst->qpu.sig.ldunif) { qinst->qpu.sig.ldunif = false; qinst->qpu.sig.ldunifrf = true; @@ -299,13 +359,18 @@ v3d_generate_code_block(struct v3d_compile *c, qinst->qpu.sig_magic = dst.magic; } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) { assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.a, src[0]); + &qinst->qpu.alu.add.a.mux, + &qinst->qpu.alu.add.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.b, src[1]); + &qinst->qpu.alu.add.b.mux, + &qinst->qpu.alu.add.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.add.waddr = dst.index; @@ -313,17 +378,21 @@ v3d_generate_code_block(struct v3d_compile *c, } else { if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.a, src[0]); + &qinst->qpu.alu.mul.a.mux, + &qinst->qpu.alu.mul.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.b, src[1]); + &qinst->qpu.alu.mul.b.mux, + &qinst->qpu.alu.mul.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.mul.waddr = dst.index; qinst->qpu.alu.mul.magic_write = dst.magic; - if (is_no_op_mov(qinst)) { + if (is_no_op_mov(qinst, c->devinfo)) { vir_remove_instruction(c, qinst); continue; } @@ -378,11 +447,7 @@ v3d_dump_qpu(struct v3d_compile *c) const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]); fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str); - /* We can only do this on 4.x, because we're not tracking TMU - * implicit uniforms here on 3.x. - */ - if (c->devinfo->ver >= 40 && - reads_uniform(c->devinfo, c->qpu_insts[i])) { + if (reads_uniform(c->devinfo, c->qpu_insts[i])) { fprintf(stderr, " ("); vir_dump_uniform(c->uniform_contents[next_uniform], c->uniform_data[next_uniform]); @@ -394,8 +459,7 @@ v3d_dump_qpu(struct v3d_compile *c) } /* Make sure our dumping lined up. */ - if (c->devinfo->ver >= 40) - assert(next_uniform == c->num_uniforms); + assert(next_uniform == c->num_uniforms); fprintf(stderr, "\n"); } @@ -431,8 +495,8 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers) } assert(i == c->qpu_inst_count); - if (V3D_DEBUG & (V3D_DEBUG_QPU | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(QPU) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { v3d_dump_qpu(c); } |