diff options
Diffstat (limited to 'src/broadcom/compiler/nir_to_vir.c')
-rw-r--r-- | src/broadcom/compiler/nir_to_vir.c | 1994 |
1 files changed, 1380 insertions, 614 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index d0a89f1a7d4..0303ca96103 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -38,7 +38,7 @@ #define __gen_address_type uint32_t #define __gen_address_offset(reloc) (*reloc) #define __gen_emit_reloc(cl, reloc) -#include "cle/v3d_packet_v41_pack.h" +#include "cle/v3d_packet_v42_pack.h" #define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) #define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) @@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c) c->last_thrsw->qpu.sig.thrsw = true; c->last_thrsw_at_top_level = !c->in_control_flow; - /* We need to lock the scoreboard before any tlb acess happens. If this + /* We need to lock the scoreboard before any tlb access happens. If this * thread switch comes after we have emitted a tlb load, then it means * that we can't lock on the last thread switch any more. */ @@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) } static uint32_t +v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr) +{ + nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr); + switch (atomic_op) { + case nir_atomic_op_iadd: + return instr->intrinsic == nir_intrinsic_ssbo_atomic ? + v3d_get_op_for_atomic_add(instr, 2) : + v3d_get_op_for_atomic_add(instr, 1); + case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN; + case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; + case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX; + case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX; + case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC; + case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC; + case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT; + case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; + case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + default: unreachable("unknown atomic op"); + } +} + +static uint32_t v3d_general_tmu_op(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { @@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr) case nir_intrinsic_load_uniform: case nir_intrinsic_load_shared: case nir_intrinsic_load_scratch: + case nir_intrinsic_load_global_2x32: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: return V3D_TMU_OP_REGULAR; - case nir_intrinsic_ssbo_atomic_add: - return v3d_get_op_for_atomic_add(instr, 2); - case nir_intrinsic_shared_atomic_add: - return v3d_get_op_for_atomic_add(instr, 1); - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_shared_atomic_imin: - return V3D_TMU_OP_WRITE_SMIN; - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_shared_atomic_umin: - return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_shared_atomic_imax: - return V3D_TMU_OP_WRITE_SMAX; - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_shared_atomic_umax: - return V3D_TMU_OP_WRITE_UMAX; - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_shared_atomic_and: - return V3D_TMU_OP_WRITE_AND_READ_INC; - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_shared_atomic_or: - return V3D_TMU_OP_WRITE_OR_READ_DEC; - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_shared_atomic_xor: - return V3D_TMU_OP_WRITE_XOR_READ_NOT; - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_shared_atomic_exchange: - return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; - case nir_intrinsic_ssbo_atomic_comp_swap: - case nir_intrinsic_shared_atomic_comp_swap: - return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_global_atomic_2x32: + case nir_intrinsic_global_atomic_swap_2x32: + return v3d_general_tmu_op_for_atomic(instr); + default: unreachable("unknown intrinsic op"); } @@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c) bool emitted_tmuwt = false; for (int i = 0; i < c->tmu.flush_count; i++) { if (c->tmu.flush[i].component_mask > 0) { - nir_dest *dest = c->tmu.flush[i].dest; - assert(dest); + nir_def *def = c->tmu.flush[i].def; + assert(def); for (int j = 0; j < 4; j++) { if (c->tmu.flush[i].component_mask & (1 << j)) { - ntq_store_dest(c, dest, j, - vir_MOV(c, vir_LDTMU(c))); + ntq_store_def(c, def, j, + vir_MOV(c, vir_LDTMU(c))); } } } else if (!emitted_tmuwt) { @@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c) /** * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller - * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, + * is responsible for ensuring that doing this doesn't overflow the TMU fifos, * and more specifically, the output fifo, since that can't stall. */ void ntq_add_pending_tmu_flush(struct v3d_compile *c, - nir_dest *dest, + nir_def *def, uint32_t component_mask) { const uint32_t num_components = util_bitcount(component_mask); @@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c, if (num_components > 0) { c->tmu.output_fifo_size += num_components; - if (!dest->is_ssa) - _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg); + + nir_intrinsic_instr *store = nir_store_reg_for_def(def); + if (store != NULL) { + nir_def *reg = store->src[1].ssa; + _mesa_set_add(c->tmu.outstanding_regs, reg); + } } - c->tmu.flush[c->tmu.flush_count].dest = dest; + c->tmu.flush[c->tmu.flush_count].def = def; c->tmu.flush[c->tmu.flush_count].component_mask = component_mask; c->tmu.flush_count++; + c->tmu.total_count++; if (c->disable_tmu_pipelining) ntq_flush_tmu(c); @@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c, uint32_t base_const_offset, uint32_t *writemask, uint32_t *const_offset, + uint32_t *type_size, uint32_t *tmu_writes) { struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); @@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c, /* Update the offset for the TMU write based on the * the first component we are writing. */ - *const_offset = base_const_offset + first_component * 4; + *type_size = nir_src_bit_size(instr->src[0]) / 8; + *const_offset = + base_const_offset + first_component * (*type_size); /* Clear these components from the writemask */ uint32_t written_mask = @@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c, int offset_src, struct qreg base_offset, uint32_t const_offset, + uint32_t dest_components, uint32_t *tmu_writes) { if (mode == MODE_COUNT) { @@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c, if (vir_in_nonuniform_control_flow(c)) vir_set_cond(tmu, V3D_QPU_COND_IFA); + + tmu->ldtmu_count = dest_components; } /** @@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c, */ static void ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, - bool is_shared_or_scratch) + bool is_shared_or_scratch, bool is_global) { uint32_t tmu_op = v3d_general_tmu_op(instr); @@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, * amount to add/sub, as that is implicit. */ bool atomic_add_replaced = - ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || - instr->intrinsic == nir_intrinsic_shared_atomic_add) && + (instr->intrinsic == nir_intrinsic_ssbo_atomic || + instr->intrinsic == nir_intrinsic_shared_atomic || + instr->intrinsic == nir_intrinsic_global_atomic_2x32) && + nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd && (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || - tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC); bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || instr->intrinsic == nir_intrinsic_store_scratch || - instr->intrinsic == nir_intrinsic_store_shared); + instr->intrinsic == nir_intrinsic_store_shared || + instr->intrinsic == nir_intrinsic_store_global_2x32); bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || instr->intrinsic == nir_intrinsic_load_ubo || instr->intrinsic == nir_intrinsic_load_ssbo || instr->intrinsic == nir_intrinsic_load_scratch || - instr->intrinsic == nir_intrinsic_load_shared); + instr->intrinsic == nir_intrinsic_load_shared || + instr->intrinsic == nir_intrinsic_load_global_2x32); if (!is_load) c->tmu_dirty_rcl = true; - bool has_index = !is_shared_or_scratch; + if (is_global) + c->has_global_address = true; + + bool has_index = !is_shared_or_scratch && !is_global; int offset_src; if (instr->intrinsic == nir_intrinsic_load_uniform) { @@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, instr->intrinsic == nir_intrinsic_load_ubo || instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_load_shared || + instr->intrinsic == nir_intrinsic_load_global_2x32 || atomic_add_replaced) { offset_src = 0 + has_index; } else if (is_store) { @@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, v3d_unit_data_create(0, const_offset)); const_offset = 0; } else if (instr->intrinsic == nir_intrinsic_load_ubo) { - uint32_t index = nir_src_as_uint(instr->src[0]); - /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index - * shifted up by 1 (0 is gallium's constant buffer 0). + /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0 + * is gallium's constant buffer 0 in GL and push constants + * in Vulkan)). */ - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) - index++; - + uint32_t index = nir_src_as_uint(instr->src[0]) + 1; base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, v3d_unit_data_create(index, const_offset)); @@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, base_offset = c->cs_shared_offset; const_offset += nir_intrinsic_base(instr); } + } else if (is_global) { + /* Global load/store intrinsics use gloal addresses, so the + * offset is the target address and we don't need to add it + * to a base offset. + */ + base_offset = vir_uniform_ui(c, 0); } else { + uint32_t idx = is_store ? 1 : 0; base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, - nir_src_as_uint(instr->src[is_store ? - 1 : 0])); + nir_src_comp_as_uint(instr->src[idx], 0)); } /* We are ready to emit TMU register writes now, but before we actually @@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) { assert(mode == MODE_COUNT || tmu_writes > 0); + uint32_t type_size = 4; + if (is_store) { emit_tmu_general_store_writes(c, mode, instr, base_const_offset, &writemask, &const_offset, + &type_size, &tmu_writes); } else if (!is_load && !atomic_add_replaced) { - emit_tmu_general_atomic_writes(c, mode, instr, - tmu_op, has_index, - &tmu_writes); + emit_tmu_general_atomic_writes(c, mode, instr, + tmu_op, has_index, + &tmu_writes); + } else if (is_load) { + type_size = instr->def.bit_size / 8; } /* For atomics we use 32bit except for CMPXCHG, that we need @@ -618,17 +648,39 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, v3d_tmu_get_type_from_op(tmu_op, !is_load) == V3D_TMU_OP_TYPE_ATOMIC; + /* Only load per-quad if we can be certain that all + * lines in the quad are active. Notice that demoted + * invocations, unlike terminated ones, are still + * active: we want to skip memory writes for them but + * loads should still work. + */ uint32_t perquad = - is_load && !vir_in_nonuniform_control_flow(c) - ? GENERAL_TMU_LOOKUP_PER_QUAD - : GENERAL_TMU_LOOKUP_PER_PIXEL; + is_load && !vir_in_nonuniform_control_flow(c) && + c->s->info.stage == MESA_SHADER_FRAGMENT && + c->s->info.fs.needs_quad_helper_invocations && + !c->emitted_discard ? + GENERAL_TMU_LOOKUP_PER_QUAD : + GENERAL_TMU_LOOKUP_PER_PIXEL; config = 0xffffff00 | tmu_op << 3 | perquad; if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { config |= GENERAL_TMU_LOOKUP_TYPE_VEC2; } else if (is_atomic || num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + switch (type_size) { + case 4: + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + break; + case 2: + config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI; + break; + case 1: + config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI; + break; + default: + unreachable("Unsupported bitsize"); + } } else { + assert(type_size == 4); config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; } @@ -637,7 +689,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, emit_tmu_general_address_write(c, mode, instr, config, dynamic_src, offset_src, base_offset, const_offset, - &tmu_writes); + dest_components, &tmu_writes); assert(tmu_writes > 0); if (mode == MODE_COUNT) { @@ -660,7 +712,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, */ const uint32_t component_mask = (1 << dest_components) - 1; - ntq_add_pending_tmu_flush(c, &instr->dest, + ntq_add_pending_tmu_flush(c, &instr->def, component_mask); } } @@ -673,7 +725,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, } static struct qreg * -ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) +ntq_init_ssa_def(struct v3d_compile *c, nir_def *def) { struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, def->num_components); @@ -717,8 +769,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig) * its destination to be the NIR reg's destination */ void -ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, - struct qreg result) +ntq_store_def(struct v3d_compile *c, nir_def *def, int chan, + struct qreg result) { struct qinst *last_inst = NULL; if (!list_is_empty(&c->cur_block->instructions)) @@ -731,23 +783,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, assert(result.file == QFILE_TEMP && last_inst && (last_inst == c->defs[result.index] || is_reused_uniform)); - if (dest->is_ssa) { - assert(chan < dest->ssa.num_components); + nir_intrinsic_instr *store = nir_store_reg_for_def(def); + if (store == NULL) { + assert(chan < def->num_components); struct qreg *qregs; struct hash_entry *entry = - _mesa_hash_table_search(c->def_ht, &dest->ssa); + _mesa_hash_table_search(c->def_ht, def); if (entry) qregs = entry->data; else - qregs = ntq_init_ssa_def(c, &dest->ssa); + qregs = ntq_init_ssa_def(c, def); qregs[chan] = result; } else { - nir_register *reg = dest->reg.reg; - assert(dest->reg.base_offset == 0); - assert(reg->num_array_elems == 0); + nir_def *reg = store->src[1].ssa; + ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + assert(nir_intrinsic_base(store) == 0); + assert(nir_intrinsic_num_array_elems(decl) == 0); struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; @@ -802,7 +856,9 @@ struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i) { struct hash_entry *entry; - if (src.is_ssa) { + + nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa); + if (load == NULL) { assert(i < src.ssa->num_components); entry = _mesa_hash_table_search(c->def_ht, src.ssa); @@ -811,10 +867,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i) entry = _mesa_hash_table_search(c->def_ht, src.ssa); } } else { - nir_register *reg = src.reg.reg; - assert(reg->num_array_elems == 0); - assert(src.reg.base_offset == 0); - assert(i < reg->num_components); + nir_def *reg = load->src[0].ssa; + ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + assert(nir_intrinsic_base(load) == 0); + assert(nir_intrinsic_num_array_elems(decl) == 0); + assert(i < nir_intrinsic_num_components(decl)); if (_mesa_set_search(c->tmu.outstanding_regs, reg)) ntq_flush_tmu(c); @@ -830,13 +887,8 @@ static struct qreg ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, unsigned src) { - assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); - unsigned chan = ffs(instr->dest.write_mask) - 1; struct qreg r = ntq_get_src(c, instr->src[src].src, - instr->src[src].swizzle[chan]); - - assert(!instr->src[src].abs); - assert(!instr->src[src].negate); + instr->src[src].swizzle[0]); return r; }; @@ -876,6 +928,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) case GLSL_SAMPLER_DIM_3D: case GLSL_SAMPLER_DIM_CUBE: case GLSL_SAMPLER_DIM_BUF: + case GLSL_SAMPLER_DIM_EXTERNAL: /* Don't minify the array size. */ if (!(instr->is_array && i == dest_size - 1)) { size = ntq_minify(c, size, lod); @@ -890,7 +943,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) unreachable("Bad sampler type"); } - ntq_store_dest(c, &instr->dest, i, size); + ntq_store_def(c, &instr->def, i, size); } } @@ -905,12 +958,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) */ switch (instr->op) { case nir_texop_query_levels: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); return; case nir_texop_texture_samples: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); return; case nir_texop_txs: ntq_emit_txs(c, instr); @@ -919,10 +972,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) break; } - if (c->devinfo->ver >= 40) - v3d40_vir_emit_tex(c, instr); - else - v3d33_vir_emit_tex(c, instr); + v3d_vir_emit_tex(c, instr); } static struct qreg @@ -963,44 +1013,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr) static struct qreg emit_smooth_varying(struct v3d_compile *c, - struct qreg vary, struct qreg w, struct qreg r5) + struct qreg vary, struct qreg w, struct qreg c_reg) { - return vir_FADD(c, vir_FMUL(c, vary, w), r5); + return vir_FADD(c, vir_FMUL(c, vary, w), c_reg); } static struct qreg emit_noperspective_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { - return vir_FADD(c, vir_MOV(c, vary), r5); + return vir_FADD(c, vir_MOV(c, vary), c_reg); } static struct qreg emit_flat_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { vir_MOV_dest(c, c->undef, vary); - return vir_MOV(c, r5); + return vir_MOV(c, c_reg); } static struct qreg emit_fragment_varying(struct v3d_compile *c, nir_variable *var, int8_t input_idx, uint8_t swizzle, int array_index) { - struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); - struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + struct qreg c_reg; /* C coefficient */ + + if (c->devinfo->has_accumulators) + c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + else + c_reg = vir_reg(QFILE_REG, 0); struct qinst *ldvary = NULL; struct qreg vary; - if (c->devinfo->ver >= 41) { - ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, - c->undef, c->undef); - ldvary->qpu.sig.ldvary = true; - vary = vir_emit_def(c, ldvary); - } else { - vir_NOP(c)->qpu.sig.ldvary = true; - vary = r3; - } + ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldvary->qpu.sig.ldvary = true; + vary = vir_emit_def(c, ldvary); /* Store the input value before interpolation so we can implement * GLSL's interpolateAt functions if the shader uses them. @@ -1008,7 +1057,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (input_idx >= 0) { assert(var); c->interp[input_idx].vp = vary; - c->interp[input_idx].C = vir_MOV(c, r5); + c->interp[input_idx].C = vir_MOV(c, c_reg); c->interp[input_idx].mode = var->data.interpolation; } @@ -1018,7 +1067,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, */ if (!var) { assert(input_idx < 0); - return emit_smooth_varying(c, vary, c->payload_w, r5); + return emit_smooth_varying(c, vary, c->payload_w, c_reg); } int i = c->num_inputs++; @@ -1033,20 +1082,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (var->data.centroid) { BITSET_SET(c->centroid_flags, i); result = emit_smooth_varying(c, vary, - c->payload_w_centroid, r5); + c->payload_w_centroid, c_reg); } else { - result = emit_smooth_varying(c, vary, c->payload_w, r5); + result = emit_smooth_varying(c, vary, c->payload_w, c_reg); } break; case INTERP_MODE_NOPERSPECTIVE: BITSET_SET(c->noperspective_flags, i); - result = emit_noperspective_varying(c, vary, r5); + result = emit_noperspective_varying(c, vary, c_reg); break; case INTERP_MODE_FLAT: BITSET_SET(c->flat_shade_flags, i); - result = emit_flat_varying(c, vary, r5); + result = emit_flat_varying(c, vary, c_reg); break; default: @@ -1163,16 +1212,6 @@ ntq_emit_comparison(struct v3d_compile *c, vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); break; - case nir_op_i2b32: - vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); - cond_invert = true; - break; - - case nir_op_f2b32: - vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); - cond_invert = true; - break; - default: return false; } @@ -1188,7 +1227,7 @@ ntq_emit_comparison(struct v3d_compile *c, static struct nir_alu_instr * ntq_get_alu_parent(nir_src src) { - if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) + if (src.ssa->parent_instr->type != nir_instr_type_alu) return NULL; nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); if (!instr) @@ -1199,7 +1238,7 @@ ntq_get_alu_parent(nir_src src) * src. */ for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { - if (!instr->src[i].src.is_ssa) + if (nir_load_reg_for_def(instr->src[i].src.ssa)) return NULL; } @@ -1242,12 +1281,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond) return result; } +static struct qreg +ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond) +{ + struct qreg result = + vir_MOV(c, vir_SEL(c, cond, + vir_uniform_ui(c, 1), + vir_uniform_ui(c, 0))); + c->flags_temp = result.index; + c->flags_cond = cond; + return result; +} + +static struct qreg +f2f16_rtz(struct v3d_compile *c, struct qreg f32) +{ + /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding + * method and seems to be using RTE by default, so we need to implement + * RTZ rounding in software. + */ + struct qreg rf16 = vir_FMOV(c, f32); + vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L); + + struct qreg rf32 = vir_FMOV(c, rf16); + vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L); + + struct qreg f32_abs = vir_FMOV(c, f32); + vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS); + + struct qreg rf32_abs = vir_FMOV(c, rf32); + vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS); + + vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs), + V3D_QPU_PF_PUSHN); + return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, + vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16)); +} + +/** + * Takes the result value of a signed integer width conversion from a smaller + * type to a larger type and if needed, it applies sign extension to it. + */ +static struct qreg +sign_extend(struct v3d_compile *c, + struct qreg value, + uint32_t src_bit_size, + uint32_t dst_bit_size) +{ + assert(src_bit_size < dst_bit_size); + + struct qreg tmp = vir_MOV(c, value); + + /* Do we need to sign-extend? */ + uint32_t sign_mask = 1 << (src_bit_size - 1); + struct qinst *sign_check = + vir_AND_dest(c, vir_nop_reg(), + tmp, vir_uniform_ui(c, sign_mask)); + vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ); + + /* If so, fill in leading sign bits */ + uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) & + ((1ull << dst_bit_size) - 1); + struct qinst *extend_inst = + vir_OR_dest(c, tmp, tmp, + vir_uniform_ui(c, extend_bits)); + vir_set_cond(extend_inst, V3D_QPU_COND_IFNA); + + return tmp; +} + static void ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) { - /* This should always be lowered to ALU operations for V3D. */ - assert(!instr->dest.saturate); - /* Vectors are special in that they have non-scalarized writemasks, * and just take the first swizzle channel for each argument in order * into each writemask channel. @@ -1260,8 +1365,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) srcs[i] = ntq_get_src(c, instr->src[i].src, instr->src[i].swizzle[0]); for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - ntq_store_dest(c, &instr->dest.dest, i, - vir_MOV(c, srcs[i])); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, srcs[i])); return; } @@ -1327,6 +1432,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); break; + case nir_op_f2f16: + case nir_op_f2f16_rtne: + assert(nir_src_bit_size(instr->src[0].src) == 32); + result = vir_FMOV(c, src[0]); + vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L); + break; + + case nir_op_f2f16_rtz: + assert(nir_src_bit_size(instr->src[0].src) == 32); + result = f2f16_rtz(c, src[0]); + break; + + case nir_op_f2f32: + assert(nir_src_bit_size(instr->src[0].src) == 16); + result = vir_FMOV(c, src[0]); + vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); + break; + + case nir_op_i2i16: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 32 || bit_size == 8); + if (bit_size == 32) { + /* We don't have integer pack/unpack methods for + * converting between 16-bit and 32-bit, so we implement + * the conversion manually by truncating the src. + */ + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); + } else { + struct qreg tmp = vir_AND(c, src[0], + vir_uniform_ui(c, 0xff)); + result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16)); + } + break; + } + + case nir_op_u2u16: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 32 || bit_size == 8); + + /* We don't have integer pack/unpack methods for converting + * between 16-bit and 32-bit, so we implement the conversion + * manually by truncating the src. For the 8-bit case, we + * want to make sure we don't copy garbage from any of the + * 24 MSB bits. + */ + if (bit_size == 32) + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); + else + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); + break; + } + + case nir_op_i2i8: + case nir_op_u2u8: + assert(nir_src_bit_size(instr->src[0].src) == 32 || + nir_src_bit_size(instr->src[0].src) == 16); + /* We don't have integer pack/unpack methods for converting + * between 8-bit and 32-bit, so we implement the conversion + * manually by truncating the src. + */ + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); + break; + + case nir_op_u2u32: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 16 || bit_size == 8); + + /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit + * from the src but we make sure to clear any garbage bits that + * may be present in the invalid src bits. + */ + uint32_t mask = (1 << bit_size) - 1; + result = vir_AND(c, src[0], vir_uniform_ui(c, mask)); + break; + } + + case nir_op_i2i32: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 16 || bit_size == 8); + + uint32_t mask = (1 << bit_size) - 1; + struct qreg tmp = vir_AND(c, src[0], + vir_uniform_ui(c, mask)); + + result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32)); + break; + } + case nir_op_iadd: result = vir_ADD(c, src[0], src[1]); break; @@ -1390,8 +1583,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; } - case nir_op_i2b32: - case nir_op_f2b32: case nir_op_feq32: case nir_op_fneu32: case nir_op_fge32: @@ -1485,13 +1676,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_uadd_carry: vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), V3D_QPU_PF_PUSHC); - result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); + result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); + break; + + case nir_op_usub_borrow: + vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]), + V3D_QPU_PF_PUSHC); + result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); break; case nir_op_pack_half_2x16_split: result = vir_VFPACK(c, src[0], src[1]); break; + case nir_op_pack_2x32_to_2x16_v3d: + result = vir_VPACK(c, src[0], src[1]); + break; + + case nir_op_pack_32_to_r11g11b10_v3d: + result = vir_V11FPACK(c, src[0], src[1]); + break; + + case nir_op_pack_uint_32_to_r10g10b10a2_v3d: + result = vir_V10PACK(c, src[0], src[1]); + break; + + case nir_op_pack_4x16_to_4x8_v3d: + result = vir_V8PACK(c, src[0], src[1]); + break; + case nir_op_unpack_half_2x16_split_x: result = vir_FMOV(c, src[0]); vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); @@ -1502,26 +1715,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); break; - case nir_op_fquantize2f16: { - /* F32 -> F16 -> F32 conversion */ - struct qreg tmp = vir_FMOV(c, src[0]); - vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L); - tmp = vir_FMOV(c, tmp); - vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L); + case nir_op_pack_2x16_to_unorm_2x8_v3d: + result = vir_VFTOUNORM8(c, src[0]); + break; - /* Check for denorm */ - struct qreg abs_src = vir_FMOV(c, src[0]); - vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS); - struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14)); - vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold), - V3D_QPU_PF_PUSHC); + case nir_op_pack_2x16_to_snorm_2x8_v3d: + result = vir_VFTOSNORM8(c, src[0]); + break; - /* Return +/-0 for denorms */ - struct qreg zero = - vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000)); - result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); + case nir_op_pack_2x16_to_unorm_2x10_v3d: + result = vir_VFTOUNORM10LO(c, src[0]); + break; + + case nir_op_pack_2x16_to_unorm_10_2_v3d: + result = vir_VFTOUNORM10HI(c, src[0]); + break; + + case nir_op_f2unorm_16_v3d: + result = vir_FTOUNORM16(c, src[0]); + break; + + case nir_op_f2snorm_16_v3d: + result = vir_FTOSNORM16(c, src[0]); break; - } default: fprintf(stderr, "unknown NIR ALU inst: "); @@ -1530,17 +1746,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) abort(); } - /* We have a scalar result, so the instruction should only have a - * single channel written to. - */ - assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); - ntq_store_dest(c, &instr->dest.dest, - ffs(instr->dest.write_mask) - 1, result); + ntq_store_def(c, &instr->def, 0, result); } /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit * specifier. They come from a register that's preloaded with 0xffffffff - * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low + * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low * 8 bits are shifted off the bottom and 0xff shifted in from the top. */ #define TLB_TYPE_F16_COLOR (3 << 6) @@ -1670,15 +1881,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt) static void emit_frag_end(struct v3d_compile *c) { - /* If the shader has no non-TLB side effects and doesn't write Z - * we can promote it to enabling early_fragment_tests even - * if the user didn't. - */ - if (c->output_position_index == -1 && - !(c->s->info.num_images || c->s->info.num_ssbos)) { - c->s->info.fs.early_fragment_tests = true; - } - if (c->output_sample_mask_index != -1) { vir_SETMSF_dest(c, vir_nop_reg(), vir_AND(c, @@ -1703,55 +1905,75 @@ emit_frag_end(struct v3d_compile *c) } struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); - if (c->output_position_index != -1 && - !c->s->info.fs.early_fragment_tests) { - struct qinst *inst = vir_MOV_dest(c, tlbu_reg, - c->outputs[c->output_position_index]); - uint8_t tlb_specifier = TLB_TYPE_DEPTH; - if (c->devinfo->ver >= 42) { - tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | - TLB_SAMPLE_MODE_PER_PIXEL); - } else - tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; + /* If the shader has no non-TLB side effects and doesn't write Z + * we can promote it to enabling early_fragment_tests even + * if the user didn't. + */ + if (c->output_position_index == -1 && + !(c->s->info.num_images || c->s->info.num_ssbos) && + !c->s->info.fs.uses_discard && + !c->s->info.fs.uses_demote && + !c->fs_key->sample_alpha_to_coverage && + c->output_sample_mask_index == -1 && + has_any_tlb_color_write) { + c->s->info.fs.early_fragment_tests = true; + } - inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, - tlb_specifier | - 0xffffff00); + /* By default, Z buffer writes are implicit using the Z values produced + * from FEP (Z value produced from rasterization). When this is not + * desirable (shader writes Z explicitly, has discards, etc) we need + * to let the hardware know by setting c->writes_z to true, in which + * case we always need to write a Z value from the QPU, even if it is + * just the passthrough Z value produced from FEP. + * + * Also, from the V3D 4.2 spec: + * + * "If a shader performs a Z read the “Fragment shader does Z writes” + * bit in the shader record must be enabled to ensure deterministic + * results" + * + * So if c->reads_z is set we always need to write Z, even if it is + * a passthrough from the Z value produced from FEP. + */ + if (!c->s->info.fs.early_fragment_tests || c->reads_z) { c->writes_z = true; - } else if (c->s->info.fs.uses_discard || - !c->s->info.fs.early_fragment_tests || - c->fs_key->sample_alpha_to_coverage || - !has_any_tlb_color_write) { - /* Emit passthrough Z if it needed to be delayed until shader - * end due to potential discards. - * - * Since (single-threaded) fragment shaders always need a TLB - * write, emit passthrouh Z if we didn't have any color - * buffers and flag us as potentially discarding, so that we - * can use Z as the TLB write. - */ - c->s->info.fs.uses_discard = true; - - struct qinst *inst = vir_MOV_dest(c, tlbu_reg, - vir_nop_reg()); uint8_t tlb_specifier = TLB_TYPE_DEPTH; + struct qinst *inst; + + if (c->output_position_index != -1) { + /* Shader writes to gl_FragDepth, use that */ + inst = vir_MOV_dest(c, tlbu_reg, + c->outputs[c->output_position_index]); + + tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | + TLB_SAMPLE_MODE_PER_PIXEL); + } else { + /* Shader doesn't write to gl_FragDepth, take Z from + * FEP. + */ + c->writes_z_from_fep = true; + inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg()); - if (c->devinfo->ver >= 42) { /* The spec says the PER_PIXEL flag is ignored for * invariant writes, but the simulator demands it. */ tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | TLB_SAMPLE_MODE_PER_PIXEL); - } else { - tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; + + /* Since (single-threaded) fragment shaders always need + * a TLB write, if we dond't have any we emit a + * passthrouh Z and flag us as potentially discarding, + * so that we can use Z as the required TLB write. + */ + if (!has_any_tlb_color_write) + c->s->info.fs.uses_discard = true; } - inst->uniform = vir_get_uniform_index(c, - QUNIFORM_CONSTANT, + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, tlb_specifier | 0xffffff00); - c->writes_z = true; + inst->is_tlb_z_write = true; } /* XXX: Performance improvement: Merge Z write and color writes TLB @@ -1767,7 +1989,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c, struct qreg vpm_index, bool uniform_vpm_index) { - assert(c->devinfo->ver >= 40); if (uniform_vpm_index) vir_STVPMV(c, vpm_index, val); else @@ -1777,13 +1998,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c, static void vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { - if (c->devinfo->ver >= 40) { - vir_VPM_WRITE_indirect(c, val, - vir_uniform_ui(c, vpm_index), true); - } else { - /* XXX: v3d33_vir_vpm_write_setup(c); */ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); - } + vir_VPM_WRITE_indirect(c, val, + vir_uniform_ui(c, vpm_index), true); } static void @@ -1791,7 +2007,7 @@ emit_vert_end(struct v3d_compile *c) { /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ - if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + if (c->devinfo->ver == 42) vir_VPMWT(c); } @@ -1800,7 +2016,7 @@ emit_geom_end(struct v3d_compile *c) { /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ - if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + if (c->devinfo->ver == 42) vir_VPMWT(c); } @@ -1812,8 +2028,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset, nir_intrinsic_instr *high, void *data) { - /* Our backend is 32-bit only at present */ - if (bit_size != 32) + /* TMU general access only supports 32-bit vectors */ + if (bit_size > 32) + return false; + + if ((bit_size == 8 || bit_size == 16) && num_components > 1) return false; if (align_mul % 4 != 0 || align_offset % 4 != 0) @@ -1843,7 +2062,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) do { progress = false; - NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp); + NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp); + NIR_PASS(progress, s, nir_opt_deref); + + NIR_PASS(progress, s, nir_lower_vars_to_ssa); + if (!s->info.var_copies_lowered) { + /* Only run this pass if nir_lower_var_copies was not called + * yet. That would lower away any copy_deref instructions and we + * don't want to introduce any more. + */ + NIR_PASS(progress, s, nir_opt_find_array_copies); + } + + NIR_PASS(progress, s, nir_opt_copy_prop_vars); + NIR_PASS(progress, s, nir_opt_dead_write_vars); + NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all); + + NIR_PASS(progress, s, nir_remove_dead_variables, + (nir_variable_mode)(nir_var_function_temp | + nir_var_shader_temp | + nir_var_mem_shared), + NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); NIR_PASS(progress, s, nir_copy_prop); @@ -1851,10 +2092,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false); + NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_intrinsics); + NIR_PASS(progress, s, nir_opt_idiv_const, 32); + NIR_PASS(progress, s, nir_lower_alu); + + if (nir_opt_loop(s)) { + progress = true; + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_dce); + } + + NIR_PASS(progress, s, nir_opt_conditional_discard); + + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_if, false); + if (c && !c->disable_gcm) { + bool local_progress = false; + NIR_PASS(local_progress, s, nir_opt_gcm, false); + c->gcm_progress |= local_progress; + progress |= local_progress; + } + + /* Note that vectorization may undo the load/store scalarization + * pass we run for non 32-bit TMU general load/store by + * converting, for example, 2 consecutive 16-bit loads into a + * single 32-bit load. This is fine (and desirable) as long as + * the resulting 32-bit load meets 32-bit alignment requirements, + * which mem_vectorize_callback() should be enforcing. + */ nir_load_store_vectorize_options vectorize_opts = { .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared | @@ -1862,7 +2132,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) .callback = mem_vectorize_callback, .robust_modes = 0, }; - NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); + bool vectorize_progress = false; + + + /* This requires that we have called + * nir_lower_vars_to_explicit_types / nir_lower_explicit_io + * first, which we may not have done yet if we call here too + * early durign NIR pre-processing. We can detect this because + * in that case we won't have a compile object + */ + if (c) { + NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize, + &vectorize_opts); + if (vectorize_progress) { + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(progress, s, nir_lower_pack); + progress = true; + } + } if (lower_flrp != 0) { bool lower_flrp_progress = false; @@ -1895,10 +2182,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) nir_move_options sink_opts = nir_move_const_undef | nir_move_comparisons | nir_move_copies | - nir_move_load_ubo; + nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform; NIR_PASS(progress, s, nir_opt_sink, sink_opts); - - NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo); } static int @@ -1915,27 +2200,9 @@ ntq_emit_vpm_read(struct v3d_compile *c, uint32_t *remaining, uint32_t vpm_index) { - struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); - - if (c->devinfo->ver >= 40 ) { - return vir_LDVPMV_IN(c, - vir_uniform_ui(c, - (*num_components_queued)++)); - } - - if (*num_components_queued != 0) { - (*num_components_queued)--; - return vir_MOV(c, vpm); - } - - uint32_t num_components = MIN2(*remaining, 32); - - v3d33_vir_vpm_read_setup(c, num_components); - - *num_components_queued = num_components - 1; - *remaining -= num_components; - - return vir_MOV(c, vpm); + return vir_LDVPMV_IN(c, + vir_uniform_ui(c, + (*num_components_queued)++)); } static void @@ -2005,31 +2272,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c) } /* The actual loads will happen directly in nir_intrinsic_load_input - * on newer versions. */ - if (c->devinfo->ver >= 40) - return; - - for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { - resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + 1) * 4); - - for (int i = 0; i < c->vattr_sizes[loc]; i++) { - c->inputs[loc * 4 + i] = - ntq_emit_vpm_read(c, - &vpm_components_queued, - &num_components, - loc * 4 + i); - - } - } - - if (c->devinfo->ver >= 40) { - assert(vpm_components_queued == num_components); - } else { - assert(vpm_components_queued == 0); - assert(num_components == 0); - } + return; } static bool @@ -2058,14 +2302,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c) */ assert(glsl_type_is_array(var->type)); const struct glsl_type *type = glsl_get_array_element(var->type); - unsigned array_len = MAX2(glsl_get_length(type), 1); + unsigned var_len = glsl_count_vec4_slots(type, false, false); unsigned loc = var->data.driver_location; resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + array_len) * 4); + (loc + var_len) * 4); if (var->data.compact) { - for (unsigned j = 0; j < array_len; j++) { + for (unsigned j = 0; j < var_len; j++) { unsigned input_idx = c->num_inputs++; unsigned loc_frac = var->data.location_frac + j; unsigned loc = var->data.location + loc_frac / 4; @@ -2076,8 +2320,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c) continue; } - for (unsigned j = 0; j < array_len; j++) { - unsigned num_elements = glsl_get_vector_elements(type); + for (unsigned j = 0; j < var_len; j++) { + unsigned num_elements = + glsl_type_is_struct(glsl_without_array(type)) ? + 4 : glsl_get_vector_elements(type); for (unsigned k = 0; k < num_elements; k++) { unsigned chan = var->data.location_frac + k; unsigned input_idx = c->num_inputs++; @@ -2124,7 +2370,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c) } else if (var->data.compact) { for (int j = 0; j < var_len; j++) emit_compact_fragment_input(c, loc, var, j); - } else if (glsl_type_is_struct(var->type)) { + } else if (glsl_type_is_struct(glsl_without_array(var->type))) { for (int j = 0; j < var_len; j++) { emit_fragment_input(c, loc, var, j, 4); } @@ -2143,12 +2389,9 @@ ntq_setup_outputs(struct v3d_compile *c) return; nir_foreach_shader_out_variable(var, c->s) { - unsigned array_len = MAX2(glsl_get_length(var->type), 1); + assert(glsl_type_is_vector_or_scalar(var->type)); unsigned loc = var->data.driver_location * 4; - assert(array_len == 1); - (void)array_len; - for (int i = 0; i < 4 - var->data.location_frac; i++) { add_output(c, loc + var->data.location_frac + i, var->data.location, @@ -2157,15 +2400,17 @@ ntq_setup_outputs(struct v3d_compile *c) switch (var->data.location) { case FRAG_RESULT_COLOR: - c->output_color_var[0] = var; - c->output_color_var[1] = var; - c->output_color_var[2] = var; - c->output_color_var[3] = var; + for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) + c->output_color_var[i] = var; break; case FRAG_RESULT_DATA0: case FRAG_RESULT_DATA1: case FRAG_RESULT_DATA2: case FRAG_RESULT_DATA3: + case FRAG_RESULT_DATA4: + case FRAG_RESULT_DATA5: + case FRAG_RESULT_DATA6: + case FRAG_RESULT_DATA7: c->output_color_var[var->data.location - FRAG_RESULT_DATA0] = var; break; @@ -2185,17 +2430,19 @@ ntq_setup_outputs(struct v3d_compile *c) * Each nir_register gets a struct qreg per 32-bit component being stored. */ static void -ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) +ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl) { - foreach_list_typed(nir_register, nir_reg, node, list) { - unsigned array_len = MAX2(nir_reg->num_array_elems, 1); + nir_foreach_reg_decl(decl, impl) { + unsigned num_components = nir_intrinsic_num_components(decl); + unsigned array_len = nir_intrinsic_num_array_elems(decl); + array_len = MAX2(array_len, 1); struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, - array_len * - nir_reg->num_components); + array_len * num_components); + nir_def *nir_reg = &decl->def; _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); - for (int i = 0; i < array_len * nir_reg->num_components; i++) + for (int i = 0; i < array_len * num_components; i++) qregs[i] = vir_get_temp(c); } } @@ -2222,23 +2469,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) assert(nir_src_as_uint(instr->src[1]) == 0); - ntq_store_dest(c, &instr->dest, 0, + ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); if (instr->num_components > 1) { - ntq_store_dest(c, &instr->dest, 1, - vir_uniform(c, - instr->num_components == 2 && is_array ? - QUNIFORM_IMAGE_ARRAY_SIZE : - QUNIFORM_IMAGE_HEIGHT, - image_index)); + ntq_store_def(c, &instr->def, 1, + vir_uniform(c, + instr->num_components == 2 && is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_HEIGHT, + image_index)); } if (instr->num_components > 2) { - ntq_store_dest(c, &instr->dest, 2, - vir_uniform(c, - is_array ? - QUNIFORM_IMAGE_ARRAY_SIZE : - QUNIFORM_IMAGE_DEPTH, - image_index)); + ntq_store_def(c, &instr->def, 2, + vir_uniform(c, + is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_DEPTH, + image_index)); } } @@ -2263,16 +2510,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) * * To fix that, we make sure we always emit a thread switch before the * first tlb color read. If that happens to be the last thread switch - * we emit, then everything is fine, but otherwsie, if any code after + * we emit, then everything is fine, but otherwise, if any code after * this point needs to emit additional thread switches, then we will * switch the strategy to locking the scoreboard on the first thread * switch instead -- see vir_emit_thrsw(). */ if (!c->emitted_tlb_load) { - if (!c->last_thrsw_at_top_level) { - assert(c->devinfo->ver >= 41); + if (!c->last_thrsw_at_top_level) vir_emit_thrsw(c); - } c->emitted_tlb_load = true; } @@ -2371,27 +2616,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) } assert(color_reads_for_sample[component].file != QFILE_NULL); - ntq_store_dest(c, &instr->dest, 0, - vir_MOV(c, color_reads_for_sample[component])); + ntq_store_def(c, &instr->def, 0, + vir_MOV(c, color_reads_for_sample[component])); +} + +static bool +ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr); + +static bool +try_emit_uniform(struct v3d_compile *c, + int offset, + int num_components, + nir_def *def, + enum quniform_contents contents) +{ + /* Even though ldunif is strictly 32-bit we can still use it + * to load scalar 8-bit/16-bit uniforms so long as their offset + * is 32-bit aligned. In this case, ldunif would still load + * 32-bit into the destination with the 8-bit/16-bit uniform + * data in the LSB and garbage in the MSB, but that is fine + * because we should only be accessing the valid bits of the + * destination. + * + * FIXME: if in the future we improve our register allocator to + * pack 2 16-bit variables in the MSB and LSB of the same + * register then this optimization would not be valid as is, + * since the load clobbers the MSB. + */ + if (offset % 4 != 0) + return false; + + /* We need dwords */ + offset = offset / 4; + + for (int i = 0; i < num_components; i++) { + ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i)); + } + + return true; } static void ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) { + /* We scalarize general TMU access for anything that is not 32-bit. */ + assert(instr->def.bit_size == 32 || + instr->num_components == 1); + + /* Try to emit ldunif if possible, otherwise fallback to general TMU */ if (nir_src_is_const(instr->src[0])) { int offset = (nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0])); - assert(offset % 4 == 0); - /* We need dwords */ - offset = offset / 4; - for (int i = 0; i < instr->num_components; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_UNIFORM, - offset + i)); + + if (try_emit_uniform(c, offset, instr->num_components, + &instr->def, QUNIFORM_UNIFORM)) { + return; + } + } + + if (!ntq_emit_load_unifa(c, instr)) { + ntq_emit_tmu_general(c, instr, false, false); + c->has_general_tmu_load = true; + } +} + +static bool +ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + if (c->compiler->max_inline_uniform_buffers <= 0) + return false; + + /* Regular UBOs start after inline UBOs */ + uint32_t index = nir_src_as_uint(instr->src[0]); + if (index >= c->compiler->max_inline_uniform_buffers) + return false; + + /* We scalarize general TMU access for anything that is not 32-bit */ + assert(instr->def.bit_size == 32 || + instr->num_components == 1); + + if (nir_src_is_const(instr->src[1])) { + int offset = nir_src_as_uint(instr->src[1]); + if (try_emit_uniform(c, offset, instr->num_components, + &instr->def, + QUNIFORM_INLINE_UBO_0 + index)) { + return true; } - } else { - ntq_emit_tmu_general(c, instr, false); } + + /* Fallback to regular UBO load */ + return false; } static void @@ -2411,7 +2725,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) unsigned offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]); - if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) { + if (c->s->info.stage != MESA_SHADER_FRAGMENT) { /* Emit the LDVPM directly now, rather than at the top * of the shader like we did for V3D 3.x (which needs * vpmsetup when not just taking the next offset). @@ -2433,19 +2747,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) SYSTEM_VALUE_VERTEX_ID)) { index++; } - for (int i = 0; i < offset; i++) - index += c->vattr_sizes[i]; + + for (int i = 0; i < offset; i++) { + /* GFXH-1602: if any builtins (vid, iid, etc) are read then + * attribute 0 must be active (size > 0). When we hit this, + * the driver is expected to program attribute 0 to have a + * size of 1, so here we need to add that. + */ + if (i == 0 && c->vs_key->is_coord && + c->vattr_sizes[i] == 0 && index > 0) { + index++; + } else { + index += c->vattr_sizes[i]; + } + } + index += nir_intrinsic_component(instr); for (int i = 0; i < instr->num_components; i++) { struct qreg vpm_offset = vir_uniform_ui(c, index++); - ntq_store_dest(c, &instr->dest, i, - vir_LDVPMV_IN(c, vpm_offset)); + ntq_store_def(c, &instr->def, i, + vir_LDVPMV_IN(c, vpm_offset)); } } else { for (int i = 0; i < instr->num_components; i++) { int comp = nir_intrinsic_component(instr) + i; - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, c->inputs[offset * 4 + comp])); + struct qreg input = c->inputs[offset * 4 + comp]; + ntq_store_def(c, &instr->def, i, vir_MOV(c, input)); + + if (c->s->info.stage == MESA_SHADER_FRAGMENT && + input.file == c->payload_z.file && + input.index == c->payload_z.index) { + c->reads_z = true; + } } } } @@ -2610,18 +2943,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c, /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */ struct qreg F = vir_uniform_ui(c, 0); struct qreg T = vir_uniform_ui(c, ~0); - struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1); + struct qreg s0 = vir_AND(c, sample_mask, i1); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); - s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2); + s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s1 = vir_AND(c, sample_mask, i2); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); - s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4); + s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s2 = vir_AND(c, sample_mask, i4); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); - s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8); + s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s3 = vir_AND(c, sample_mask, i8); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ); - s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); + s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */ struct qreg sample_idx = i3; @@ -2708,28 +3041,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result) c->current_unifa_offset += 4; } -static void -ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) +/* Checks if the value of a nir src is derived from a nir register */ +static bool +nir_src_derived_from_reg(nir_src src) +{ + nir_def *def = src.ssa; + if (nir_load_reg_for_def(def)) + return true; + + nir_instr *parent = def->parent_instr; + switch (parent->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(parent); + int num_srcs = nir_op_infos[alu->op].num_inputs; + for (int i = 0; i < num_srcs; i++) { + if (nir_src_derived_from_reg(alu->src[i].src)) + return true; + } + return false; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); + int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs; + for (int i = 0; i < num_srcs; i++) { + if (nir_src_derived_from_reg(intr->src[i])) + return true; + } + return false; + } + case nir_instr_type_load_const: + case nir_instr_type_undef: + return false; + default: + /* By default we assume it may come from a register, the above + * cases should be able to handle the majority of situations + * though. + */ + return true; + }; +} + +static bool +ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) { + assert(instr->intrinsic == nir_intrinsic_load_ubo || + instr->intrinsic == nir_intrinsic_load_ssbo || + instr->intrinsic == nir_intrinsic_load_uniform); + + bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; + bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; + bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo; + /* Every ldunifa auto-increments the unifa address by 4 bytes, so our * current unifa offset is 4 bytes ahead of the offset of the last load. */ static const int32_t max_unifa_skip_dist = MAX_UNIFA_SKIP_DISTANCE - 4; - bool dynamic_src = !nir_src_is_const(instr->src[1]); - uint32_t const_offset = - dynamic_src ? 0 : nir_src_as_uint(instr->src[1]); + /* We can only use unifa if the offset is uniform */ + nir_src offset = is_uniform ? instr->src[0] : instr->src[1]; + if (nir_src_is_divergent(offset)) + return false; - /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index - * shifted up by 1 (0 is gallium's constant buffer 0). + /* Emitting loads from unifa may not be safe under non-uniform control + * flow. It seems the address that is used to write to the unifa + * register is taken from the first lane and if that lane is disabled + * by control flow then the value we read may be bogus and lead to + * invalid memory accesses on follow-up ldunifa instructions. However, + * ntq_store_def only emits conditional writes for nir registersas long + * we can be certain that the offset isn't derived from a load_reg we + * should be fine. + * + * The following CTS test can be used to trigger the problem, which + * causes a GMP violations in the sim without this check: + * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int */ - uint32_t index = nir_src_as_uint(instr->src[0]); - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) + if (vir_in_nonuniform_control_flow(c) && + nir_src_derived_from_reg(offset)) { + return false; + } + + /* We can only use unifa with SSBOs if they are read-only. Otherwise + * ldunifa won't see the shader writes to that address (possibly + * because ldunifa doesn't read from the L2T cache). + */ + if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE)) + return false; + + /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms + * that we may have been written to scratch using the TMU. + */ + bool dynamic_src = !nir_src_is_const(offset); + if (is_uniform && dynamic_src && c->s->scratch_size > 0) + return false; + + uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset); + if (is_uniform) + const_offset += nir_intrinsic_base(instr); + + /* ldunifa is a 32-bit load instruction so we can only use it with + * 32-bit aligned addresses. We always produce 32-bit aligned addresses + * except for types smaller than 32-bit, so in these cases we can only + * use ldunifa if we can verify alignment, which we can only do for + * loads with a constant offset. + */ + uint32_t bit_size = instr->def.bit_size; + uint32_t value_skips = 0; + if (bit_size < 32) { + if (dynamic_src) { + return false; + } else if (const_offset % 4 != 0) { + /* If we are loading from an unaligned offset, fix + * alignment and skip over unused elements in result. + */ + value_skips = (const_offset % 4) / (bit_size / 8); + const_offset &= ~0x3; + } + } + + assert((bit_size == 32 && value_skips == 0) || + (bit_size == 16 && value_skips <= 1) || + (bit_size == 8 && value_skips <= 3)); + + /* Both Vulkan and OpenGL reserve index 0 for uniforms / push + * constants. + */ + uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]); + + /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use + * index 0 for Gallium's constant buffer (GL) or push constants + * (Vulkan). + */ + if (is_ubo) index++; /* We can only keep track of the last unifa address we used with - * constant offset loads. If the new load targets the same UBO and + * constant offset loads. If the new load targets the same buffer and * is close enough to the previous load, we can skip the unifa register * write by emitting dummy ldunifa instructions to update the unifa * address. @@ -2739,6 +3186,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) if (dynamic_src) { c->current_unifa_block = NULL; } else if (c->cur_block == c->current_unifa_block && + c->current_unifa_is_ubo == !is_ssbo && c->current_unifa_index == index && c->current_unifa_offset <= const_offset && c->current_unifa_offset + max_unifa_skip_dist >= const_offset) { @@ -2746,32 +3194,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) ldunifa_skips = (const_offset - c->current_unifa_offset) / 4; } else { c->current_unifa_block = c->cur_block; + c->current_unifa_is_ubo = !is_ssbo; c->current_unifa_index = index; c->current_unifa_offset = const_offset; } if (!skip_unifa) { - struct qreg base_offset = + struct qreg base_offset = !is_ssbo ? vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(index, const_offset)); + v3d_unit_data_create(index, const_offset)) : + vir_uniform(c, QUNIFORM_SSBO_OFFSET, index); struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); if (!dynamic_src) { - vir_MOV_dest(c, unifa, base_offset); + if (!is_ssbo) { + /* Avoid the extra MOV to UNIFA by making + * ldunif load directly into it. We can't + * do this if we have not actually emitted + * ldunif and are instead reusing a previous + * one. + */ + struct qinst *inst = + (struct qinst *)c->cur_block->instructions.prev; + if (inst == c->defs[base_offset.index]) { + inst->dst = unifa; + c->defs[base_offset.index] = NULL; + } else { + vir_MOV_dest(c, unifa, base_offset); + } + } else { + vir_ADD_dest(c, unifa, base_offset, + vir_uniform_ui(c, const_offset)); + } } else { vir_ADD_dest(c, unifa, base_offset, - ntq_get_src(c, instr->src[1], 0)); + ntq_get_src(c, offset, 0)); } } else { for (int i = 0; i < ldunifa_skips; i++) emit_ldunifa(c, NULL); } - for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) { + uint32_t num_components = nir_intrinsic_dest_components(instr); + for (uint32_t i = 0; i < num_components; ) { struct qreg data; emit_ldunifa(c, &data); - ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data)); + + if (bit_size == 32) { + assert(value_skips == 0); + ntq_store_def(c, &instr->def, i, vir_MOV(c, data)); + i++; + } else { + assert((bit_size == 16 && value_skips <= 1) || + (bit_size == 8 && value_skips <= 3)); + + /* If we have any values to skip, shift to the first + * valid value in the ldunifa result. + */ + if (value_skips > 0) { + data = vir_SHR(c, data, + vir_uniform_ui(c, bit_size * + value_skips)); + } + + /* Check how many valid components we have discounting + * read components to skip. + */ + uint32_t valid_count = (32 / bit_size) - value_skips; + assert((bit_size == 16 && valid_count <= 2) || + (bit_size == 8 && valid_count <= 4)); + assert(valid_count > 0); + + /* Process the valid components */ + do { + struct qreg tmp; + uint32_t mask = (1 << bit_size) - 1; + tmp = vir_AND(c, vir_MOV(c, data), + vir_uniform_ui(c, mask)); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, tmp)); + i++; + valid_count--; + + /* Shift to next component */ + if (i < num_components && valid_count > 0) { + data = vir_SHR(c, data, + vir_uniform_ui(c, bit_size)); + } + } while (i < num_components && valid_count > 0); + } } + + return true; } static inline struct qreg @@ -2781,187 +3295,273 @@ emit_load_local_invocation_index(struct v3d_compile *c) vir_uniform_ui(c, 32 - c->local_invocation_index_bits)); } -/* Various subgroup operations rely on the A flags, so this helper ensures that - * A flags represents currently active lanes in the subgroup. +/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in + * fragment shaders a lane is considered active if any sample flags are set + * for *any* lane in the same quad, however, we still need to ensure that + * terminated lanes (OpTerminate) are not included. Further, we also need to + * disable lanes that may be disabled because of non-uniform control + * flow. */ -static void -set_a_flags_for_subgroup(struct v3d_compile *c) +static enum v3d_qpu_cond +setup_subgroup_control_flow_condition(struct v3d_compile *c) { - /* MSF returns 0 for disabled lanes in compute shaders so - * PUSHZ will set A=1 for disabled lanes. We want the inverse - * of this but we don't have any means to negate the A flags - * directly, but we can do it by repeating the same operation - * with NORZ (A = ~A & ~Z). + assert(c->s->info.stage == MESA_SHADER_FRAGMENT || + c->s->info.stage == MESA_SHADER_COMPUTE); + + enum v3d_qpu_cond cond = V3D_QPU_COND_NONE; + + /* We need to make sure that terminated lanes in fragment shaders are + * not included. We can identify these lanes by comparing the inital + * sample mask with the current. This fixes: + * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_* */ - assert(c->s->info.stage == MESA_SHADER_COMPUTE); - vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); - vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ); + if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) { + vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf, + vir_NOT(c, vir_XOR(c, c->start_msf, + vir_MSF(c)))), + V3D_QPU_PF_PUSHZ); + cond = V3D_QPU_COND_IFNA; + } - /* If we are under non-uniform control flow we also need to - * AND the A flags with the current execute mask. + /* If we are in non-uniform control-flow update the condition to + * also limit lanes to those in the current execution mask. */ if (vir_in_nonuniform_control_flow(c)) { - const uint32_t bidx = c->cur_block->index; - vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(), - c->execute, - vir_uniform_ui(c, bidx)), - V3D_QPU_UF_ANDZ); + if (cond == V3D_QPU_COND_IFNA) { + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_UF_NORNZ); + } else { + assert(cond == V3D_QPU_COND_NONE); + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + cond = V3D_QPU_COND_IFA; } + + return cond; +} + +static void +emit_compute_barrier(struct v3d_compile *c) +{ + /* Ensure we flag the use of the control barrier. NIR's + * gather info pass usually takes care of this, but that + * requires that we call that pass after any other pass + * may emit a control barrier, so this is safer. + */ + c->s->info.uses_control_barrier = true; + + /* Emit a TSY op to get all invocations in the workgroup + * (actually supergroup) to block until the last + * invocation reaches the TSY op. + */ + vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB)); +} + +static void +emit_barrier(struct v3d_compile *c) +{ + struct qreg eidx = vir_EIDX(c); + + /* The config for the TSY op should be setup like this: + * - Lane 0: Quorum + * - Lane 2: TSO id + * - Lane 3: TSY opcode + */ + + /* Lane 0: we want to synchronize across one subgroup. Here we write to + * all lanes unconditionally and will overwrite other lanes below. + */ + struct qreg tsy_conf = vir_uniform_ui(c, 1); + + /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the + * curent QPU index and thread index to ensure we get a unique one for + * this group of invocations in this core. + */ + struct qreg tso_id = + vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f)); + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id); + + /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */ + struct qreg tsy_op = vir_uniform_ui(c, 16); + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op); + + /* Emit TSY sync */ + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf); } static void ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) { switch (instr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + break; /* Ignore these */ + case nir_intrinsic_load_uniform: ntq_emit_load_uniform(c, instr); break; + case nir_intrinsic_load_global_2x32: + ntq_emit_tmu_general(c, instr, false, true); + c->has_general_tmu_load = true; + break; + case nir_intrinsic_load_ubo: - if (!nir_src_is_divergent(instr->src[1])) - ntq_emit_load_ubo_unifa(c, instr); - else - ntq_emit_tmu_general(c, instr, false); - break; - - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: + if (ntq_emit_inline_ubo_load(c, instr)) + break; + FALLTHROUGH; case nir_intrinsic_load_ssbo: + if (!ntq_emit_load_unifa(c, instr)) { + ntq_emit_tmu_general(c, instr, false, false); + c->has_general_tmu_load = true; + } + break; + case nir_intrinsic_store_ssbo: - ntq_emit_tmu_general(c, instr, false); - break; - - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - case nir_intrinsic_load_shared: + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + ntq_emit_tmu_general(c, instr, false, false); + break; + + case nir_intrinsic_store_global_2x32: + case nir_intrinsic_global_atomic_2x32: + case nir_intrinsic_global_atomic_swap_2x32: + ntq_emit_tmu_general(c, instr, false, true); + break; + + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: case nir_intrinsic_store_shared: - case nir_intrinsic_load_scratch: case nir_intrinsic_store_scratch: - ntq_emit_tmu_general(c, instr, true); + ntq_emit_tmu_general(c, instr, true, false); + break; + + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_shared: + ntq_emit_tmu_general(c, instr, true, false); + c->has_general_tmu_load = true; break; - case nir_intrinsic_image_load: case nir_intrinsic_image_store: - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_comp_swap: - v3d40_vir_emit_image_load_store(c, instr); + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + v3d_vir_emit_image_load_store(c, instr); + break; + + case nir_intrinsic_image_load: + v3d_vir_emit_image_load_store(c, instr); + /* Not really a general TMU load, but we only use this flag + * for NIR scheduling and we do schedule these under the same + * policy as general TMU. + */ + c->has_general_tmu_load = true; break; case nir_intrinsic_get_ssbo_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, - nir_src_comp_as_uint(instr->src[0], 0))); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, + nir_src_comp_as_uint(instr->src[0], 0))); break; case nir_intrinsic_get_ubo_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_GET_UBO_SIZE, - nir_src_comp_as_uint(instr->src[0], 0))); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_GET_UBO_SIZE, + nir_src_comp_as_uint(instr->src[0], 0))); break; case nir_intrinsic_load_user_clip_plane: for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, - nir_intrinsic_ucp_id(instr) * - 4 + i)); + ntq_store_def(c, &instr->def, i, + vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, + nir_intrinsic_ucp_id(instr) * + 4 + i)); } break; case nir_intrinsic_load_viewport_x_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); break; case nir_intrinsic_load_viewport_y_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); break; case nir_intrinsic_load_viewport_z_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); break; case nir_intrinsic_load_viewport_z_offset: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); break; case nir_intrinsic_load_line_coord: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x)); break; case nir_intrinsic_load_line_width: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); break; case nir_intrinsic_load_aa_line_width: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); break; case nir_intrinsic_load_sample_mask_in: - ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); + ntq_store_def(c, &instr->def, 0, vir_MSF(c)); break; case nir_intrinsic_load_helper_invocation: vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); - ntq_store_dest(c, &instr->dest, 0, qdest); + ntq_store_def(c, &instr->def, 0, qdest); break; case nir_intrinsic_load_front_face: /* The register contains 0 (front) or 1 (back), and we need to * turn it into a NIR bool where true means front. */ - ntq_store_dest(c, &instr->dest, 0, - vir_ADD(c, - vir_uniform_ui(c, -1), - vir_REVF(c))); + ntq_store_def(c, &instr->def, 0, + vir_ADD(c, + vir_uniform_ui(c, -1), + vir_REVF(c))); break; case nir_intrinsic_load_base_instance: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid)); break; case nir_intrinsic_load_instance_id: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid)); break; case nir_intrinsic_load_vertex_id: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid)); + break; + + case nir_intrinsic_load_draw_id: + ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0)); break; case nir_intrinsic_load_tlb_color_v3d: vir_emit_tlb_color_read(c, instr); break; + case nir_intrinsic_load_fep_w_v3d: + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w)); + break; + case nir_intrinsic_load_input: ntq_emit_load_input(c, instr); break; @@ -2978,7 +3578,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_image_size(c, instr); break; + /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which + * is intended to match the semantics of GLSL's discard) should + * terminate the invocation immediately. Our implementation doesn't + * do that. What we do is actually a demote by removing the invocations + * from the sample mask. Maybe we could be more strict and force an + * early termination by emitting a (maybe conditional) jump to the + * end section of the fragment shader for affected invocations. + */ case nir_intrinsic_discard: + case nir_intrinsic_terminate: + c->emitted_discard = true; + FALLTHROUGH; + case nir_intrinsic_demote: ntq_flush_tmu(c); if (vir_in_nonuniform_control_flow(c)) { @@ -2993,7 +3605,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) } break; - case nir_intrinsic_discard_if: { + case nir_intrinsic_discard_if: + case nir_intrinsic_terminate_if: + c->emitted_discard = true; + FALLTHROUGH; + case nir_intrinsic_demote_if: { ntq_flush_tmu(c); enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); @@ -3011,102 +3627,80 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), vir_uniform_ui(c, 0)), cond); - break; } - case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_memory_barrier_tcs_patch: - case nir_intrinsic_group_memory_barrier: - /* We don't do any instruction scheduling of these NIR - * instructions between each other, so we just need to make - * sure that the TMU operations before the barrier are flushed + case nir_intrinsic_barrier: + /* Ensure that the TMU operations before the barrier are flushed * before the ones after the barrier. */ ntq_flush_tmu(c); - break; - - case nir_intrinsic_control_barrier: - /* Emit a TSY op to get all invocations in the workgroup - * (actually supergroup) to block until the last invocation - * reaches the TSY op. - */ - ntq_flush_tmu(c); - if (c->devinfo->ver >= 42) { - vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, - V3D_QPU_WADDR_SYNCB)); - } else { - struct qinst *sync = - vir_BARRIERID_dest(c, - vir_reg(QFILE_MAGIC, - V3D_QPU_WADDR_SYNCU)); - sync->uniform = - vir_get_uniform_index(c, QUNIFORM_CONSTANT, - 0xffffff00 | - V3D_TSY_WAIT_INC_CHECK); + if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) { + if (c->s->info.stage == MESA_SHADER_COMPUTE) + emit_compute_barrier(c); + else + emit_barrier(c); + /* The blocking of a TSY op only happens at the next + * thread switch. No texturing may be outstanding at the + * time of a TSY blocking operation. + */ + vir_emit_thrsw(c); } - - /* The blocking of a TSY op only happens at the next thread - * switch. No texturing may be outstanding at the time of a - * TSY blocking operation. - */ - vir_emit_thrsw(c); break; case nir_intrinsic_load_num_workgroups: for (int i = 0; i < 3; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, - i)); + ntq_store_def(c, &instr->def, i, + vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, + i)); } break; + case nir_intrinsic_load_workgroup_id_zero_base: case nir_intrinsic_load_workgroup_id: { struct qreg x = vir_AND(c, c->cs_payload[0], vir_uniform_ui(c, 0xffff)); + ntq_store_def(c, &instr->def, 0, x); struct qreg y = vir_SHR(c, c->cs_payload[0], vir_uniform_ui(c, 16)); + ntq_store_def(c, &instr->def, 1, y); struct qreg z = vir_AND(c, c->cs_payload[1], vir_uniform_ui(c, 0xffff)); + ntq_store_def(c, &instr->def, 2, z); + break; + } - /* We only support dispatch base in Vulkan */ - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { - x = vir_ADD(c, x, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0)); - y = vir_ADD(c, y, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1)); - z = vir_ADD(c, z, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2)); - } + case nir_intrinsic_load_base_workgroup_id: { + struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0); + ntq_store_def(c, &instr->def, 0, x); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y)); - ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z)); + struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1); + ntq_store_def(c, &instr->def, 1, y); + + struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2); + ntq_store_def(c, &instr->def, 2, z); break; } case nir_intrinsic_load_local_invocation_index: - ntq_store_dest(c, &instr->dest, 0, - emit_load_local_invocation_index(c)); + ntq_store_def(c, &instr->def, 0, + emit_load_local_invocation_index(c)); break; case nir_intrinsic_load_subgroup_id: { /* This is basically the batch index, which is the Local * Invocation Index divided by the SIMD width). */ - STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS)); + STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0); const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1; struct qreg lii = emit_load_local_invocation_index(c); - ntq_store_dest(c, &instr->dest, 0, - vir_SHR(c, lii, - vir_uniform_ui(c, divide_shift))); + ntq_store_def(c, &instr->def, 0, + vir_SHR(c, lii, + vir_uniform_ui(c, divide_shift))); break; } @@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg col = ntq_get_src(c, instr->src[0], 0); for (int i = 0; i < instr->num_components; i++) { struct qreg row = vir_uniform_ui(c, row_idx++); - ntq_store_dest(c, &instr->dest, i, - vir_LDVPMG_IN(c, row, col)); + ntq_store_def(c, &instr->def, i, + vir_LDVPMG_IN(c, row, col)); } break; } @@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) * using ldvpm(v,d)_in (See Table 71). */ assert(c->s->info.stage == MESA_SHADER_GEOMETRY); - ntq_store_dest(c, &instr->dest, 0, - vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); + ntq_store_def(c, &instr->def, 0, + vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); break; } case nir_intrinsic_load_invocation_id: - ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); + ntq_store_def(c, &instr->def, 0, vir_IID(c)); break; case nir_intrinsic_load_fb_layers_v3d: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); break; case nir_intrinsic_load_sample_id: - ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c)); + ntq_store_def(c, &instr->def, 0, vir_SAMPID(c)); break; case nir_intrinsic_load_sample_pos: - ntq_store_dest(c, &instr->dest, 0, - vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); - ntq_store_dest(c, &instr->dest, 1, - vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); + ntq_store_def(c, &instr->def, 0, + vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); + ntq_store_def(c, &instr->def, 1, + vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); break; case nir_intrinsic_load_barycentric_at_offset: - ntq_store_dest(c, &instr->dest, 0, - vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); - ntq_store_dest(c, &instr->dest, 1, - vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); + ntq_store_def(c, &instr->def, 0, + vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); + ntq_store_def(c, &instr->def, 1, + vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); break; case nir_intrinsic_load_barycentric_pixel: - ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); - ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f)); break; case nir_intrinsic_load_barycentric_at_sample: { if (!c->fs_key->msaa) { - ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); - ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f)); return; } @@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0); ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x)); + ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y)); break; } @@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg offset_y = vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); - ntq_store_dest(c, &instr->dest, 0, - vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); - ntq_store_dest(c, &instr->dest, 1, - vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); + ntq_store_def(c, &instr->def, 0, + vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); + ntq_store_def(c, &instr->def, 1, + vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); break; } case nir_intrinsic_load_barycentric_centroid: { struct qreg offset_x, offset_y; ntq_get_barycentric_centroid(c, &offset_x, &offset_y); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x)); + ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y)); break; } @@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) */ if (!c->fs_key->msaa || c->interp[input_idx].vp.file == QFILE_NULL) { - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, c->inputs[input_idx])); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, c->inputs[input_idx])); continue; } @@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_load_interpolated_input(c, p, C, offset_x, offset_y, interp_mode); - ntq_store_dest(c, &instr->dest, i, result); + ntq_store_def(c, &instr->def, i, result); } break; } case nir_intrinsic_load_subgroup_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform_ui(c, V3D_CHANNELS)); + ntq_store_def(c, &instr->def, 0, + vir_uniform_ui(c, V3D_CHANNELS)); break; case nir_intrinsic_load_subgroup_invocation: - ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); + ntq_store_def(c, &instr->def, 0, vir_EIDX(c)); break; case nir_intrinsic_elect: { - set_a_flags_for_subgroup(c); - struct qreg first = vir_FLAFIRST(c); + struct qreg first; + if (vir_in_nonuniform_control_flow(c)) { + /* Sets A=1 for lanes enabled in the execution mask */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + /* Updates A ANDing with lanes enabled in MSF */ + vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), + V3D_QPU_UF_ANDNZ); + first = vir_FLAFIRST(c); + } else { + /* Sets A=1 for inactive lanes */ + vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), + V3D_QPU_PF_PUSHZ); + first = vir_FLNAFIRST(c); + } - /* Produce a boolean result from Flafirst */ + /* Produce a boolean result */ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), first, vir_uniform_ui(c, 1)), V3D_QPU_PF_PUSHZ); struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); - ntq_store_dest(c, &instr->dest, 0, result); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_ballot: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_BALLOT_dest(c, res, value), cond); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg index = ntq_get_src(c, instr->src[1], 0); + struct qreg res = vir_SHUFFLE(c, value, index); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_first_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_BCASTF_dest(c, res, value), cond); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_shuffle: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg indices = ntq_get_src(c, instr->src[1], 0); + struct qreg res = vir_SHUFFLE(c, value, indices); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_vote_feq: + case nir_intrinsic_vote_ieq: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ? + vir_ALLEQ_dest(c, res, value) : + vir_ALLFEQ_dest(c, res, value), + cond); + + /* Produce boolean result */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_vote_all: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); + + /* We want to check if 'all lanes are equal (alleq != 0) and + * their value is True (value != 0)'. + * + * The first MOV.pushz generates predicate for 'alleq == 0'. + * The second MOV.NORZ generates predicate for: + * '!(alleq == 0) & !(value == 0). + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value), + V3D_QPU_UF_NORZ); + struct qreg result = + ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_vote_any: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); + + /* We want to check 'not (all lanes are equal (alleq != 0)' + * and their value is False (value == 0))'. + * + * The first MOV.pushz generates predicate for 'alleq == 0'. + * The second MOV.NORNZ generates predicate for: + * '!(alleq == 0) & (value == 0). + * The IFNA condition negates the predicate when evaluated: + * '!(!alleq == 0) & (value == 0)) + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value), + V3D_QPU_UF_NORNZ); + struct qreg result = + ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA); + ntq_store_def(c, &instr->def, 0, result); break; } @@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_view_index: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); break; default: @@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); } +static bool +is_cheap_block(nir_block *block) +{ + int32_t cost = 3; + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + case nir_instr_type_undef: + case nir_instr_type_load_const: + if (--cost <= 0) + return false; + break; + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + continue; + default: + return false; + } + } + default: + return false; + } + } + return true; +} + static void ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) { @@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) c->execute, vir_uniform_ui(c, else_block->index)); - /* Jump to ELSE if nothing is active for THEN, otherwise fall - * through. + /* Set the flags for taking the THEN block */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + + /* Jump to ELSE if nothing is active for THEN (unless THEN block is + * so small it won't pay off), otherwise fall through. */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); - vir_link_blocks(c->cur_block, else_block); + bool is_cheap = exec_list_is_singular(&if_stmt->then_list) && + is_cheap_block(nir_if_first_then_block(if_stmt)); + if (!is_cheap) { + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); + vir_link_blocks(c->cur_block, else_block); + } vir_link_blocks(c->cur_block, then_block); - /* Process the THEN block. */ + /* Process the THEN block. + * + * Notice we don't call ntq_activate_execute_for_block here on purpose: + * c->execute is already set up to be 0 for lanes that must take the + * THEN block. + */ vir_set_emit_block(c, then_block); ntq_emit_cf_list(c, &if_stmt->then_list); @@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, after_block->index)); - /* If everything points at ENDIF, then jump there immediately. */ - vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), - c->execute, - vir_uniform_ui(c, after_block->index)), - V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); - vir_link_blocks(c->cur_block, after_block); + /* If everything points at ENDIF, then jump there immediately + * (unless ELSE block is so small it won't pay off). + */ + bool is_cheap = exec_list_is_singular(&if_stmt->else_list) && + is_cheap_block(nir_else_block); + if (!is_cheap) { + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), + c->execute, + vir_uniform_ui(c, after_block->index)), + V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); + vir_link_blocks(c->cur_block, after_block); + } vir_link_blocks(c->cur_block, else_block); vir_set_emit_block(c, else_block); @@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) ntq_emit_load_const(c, nir_instr_as_load_const(instr)); break; - case nir_instr_type_ssa_undef: + case nir_instr_type_undef: unreachable("Should've been lowered by nir_lower_undef_to_zero"); break; @@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop) static void ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) { - c->loop_cont_block = vir_new_block(c); c->loop_break_block = vir_new_block(c); @@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) static void ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) { + assert(!nir_loop_has_continue_construct(loop)); + + /* Disable flags optimization for loop conditions. The problem here is + * that we can have code like this: + * + * // block_0 + * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2 + * loop { + * // block_1 + * if ssa_9 { + * + * In this example we emit flags to compute ssa_9 and the optimization + * will skip regenerating them again for the loop condition in the + * loop continue block (block_1). However, this is not safe after the + * first iteration because the loop body can stomp the flags if it has + * any conditionals. + */ + c->flags_temp = -1; + bool was_in_control_flow = c->in_control_flow; c->in_control_flow = true; @@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) static void ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) { - ntq_setup_registers(c, &impl->registers); + ntq_setup_registers(c, impl); ntq_emit_cf_list(c, &impl->body); } @@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c) { switch (c->s->info.stage) { case MESA_SHADER_FRAGMENT: - c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->start_msf = vir_MSF(c); + if (c->devinfo->ver < 71) + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + else + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); @@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c) emit_fragment_varying(c, NULL, -1, 0, 0); } - if (c->fs_key->is_points && - (c->devinfo->ver < 40 || program_reads_point_coord(c))) { + if (c->fs_key->is_points && program_reads_point_coord(c)) { c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0); c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0); c->uses_implicit_point_line_varyings = true; } else if (c->fs_key->is_lines && - (c->devinfo->ver < 40 || - BITSET_TEST(c->s->info.system_values_read, + (BITSET_TEST(c->s->info.system_values_read, SYSTEM_VALUE_LINE_COORD))) { c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0); c->uses_implicit_point_line_varyings = true; } - - c->force_per_sample_msaa = - c->s->info.fs.uses_sample_qualifier || - BITSET_TEST(c->s->info.system_values_read, - SYSTEM_VALUE_SAMPLE_ID) || - BITSET_TEST(c->s->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS); break; case MESA_SHADER_COMPUTE: /* Set up the TSO for barriers, assuming we do some. */ @@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c) V3D_QPU_WADDR_SYNC)); } - c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); - c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + if (c->devinfo->ver == 42) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } else if (c->devinfo->ver >= 71) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } /* Set up the division between gl_LocalInvocationIndex and * wg_in_mem in the payload reg. @@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c) /* Find the main function and emit the body. */ nir_foreach_function(function, c->s) { - assert(strcmp(function->name, "main") == 0); + assert(function->is_entrypoint); assert(function->impl); ntq_emit_impl(c, function->impl); } @@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c, { *restore_last_thrsw = c->last_thrsw; - /* On V3D before 4.1, we need a TMU op to be outstanding when thread - * switching, so disable threads if we didn't do any TMU ops (each of - * which would have emitted a THRSW). - */ - if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { - c->threads = 1; - if (c->last_thrsw) - vir_remove_thrsw(c); - *restore_last_thrsw = NULL; - } - /* If we're threaded and the last THRSW was in conditional code, then * we need to emit another one so that we can flag it as the last * thrsw. */ - if (c->last_thrsw && !c->last_thrsw_at_top_level) { - assert(c->devinfo->ver >= 41); + if (c->last_thrsw && !c->last_thrsw_at_top_level) vir_emit_thrsw(c); - } /* If we're threaded, then we need to mark the last THRSW instruction * so we can emit a pair of them at QPU emit time. @@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c, * For V3D 4.x, we can spawn the non-fragment shaders already in the * post-last-THRSW state, so we can skip this. */ - if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { - assert(c->devinfo->ver >= 41); + if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) vir_emit_thrsw(c); - } /* If we have not inserted a last thread switch yet, do it now to ensure * any potential spilling we do happens before this. If we don't spill @@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c) vir_for_each_inst_inorder(inst, c) { for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file == QFILE_REG && - inst->src[i].index == 0) { + if (inst->src[i].file == c->payload_w.file && + inst->src[i].index == c->payload_w.index) { c->uses_center_w = true; return; } @@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c) void v3d_nir_to_vir(struct v3d_compile *c) { - if (V3D_DEBUG & (V3D_DEBUG_NIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(NIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d NIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c) unreachable("bad stage"); } - if (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c) * instructions until the results are needed. */ - if (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c) /* Attempt to allocate registers for the temporaries. If we fail, * reduce thread count and try again. */ - int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; + int min_threads = 2; struct qpu_reg *temp_registers; while (true) { - bool spilled; - temp_registers = v3d_register_allocate(c, &spilled); - if (spilled) - continue; - - if (temp_registers) + temp_registers = v3d_register_allocate(c); + if (temp_registers) { + assert(c->spills + c->fills <= c->max_tmu_spills); break; + } if (c->threads == min_threads && - (V3D_DEBUG & V3D_DEBUG_RA)) { + V3D_DBG(RA)) { fprintf(stderr, "Failed to register allocate using %s\n", c->fallback_scheduler ? "the fallback scheduler:" : @@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c) } if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) { - if (V3D_DEBUG & V3D_DEBUG_PERF) { + if (V3D_DBG(PERF)) { fprintf(stderr, - "Failed to register allocate %s at " - "%d threads.\n", vir_get_stage_name(c), - c->threads); + "Failed to register allocate %s " + "prog %d/%d at %d threads.\n", + vir_get_stage_name(c), + c->program_id, c->variant_id, c->threads); } c->compilation_result = V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; return; } - c->spill_count = 0; + c->spills = 0; + c->fills = 0; c->threads /= 2; if (c->threads == 1) @@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c) vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock); if (c->spills && - (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { + (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage))) { fprintf(stderr, "%s prog %d/%d spilled VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); |