1 files changed, 1380 insertions, 614 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index d0a89f1a7d4..0303ca96103 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -38,7 +38,7 @@
 #define __gen_address_type uint32_t
 #define __gen_address_offset(reloc) (*reloc)
 #define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
 
 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
@@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
 
-        /* We need to lock the scoreboard before any tlb acess happens. If this
+        /* We need to lock the scoreboard before any tlb access happens. If this
          * thread switch comes after we have emitted a tlb load, then it means
          * that we can't lock on the last thread switch any more.
          */
@@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
 }
 
 static uint32_t
+v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
+{
+        nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+        switch (atomic_op) {
+        case nir_atomic_op_iadd:
+                return  instr->intrinsic == nir_intrinsic_ssbo_atomic ?
+                        v3d_get_op_for_atomic_add(instr, 2) :
+                        v3d_get_op_for_atomic_add(instr, 1);
+        case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
+        case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+        case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
+        case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
+        case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
+        case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+        case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+        case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+        default:                    unreachable("unknown atomic op");
+        }
+}
+
+static uint32_t
 v3d_general_tmu_op(nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
@@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
         case nir_intrinsic_load_uniform:
         case nir_intrinsic_load_shared:
         case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_global_2x32:
         case nir_intrinsic_store_ssbo:
         case nir_intrinsic_store_shared:
         case nir_intrinsic_store_scratch:
+        case nir_intrinsic_store_global_2x32:
                 return V3D_TMU_OP_REGULAR;
-        case nir_intrinsic_ssbo_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 2);
-        case nir_intrinsic_shared_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 1);
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_shared_atomic_imin:
-                return V3D_TMU_OP_WRITE_SMIN;
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_shared_atomic_umin:
-                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_shared_atomic_imax:
-                return V3D_TMU_OP_WRITE_SMAX;
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_shared_atomic_umax:
-                return V3D_TMU_OP_WRITE_UMAX;
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_shared_atomic_and:
-                return V3D_TMU_OP_WRITE_AND_READ_INC;
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_shared_atomic_or:
-                return V3D_TMU_OP_WRITE_OR_READ_DEC;
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_shared_atomic_xor:
-                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_shared_atomic_exchange:
-                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
-        case nir_intrinsic_ssbo_atomic_comp_swap:
-        case nir_intrinsic_shared_atomic_comp_swap:
-                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+        case nir_intrinsic_ssbo_atomic:
+        case nir_intrinsic_ssbo_atomic_swap:
+        case nir_intrinsic_shared_atomic:
+        case nir_intrinsic_shared_atomic_swap:
+        case nir_intrinsic_global_atomic_2x32:
+        case nir_intrinsic_global_atomic_swap_2x32:
+                return v3d_general_tmu_op_for_atomic(instr);
+
         default:
                 unreachable("unknown intrinsic op");
         }
@@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c)
         bool emitted_tmuwt = false;
         for (int i = 0; i < c->tmu.flush_count; i++) {
                 if (c->tmu.flush[i].component_mask > 0) {
-                        nir_dest *dest = c->tmu.flush[i].dest;
-                        assert(dest);
+                        nir_def *def = c->tmu.flush[i].def;
+                        assert(def);
 
                         for (int j = 0; j < 4; j++) {
                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
-                                        ntq_store_dest(c, dest, j,
-                                                       vir_MOV(c, vir_LDTMU(c)));
+                                        ntq_store_def(c, def, j,
+                                                      vir_MOV(c, vir_LDTMU(c)));
                                 }
                         }
                 } else if (!emitted_tmuwt) {
@@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c)
 
 /**
  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
- * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
+ * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
  * and more specifically, the output fifo, since that can't stall.
  */
 void
 ntq_add_pending_tmu_flush(struct v3d_compile *c,
-                          nir_dest *dest,
+                          nir_def *def,
                           uint32_t component_mask)
 {
         const uint32_t num_components = util_bitcount(component_mask);
@@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
 
         if (num_components > 0) {
                 c->tmu.output_fifo_size += num_components;
-                if (!dest->is_ssa)
-                        _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
+
+                nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+                if (store != NULL) {
+                        nir_def *reg = store->src[1].ssa;
+                        _mesa_set_add(c->tmu.outstanding_regs, reg);
+                }
         }
 
-        c->tmu.flush[c->tmu.flush_count].dest = dest;
+        c->tmu.flush[c->tmu.flush_count].def = def;
         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
         c->tmu.flush_count++;
+        c->tmu.total_count++;
 
         if (c->disable_tmu_pipelining)
                 ntq_flush_tmu(c);
@@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
                               uint32_t base_const_offset,
                               uint32_t *writemask,
                               uint32_t *const_offset,
+                              uint32_t *type_size,
                               uint32_t *tmu_writes)
 {
         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
@@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
                 /* Update the offset for the TMU write based on the
                  * the first component we are writing.
                  */
-                *const_offset = base_const_offset + first_component * 4;
+                *type_size = nir_src_bit_size(instr->src[0]) / 8;
+                *const_offset =
+                        base_const_offset + first_component * (*type_size);
 
                 /* Clear these components from the writemask */
                 uint32_t written_mask =
@@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
                                int offset_src,
                                struct qreg base_offset,
                                uint32_t const_offset,
+                               uint32_t dest_components,
                                uint32_t *tmu_writes)
 {
         if (mode == MODE_COUNT) {
@@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
 
         if (vir_in_nonuniform_control_flow(c))
                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+        tmu->ldtmu_count = dest_components;
 }
 
 /**
@@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
  */
 static void
 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
-                     bool is_shared_or_scratch)
+                     bool is_shared_or_scratch, bool is_global)
 {
         uint32_t tmu_op = v3d_general_tmu_op(instr);
 
@@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
          * amount to add/sub, as that is implicit.
          */
         bool atomic_add_replaced =
-                ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
-                  instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+                (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
+                 instr->intrinsic == nir_intrinsic_shared_atomic ||
+                 instr->intrinsic == nir_intrinsic_global_atomic_2x32) &&
+                nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
-                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
 
         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                          instr->intrinsic == nir_intrinsic_store_scratch ||
-                         instr->intrinsic == nir_intrinsic_store_shared);
+                         instr->intrinsic == nir_intrinsic_store_shared ||
+                         instr->intrinsic == nir_intrinsic_store_global_2x32);
 
         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
                         instr->intrinsic == nir_intrinsic_load_ubo ||
                         instr->intrinsic == nir_intrinsic_load_ssbo ||
                         instr->intrinsic == nir_intrinsic_load_scratch ||
-                        instr->intrinsic == nir_intrinsic_load_shared);
+                        instr->intrinsic == nir_intrinsic_load_shared ||
+                        instr->intrinsic == nir_intrinsic_load_global_2x32);
 
         if (!is_load)
                 c->tmu_dirty_rcl = true;
 
-        bool has_index = !is_shared_or_scratch;
+        if (is_global)
+                c->has_global_address = true;
+
+        bool has_index = !is_shared_or_scratch && !is_global;
 
         int offset_src;
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                    instr->intrinsic == nir_intrinsic_load_ubo ||
                    instr->intrinsic == nir_intrinsic_load_scratch ||
                    instr->intrinsic == nir_intrinsic_load_shared ||
+                   instr->intrinsic == nir_intrinsic_load_global_2x32 ||
                    atomic_add_replaced) {
                 offset_src = 0 + has_index;
         } else if (is_store) {
@@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                           v3d_unit_data_create(0, const_offset));
                 const_offset = 0;
         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
-                uint32_t index = nir_src_as_uint(instr->src[0]);
-                /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
-                 * shifted up by 1 (0 is gallium's constant buffer 0).
+                /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
+                 * is gallium's constant buffer 0 in GL and push constants
+                 * in Vulkan)).
                  */
-                if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
-                        index++;
-
+                uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
                 base_offset =
                         vir_uniform(c, QUNIFORM_UBO_ADDR,
                                     v3d_unit_data_create(index, const_offset));
@@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                         base_offset = c->cs_shared_offset;
                         const_offset += nir_intrinsic_base(instr);
                 }
+        } else if (is_global) {
+                /* Global load/store intrinsics use gloal addresses, so the
+                 * offset is the target address and we don't need to add it
+                 * to a base offset.
+                 */
+                base_offset = vir_uniform_ui(c, 0);
         } else {
+                uint32_t idx = is_store ? 1 : 0;
                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
-                                          nir_src_as_uint(instr->src[is_store ?
-                                                                      1 : 0]));
+                                          nir_src_comp_as_uint(instr->src[idx], 0));
         }
 
         /* We are ready to emit TMU register writes now, but before we actually
@@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
                 assert(mode == MODE_COUNT || tmu_writes > 0);
 
+                uint32_t type_size = 4;
+
                 if (is_store) {
                         emit_tmu_general_store_writes(c, mode, instr,
                                                       base_const_offset,
                                                       &writemask,
                                                       &const_offset,
+                                                      &type_size,
                                                       &tmu_writes);
                 } else if (!is_load && !atomic_add_replaced) {
-                         emit_tmu_general_atomic_writes(c, mode, instr,
-                                                        tmu_op, has_index,
-                                                        &tmu_writes);
+                        emit_tmu_general_atomic_writes(c, mode, instr,
+                                                       tmu_op, has_index,
+                                                       &tmu_writes);
+                } else if (is_load) {
+                        type_size = instr->def.bit_size / 8;
                 }
 
                 /* For atomics we use 32bit except for CMPXCHG, that we need
@@ -618,17 +648,39 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
                                 V3D_TMU_OP_TYPE_ATOMIC;
 
+                        /* Only load per-quad if we can be certain that all
+                         * lines in the quad are active. Notice that demoted
+                         * invocations, unlike terminated ones, are still
+                         * active: we want to skip memory writes for them but
+                         * loads should still work.
+                         */
                         uint32_t perquad =
-                                is_load && !vir_in_nonuniform_control_flow(c)
-                                ? GENERAL_TMU_LOOKUP_PER_QUAD
-                                : GENERAL_TMU_LOOKUP_PER_PIXEL;
+                                is_load && !vir_in_nonuniform_control_flow(c) &&
+                                c->s->info.stage == MESA_SHADER_FRAGMENT &&
+                                c->s->info.fs.needs_quad_helper_invocations &&
+                                !c->emitted_discard ?
+                                GENERAL_TMU_LOOKUP_PER_QUAD :
+                                GENERAL_TMU_LOOKUP_PER_PIXEL;
                         config = 0xffffff00 | tmu_op << 3 | perquad;
 
                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
                         } else if (is_atomic || num_components == 1) {
-                                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                                switch (type_size) {
+                                case 4:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                                        break;
+                                case 2:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
+                                        break;
+                                case 1:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
+                                        break;
+                                default:
+                                        unreachable("Unsupported bitsize");
+                                }
                         } else {
+                                assert(type_size == 4);
                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
                                           num_components - 2;
                         }
@@ -637,7 +689,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                 emit_tmu_general_address_write(c, mode, instr, config,
                                                dynamic_src, offset_src,
                                                base_offset, const_offset,
-                                               &tmu_writes);
+                                               dest_components, &tmu_writes);
 
                 assert(tmu_writes > 0);
                 if (mode == MODE_COUNT) {
@@ -660,7 +712,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                          */
                         const uint32_t component_mask =
                                 (1 << dest_components) - 1;
-                        ntq_add_pending_tmu_flush(c, &instr->dest,
+                        ntq_add_pending_tmu_flush(c, &instr->def,
                                                   component_mask);
                 }
         }
@@ -673,7 +725,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
 }
 
 static struct qreg *
-ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
+ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
 {
         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
                                           def->num_components);
@@ -717,8 +769,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig)
  * its destination to be the NIR reg's destination
  */
 void
-ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
-               struct qreg result)
+ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+              struct qreg result)
 {
         struct qinst *last_inst = NULL;
         if (!list_is_empty(&c->cur_block->instructions))
@@ -731,23 +783,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
         assert(result.file == QFILE_TEMP && last_inst &&
                (last_inst == c->defs[result.index] || is_reused_uniform));
 
-        if (dest->is_ssa) {
-                assert(chan < dest->ssa.num_components);
+        nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+        if (store == NULL) {
+                assert(chan < def->num_components);
 
                 struct qreg *qregs;
                 struct hash_entry *entry =
-                        _mesa_hash_table_search(c->def_ht, &dest->ssa);
+                        _mesa_hash_table_search(c->def_ht, def);
 
                 if (entry)
                         qregs = entry->data;
                 else
-                        qregs = ntq_init_ssa_def(c, &dest->ssa);
+                        qregs = ntq_init_ssa_def(c, def);
 
                 qregs[chan] = result;
         } else {
-                nir_register *reg = dest->reg.reg;
-                assert(dest->reg.base_offset == 0);
-                assert(reg->num_array_elems == 0);
+                nir_def *reg = store->src[1].ssa;
+                ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                assert(nir_intrinsic_base(store) == 0);
+                assert(nir_intrinsic_num_array_elems(decl) == 0);
                 struct hash_entry *entry =
                         _mesa_hash_table_search(c->def_ht, reg);
                 struct qreg *qregs = entry->data;
@@ -802,7 +856,9 @@ struct qreg
 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
 {
         struct hash_entry *entry;
-        if (src.is_ssa) {
+
+        nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
+        if (load == NULL) {
                 assert(i < src.ssa->num_components);
 
                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
@@ -811,10 +867,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i)
                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
                 }
         } else {
-                nir_register *reg = src.reg.reg;
-                assert(reg->num_array_elems == 0);
-                assert(src.reg.base_offset == 0);
-                assert(i < reg->num_components);
+                nir_def *reg = load->src[0].ssa;
+                ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                assert(nir_intrinsic_base(load) == 0);
+                assert(nir_intrinsic_num_array_elems(decl) == 0);
+                assert(i < nir_intrinsic_num_components(decl));
 
                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
                         ntq_flush_tmu(c);
@@ -830,13 +887,8 @@ static struct qreg
 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
                 unsigned src)
 {
-        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
-        unsigned chan = ffs(instr->dest.write_mask) - 1;
         struct qreg r = ntq_get_src(c, instr->src[src].src,
-                                    instr->src[src].swizzle[chan]);
-
-        assert(!instr->src[src].abs);
-        assert(!instr->src[src].negate);
+                                    instr->src[src].swizzle[0]);
 
         return r;
 };
@@ -876,6 +928,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                 case GLSL_SAMPLER_DIM_3D:
                 case GLSL_SAMPLER_DIM_CUBE:
                 case GLSL_SAMPLER_DIM_BUF:
+                case GLSL_SAMPLER_DIM_EXTERNAL:
                         /* Don't minify the array size. */
                         if (!(instr->is_array && i == dest_size - 1)) {
                                 size = ntq_minify(c, size, lod);
@@ -890,7 +943,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                         unreachable("Bad sampler type");
                 }
 
-                ntq_store_dest(c, &instr->dest, i, size);
+                ntq_store_def(c, &instr->def, i, size);
         }
 }
 
@@ -905,12 +958,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
          */
         switch (instr->op) {
         case nir_texop_query_levels:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
                 return;
         case nir_texop_texture_samples:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
                 return;
         case nir_texop_txs:
                 ntq_emit_txs(c, instr);
@@ -919,10 +972,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 break;
         }
 
-        if (c->devinfo->ver >= 40)
-                v3d40_vir_emit_tex(c, instr);
-        else
-                v3d33_vir_emit_tex(c, instr);
+        v3d_vir_emit_tex(c, instr);
 }
 
 static struct qreg
@@ -963,44 +1013,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
 
 static struct qreg
 emit_smooth_varying(struct v3d_compile *c,
-                    struct qreg vary, struct qreg w, struct qreg r5)
+                    struct qreg vary, struct qreg w, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+        return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
 }
 
 static struct qreg
 emit_noperspective_varying(struct v3d_compile *c,
-                           struct qreg vary, struct qreg r5)
+                           struct qreg vary, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_MOV(c, vary), r5);
+        return vir_FADD(c, vir_MOV(c, vary), c_reg);
 }
 
 static struct qreg
 emit_flat_varying(struct v3d_compile *c,
-                  struct qreg vary, struct qreg r5)
+                  struct qreg vary, struct qreg c_reg)
 {
         vir_MOV_dest(c, c->undef, vary);
-        return vir_MOV(c, r5);
+        return vir_MOV(c, c_reg);
 }
 
 static struct qreg
 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                       int8_t input_idx, uint8_t swizzle, int array_index)
 {
-        struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
-        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        struct qreg c_reg; /* C coefficient */
+
+        if (c->devinfo->has_accumulators)
+                c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        else
+                c_reg = vir_reg(QFILE_REG, 0);
 
         struct qinst *ldvary = NULL;
         struct qreg vary;
-        if (c->devinfo->ver >= 41) {
-                ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
-                                      c->undef, c->undef);
-                ldvary->qpu.sig.ldvary = true;
-                vary = vir_emit_def(c, ldvary);
-        } else {
-                vir_NOP(c)->qpu.sig.ldvary = true;
-                vary = r3;
-        }
+        ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+                              c->undef, c->undef);
+        ldvary->qpu.sig.ldvary = true;
+        vary = vir_emit_def(c, ldvary);
 
         /* Store the input value before interpolation so we can implement
          * GLSL's interpolateAt functions if the shader uses them.
@@ -1008,7 +1057,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
         if (input_idx >= 0) {
                 assert(var);
                 c->interp[input_idx].vp = vary;
-                c->interp[input_idx].C = vir_MOV(c, r5);
+                c->interp[input_idx].C = vir_MOV(c, c_reg);
                 c->interp[input_idx].mode = var->data.interpolation;
         }
 
@@ -1018,7 +1067,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
          */
         if (!var) {
                 assert(input_idx < 0);
-                return emit_smooth_varying(c, vary, c->payload_w, r5);
+                return emit_smooth_varying(c, vary, c->payload_w, c_reg);
         }
 
         int i = c->num_inputs++;
@@ -1033,20 +1082,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                 if (var->data.centroid) {
                         BITSET_SET(c->centroid_flags, i);
                         result = emit_smooth_varying(c, vary,
-                                                     c->payload_w_centroid, r5);
+                                                     c->payload_w_centroid, c_reg);
                 } else {
-                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
+                        result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
                 }
                 break;
 
         case INTERP_MODE_NOPERSPECTIVE:
                 BITSET_SET(c->noperspective_flags, i);
-                result = emit_noperspective_varying(c, vary, r5);
+                result = emit_noperspective_varying(c, vary, c_reg);
                 break;
 
         case INTERP_MODE_FLAT:
                 BITSET_SET(c->flat_shade_flags, i);
-                result = emit_flat_varying(c, vary, r5);
+                result = emit_flat_varying(c, vary, c_reg);
                 break;
 
         default:
@@ -1163,16 +1212,6 @@ ntq_emit_comparison(struct v3d_compile *c,
                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                 break;
 
-        case nir_op_i2b32:
-                vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
-                cond_invert = true;
-                break;
-
-        case nir_op_f2b32:
-                vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
-                cond_invert = true;
-                break;
-
         default:
                 return false;
         }
@@ -1188,7 +1227,7 @@ ntq_emit_comparison(struct v3d_compile *c,
 static struct nir_alu_instr *
 ntq_get_alu_parent(nir_src src)
 {
-        if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
+        if (src.ssa->parent_instr->type != nir_instr_type_alu)
                 return NULL;
         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
         if (!instr)
@@ -1199,7 +1238,7 @@ ntq_get_alu_parent(nir_src src)
          * src.
          */
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
-                if (!instr->src[i].src.is_ssa)
+                if (nir_load_reg_for_def(instr->src[i].src.ssa))
                         return NULL;
         }
 
@@ -1242,12 +1281,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
         return result;
 }
 
+static struct qreg
+ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+        struct qreg result =
+                vir_MOV(c, vir_SEL(c, cond,
+                                   vir_uniform_ui(c, 1),
+                                   vir_uniform_ui(c, 0)));
+        c->flags_temp = result.index;
+        c->flags_cond = cond;
+        return result;
+}
+
+static struct qreg
+f2f16_rtz(struct v3d_compile *c, struct qreg f32)
+{
+   /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
+    * method and seems to be using RTE by default, so we need to implement
+    * RTZ rounding in software.
+    */
+   struct qreg rf16 = vir_FMOV(c, f32);
+   vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
+
+   struct qreg rf32 = vir_FMOV(c, rf16);
+   vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
+
+   struct qreg f32_abs = vir_FMOV(c, f32);
+   vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+   struct qreg rf32_abs = vir_FMOV(c, rf32);
+   vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+   vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
+              V3D_QPU_PF_PUSHN);
+   return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+                  vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
+}
+
+/**
+ * Takes the result value of a signed integer width conversion from a smaller
+ * type to a larger type and if needed, it applies sign extension to it.
+ */
+static struct qreg
+sign_extend(struct v3d_compile *c,
+            struct qreg value,
+            uint32_t src_bit_size,
+            uint32_t dst_bit_size)
+{
+        assert(src_bit_size < dst_bit_size);
+
+        struct qreg tmp = vir_MOV(c, value);
+
+        /* Do we need to sign-extend? */
+        uint32_t sign_mask = 1 << (src_bit_size - 1);
+        struct qinst *sign_check =
+                vir_AND_dest(c, vir_nop_reg(),
+                             tmp, vir_uniform_ui(c, sign_mask));
+        vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
+
+        /* If so, fill in leading sign bits */
+        uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
+                               ((1ull << dst_bit_size) - 1);
+        struct qinst *extend_inst =
+                vir_OR_dest(c, tmp, tmp,
+                            vir_uniform_ui(c, extend_bits));
+        vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
+
+        return tmp;
+}
+
 static void
 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 {
-        /* This should always be lowered to ALU operations for V3D. */
-        assert(!instr->dest.saturate);
-
         /* Vectors are special in that they have non-scalarized writemasks,
          * and just take the first swizzle channel for each argument in order
          * into each writemask channel.
@@ -1260,8 +1365,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-                        ntq_store_dest(c, &instr->dest.dest, i,
-                                       vir_MOV(c, srcs[i]));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_MOV(c, srcs[i]));
                 return;
         }
 
@@ -1327,6 +1432,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
                 break;
 
+        case nir_op_f2f16:
+        case nir_op_f2f16_rtne:
+                assert(nir_src_bit_size(instr->src[0].src) == 32);
+                result = vir_FMOV(c, src[0]);
+                vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
+                break;
+
+        case nir_op_f2f16_rtz:
+                assert(nir_src_bit_size(instr->src[0].src) == 32);
+                result = f2f16_rtz(c, src[0]);
+                break;
+
+        case nir_op_f2f32:
+                assert(nir_src_bit_size(instr->src[0].src) == 16);
+                result = vir_FMOV(c, src[0]);
+                vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
+                break;
+
+        case nir_op_i2i16: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 32 || bit_size == 8);
+                if (bit_size == 32) {
+                        /* We don't have integer pack/unpack methods for
+                         * converting between 16-bit and 32-bit, so we implement
+                         * the conversion manually by truncating the src.
+                         */
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+                } else {
+                        struct qreg tmp = vir_AND(c, src[0],
+                                                  vir_uniform_ui(c, 0xff));
+                        result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
+                }
+                break;
+        }
+
+        case nir_op_u2u16: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 32 || bit_size == 8);
+
+                /* We don't have integer pack/unpack methods for converting
+                 * between 16-bit and 32-bit, so we implement the conversion
+                 * manually by truncating the src. For the 8-bit case, we
+                 * want to make sure we don't copy garbage from any of the
+                 * 24 MSB bits.
+                 */
+                if (bit_size == 32)
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+                else
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+                break;
+        }
+
+        case nir_op_i2i8:
+        case nir_op_u2u8:
+                assert(nir_src_bit_size(instr->src[0].src) == 32 ||
+                       nir_src_bit_size(instr->src[0].src) == 16);
+                /* We don't have integer pack/unpack methods for converting
+                 * between 8-bit and 32-bit, so we implement the conversion
+                 * manually by truncating the src.
+                 */
+                result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+                break;
+
+        case nir_op_u2u32: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 16 || bit_size == 8);
+
+                /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
+                 * from the src but we make sure to clear any garbage bits that
+                 * may be present in the invalid src bits.
+                 */
+                uint32_t mask = (1 << bit_size) - 1;
+                result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
+                break;
+        }
+
+        case nir_op_i2i32: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 16 || bit_size == 8);
+
+                uint32_t mask = (1 << bit_size) - 1;
+                struct qreg tmp = vir_AND(c, src[0],
+                                          vir_uniform_ui(c, mask));
+
+                result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
+                break;
+        }
+
         case nir_op_iadd:
                 result = vir_ADD(c, src[0], src[1]);
                 break;
@@ -1390,8 +1583,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
         }
 
-        case nir_op_i2b32:
-        case nir_op_f2b32:
         case nir_op_feq32:
         case nir_op_fneu32:
         case nir_op_fge32:
@@ -1485,13 +1676,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_uadd_carry:
                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
                            V3D_QPU_PF_PUSHC);
-                result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+                result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
+                break;
+
+        case nir_op_usub_borrow:
+                vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
+                           V3D_QPU_PF_PUSHC);
+                result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
                 break;
 
         case nir_op_pack_half_2x16_split:
                 result = vir_VFPACK(c, src[0], src[1]);
                 break;
 
+        case nir_op_pack_2x32_to_2x16_v3d:
+                result = vir_VPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_32_to_r11g11b10_v3d:
+                result = vir_V11FPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
+                result = vir_V10PACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_4x16_to_4x8_v3d:
+                result = vir_V8PACK(c, src[0], src[1]);
+                break;
+
         case nir_op_unpack_half_2x16_split_x:
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1502,26 +1715,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
                 break;
 
-        case nir_op_fquantize2f16: {
-                /* F32 -> F16 -> F32 conversion */
-                struct qreg tmp = vir_FMOV(c, src[0]);
-                vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
-                tmp = vir_FMOV(c, tmp);
-                vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
+        case nir_op_pack_2x16_to_unorm_2x8_v3d:
+                result = vir_VFTOUNORM8(c, src[0]);
+                break;
 
-                /* Check for denorm */
-                struct qreg abs_src = vir_FMOV(c, src[0]);
-                vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
-                struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
-                vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
-                                         V3D_QPU_PF_PUSHC);
+        case nir_op_pack_2x16_to_snorm_2x8_v3d:
+                result = vir_VFTOSNORM8(c, src[0]);
+                break;
 
-                /* Return +/-0 for denorms */
-                struct qreg zero =
-                        vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
-                result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
+        case nir_op_pack_2x16_to_unorm_2x10_v3d:
+                result = vir_VFTOUNORM10LO(c, src[0]);
+                break;
+
+        case nir_op_pack_2x16_to_unorm_10_2_v3d:
+                result = vir_VFTOUNORM10HI(c, src[0]);
+                break;
+
+        case nir_op_f2unorm_16_v3d:
+                result = vir_FTOUNORM16(c, src[0]);
+                break;
+
+        case nir_op_f2snorm_16_v3d:
+                result = vir_FTOSNORM16(c, src[0]);
                 break;
-        }
 
         default:
                 fprintf(stderr, "unknown NIR ALU inst: ");
@@ -1530,17 +1746,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 abort();
         }
 
-        /* We have a scalar result, so the instruction should only have a
-         * single channel written to.
-         */
-        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
-        ntq_store_dest(c, &instr->dest.dest,
-                       ffs(instr->dest.write_mask) - 1, result);
+        ntq_store_def(c, &instr->def, 0, result);
 }
 
 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
  * specifier.  They come from a register that's preloaded with 0xffffffff
- * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
+ * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
  */
 #define TLB_TYPE_F16_COLOR         (3 << 6)
@@ -1670,15 +1881,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
 static void
 emit_frag_end(struct v3d_compile *c)
 {
-        /* If the shader has no non-TLB side effects and doesn't write Z
-         * we can promote it to enabling early_fragment_tests even
-         * if the user didn't.
-         */
-        if (c->output_position_index == -1 &&
-            !(c->s->info.num_images || c->s->info.num_ssbos)) {
-                c->s->info.fs.early_fragment_tests = true;
-        }
-
         if (c->output_sample_mask_index != -1) {
                 vir_SETMSF_dest(c, vir_nop_reg(),
                                 vir_AND(c,
@@ -1703,55 +1905,75 @@ emit_frag_end(struct v3d_compile *c)
         }
 
         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
-        if (c->output_position_index != -1 &&
-            !c->s->info.fs.early_fragment_tests) {
-                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
-                                                  c->outputs[c->output_position_index]);
-                uint8_t tlb_specifier = TLB_TYPE_DEPTH;
 
-                if (c->devinfo->ver >= 42) {
-                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
-                                          TLB_SAMPLE_MODE_PER_PIXEL);
-                } else
-                        tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
+        /* If the shader has no non-TLB side effects and doesn't write Z
+         * we can promote it to enabling early_fragment_tests even
+         * if the user didn't.
+         */
+        if (c->output_position_index == -1 &&
+            !(c->s->info.num_images || c->s->info.num_ssbos) &&
+            !c->s->info.fs.uses_discard &&
+            !c->s->info.fs.uses_demote &&
+            !c->fs_key->sample_alpha_to_coverage &&
+            c->output_sample_mask_index == -1 &&
+            has_any_tlb_color_write) {
+                c->s->info.fs.early_fragment_tests = true;
+        }
 
-                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                      tlb_specifier |
-                                                      0xffffff00);
+        /* By default, Z buffer writes are implicit using the Z values produced
+         * from FEP (Z value produced from rasterization). When this is not
+         * desirable (shader writes Z explicitly, has discards, etc) we need
+         * to let the hardware know by setting c->writes_z to true, in which
+         * case we always need to write a Z value from the QPU, even if it is
+         * just the passthrough Z value produced from FEP.
+         *
+         * Also, from the V3D 4.2 spec:
+         *
+         * "If a shader performs a Z read the “Fragment shader does Z writes”
+         *  bit in the shader record must be enabled to ensure deterministic
+         *  results"
+         *
+         * So if c->reads_z is set we always need to write Z, even if it is
+         * a passthrough from the Z value produced from FEP.
+         */
+        if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
                 c->writes_z = true;
-        } else if (c->s->info.fs.uses_discard ||
-                   !c->s->info.fs.early_fragment_tests ||
-                   c->fs_key->sample_alpha_to_coverage ||
-                   !has_any_tlb_color_write) {
-                /* Emit passthrough Z if it needed to be delayed until shader
-                 * end due to potential discards.
-                 *
-                 * Since (single-threaded) fragment shaders always need a TLB
-                 * write, emit passthrouh Z if we didn't have any color
-                 * buffers and flag us as potentially discarding, so that we
-                 * can use Z as the TLB write.
-                 */
-                c->s->info.fs.uses_discard = true;
-
-                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
-                                                  vir_nop_reg());
                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
+                struct qinst *inst;
+
+                if (c->output_position_index != -1) {
+                        /* Shader writes to gl_FragDepth, use that */
+                        inst = vir_MOV_dest(c, tlbu_reg,
+                                            c->outputs[c->output_position_index]);
+
+                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
+                                          TLB_SAMPLE_MODE_PER_PIXEL);
+                } else {
+                        /* Shader doesn't write to gl_FragDepth, take Z from
+                         * FEP.
+                         */
+                        c->writes_z_from_fep = true;
+                        inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
 
-                if (c->devinfo->ver >= 42) {
                         /* The spec says the PER_PIXEL flag is ignored for
                          * invariant writes, but the simulator demands it.
                          */
                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
                                           TLB_SAMPLE_MODE_PER_PIXEL);
-                } else {
-                        tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
+
+                        /* Since (single-threaded) fragment shaders always need
+                         * a TLB write, if we dond't have any we emit a
+                         * passthrouh Z and flag us as potentially discarding,
+                         * so that we can use Z as the required TLB write.
+                         */
+                        if (!has_any_tlb_color_write)
+                                c->s->info.fs.uses_discard = true;
                 }
 
-                inst->uniform = vir_get_uniform_index(c,
-                                                      QUNIFORM_CONSTANT,
+                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
                                                       tlb_specifier |
                                                       0xffffff00);
-                c->writes_z = true;
+                inst->is_tlb_z_write = true;
         }
 
         /* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1767,7 +1989,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
                        struct qreg vpm_index,
                        bool uniform_vpm_index)
 {
-        assert(c->devinfo->ver >= 40);
         if (uniform_vpm_index)
                 vir_STVPMV(c, vpm_index, val);
         else
@@ -1777,13 +1998,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
 static void
 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
 {
-        if (c->devinfo->ver >= 40) {
-                vir_VPM_WRITE_indirect(c, val,
-                                       vir_uniform_ui(c, vpm_index), true);
-        } else {
-                /* XXX: v3d33_vir_vpm_write_setup(c); */
-                vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
-        }
+        vir_VPM_WRITE_indirect(c, val,
+                               vir_uniform_ui(c, vpm_index), true);
 }
 
 static void
@@ -1791,7 +2007,7 @@ emit_vert_end(struct v3d_compile *c)
 {
         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
          */
-        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+        if (c->devinfo->ver == 42)
                 vir_VPMWT(c);
 }
 
@@ -1800,7 +2016,7 @@ emit_geom_end(struct v3d_compile *c)
 {
         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
          */
-        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+        if (c->devinfo->ver == 42)
                 vir_VPMWT(c);
 }
 
@@ -1812,8 +2028,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
                        nir_intrinsic_instr *high,
                        void *data)
 {
-        /* Our backend is 32-bit only at present */
-        if (bit_size != 32)
+        /* TMU general access only supports 32-bit vectors */
+        if (bit_size > 32)
+                return false;
+
+        if ((bit_size == 8 || bit_size == 16) && num_components > 1)
                 return false;
 
         if (align_mul % 4 != 0 || align_offset % 4 != 0)
@@ -1843,7 +2062,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
         do {
                 progress = false;
 
-                NIR_PASS_V(s, nir_lower_vars_to_ssa);
+                NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_opt_deref);
+
+                NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+                if (!s->info.var_copies_lowered) {
+                        /* Only run this pass if nir_lower_var_copies was not called
+                         * yet. That would lower away any copy_deref instructions and we
+                         * don't want to introduce any more.
+                         */
+                        NIR_PASS(progress, s, nir_opt_find_array_copies);
+                }
+
+                NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+                NIR_PASS(progress, s, nir_opt_dead_write_vars);
+                NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
+
+                NIR_PASS(progress, s, nir_remove_dead_variables,
+                         (nir_variable_mode)(nir_var_function_temp |
+                                             nir_var_shader_temp |
+                                             nir_var_mem_shared),
+                         NULL);
+
                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
                 NIR_PASS(progress, s, nir_copy_prop);
@@ -1851,10 +2092,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
 
+                NIR_PASS(progress, s, nir_opt_intrinsics);
+                NIR_PASS(progress, s, nir_opt_idiv_const, 32);
+                NIR_PASS(progress, s, nir_lower_alu);
+
+                if (nir_opt_loop(s)) {
+                   progress = true;
+                   NIR_PASS(progress, s, nir_copy_prop);
+                   NIR_PASS(progress, s, nir_opt_dce);
+                }
+
+                NIR_PASS(progress, s, nir_opt_conditional_discard);
+
+                NIR_PASS(progress, s, nir_opt_remove_phis);
+                NIR_PASS(progress, s, nir_opt_if, false);
+                if (c && !c->disable_gcm) {
+                        bool local_progress = false;
+                        NIR_PASS(local_progress, s, nir_opt_gcm, false);
+                        c->gcm_progress |= local_progress;
+                        progress |= local_progress;
+                }
+
+                /* Note that vectorization may undo the load/store scalarization
+                 * pass we run for non 32-bit TMU general load/store by
+                 * converting, for example, 2 consecutive 16-bit loads into a
+                 * single 32-bit load. This is fine (and desirable) as long as
+                 * the resulting 32-bit load meets 32-bit alignment requirements,
+                 * which mem_vectorize_callback() should be enforcing.
+                 */
                 nir_load_store_vectorize_options vectorize_opts = {
                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
                                  nir_var_mem_push_const | nir_var_mem_shared |
@@ -1862,7 +2132,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
                         .callback = mem_vectorize_callback,
                         .robust_modes = 0,
                 };
-                NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
+                bool vectorize_progress = false;
+
+
+                /* This requires that we have called
+                 * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
+                 * first, which we may not have done yet if we call here too
+                 * early durign NIR pre-processing. We can detect this because
+                 * in that case we won't have a compile object
+                 */
+                if (c) {
+                        NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
+                                 &vectorize_opts);
+                        if (vectorize_progress) {
+                                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
+                                NIR_PASS(progress, s, nir_lower_pack);
+                                progress = true;
+                        }
+                }
 
                 if (lower_flrp != 0) {
                         bool lower_flrp_progress = false;
@@ -1895,10 +2182,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
 
         nir_move_options sink_opts =
                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
-                nir_move_load_ubo;
+                nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
-
-        NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
 }
 
 static int
@@ -1915,27 +2200,9 @@ ntq_emit_vpm_read(struct v3d_compile *c,
                   uint32_t *remaining,
                   uint32_t vpm_index)
 {
-        struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
-
-        if (c->devinfo->ver >= 40 ) {
-                return vir_LDVPMV_IN(c,
-                                     vir_uniform_ui(c,
-                                                    (*num_components_queued)++));
-        }
-
-        if (*num_components_queued != 0) {
-                (*num_components_queued)--;
-                return vir_MOV(c, vpm);
-        }
-
-        uint32_t num_components = MIN2(*remaining, 32);
-
-        v3d33_vir_vpm_read_setup(c, num_components);
-
-        *num_components_queued = num_components - 1;
-        *remaining -= num_components;
-
-        return vir_MOV(c, vpm);
+        return vir_LDVPMV_IN(c,
+                             vir_uniform_ui(c,
+                                            (*num_components_queued)++));
 }
 
 static void
@@ -2005,31 +2272,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c)
         }
 
         /* The actual loads will happen directly in nir_intrinsic_load_input
-         * on newer versions.
          */
-        if (c->devinfo->ver >= 40)
-                return;
-
-        for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
-                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + 1) * 4);
-
-                for (int i = 0; i < c->vattr_sizes[loc]; i++) {
-                        c->inputs[loc * 4 + i] =
-                                ntq_emit_vpm_read(c,
-                                                  &vpm_components_queued,
-                                                  &num_components,
-                                                  loc * 4 + i);
-
-                }
-        }
-
-        if (c->devinfo->ver >= 40) {
-                assert(vpm_components_queued == num_components);
-        } else {
-                assert(vpm_components_queued == 0);
-                assert(num_components == 0);
-        }
+        return;
 }
 
 static bool
@@ -2058,14 +2302,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
                  */
                 assert(glsl_type_is_array(var->type));
                 const struct glsl_type *type = glsl_get_array_element(var->type);
-                unsigned array_len = MAX2(glsl_get_length(type), 1);
+                unsigned var_len = glsl_count_vec4_slots(type, false, false);
                 unsigned loc = var->data.driver_location;
 
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + array_len) * 4);
+                                  (loc + var_len) * 4);
 
                 if (var->data.compact) {
-                        for (unsigned j = 0; j < array_len; j++) {
+                        for (unsigned j = 0; j < var_len; j++) {
                                 unsigned input_idx = c->num_inputs++;
                                 unsigned loc_frac = var->data.location_frac + j;
                                 unsigned loc = var->data.location + loc_frac / 4;
@@ -2076,8 +2320,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
                        continue;
                 }
 
-                for (unsigned j = 0; j < array_len; j++) {
-                        unsigned num_elements = glsl_get_vector_elements(type);
+                for (unsigned j = 0; j < var_len; j++) {
+                        unsigned num_elements =
+                                glsl_type_is_struct(glsl_without_array(type)) ?
+                                4 : glsl_get_vector_elements(type);
                         for (unsigned k = 0; k < num_elements; k++) {
                                 unsigned chan = var->data.location_frac + k;
                                 unsigned input_idx = c->num_inputs++;
@@ -2124,7 +2370,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
                 } else if (var->data.compact) {
                         for (int j = 0; j < var_len; j++)
                                 emit_compact_fragment_input(c, loc, var, j);
-                } else if (glsl_type_is_struct(var->type)) {
+                } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
                         for (int j = 0; j < var_len; j++) {
                            emit_fragment_input(c, loc, var, j, 4);
                         }
@@ -2143,12 +2389,9 @@ ntq_setup_outputs(struct v3d_compile *c)
                 return;
 
         nir_foreach_shader_out_variable(var, c->s) {
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+                assert(glsl_type_is_vector_or_scalar(var->type));
                 unsigned loc = var->data.driver_location * 4;
 
-                assert(array_len == 1);
-                (void)array_len;
-
                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
                         add_output(c, loc + var->data.location_frac + i,
                                    var->data.location,
@@ -2157,15 +2400,17 @@ ntq_setup_outputs(struct v3d_compile *c)
 
                 switch (var->data.location) {
                 case FRAG_RESULT_COLOR:
-                        c->output_color_var[0] = var;
-                        c->output_color_var[1] = var;
-                        c->output_color_var[2] = var;
-                        c->output_color_var[3] = var;
+                        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+                                c->output_color_var[i] = var;
                         break;
                 case FRAG_RESULT_DATA0:
                 case FRAG_RESULT_DATA1:
                 case FRAG_RESULT_DATA2:
                 case FRAG_RESULT_DATA3:
+                case FRAG_RESULT_DATA4:
+                case FRAG_RESULT_DATA5:
+                case FRAG_RESULT_DATA6:
+                case FRAG_RESULT_DATA7:
                         c->output_color_var[var->data.location -
                                             FRAG_RESULT_DATA0] = var;
                         break;
@@ -2185,17 +2430,19 @@ ntq_setup_outputs(struct v3d_compile *c)
  * Each nir_register gets a struct qreg per 32-bit component being stored.
  */
 static void
-ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
+ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
 {
-        foreach_list_typed(nir_register, nir_reg, node, list) {
-                unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
+        nir_foreach_reg_decl(decl, impl) {
+                unsigned num_components = nir_intrinsic_num_components(decl);
+                unsigned array_len = nir_intrinsic_num_array_elems(decl);
+                array_len = MAX2(array_len, 1);
                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
-                                                  array_len *
-                                                  nir_reg->num_components);
+                                                  array_len * num_components);
 
+                nir_def *nir_reg = &decl->def;
                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
 
-                for (int i = 0; i < array_len * nir_reg->num_components; i++)
+                for (int i = 0; i < array_len * num_components; i++)
                         qregs[i] = vir_get_temp(c);
         }
 }
@@ -2222,23 +2469,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
         assert(nir_src_as_uint(instr->src[1]) == 0);
 
-        ntq_store_dest(c, &instr->dest, 0,
+        ntq_store_def(c, &instr->def, 0,
                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
         if (instr->num_components > 1) {
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_uniform(c,
-                                           instr->num_components == 2 && is_array ?
-                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
-                                                   QUNIFORM_IMAGE_HEIGHT,
-                                           image_index));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_uniform(c,
+                                          instr->num_components == 2 && is_array ?
+                                                  QUNIFORM_IMAGE_ARRAY_SIZE :
+                                                  QUNIFORM_IMAGE_HEIGHT,
+                                          image_index));
         }
         if (instr->num_components > 2) {
-                ntq_store_dest(c, &instr->dest, 2,
-                               vir_uniform(c,
-                                           is_array ?
-                                           QUNIFORM_IMAGE_ARRAY_SIZE :
-                                           QUNIFORM_IMAGE_DEPTH,
-                                           image_index));
+                ntq_store_def(c, &instr->def, 2,
+                              vir_uniform(c,
+                                          is_array ?
+                                          QUNIFORM_IMAGE_ARRAY_SIZE :
+                                          QUNIFORM_IMAGE_DEPTH,
+                                          image_index));
         }
 }
 
@@ -2263,16 +2510,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
          *
          * To fix that, we make sure we always emit a thread switch before the
          * first tlb color read. If that happens to be the last thread switch
-         * we emit, then everything is fine, but otherwsie, if any code after
+         * we emit, then everything is fine, but otherwise, if any code after
          * this point needs to emit additional thread switches, then we will
          * switch the strategy to locking the scoreboard on the first thread
          * switch instead -- see vir_emit_thrsw().
          */
         if (!c->emitted_tlb_load) {
-                if (!c->last_thrsw_at_top_level) {
-                        assert(c->devinfo->ver >= 41);
+                if (!c->last_thrsw_at_top_level)
                         vir_emit_thrsw(c);
-                }
 
                 c->emitted_tlb_load = true;
         }
@@ -2371,27 +2616,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
         }
 
         assert(color_reads_for_sample[component].file != QFILE_NULL);
-        ntq_store_dest(c, &instr->dest, 0,
-                       vir_MOV(c, color_reads_for_sample[component]));
+        ntq_store_def(c, &instr->def, 0,
+                      vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
+
+static bool
+try_emit_uniform(struct v3d_compile *c,
+                 int offset,
+                 int num_components,
+                 nir_def *def,
+                 enum quniform_contents contents)
+{
+        /* Even though ldunif is strictly 32-bit we can still use it
+         * to load scalar 8-bit/16-bit uniforms so long as their offset
+         * is 32-bit aligned. In this case, ldunif would still load
+         * 32-bit into the destination with the 8-bit/16-bit uniform
+         * data in the LSB and garbage in the MSB, but that is fine
+         * because we should only be accessing the valid bits of the
+         * destination.
+         *
+         * FIXME: if in the future we improve our register allocator to
+         * pack 2 16-bit variables in the MSB and LSB of the same
+         * register then this optimization would not be valid as is,
+         * since the load clobbers the MSB.
+         */
+        if (offset % 4 != 0)
+                return false;
+
+        /* We need dwords */
+        offset = offset / 4;
+
+        for (int i = 0; i < num_components; i++) {
+                ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
+        }
+
+        return true;
 }
 
 static void
 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
+        /* We scalarize general TMU access for anything that is not 32-bit. */
+        assert(instr->def.bit_size == 32 ||
+               instr->num_components == 1);
+
+        /* Try to emit ldunif if possible, otherwise fallback to general TMU */
         if (nir_src_is_const(instr->src[0])) {
                 int offset = (nir_intrinsic_base(instr) +
                              nir_src_as_uint(instr->src[0]));
-                assert(offset % 4 == 0);
-                /* We need dwords */
-                offset = offset / 4;
-                for (int i = 0; i < instr->num_components; i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_UNIFORM,
-                                                   offset + i));
+
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->def, QUNIFORM_UNIFORM)) {
+                        return;
+                }
+        }
+
+        if (!ntq_emit_load_unifa(c, instr)) {
+                ntq_emit_tmu_general(c, instr, false, false);
+                c->has_general_tmu_load = true;
+        }
+}
+
+static bool
+ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        if (c->compiler->max_inline_uniform_buffers <= 0)
+                return false;
+
+        /* Regular UBOs start after inline UBOs */
+        uint32_t index = nir_src_as_uint(instr->src[0]);
+        if (index >= c->compiler->max_inline_uniform_buffers)
+                return false;
+
+        /* We scalarize general TMU access for anything that is not 32-bit */
+        assert(instr->def.bit_size == 32 ||
+               instr->num_components == 1);
+
+        if (nir_src_is_const(instr->src[1])) {
+                int offset = nir_src_as_uint(instr->src[1]);
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->def,
+                                     QUNIFORM_INLINE_UBO_0 + index)) {
+                        return true;
                 }
-        } else {
-               ntq_emit_tmu_general(c, instr, false);
         }
+
+        /* Fallback to regular UBO load */
+        return false;
 }
 
 static void
@@ -2411,7 +2725,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
         unsigned offset =
                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
 
-        if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+        if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
                /* Emit the LDVPM directly now, rather than at the top
                 * of the shader like we did for V3D 3.x (which needs
                 * vpmsetup when not just taking the next offset).
@@ -2433,19 +2747,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
                                SYSTEM_VALUE_VERTEX_ID)) {
                       index++;
                }
-               for (int i = 0; i < offset; i++)
-                      index += c->vattr_sizes[i];
+
+               for (int i = 0; i < offset; i++) {
+                      /* GFXH-1602: if any builtins (vid, iid, etc) are read then
+                       * attribute 0 must be active (size > 0). When we hit this,
+                       * the driver is expected to program attribute 0 to have a
+                       * size of 1, so here we need to add that.
+                       */
+                      if (i == 0 && c->vs_key->is_coord &&
+                          c->vattr_sizes[i] == 0 && index > 0) {
+                         index++;
+                      } else {
+                         index += c->vattr_sizes[i];
+                      }
+               }
+
                index += nir_intrinsic_component(instr);
                for (int i = 0; i < instr->num_components; i++) {
                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
-                      ntq_store_dest(c, &instr->dest, i,
-                                     vir_LDVPMV_IN(c, vpm_offset));
+                      ntq_store_def(c, &instr->def, i,
+                                    vir_LDVPMV_IN(c, vpm_offset));
                 }
         } else {
                 for (int i = 0; i < instr->num_components; i++) {
                         int comp = nir_intrinsic_component(instr) + i;
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
+                        struct qreg input = c->inputs[offset * 4 + comp];
+                        ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
+
+                        if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
+                            input.file == c->payload_z.file &&
+                            input.index == c->payload_z.index) {
+                                c->reads_z = true;
+                        }
                 }
         }
 }
@@ -2610,18 +2943,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c,
         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
         struct qreg F = vir_uniform_ui(c, 0);
         struct qreg T = vir_uniform_ui(c, ~0);
-        struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
+        struct qreg s0 = vir_AND(c, sample_mask, i1);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
-        s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
+        s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s1 = vir_AND(c, sample_mask, i2);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
-        s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
+        s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s2 = vir_AND(c, sample_mask, i4);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
-        s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
+        s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s3 = vir_AND(c, sample_mask, i8);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
-        s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
+        s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
 
         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
         struct qreg sample_idx = i3;
@@ -2708,28 +3041,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result)
         c->current_unifa_offset += 4;
 }
 
-static void
-ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
+/* Checks if the value of a nir src is derived from a nir register */
+static bool
+nir_src_derived_from_reg(nir_src src)
+{
+        nir_def *def = src.ssa;
+        if (nir_load_reg_for_def(def))
+                return true;
+
+        nir_instr *parent = def->parent_instr;
+        switch (parent->type) {
+        case nir_instr_type_alu: {
+                nir_alu_instr *alu = nir_instr_as_alu(parent);
+                int num_srcs = nir_op_infos[alu->op].num_inputs;
+                for (int i = 0; i < num_srcs; i++) {
+                        if (nir_src_derived_from_reg(alu->src[i].src))
+                                return true;
+                }
+                return false;
+        }
+        case nir_instr_type_intrinsic: {
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+                int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
+                for (int i = 0; i < num_srcs; i++) {
+                        if (nir_src_derived_from_reg(intr->src[i]))
+                                return true;
+                }
+                return false;
+        }
+        case nir_instr_type_load_const:
+        case nir_instr_type_undef:
+                return false;
+        default:
+                /* By default we assume it may come from a register, the above
+                 * cases should be able to handle the majority of situations
+                 * though.
+                 */
+                return true;
+        };
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
+        assert(instr->intrinsic == nir_intrinsic_load_ubo ||
+               instr->intrinsic == nir_intrinsic_load_ssbo ||
+               instr->intrinsic == nir_intrinsic_load_uniform);
+
+        bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
+        bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
+        bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
+
         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
          * current unifa offset is 4 bytes ahead of the offset of the last load.
          */
         static const int32_t max_unifa_skip_dist =
                 MAX_UNIFA_SKIP_DISTANCE - 4;
 
-        bool dynamic_src = !nir_src_is_const(instr->src[1]);
-        uint32_t const_offset =
-                dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
+        /* We can only use unifa if the offset is uniform */
+        nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
+        if (nir_src_is_divergent(offset))
+                return false;
 
-        /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
-         * shifted up by 1 (0 is gallium's constant buffer 0).
+        /* Emitting loads from unifa may not be safe under non-uniform control
+         * flow. It seems the address that is used to write to the unifa
+         * register is taken from the first lane and if that lane is disabled
+         * by control flow then the value we read may be bogus and lead to
+         * invalid memory accesses on follow-up ldunifa instructions. However,
+         * ntq_store_def only emits conditional writes for nir registersas long
+         * we can be certain that the offset isn't derived from a load_reg we
+         * should be fine.
+         *
+         * The following CTS test can be used to trigger the problem, which
+         * causes a GMP violations in the sim without this check:
+         * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
          */
-        uint32_t index = nir_src_as_uint(instr->src[0]);
-        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
+        if (vir_in_nonuniform_control_flow(c) &&
+            nir_src_derived_from_reg(offset)) {
+                return false;
+        }
+
+        /* We can only use unifa with SSBOs if they are read-only. Otherwise
+         * ldunifa won't see the shader writes to that address (possibly
+         * because ldunifa doesn't read from the L2T cache).
+         */
+        if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
+                return false;
+
+        /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
+         * that we may have been written to scratch using the TMU.
+         */
+        bool dynamic_src = !nir_src_is_const(offset);
+        if (is_uniform && dynamic_src && c->s->scratch_size > 0)
+                return false;
+
+        uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
+        if (is_uniform)
+                const_offset += nir_intrinsic_base(instr);
+
+        /* ldunifa is a 32-bit load instruction so we can only use it with
+         * 32-bit aligned addresses. We always produce 32-bit aligned addresses
+         * except for types smaller than 32-bit, so in these cases we can only
+         * use ldunifa if we can verify alignment, which we can only do for
+         * loads with a constant offset.
+         */
+        uint32_t bit_size = instr->def.bit_size;
+        uint32_t value_skips = 0;
+        if (bit_size < 32) {
+                if (dynamic_src) {
+                        return false;
+                } else if (const_offset % 4 != 0) {
+                        /* If we are loading from an unaligned offset, fix
+                         * alignment and skip over unused elements in result.
+                         */
+                        value_skips = (const_offset % 4) / (bit_size / 8);
+                        const_offset &= ~0x3;
+                }
+        }
+
+        assert((bit_size == 32 && value_skips == 0) ||
+               (bit_size == 16 && value_skips <= 1) ||
+               (bit_size == 8  && value_skips <= 3));
+
+        /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
+         * constants.
+         */
+        uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
+
+        /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
+         * index 0 for Gallium's constant buffer (GL) or push constants
+         * (Vulkan).
+         */
+        if (is_ubo)
                 index++;
 
         /* We can only keep track of the last unifa address we used with
-         * constant offset loads. If the new load targets the same UBO and
+         * constant offset loads. If the new load targets the same buffer and
          * is close enough to the previous load, we can skip the unifa register
          * write by emitting dummy ldunifa instructions to update the unifa
          * address.
@@ -2739,6 +3186,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
         if (dynamic_src) {
                 c->current_unifa_block = NULL;
         } else if (c->cur_block == c->current_unifa_block &&
+                   c->current_unifa_is_ubo == !is_ssbo &&
                    c->current_unifa_index == index &&
                    c->current_unifa_offset <= const_offset &&
                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
@@ -2746,32 +3194,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
         } else {
                 c->current_unifa_block = c->cur_block;
+                c->current_unifa_is_ubo = !is_ssbo;
                 c->current_unifa_index = index;
                 c->current_unifa_offset = const_offset;
         }
 
         if (!skip_unifa) {
-                struct qreg base_offset =
+                struct qreg base_offset = !is_ssbo ?
                         vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                    v3d_unit_data_create(index, const_offset));
+                                    v3d_unit_data_create(index, const_offset)) :
+                        vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
 
                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 if (!dynamic_src) {
-                        vir_MOV_dest(c, unifa, base_offset);
+                        if (!is_ssbo) {
+                                /* Avoid the extra MOV to UNIFA by making
+                                 * ldunif load directly into it. We can't
+                                 * do this if we have not actually emitted
+                                 * ldunif and are instead reusing a previous
+                                 * one.
+                                 */
+                                struct qinst *inst =
+                                        (struct qinst *)c->cur_block->instructions.prev;
+                                if (inst == c->defs[base_offset.index]) {
+                                   inst->dst = unifa;
+                                   c->defs[base_offset.index] = NULL;
+                                } else {
+                                   vir_MOV_dest(c, unifa, base_offset);
+                                }
+                        } else {
+                                vir_ADD_dest(c, unifa, base_offset,
+                                             vir_uniform_ui(c, const_offset));
+                        }
                 } else {
                         vir_ADD_dest(c, unifa, base_offset,
-                                     ntq_get_src(c, instr->src[1], 0));
+                                     ntq_get_src(c, offset, 0));
                 }
         } else {
                 for (int i = 0; i < ldunifa_skips; i++)
                         emit_ldunifa(c, NULL);
         }
 
-        for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+        uint32_t num_components = nir_intrinsic_dest_components(instr);
+        for (uint32_t i = 0; i < num_components; ) {
                 struct qreg data;
                 emit_ldunifa(c, &data);
-                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
+
+                if (bit_size == 32) {
+                        assert(value_skips == 0);
+                        ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
+                        i++;
+                } else {
+                        assert((bit_size == 16 && value_skips <= 1) ||
+                               (bit_size ==  8 && value_skips <= 3));
+
+                        /* If we have any values to skip, shift to the first
+                         * valid value in the ldunifa result.
+                         */
+                        if (value_skips > 0) {
+                                data = vir_SHR(c, data,
+                                               vir_uniform_ui(c, bit_size *
+                                                                 value_skips));
+                        }
+
+                        /* Check how many valid components we have discounting
+                         * read components to skip.
+                         */
+                        uint32_t valid_count = (32 / bit_size) - value_skips;
+                        assert((bit_size == 16 && valid_count <= 2) ||
+                               (bit_size ==  8 && valid_count <= 4));
+                        assert(valid_count > 0);
+
+                        /* Process the valid components */
+                        do {
+                                struct qreg tmp;
+                                uint32_t mask = (1 << bit_size) - 1;
+                                tmp = vir_AND(c, vir_MOV(c, data),
+                                              vir_uniform_ui(c, mask));
+                                ntq_store_def(c, &instr->def, i,
+                                              vir_MOV(c, tmp));
+                                i++;
+                                valid_count--;
+
+                                /* Shift to next component */
+                                if (i < num_components && valid_count > 0) {
+                                        data = vir_SHR(c, data,
+                                                       vir_uniform_ui(c, bit_size));
+                                }
+                        } while (i < num_components && valid_count > 0);
+                }
         }
+
+        return true;
 }
 
 static inline struct qreg
@@ -2781,187 +3295,273 @@ emit_load_local_invocation_index(struct v3d_compile *c)
                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
 }
 
-/* Various subgroup operations rely on the A flags, so this helper ensures that
- * A flags represents currently active lanes in the subgroup.
+/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
+ * fragment shaders a lane is considered active if any sample flags are set
+ * for *any* lane in the same quad, however, we still need to ensure that
+ * terminated lanes (OpTerminate) are not included. Further, we also need to
+ * disable lanes that may be disabled because of non-uniform control
+ * flow.
  */
-static void
-set_a_flags_for_subgroup(struct v3d_compile *c)
+static enum v3d_qpu_cond
+setup_subgroup_control_flow_condition(struct v3d_compile *c)
 {
-        /* MSF returns 0 for disabled lanes in compute shaders so
-         * PUSHZ will set A=1 for disabled lanes. We want the inverse
-         * of this but we don't have any means to negate the A flags
-         * directly, but we can do it by repeating the same operation
-         * with NORZ (A = ~A & ~Z).
+        assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
+               c->s->info.stage == MESA_SHADER_COMPUTE);
+
+        enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
+
+        /* We need to make sure that terminated lanes in fragment shaders are
+         * not included. We can identify these lanes by comparing the inital
+         * sample mask with the current. This fixes:
+         * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
          */
-        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
-        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
-        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
+                vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
+                                           vir_NOT(c, vir_XOR(c, c->start_msf,
+                                                              vir_MSF(c)))),
+                           V3D_QPU_PF_PUSHZ);
+                cond = V3D_QPU_COND_IFNA;
+        }
 
-        /* If we are under non-uniform control flow we also need to
-         * AND the A flags with the current execute mask.
+        /* If we are in non-uniform control-flow update the condition to
+         * also limit lanes to those in the current execution mask.
          */
         if (vir_in_nonuniform_control_flow(c)) {
-                const uint32_t bidx = c->cur_block->index;
-                vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
-                                           c->execute,
-                                           vir_uniform_ui(c, bidx)),
-                           V3D_QPU_UF_ANDZ);
+                if (cond == V3D_QPU_COND_IFNA) {
+                        vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_UF_NORNZ);
+                } else {
+                        assert(cond == V3D_QPU_COND_NONE);
+                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                }
+                cond = V3D_QPU_COND_IFA;
         }
+
+        return cond;
+}
+
+static void
+emit_compute_barrier(struct v3d_compile *c)
+{
+        /* Ensure we flag the use of the control barrier. NIR's
+         * gather info pass usually takes care of this, but that
+         * requires that we call that pass after any other pass
+         * may emit a control barrier, so this is safer.
+         */
+        c->s->info.uses_control_barrier = true;
+
+        /* Emit a TSY op to get all invocations in the workgroup
+         * (actually supergroup) to block until the last
+         * invocation reaches the TSY op.
+         */
+        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
+}
+
+static void
+emit_barrier(struct v3d_compile *c)
+{
+        struct qreg eidx = vir_EIDX(c);
+
+        /* The config for the TSY op should be setup like this:
+         * - Lane 0: Quorum
+         * - Lane 2: TSO id
+         * - Lane 3: TSY opcode
+         */
+
+        /* Lane 0: we want to synchronize across one subgroup. Here we write to
+         * all lanes unconditionally and will overwrite other lanes below.
+         */
+        struct qreg tsy_conf = vir_uniform_ui(c, 1);
+
+        /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
+         * curent QPU index and thread index to ensure we get a unique one for
+         * this group of invocations in this core.
+         */
+        struct qreg tso_id =
+                vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
+        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
+                   V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
+
+        /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
+        struct qreg tsy_op = vir_uniform_ui(c, 16);
+        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
+                   V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
+
+        /* Emit TSY sync */
+        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
 }
 
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
+        case nir_intrinsic_decl_reg:
+        case nir_intrinsic_load_reg:
+        case nir_intrinsic_store_reg:
+                break; /* Ignore these */
+
         case nir_intrinsic_load_uniform:
                 ntq_emit_load_uniform(c, instr);
                 break;
 
+        case nir_intrinsic_load_global_2x32:
+                ntq_emit_tmu_general(c, instr, false, true);
+                c->has_general_tmu_load = true;
+                break;
+
         case nir_intrinsic_load_ubo:
-                if (!nir_src_is_divergent(instr->src[1]))
-                        ntq_emit_load_ubo_unifa(c, instr);
-                else
-                        ntq_emit_tmu_general(c, instr, false);
-                break;
-
-        case nir_intrinsic_ssbo_atomic_add:
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_ssbo_atomic_comp_swap:
+           if (ntq_emit_inline_ubo_load(c, instr))
+                   break;
+           FALLTHROUGH;
         case nir_intrinsic_load_ssbo:
+                if (!ntq_emit_load_unifa(c, instr)) {
+                        ntq_emit_tmu_general(c, instr, false, false);
+                        c->has_general_tmu_load = true;
+                }
+                break;
+
         case nir_intrinsic_store_ssbo:
-                ntq_emit_tmu_general(c, instr, false);
-                break;
-
-        case nir_intrinsic_shared_atomic_add:
-        case nir_intrinsic_shared_atomic_imin:
-        case nir_intrinsic_shared_atomic_umin:
-        case nir_intrinsic_shared_atomic_imax:
-        case nir_intrinsic_shared_atomic_umax:
-        case nir_intrinsic_shared_atomic_and:
-        case nir_intrinsic_shared_atomic_or:
-        case nir_intrinsic_shared_atomic_xor:
-        case nir_intrinsic_shared_atomic_exchange:
-        case nir_intrinsic_shared_atomic_comp_swap:
-        case nir_intrinsic_load_shared:
+        case nir_intrinsic_ssbo_atomic:
+        case nir_intrinsic_ssbo_atomic_swap:
+                ntq_emit_tmu_general(c, instr, false, false);
+                break;
+
+        case nir_intrinsic_store_global_2x32:
+        case nir_intrinsic_global_atomic_2x32:
+        case nir_intrinsic_global_atomic_swap_2x32:
+                ntq_emit_tmu_general(c, instr, false, true);
+                break;
+
+        case nir_intrinsic_shared_atomic:
+        case nir_intrinsic_shared_atomic_swap:
         case nir_intrinsic_store_shared:
-        case nir_intrinsic_load_scratch:
         case nir_intrinsic_store_scratch:
-                ntq_emit_tmu_general(c, instr, true);
+                ntq_emit_tmu_general(c, instr, true, false);
+                break;
+
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_shared:
+                ntq_emit_tmu_general(c, instr, true, false);
+                c->has_general_tmu_load = true;
                 break;
 
-        case nir_intrinsic_image_load:
         case nir_intrinsic_image_store:
-        case nir_intrinsic_image_atomic_add:
-        case nir_intrinsic_image_atomic_imin:
-        case nir_intrinsic_image_atomic_umin:
-        case nir_intrinsic_image_atomic_imax:
-        case nir_intrinsic_image_atomic_umax:
-        case nir_intrinsic_image_atomic_and:
-        case nir_intrinsic_image_atomic_or:
-        case nir_intrinsic_image_atomic_xor:
-        case nir_intrinsic_image_atomic_exchange:
-        case nir_intrinsic_image_atomic_comp_swap:
-                v3d40_vir_emit_image_load_store(c, instr);
+        case nir_intrinsic_image_atomic:
+        case nir_intrinsic_image_atomic_swap:
+                v3d_vir_emit_image_load_store(c, instr);
+                break;
+
+        case nir_intrinsic_image_load:
+                v3d_vir_emit_image_load_store(c, instr);
+                /* Not really a general TMU load, but we only use this flag
+                 * for NIR scheduling and we do schedule these under the same
+                 * policy as general TMU.
+                 */
+                c->has_general_tmu_load = true;
                 break;
 
         case nir_intrinsic_get_ssbo_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
-                                           nir_src_comp_as_uint(instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
+                                          nir_src_comp_as_uint(instr->src[0], 0)));
                 break;
 
         case nir_intrinsic_get_ubo_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
-                                           nir_src_comp_as_uint(instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
+                                          nir_src_comp_as_uint(instr->src[0], 0)));
                 break;
 
         case nir_intrinsic_load_user_clip_plane:
                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
-                                                   nir_intrinsic_ucp_id(instr) *
-                                                   4 + i));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+                                                  nir_intrinsic_ucp_id(instr) *
+                                                  4 + i));
                 }
                 break;
 
         case nir_intrinsic_load_viewport_x_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_y_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_z_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_z_offset:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
                 break;
 
         case nir_intrinsic_load_line_coord:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
                 break;
 
         case nir_intrinsic_load_line_width:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
                 break;
 
         case nir_intrinsic_load_aa_line_width:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
                 break;
 
         case nir_intrinsic_load_sample_mask_in:
-                ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
+                ntq_store_def(c, &instr->def, 0, vir_MSF(c));
                 break;
 
         case nir_intrinsic_load_helper_invocation:
                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
-                ntq_store_dest(c, &instr->dest, 0, qdest);
+                ntq_store_def(c, &instr->def, 0, qdest);
                 break;
 
         case nir_intrinsic_load_front_face:
                 /* The register contains 0 (front) or 1 (back), and we need to
                  * turn it into a NIR bool where true means front.
                  */
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_ADD(c,
-                                       vir_uniform_ui(c, -1),
-                                       vir_REVF(c)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_ADD(c,
+                                      vir_uniform_ui(c, -1),
+                                      vir_REVF(c)));
                 break;
 
         case nir_intrinsic_load_base_instance:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
                 break;
 
         case nir_intrinsic_load_instance_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
                 break;
 
         case nir_intrinsic_load_vertex_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
+                break;
+
+        case nir_intrinsic_load_draw_id:
+                ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
                 break;
 
         case nir_intrinsic_load_tlb_color_v3d:
                 vir_emit_tlb_color_read(c, instr);
                 break;
 
+        case nir_intrinsic_load_fep_w_v3d:
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
+                break;
+
         case nir_intrinsic_load_input:
                 ntq_emit_load_input(c, instr);
                 break;
@@ -2978,7 +3578,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_emit_image_size(c, instr);
                 break;
 
+        /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
+         * is intended to match the semantics of GLSL's discard) should
+         * terminate the invocation immediately. Our implementation doesn't
+         * do that. What we do is actually a demote by removing the invocations
+         * from the sample mask. Maybe we could be more strict and force an
+         * early termination by emitting a (maybe conditional) jump to the
+         * end section of the fragment shader for affected invocations.
+         */
         case nir_intrinsic_discard:
+        case nir_intrinsic_terminate:
+                c->emitted_discard = true;
+                FALLTHROUGH;
+        case nir_intrinsic_demote:
                 ntq_flush_tmu(c);
 
                 if (vir_in_nonuniform_control_flow(c)) {
@@ -2993,7 +3605,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
-        case nir_intrinsic_discard_if: {
+        case nir_intrinsic_discard_if:
+        case nir_intrinsic_terminate_if:
+                c->emitted_discard = true;
+                FALLTHROUGH;
+        case nir_intrinsic_demote_if: {
                 ntq_flush_tmu(c);
 
                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
@@ -3011,102 +3627,80 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
                                              vir_uniform_ui(c, 0)), cond);
-
                 break;
         }
 
-        case nir_intrinsic_memory_barrier:
-        case nir_intrinsic_memory_barrier_buffer:
-        case nir_intrinsic_memory_barrier_image:
-        case nir_intrinsic_memory_barrier_shared:
-        case nir_intrinsic_memory_barrier_tcs_patch:
-        case nir_intrinsic_group_memory_barrier:
-                /* We don't do any instruction scheduling of these NIR
-                 * instructions between each other, so we just need to make
-                 * sure that the TMU operations before the barrier are flushed
+        case nir_intrinsic_barrier:
+                /* Ensure that the TMU operations before the barrier are flushed
                  * before the ones after the barrier.
                  */
                 ntq_flush_tmu(c);
-                break;
-
-        case nir_intrinsic_control_barrier:
-                /* Emit a TSY op to get all invocations in the workgroup
-                 * (actually supergroup) to block until the last invocation
-                 * reaches the TSY op.
-                 */
-                ntq_flush_tmu(c);
 
-                if (c->devinfo->ver >= 42) {
-                        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
-                                                      V3D_QPU_WADDR_SYNCB));
-                } else {
-                        struct qinst *sync =
-                                vir_BARRIERID_dest(c,
-                                                   vir_reg(QFILE_MAGIC,
-                                                           V3D_QPU_WADDR_SYNCU));
-                        sync->uniform =
-                                vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                      0xffffff00 |
-                                                      V3D_TSY_WAIT_INC_CHECK);
+                if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
+                        if (c->s->info.stage == MESA_SHADER_COMPUTE)
+                                emit_compute_barrier(c);
+                        else
+                                emit_barrier(c);
 
+                        /* The blocking of a TSY op only happens at the next
+                         * thread switch. No texturing may be outstanding at the
+                         * time of a TSY blocking operation.
+                         */
+                        vir_emit_thrsw(c);
                 }
-
-                /* The blocking of a TSY op only happens at the next thread
-                 * switch.  No texturing may be outstanding at the time of a
-                 * TSY blocking operation.
-                 */
-                vir_emit_thrsw(c);
                 break;
 
         case nir_intrinsic_load_num_workgroups:
                 for (int i = 0; i < 3; i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
-                                                   i));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
+                                                  i));
                 }
                 break;
 
+        case nir_intrinsic_load_workgroup_id_zero_base:
         case nir_intrinsic_load_workgroup_id: {
                 struct qreg x = vir_AND(c, c->cs_payload[0],
                                          vir_uniform_ui(c, 0xffff));
+                ntq_store_def(c, &instr->def, 0, x);
 
                 struct qreg y = vir_SHR(c, c->cs_payload[0],
                                          vir_uniform_ui(c, 16));
+                ntq_store_def(c, &instr->def, 1, y);
 
                 struct qreg z = vir_AND(c, c->cs_payload[1],
                                          vir_uniform_ui(c, 0xffff));
+                ntq_store_def(c, &instr->def, 2, z);
+                break;
+        }
 
-                /* We only support dispatch base in Vulkan */
-                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
-                        x = vir_ADD(c, x,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
-                        y = vir_ADD(c, y,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
-                        z = vir_ADD(c, z,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
-                }
+        case nir_intrinsic_load_base_workgroup_id: {
+                struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
+                ntq_store_def(c, &instr->def, 0, x);
 
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
-                ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
+                struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
+                ntq_store_def(c, &instr->def, 1, y);
+
+                struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
+                ntq_store_def(c, &instr->def, 2, z);
                 break;
         }
 
         case nir_intrinsic_load_local_invocation_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               emit_load_local_invocation_index(c));
+                ntq_store_def(c, &instr->def, 0,
+                              emit_load_local_invocation_index(c));
                 break;
 
         case nir_intrinsic_load_subgroup_id: {
                 /* This is basically the batch index, which is the Local
                  * Invocation Index divided by the SIMD width).
                  */
-                STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+                STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
                 struct qreg lii = emit_load_local_invocation_index(c);
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_SHR(c, lii,
-                                       vir_uniform_ui(c, divide_shift)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_SHR(c, lii,
+                                      vir_uniform_ui(c, divide_shift)));
                 break;
         }
 
@@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
                 for (int i = 0; i < instr->num_components; i++) {
                         struct qreg row = vir_uniform_ui(c, row_idx++);
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_LDVPMG_IN(c, row, col));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_LDVPMG_IN(c, row, col));
                 }
                 break;
         }
@@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  * using ldvpm(v,d)_in (See Table 71).
                  */
                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
                 break;
         }
 
         case nir_intrinsic_load_invocation_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
+                ntq_store_def(c, &instr->def, 0, vir_IID(c));
                 break;
 
         case nir_intrinsic_load_fb_layers_v3d:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
                 break;
 
         case nir_intrinsic_load_sample_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
+                ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
                 break;
 
         case nir_intrinsic_load_sample_pos:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
                 break;
 
         case nir_intrinsic_load_barycentric_at_offset:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
                 break;
 
         case nir_intrinsic_load_barycentric_pixel:
-                ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
-                ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+                ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+                ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
                 break;
 
         case nir_intrinsic_load_barycentric_at_sample: {
                 if (!c->fs_key->msaa) {
-                        ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
-                        ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+                        ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+                        ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
                         return;
                 }
 
@@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
 
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+                ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
                 break;
         }
 
@@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg offset_y =
                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
 
-                ntq_store_dest(c, &instr->dest, 0,
-                                  vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
-                ntq_store_dest(c, &instr->dest, 1,
-                                  vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
+                ntq_store_def(c, &instr->def, 0,
+                             vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
                 break;
         }
 
         case nir_intrinsic_load_barycentric_centroid: {
                 struct qreg offset_x, offset_y;
                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+                ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
                 break;
         }
 
@@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                          */
                         if (!c->fs_key->msaa ||
                             c->interp[input_idx].vp.file == QFILE_NULL) {
-                                ntq_store_dest(c, &instr->dest, i,
-                                               vir_MOV(c, c->inputs[input_idx]));
+                                ntq_store_def(c, &instr->def, i,
+                                              vir_MOV(c, c->inputs[input_idx]));
                                 continue;
                         }
 
@@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                               ntq_emit_load_interpolated_input(c, p, C,
                                                                offset_x, offset_y,
                                                                interp_mode);
-                        ntq_store_dest(c, &instr->dest, i, result);
+                        ntq_store_def(c, &instr->def, i, result);
                 }
                 break;
         }
 
         case nir_intrinsic_load_subgroup_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform_ui(c, V3D_CHANNELS));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform_ui(c, V3D_CHANNELS));
                 break;
 
         case nir_intrinsic_load_subgroup_invocation:
-                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+                ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
                 break;
 
         case nir_intrinsic_elect: {
-                set_a_flags_for_subgroup(c);
-                struct qreg first = vir_FLAFIRST(c);
+                struct qreg first;
+                if (vir_in_nonuniform_control_flow(c)) {
+                        /* Sets A=1 for lanes enabled in the execution mask */
+                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                        /* Updates A ANDing with lanes enabled in MSF */
+                        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
+                                   V3D_QPU_UF_ANDNZ);
+                        first = vir_FLAFIRST(c);
+                } else {
+                        /* Sets A=1 for inactive lanes */
+                        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
+                                   V3D_QPU_PF_PUSHZ);
+                        first = vir_FLNAFIRST(c);
+                }
 
-                /* Produce a boolean result from Flafirst */
+                /* Produce a boolean result */
                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
                                            first, vir_uniform_ui(c, 1)),
                                            V3D_QPU_PF_PUSHZ);
                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
-                ntq_store_dest(c, &instr->dest, 0, result);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_ballot: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_read_invocation: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                struct qreg index = ntq_get_src(c, instr->src[1], 0);
+                struct qreg res = vir_SHUFFLE(c, value, index);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_read_first_invocation: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_shuffle: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                struct qreg indices = ntq_get_src(c, instr->src[1], 0);
+                struct qreg res = vir_SHUFFLE(c, value, indices);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_vote_feq:
+        case nir_intrinsic_vote_ieq: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
+                             vir_ALLEQ_dest(c, res, value) :
+                             vir_ALLFEQ_dest(c, res, value),
+                             cond);
+
+                /* Produce boolean result */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_vote_all: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+                /* We want to check if 'all lanes are equal (alleq != 0) and
+                 * their value is True (value != 0)'.
+                 *
+                 * The first MOV.pushz generates predicate for 'alleq == 0'.
+                 * The second MOV.NORZ generates predicate for:
+                 * '!(alleq == 0) & !(value == 0).
+                 */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+                           V3D_QPU_UF_NORZ);
+                struct qreg result =
+                        ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_vote_any: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+                /* We want to check 'not (all lanes are equal (alleq != 0)'
+                 * and their value is False (value == 0))'.
+                 *
+                 * The first MOV.pushz generates predicate for 'alleq == 0'.
+                 * The second MOV.NORNZ generates predicate for:
+                 * '!(alleq == 0) & (value == 0).
+                 * The IFNA condition negates the predicate when evaluated:
+                 * '!(!alleq == 0) & (value == 0))
+                 */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+                           V3D_QPU_UF_NORNZ);
+                struct qreg result =
+                        ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+                ntq_store_def(c, &instr->def, 0, result);
                 break;
         }
 
@@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_view_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
                 break;
 
         default:
@@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c)
         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
 }
 
+static bool
+is_cheap_block(nir_block *block)
+{
+        int32_t cost = 3;
+        nir_foreach_instr(instr, block) {
+                switch (instr->type) {
+                case nir_instr_type_alu:
+                case nir_instr_type_undef:
+                case nir_instr_type_load_const:
+                        if (--cost <= 0)
+                                return false;
+                break;
+                case nir_instr_type_intrinsic: {
+                        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                        switch (intr->intrinsic) {
+                        case nir_intrinsic_decl_reg:
+                        case nir_intrinsic_load_reg:
+                        case nir_intrinsic_store_reg:
+                                continue;
+                        default:
+                                return false;
+                        }
+                }
+                default:
+                        return false;
+                }
+        }
+        return true;
+}
+
 static void
 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
 {
@@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                      c->execute,
                      vir_uniform_ui(c, else_block->index));
 
-        /* Jump to ELSE if nothing is active for THEN, otherwise fall
-         * through.
+        /* Set the flags for taking the THEN block */
+        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                   V3D_QPU_PF_PUSHZ);
+
+        /* Jump to ELSE if nothing is active for THEN (unless THEN block is
+         * so small it won't pay off), otherwise fall through.
          */
-        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
-        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
-        vir_link_blocks(c->cur_block, else_block);
+        bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
+                        is_cheap_block(nir_if_first_then_block(if_stmt));
+        if (!is_cheap) {
+                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+                vir_link_blocks(c->cur_block, else_block);
+        }
         vir_link_blocks(c->cur_block, then_block);
 
-        /* Process the THEN block. */
+        /* Process the THEN block.
+         *
+         * Notice we don't call ntq_activate_execute_for_block here on purpose:
+         * c->execute is already set up to be 0 for lanes that must take the
+         * THEN block.
+         */
         vir_set_emit_block(c, then_block);
         ntq_emit_cf_list(c, &if_stmt->then_list);
 
@@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
                              vir_uniform_ui(c, after_block->index));
 
-                /* If everything points at ENDIF, then jump there immediately. */
-                vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
-                                        c->execute,
-                                        vir_uniform_ui(c, after_block->index)),
-                           V3D_QPU_PF_PUSHZ);
-                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
-                vir_link_blocks(c->cur_block, after_block);
+                /* If everything points at ENDIF, then jump there immediately
+                 * (unless ELSE block is so small it won't pay off).
+                 */
+                bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
+                                is_cheap_block(nir_else_block);
+                if (!is_cheap) {
+                        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
+                                                   c->execute,
+                                                   vir_uniform_ui(c, after_block->index)),
+                                   V3D_QPU_PF_PUSHZ);
+                        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+                        vir_link_blocks(c->cur_block, after_block);
+                }
                 vir_link_blocks(c->cur_block, else_block);
 
                 vir_set_emit_block(c, else_block);
@@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
                 break;
 
-        case nir_instr_type_ssa_undef:
+        case nir_instr_type_undef:
                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
                 break;
 
@@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
 static void
 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
 {
-
         c->loop_cont_block = vir_new_block(c);
         c->loop_break_block = vir_new_block(c);
 
@@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
 static void
 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
 {
+        assert(!nir_loop_has_continue_construct(loop));
+
+        /* Disable flags optimization for loop conditions. The problem here is
+         * that we can have code like this:
+         *
+         *  // block_0
+         *  vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
+         *  loop {
+         *     // block_1
+         *     if ssa_9 {
+         *
+         * In this example we emit flags to compute ssa_9 and the optimization
+         * will skip regenerating them again for the loop condition in the
+         * loop continue block (block_1). However, this is not safe after the
+         * first iteration because the loop body can stomp the flags if it has
+         * any conditionals.
+         */
+        c->flags_temp = -1;
+
         bool was_in_control_flow = c->in_control_flow;
         c->in_control_flow = true;
 
@@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
 static void
 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
 {
-        ntq_setup_registers(c, &impl->registers);
+        ntq_setup_registers(c, impl);
         ntq_emit_cf_list(c, &impl->body);
 }
 
@@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c)
 {
         switch (c->s->info.stage) {
         case MESA_SHADER_FRAGMENT:
-                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                c->start_msf = vir_MSF(c);
+                if (c->devinfo->ver < 71)
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                else
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
@@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c)
                                emit_fragment_varying(c, NULL, -1, 0, 0);
                 }
 
-                if (c->fs_key->is_points &&
-                    (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
+                if (c->fs_key->is_points && program_reads_point_coord(c)) {
                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->uses_implicit_point_line_varyings = true;
                 } else if (c->fs_key->is_lines &&
-                           (c->devinfo->ver < 40 ||
-                            BITSET_TEST(c->s->info.system_values_read,
+                           (BITSET_TEST(c->s->info.system_values_read,
                                         SYSTEM_VALUE_LINE_COORD))) {
                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->uses_implicit_point_line_varyings = true;
                 }
-
-                c->force_per_sample_msaa =
-                   c->s->info.fs.uses_sample_qualifier ||
-                   BITSET_TEST(c->s->info.system_values_read,
-                               SYSTEM_VALUE_SAMPLE_ID) ||
-                   BITSET_TEST(c->s->info.system_values_read,
-                               SYSTEM_VALUE_SAMPLE_POS);
                 break;
         case MESA_SHADER_COMPUTE:
                 /* Set up the TSO for barriers, assuming we do some. */
@@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c)
                                                       V3D_QPU_WADDR_SYNC));
                 }
 
-                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
-                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                if (c->devinfo->ver == 42) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                } else if (c->devinfo->ver >= 71) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                }
 
                 /* Set up the division between gl_LocalInvocationIndex and
                  * wg_in_mem in the payload reg.
@@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c)
 
         /* Find the main function and emit the body. */
         nir_foreach_function(function, c->s) {
-                assert(strcmp(function->name, "main") == 0);
+                assert(function->is_entrypoint);
                 assert(function->impl);
                 ntq_emit_impl(c, function->impl);
         }
@@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c,
 {
         *restore_last_thrsw = c->last_thrsw;
 
-        /* On V3D before 4.1, we need a TMU op to be outstanding when thread
-         * switching, so disable threads if we didn't do any TMU ops (each of
-         * which would have emitted a THRSW).
-         */
-        if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
-                c->threads = 1;
-                if (c->last_thrsw)
-                        vir_remove_thrsw(c);
-                *restore_last_thrsw = NULL;
-        }
-
         /* If we're threaded and the last THRSW was in conditional code, then
          * we need to emit another one so that we can flag it as the last
          * thrsw.
          */
-        if (c->last_thrsw && !c->last_thrsw_at_top_level) {
-                assert(c->devinfo->ver >= 41);
+        if (c->last_thrsw && !c->last_thrsw_at_top_level)
                 vir_emit_thrsw(c);
-        }
 
         /* If we're threaded, then we need to mark the last THRSW instruction
          * so we can emit a pair of them at QPU emit time.
@@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c,
          * For V3D 4.x, we can spawn the non-fragment shaders already in the
          * post-last-THRSW state, so we can skip this.
          */
-        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                assert(c->devinfo->ver >= 41);
+        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
                 vir_emit_thrsw(c);
-        }
 
         /* If we have not inserted a last thread switch yet, do it now to ensure
          * any potential spilling we do happens before this. If we don't spill
@@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c)
 
         vir_for_each_inst_inorder(inst, c) {
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                        if (inst->src[i].file == QFILE_REG &&
-                            inst->src[i].index == 0) {
+                        if (inst->src[i].file == c->payload_w.file &&
+                            inst->src[i].index == c->payload_w.index) {
                                 c->uses_center_w = true;
                                 return;
                         }
@@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c)
 void
 v3d_nir_to_vir(struct v3d_compile *c)
 {
-        if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(NIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d NIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 unreachable("bad stage");
         }
 
-        if (V3D_DEBUG & (V3D_DEBUG_VIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(VIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
          * instructions until the results are needed.
          */
 
-        if (V3D_DEBUG & (V3D_DEBUG_VIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(VIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c)
         /* Attempt to allocate registers for the temporaries.  If we fail,
          * reduce thread count and try again.
          */
-        int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
+        int min_threads = 2;
         struct qpu_reg *temp_registers;
         while (true) {
-                bool spilled;
-                temp_registers = v3d_register_allocate(c, &spilled);
-                if (spilled)
-                        continue;
-
-                if (temp_registers)
+                temp_registers = v3d_register_allocate(c);
+                if (temp_registers) {
+                        assert(c->spills + c->fills <= c->max_tmu_spills);
                         break;
+                }
 
                 if (c->threads == min_threads &&
-                    (V3D_DEBUG & V3D_DEBUG_RA)) {
+                    V3D_DBG(RA)) {
                         fprintf(stderr,
                                 "Failed to register allocate using %s\n",
                                 c->fallback_scheduler ? "the fallback scheduler:" :
@@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 }
 
                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
-                        if (V3D_DEBUG & V3D_DEBUG_PERF) {
+                        if (V3D_DBG(PERF)) {
                                 fprintf(stderr,
-                                        "Failed to register allocate %s at "
-                                        "%d threads.\n", vir_get_stage_name(c),
-                                        c->threads);
+                                        "Failed to register allocate %s "
+                                        "prog %d/%d at %d threads.\n",
+                                        vir_get_stage_name(c),
+                                        c->program_id, c->variant_id, c->threads);
                         }
                         c->compilation_result =
                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
                         return;
                 }
 
-                c->spill_count = 0;
+                c->spills = 0;
+                c->fills = 0;
                 c->threads /= 2;
 
                 if (c->threads == 1)
@@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
 
         if (c->spills &&
-            (V3D_DEBUG & (V3D_DEBUG_VIR |
-                          v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+            (V3D_DBG(VIR) ||
+             v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);