27 files changed, 5299 insertions, 2647 deletions
diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build
index 95156140ad9..d5aafb3879e 100644
--- a/src/broadcom/compiler/meson.build
+++ b/src/broadcom/compiler/meson.build
@@ -32,23 +32,22 @@ libbroadcom_compiler_files = files(
   'vir_to_qpu.c',
   'qpu_schedule.c',
   'qpu_validate.c',
-  'v3d33_tex.c',
-  'v3d40_tex.c',
-  'v3d33_vpm_setup.c',
+  'v3d_tex.c',
   'v3d_compiler.h',
   'v3d_nir_lower_io.c',
   'v3d_nir_lower_image_load_store.c',
   'v3d_nir_lower_line_smooth.c',
+  'v3d_nir_lower_load_store_bitsize.c',
   'v3d_nir_lower_logic_ops.c',
-  'v3d_nir_lower_robust_buffer_access.c',
   'v3d_nir_lower_scratch.c',
   'v3d_nir_lower_txf_ms.c',
+  'v3d_packing.c',
 )
 
 libbroadcom_compiler = static_library(
-  ['broadcom_compiler', v3d_xml_pack],
-  libbroadcom_compiler_files,
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+  'broadcom_compiler',
+  [libbroadcom_compiler_files, v3d_xml_pack],
+  include_directories : [inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_broadcom],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
   dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index d0a89f1a7d4..acc62a092f2 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -38,7 +38,7 @@
 #define __gen_address_type uint32_t
 #define __gen_address_offset(reloc) (*reloc)
 #define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
 
 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
@@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
 
-        /* We need to lock the scoreboard before any tlb acess happens. If this
+        /* We need to lock the scoreboard before any tlb access happens. If this
          * thread switch comes after we have emitted a tlb load, then it means
          * that we can't lock on the last thread switch any more.
          */
@@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
 }
 
 static uint32_t
+v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
+{
+        nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+        switch (atomic_op) {
+        case nir_atomic_op_iadd:
+                return  instr->intrinsic == nir_intrinsic_ssbo_atomic ?
+                        v3d_get_op_for_atomic_add(instr, 2) :
+                        v3d_get_op_for_atomic_add(instr, 1);
+        case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
+        case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+        case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
+        case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
+        case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
+        case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+        case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+        case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+        default:                    unreachable("unknown atomic op");
+        }
+}
+
+static uint32_t
 v3d_general_tmu_op(nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
@@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
         case nir_intrinsic_load_uniform:
         case nir_intrinsic_load_shared:
         case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_global_2x32:
         case nir_intrinsic_store_ssbo:
         case nir_intrinsic_store_shared:
         case nir_intrinsic_store_scratch:
+        case nir_intrinsic_store_global_2x32:
                 return V3D_TMU_OP_REGULAR;
-        case nir_intrinsic_ssbo_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 2);
-        case nir_intrinsic_shared_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 1);
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_shared_atomic_imin:
-                return V3D_TMU_OP_WRITE_SMIN;
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_shared_atomic_umin:
-                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_shared_atomic_imax:
-                return V3D_TMU_OP_WRITE_SMAX;
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_shared_atomic_umax:
-                return V3D_TMU_OP_WRITE_UMAX;
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_shared_atomic_and:
-                return V3D_TMU_OP_WRITE_AND_READ_INC;
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_shared_atomic_or:
-                return V3D_TMU_OP_WRITE_OR_READ_DEC;
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_shared_atomic_xor:
-                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_shared_atomic_exchange:
-                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
-        case nir_intrinsic_ssbo_atomic_comp_swap:
-        case nir_intrinsic_shared_atomic_comp_swap:
-                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+        case nir_intrinsic_ssbo_atomic:
+        case nir_intrinsic_ssbo_atomic_swap:
+        case nir_intrinsic_shared_atomic:
+        case nir_intrinsic_shared_atomic_swap:
+        case nir_intrinsic_global_atomic_2x32:
+        case nir_intrinsic_global_atomic_swap_2x32:
+                return v3d_general_tmu_op_for_atomic(instr);
+
         default:
                 unreachable("unknown intrinsic op");
         }
@@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c)
         bool emitted_tmuwt = false;
         for (int i = 0; i < c->tmu.flush_count; i++) {
                 if (c->tmu.flush[i].component_mask > 0) {
-                        nir_dest *dest = c->tmu.flush[i].dest;
-                        assert(dest);
+                        nir_def *def = c->tmu.flush[i].def;
+                        assert(def);
 
                         for (int j = 0; j < 4; j++) {
                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
-                                        ntq_store_dest(c, dest, j,
-                                                       vir_MOV(c, vir_LDTMU(c)));
+                                        ntq_store_def(c, def, j,
+                                                      vir_MOV(c, vir_LDTMU(c)));
                                 }
                         }
                 } else if (!emitted_tmuwt) {
@@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c)
 
 /**
  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
- * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
+ * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
  * and more specifically, the output fifo, since that can't stall.
  */
 void
 ntq_add_pending_tmu_flush(struct v3d_compile *c,
-                          nir_dest *dest,
+                          nir_def *def,
                           uint32_t component_mask)
 {
         const uint32_t num_components = util_bitcount(component_mask);
@@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
 
         if (num_components > 0) {
                 c->tmu.output_fifo_size += num_components;
-                if (!dest->is_ssa)
-                        _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
+
+                nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+                if (store != NULL) {
+                        nir_def *reg = store->src[1].ssa;
+                        _mesa_set_add(c->tmu.outstanding_regs, reg);
+                }
         }
 
-        c->tmu.flush[c->tmu.flush_count].dest = dest;
+        c->tmu.flush[c->tmu.flush_count].def = def;
         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
         c->tmu.flush_count++;
+        c->tmu.total_count++;
 
         if (c->disable_tmu_pipelining)
                 ntq_flush_tmu(c);
@@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
                               uint32_t base_const_offset,
                               uint32_t *writemask,
                               uint32_t *const_offset,
+                              uint32_t *type_size,
                               uint32_t *tmu_writes)
 {
         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
@@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
                 /* Update the offset for the TMU write based on the
                  * the first component we are writing.
                  */
-                *const_offset = base_const_offset + first_component * 4;
+                *type_size = nir_src_bit_size(instr->src[0]) / 8;
+                *const_offset =
+                        base_const_offset + first_component * (*type_size);
 
                 /* Clear these components from the writemask */
                 uint32_t written_mask =
@@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
                                int offset_src,
                                struct qreg base_offset,
                                uint32_t const_offset,
+                               uint32_t dest_components,
                                uint32_t *tmu_writes)
 {
         if (mode == MODE_COUNT) {
@@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
 
         if (vir_in_nonuniform_control_flow(c))
                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+        tmu->ldtmu_count = dest_components;
 }
 
 /**
@@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
  */
 static void
 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
-                     bool is_shared_or_scratch)
+                     bool is_shared_or_scratch, bool is_global)
 {
         uint32_t tmu_op = v3d_general_tmu_op(instr);
 
@@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
          * amount to add/sub, as that is implicit.
          */
         bool atomic_add_replaced =
-                ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
-                  instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+                (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
+                 instr->intrinsic == nir_intrinsic_shared_atomic ||
+                 instr->intrinsic == nir_intrinsic_global_atomic_2x32) &&
+                nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
-                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
 
         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                          instr->intrinsic == nir_intrinsic_store_scratch ||
-                         instr->intrinsic == nir_intrinsic_store_shared);
+                         instr->intrinsic == nir_intrinsic_store_shared ||
+                         instr->intrinsic == nir_intrinsic_store_global_2x32);
 
         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
                         instr->intrinsic == nir_intrinsic_load_ubo ||
                         instr->intrinsic == nir_intrinsic_load_ssbo ||
                         instr->intrinsic == nir_intrinsic_load_scratch ||
-                        instr->intrinsic == nir_intrinsic_load_shared);
+                        instr->intrinsic == nir_intrinsic_load_shared ||
+                        instr->intrinsic == nir_intrinsic_load_global_2x32);
 
         if (!is_load)
                 c->tmu_dirty_rcl = true;
 
-        bool has_index = !is_shared_or_scratch;
+        if (is_global)
+                c->has_global_address = true;
+
+        bool has_index = !is_shared_or_scratch && !is_global;
 
         int offset_src;
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                    instr->intrinsic == nir_intrinsic_load_ubo ||
                    instr->intrinsic == nir_intrinsic_load_scratch ||
                    instr->intrinsic == nir_intrinsic_load_shared ||
+                   instr->intrinsic == nir_intrinsic_load_global_2x32 ||
                    atomic_add_replaced) {
                 offset_src = 0 + has_index;
         } else if (is_store) {
@@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                           v3d_unit_data_create(0, const_offset));
                 const_offset = 0;
         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
-                uint32_t index = nir_src_as_uint(instr->src[0]);
-                /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
-                 * shifted up by 1 (0 is gallium's constant buffer 0).
+                /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
+                 * is gallium's constant buffer 0 in GL and push constants
+                 * in Vulkan)).
                  */
-                if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
-                        index++;
-
+                uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
                 base_offset =
                         vir_uniform(c, QUNIFORM_UBO_ADDR,
                                     v3d_unit_data_create(index, const_offset));
@@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                         base_offset = c->cs_shared_offset;
                         const_offset += nir_intrinsic_base(instr);
                 }
+        } else if (is_global) {
+                /* Global load/store intrinsics use gloal addresses, so the
+                 * offset is the target address and we don't need to add it
+                 * to a base offset.
+                 */
+                base_offset = vir_uniform_ui(c, 0);
         } else {
+                uint32_t idx = is_store ? 1 : 0;
                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
-                                          nir_src_as_uint(instr->src[is_store ?
-                                                                      1 : 0]));
+                                          nir_src_comp_as_uint(instr->src[idx], 0));
         }
 
         /* We are ready to emit TMU register writes now, but before we actually
@@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
                 assert(mode == MODE_COUNT || tmu_writes > 0);
 
+                uint32_t type_size = 4;
+
                 if (is_store) {
                         emit_tmu_general_store_writes(c, mode, instr,
                                                       base_const_offset,
                                                       &writemask,
                                                       &const_offset,
+                                                      &type_size,
                                                       &tmu_writes);
                 } else if (!is_load && !atomic_add_replaced) {
-                         emit_tmu_general_atomic_writes(c, mode, instr,
-                                                        tmu_op, has_index,
-                                                        &tmu_writes);
+                        emit_tmu_general_atomic_writes(c, mode, instr,
+                                                       tmu_op, has_index,
+                                                       &tmu_writes);
+                } else if (is_load) {
+                        type_size = instr->def.bit_size / 8;
                 }
 
                 /* For atomics we use 32bit except for CMPXCHG, that we need
@@ -618,17 +648,40 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
                                 V3D_TMU_OP_TYPE_ATOMIC;
 
+                        /* Only load per-quad if we can be certain that all
+                         * lines in the quad are active. Notice that demoted
+                         * invocations, unlike terminated ones, are still
+                         * active: we want to skip memory writes for them but
+                         * loads should still work.
+                         */
                         uint32_t perquad =
-                                is_load && !vir_in_nonuniform_control_flow(c)
-                                ? GENERAL_TMU_LOOKUP_PER_QUAD
-                                : GENERAL_TMU_LOOKUP_PER_PIXEL;
+                                is_load && !vir_in_nonuniform_control_flow(c) &&
+                                ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
+                                  c->s->info.fs.needs_quad_helper_invocations &&
+                                  !c->emitted_discard) ||
+                                 c->s->info.uses_wide_subgroup_intrinsics) ?
+                                GENERAL_TMU_LOOKUP_PER_QUAD :
+                                GENERAL_TMU_LOOKUP_PER_PIXEL;
                         config = 0xffffff00 | tmu_op << 3 | perquad;
 
                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
                         } else if (is_atomic || num_components == 1) {
-                                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                                switch (type_size) {
+                                case 4:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                                        break;
+                                case 2:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
+                                        break;
+                                case 1:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
+                                        break;
+                                default:
+                                        unreachable("Unsupported bitsize");
+                                }
                         } else {
+                                assert(type_size == 4);
                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
                                           num_components - 2;
                         }
@@ -637,7 +690,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                 emit_tmu_general_address_write(c, mode, instr, config,
                                                dynamic_src, offset_src,
                                                base_offset, const_offset,
-                                               &tmu_writes);
+                                               dest_components, &tmu_writes);
 
                 assert(tmu_writes > 0);
                 if (mode == MODE_COUNT) {
@@ -660,7 +713,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                          */
                         const uint32_t component_mask =
                                 (1 << dest_components) - 1;
-                        ntq_add_pending_tmu_flush(c, &instr->dest,
+                        ntq_add_pending_tmu_flush(c, &instr->def,
                                                   component_mask);
                 }
         }
@@ -673,7 +726,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
 }
 
 static struct qreg *
-ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
+ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
 {
         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
                                           def->num_components);
@@ -717,8 +770,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig)
  * its destination to be the NIR reg's destination
  */
 void
-ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
-               struct qreg result)
+ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+              struct qreg result)
 {
         struct qinst *last_inst = NULL;
         if (!list_is_empty(&c->cur_block->instructions))
@@ -731,23 +784,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
         assert(result.file == QFILE_TEMP && last_inst &&
                (last_inst == c->defs[result.index] || is_reused_uniform));
 
-        if (dest->is_ssa) {
-                assert(chan < dest->ssa.num_components);
+        nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+        if (store == NULL) {
+                assert(chan < def->num_components);
 
                 struct qreg *qregs;
                 struct hash_entry *entry =
-                        _mesa_hash_table_search(c->def_ht, &dest->ssa);
+                        _mesa_hash_table_search(c->def_ht, def);
 
                 if (entry)
                         qregs = entry->data;
                 else
-                        qregs = ntq_init_ssa_def(c, &dest->ssa);
+                        qregs = ntq_init_ssa_def(c, def);
 
                 qregs[chan] = result;
         } else {
-                nir_register *reg = dest->reg.reg;
-                assert(dest->reg.base_offset == 0);
-                assert(reg->num_array_elems == 0);
+                nir_def *reg = store->src[1].ssa;
+                ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                assert(nir_intrinsic_base(store) == 0);
+                assert(nir_intrinsic_num_array_elems(decl) == 0);
                 struct hash_entry *entry =
                         _mesa_hash_table_search(c->def_ht, reg);
                 struct qreg *qregs = entry->data;
@@ -802,7 +857,9 @@ struct qreg
 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
 {
         struct hash_entry *entry;
-        if (src.is_ssa) {
+
+        nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
+        if (load == NULL) {
                 assert(i < src.ssa->num_components);
 
                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
@@ -811,10 +868,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i)
                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
                 }
         } else {
-                nir_register *reg = src.reg.reg;
-                assert(reg->num_array_elems == 0);
-                assert(src.reg.base_offset == 0);
-                assert(i < reg->num_components);
+                nir_def *reg = load->src[0].ssa;
+                ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                assert(nir_intrinsic_base(load) == 0);
+                assert(nir_intrinsic_num_array_elems(decl) == 0);
+                assert(i < nir_intrinsic_num_components(decl));
 
                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
                         ntq_flush_tmu(c);
@@ -830,13 +888,8 @@ static struct qreg
 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
                 unsigned src)
 {
-        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
-        unsigned chan = ffs(instr->dest.write_mask) - 1;
         struct qreg r = ntq_get_src(c, instr->src[src].src,
-                                    instr->src[src].swizzle[chan]);
-
-        assert(!instr->src[src].abs);
-        assert(!instr->src[src].negate);
+                                    instr->src[src].swizzle[0]);
 
         return r;
 };
@@ -876,6 +929,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                 case GLSL_SAMPLER_DIM_3D:
                 case GLSL_SAMPLER_DIM_CUBE:
                 case GLSL_SAMPLER_DIM_BUF:
+                case GLSL_SAMPLER_DIM_EXTERNAL:
                         /* Don't minify the array size. */
                         if (!(instr->is_array && i == dest_size - 1)) {
                                 size = ntq_minify(c, size, lod);
@@ -890,7 +944,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                         unreachable("Bad sampler type");
                 }
 
-                ntq_store_dest(c, &instr->dest, i, size);
+                ntq_store_def(c, &instr->def, i, size);
         }
 }
 
@@ -905,12 +959,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
          */
         switch (instr->op) {
         case nir_texop_query_levels:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
                 return;
         case nir_texop_texture_samples:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
                 return;
         case nir_texop_txs:
                 ntq_emit_txs(c, instr);
@@ -919,10 +973,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 break;
         }
 
-        if (c->devinfo->ver >= 40)
-                v3d40_vir_emit_tex(c, instr);
-        else
-                v3d33_vir_emit_tex(c, instr);
+        v3d_vir_emit_tex(c, instr);
 }
 
 static struct qreg
@@ -963,44 +1014,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
 
 static struct qreg
 emit_smooth_varying(struct v3d_compile *c,
-                    struct qreg vary, struct qreg w, struct qreg r5)
+                    struct qreg vary, struct qreg w, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+        return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
 }
 
 static struct qreg
 emit_noperspective_varying(struct v3d_compile *c,
-                           struct qreg vary, struct qreg r5)
+                           struct qreg vary, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_MOV(c, vary), r5);
+        return vir_FADD(c, vir_MOV(c, vary), c_reg);
 }
 
 static struct qreg
 emit_flat_varying(struct v3d_compile *c,
-                  struct qreg vary, struct qreg r5)
+                  struct qreg vary, struct qreg c_reg)
 {
         vir_MOV_dest(c, c->undef, vary);
-        return vir_MOV(c, r5);
+        return vir_MOV(c, c_reg);
 }
 
 static struct qreg
 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                       int8_t input_idx, uint8_t swizzle, int array_index)
 {
-        struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
-        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        struct qreg c_reg; /* C coefficient */
+
+        if (c->devinfo->has_accumulators)
+                c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        else
+                c_reg = vir_reg(QFILE_REG, 0);
 
         struct qinst *ldvary = NULL;
         struct qreg vary;
-        if (c->devinfo->ver >= 41) {
-                ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
-                                      c->undef, c->undef);
-                ldvary->qpu.sig.ldvary = true;
-                vary = vir_emit_def(c, ldvary);
-        } else {
-                vir_NOP(c)->qpu.sig.ldvary = true;
-                vary = r3;
-        }
+        ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+                              c->undef, c->undef);
+        ldvary->qpu.sig.ldvary = true;
+        vary = vir_emit_def(c, ldvary);
 
         /* Store the input value before interpolation so we can implement
          * GLSL's interpolateAt functions if the shader uses them.
@@ -1008,7 +1058,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
         if (input_idx >= 0) {
                 assert(var);
                 c->interp[input_idx].vp = vary;
-                c->interp[input_idx].C = vir_MOV(c, r5);
+                c->interp[input_idx].C = vir_MOV(c, c_reg);
                 c->interp[input_idx].mode = var->data.interpolation;
         }
 
@@ -1018,7 +1068,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
          */
         if (!var) {
                 assert(input_idx < 0);
-                return emit_smooth_varying(c, vary, c->payload_w, r5);
+                return emit_smooth_varying(c, vary, c->payload_w, c_reg);
         }
 
         int i = c->num_inputs++;
@@ -1033,20 +1083,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                 if (var->data.centroid) {
                         BITSET_SET(c->centroid_flags, i);
                         result = emit_smooth_varying(c, vary,
-                                                     c->payload_w_centroid, r5);
+                                                     c->payload_w_centroid, c_reg);
                 } else {
-                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
+                        result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
                 }
                 break;
 
         case INTERP_MODE_NOPERSPECTIVE:
                 BITSET_SET(c->noperspective_flags, i);
-                result = emit_noperspective_varying(c, vary, r5);
+                result = emit_noperspective_varying(c, vary, c_reg);
                 break;
 
         case INTERP_MODE_FLAT:
                 BITSET_SET(c->flat_shade_flags, i);
-                result = emit_flat_varying(c, vary, r5);
+                result = emit_flat_varying(c, vary, c_reg);
                 break;
 
         default:
@@ -1163,16 +1213,6 @@ ntq_emit_comparison(struct v3d_compile *c,
                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                 break;
 
-        case nir_op_i2b32:
-                vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
-                cond_invert = true;
-                break;
-
-        case nir_op_f2b32:
-                vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
-                cond_invert = true;
-                break;
-
         default:
                 return false;
         }
@@ -1188,7 +1228,7 @@ ntq_emit_comparison(struct v3d_compile *c,
 static struct nir_alu_instr *
 ntq_get_alu_parent(nir_src src)
 {
-        if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
+        if (src.ssa->parent_instr->type != nir_instr_type_alu)
                 return NULL;
         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
         if (!instr)
@@ -1199,7 +1239,7 @@ ntq_get_alu_parent(nir_src src)
          * src.
          */
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
-                if (!instr->src[i].src.is_ssa)
+                if (nir_load_reg_for_def(instr->src[i].src.ssa))
                         return NULL;
         }
 
@@ -1242,12 +1282,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
         return result;
 }
 
+static struct qreg
+ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+        struct qreg result =
+                vir_MOV(c, vir_SEL(c, cond,
+                                   vir_uniform_ui(c, 1),
+                                   vir_uniform_ui(c, 0)));
+        c->flags_temp = result.index;
+        c->flags_cond = cond;
+        return result;
+}
+
+static struct qreg
+f2f16_rtz(struct v3d_compile *c, struct qreg f32)
+{
+   /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
+    * method and seems to be using RTE by default, so we need to implement
+    * RTZ rounding in software.
+    */
+   struct qreg rf16 = vir_FMOV(c, f32);
+   vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
+
+   struct qreg rf32 = vir_FMOV(c, rf16);
+   vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
+
+   struct qreg f32_abs = vir_FMOV(c, f32);
+   vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+   struct qreg rf32_abs = vir_FMOV(c, rf32);
+   vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+   vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
+              V3D_QPU_PF_PUSHN);
+   return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+                  vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
+}
+
+/**
+ * Takes the result value of a signed integer width conversion from a smaller
+ * type to a larger type and if needed, it applies sign extension to it.
+ */
+static struct qreg
+sign_extend(struct v3d_compile *c,
+            struct qreg value,
+            uint32_t src_bit_size,
+            uint32_t dst_bit_size)
+{
+        assert(src_bit_size < dst_bit_size);
+
+        struct qreg tmp = vir_MOV(c, value);
+
+        /* Do we need to sign-extend? */
+        uint32_t sign_mask = 1 << (src_bit_size - 1);
+        struct qinst *sign_check =
+                vir_AND_dest(c, vir_nop_reg(),
+                             tmp, vir_uniform_ui(c, sign_mask));
+        vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
+
+        /* If so, fill in leading sign bits */
+        uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
+                               ((1ull << dst_bit_size) - 1);
+        struct qinst *extend_inst =
+                vir_OR_dest(c, tmp, tmp,
+                            vir_uniform_ui(c, extend_bits));
+        vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
+
+        return tmp;
+}
+
 static void
 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 {
-        /* This should always be lowered to ALU operations for V3D. */
-        assert(!instr->dest.saturate);
-
         /* Vectors are special in that they have non-scalarized writemasks,
          * and just take the first swizzle channel for each argument in order
          * into each writemask channel.
@@ -1260,8 +1366,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-                        ntq_store_dest(c, &instr->dest.dest, i,
-                                       vir_MOV(c, srcs[i]));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_MOV(c, srcs[i]));
                 return;
         }
 
@@ -1327,6 +1433,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
                 break;
 
+        case nir_op_f2f16:
+        case nir_op_f2f16_rtne:
+                assert(nir_src_bit_size(instr->src[0].src) == 32);
+                result = vir_FMOV(c, src[0]);
+                vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
+                break;
+
+        case nir_op_f2f16_rtz:
+                assert(nir_src_bit_size(instr->src[0].src) == 32);
+                result = f2f16_rtz(c, src[0]);
+                break;
+
+        case nir_op_f2f32:
+                assert(nir_src_bit_size(instr->src[0].src) == 16);
+                result = vir_FMOV(c, src[0]);
+                vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
+                break;
+
+        case nir_op_i2i16: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 32 || bit_size == 8);
+                if (bit_size == 32) {
+                        /* We don't have integer pack/unpack methods for
+                         * converting between 16-bit and 32-bit, so we implement
+                         * the conversion manually by truncating the src.
+                         */
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+                } else {
+                        struct qreg tmp = vir_AND(c, src[0],
+                                                  vir_uniform_ui(c, 0xff));
+                        result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
+                }
+                break;
+        }
+
+        case nir_op_u2u16: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 32 || bit_size == 8);
+
+                /* We don't have integer pack/unpack methods for converting
+                 * between 16-bit and 32-bit, so we implement the conversion
+                 * manually by truncating the src. For the 8-bit case, we
+                 * want to make sure we don't copy garbage from any of the
+                 * 24 MSB bits.
+                 */
+                if (bit_size == 32)
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+                else
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+                break;
+        }
+
+        case nir_op_i2i8:
+        case nir_op_u2u8:
+                assert(nir_src_bit_size(instr->src[0].src) == 32 ||
+                       nir_src_bit_size(instr->src[0].src) == 16);
+                /* We don't have integer pack/unpack methods for converting
+                 * between 8-bit and 32-bit, so we implement the conversion
+                 * manually by truncating the src.
+                 */
+                result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+                break;
+
+        case nir_op_u2u32: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 16 || bit_size == 8);
+
+                /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
+                 * from the src but we make sure to clear any garbage bits that
+                 * may be present in the invalid src bits.
+                 */
+                uint32_t mask = (1 << bit_size) - 1;
+                result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
+                break;
+        }
+
+        case nir_op_i2i32: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 16 || bit_size == 8);
+
+                uint32_t mask = (1 << bit_size) - 1;
+                struct qreg tmp = vir_AND(c, src[0],
+                                          vir_uniform_ui(c, mask));
+
+                result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
+                break;
+        }
+
         case nir_op_iadd:
                 result = vir_ADD(c, src[0], src[1]);
                 break;
@@ -1390,8 +1584,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
         }
 
-        case nir_op_i2b32:
-        case nir_op_f2b32:
         case nir_op_feq32:
         case nir_op_fneu32:
         case nir_op_fge32:
@@ -1485,13 +1677,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_uadd_carry:
                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
                            V3D_QPU_PF_PUSHC);
-                result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+                result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
+                break;
+
+        case nir_op_usub_borrow:
+                vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
+                           V3D_QPU_PF_PUSHC);
+                result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
                 break;
 
         case nir_op_pack_half_2x16_split:
                 result = vir_VFPACK(c, src[0], src[1]);
                 break;
 
+        case nir_op_pack_2x32_to_2x16_v3d:
+                result = vir_VPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_32_to_r11g11b10_v3d:
+                result = vir_V11FPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
+                result = vir_V10PACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_4x16_to_4x8_v3d:
+                result = vir_V8PACK(c, src[0], src[1]);
+                break;
+
         case nir_op_unpack_half_2x16_split_x:
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1502,26 +1716,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
                 break;
 
-        case nir_op_fquantize2f16: {
-                /* F32 -> F16 -> F32 conversion */
-                struct qreg tmp = vir_FMOV(c, src[0]);
-                vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
-                tmp = vir_FMOV(c, tmp);
-                vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
+        case nir_op_pack_2x16_to_unorm_2x8_v3d:
+                result = vir_VFTOUNORM8(c, src[0]);
+                break;
 
-                /* Check for denorm */
-                struct qreg abs_src = vir_FMOV(c, src[0]);
-                vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
-                struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
-                vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
-                                         V3D_QPU_PF_PUSHC);
+        case nir_op_pack_2x16_to_snorm_2x8_v3d:
+                result = vir_VFTOSNORM8(c, src[0]);
+                break;
 
-                /* Return +/-0 for denorms */
-                struct qreg zero =
-                        vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
-                result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
+        case nir_op_pack_2x16_to_unorm_2x10_v3d:
+                result = vir_VFTOUNORM10LO(c, src[0]);
+                break;
+
+        case nir_op_pack_2x16_to_unorm_10_2_v3d:
+                result = vir_VFTOUNORM10HI(c, src[0]);
+                break;
+
+        case nir_op_f2unorm_16_v3d:
+                result = vir_FTOUNORM16(c, src[0]);
+                break;
+
+        case nir_op_f2snorm_16_v3d:
+                result = vir_FTOSNORM16(c, src[0]);
                 break;
-        }
 
         default:
                 fprintf(stderr, "unknown NIR ALU inst: ");
@@ -1530,17 +1747,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 abort();
         }
 
-        /* We have a scalar result, so the instruction should only have a
-         * single channel written to.
-         */
-        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
-        ntq_store_dest(c, &instr->dest.dest,
-                       ffs(instr->dest.write_mask) - 1, result);
+        ntq_store_def(c, &instr->def, 0, result);
 }
 
 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
  * specifier.  They come from a register that's preloaded with 0xffffffff
- * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
+ * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
  */
 #define TLB_TYPE_F16_COLOR         (3 << 6)
@@ -1670,15 +1882,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
 static void
 emit_frag_end(struct v3d_compile *c)
 {
-        /* If the shader has no non-TLB side effects and doesn't write Z
-         * we can promote it to enabling early_fragment_tests even
-         * if the user didn't.
-         */
-        if (c->output_position_index == -1 &&
-            !(c->s->info.num_images || c->s->info.num_ssbos)) {
-                c->s->info.fs.early_fragment_tests = true;
-        }
-
         if (c->output_sample_mask_index != -1) {
                 vir_SETMSF_dest(c, vir_nop_reg(),
                                 vir_AND(c,
@@ -1703,55 +1906,75 @@ emit_frag_end(struct v3d_compile *c)
         }
 
         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
-        if (c->output_position_index != -1 &&
-            !c->s->info.fs.early_fragment_tests) {
-                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
-                                                  c->outputs[c->output_position_index]);
-                uint8_t tlb_specifier = TLB_TYPE_DEPTH;
 
-                if (c->devinfo->ver >= 42) {
-                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
-                                          TLB_SAMPLE_MODE_PER_PIXEL);
-                } else
-                        tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
+        /* If the shader has no non-TLB side effects and doesn't write Z
+         * we can promote it to enabling early_fragment_tests even
+         * if the user didn't.
+         */
+        if (c->output_position_index == -1 &&
+            !(c->s->info.num_images || c->s->info.num_ssbos) &&
+            !c->s->info.fs.uses_discard &&
+            !c->s->info.fs.uses_demote &&
+            !c->fs_key->sample_alpha_to_coverage &&
+            c->output_sample_mask_index == -1 &&
+            has_any_tlb_color_write) {
+                c->s->info.fs.early_fragment_tests = true;
+        }
 
-                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                      tlb_specifier |
-                                                      0xffffff00);
+        /* By default, Z buffer writes are implicit using the Z values produced
+         * from FEP (Z value produced from rasterization). When this is not
+         * desirable (shader writes Z explicitly, has discards, etc) we need
+         * to let the hardware know by setting c->writes_z to true, in which
+         * case we always need to write a Z value from the QPU, even if it is
+         * just the passthrough Z value produced from FEP.
+         *
+         * Also, from the V3D 4.2 spec:
+         *
+         * "If a shader performs a Z read the “Fragment shader does Z writes”
+         *  bit in the shader record must be enabled to ensure deterministic
+         *  results"
+         *
+         * So if c->reads_z is set we always need to write Z, even if it is
+         * a passthrough from the Z value produced from FEP.
+         */
+        if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
                 c->writes_z = true;
-        } else if (c->s->info.fs.uses_discard ||
-                   !c->s->info.fs.early_fragment_tests ||
-                   c->fs_key->sample_alpha_to_coverage ||
-                   !has_any_tlb_color_write) {
-                /* Emit passthrough Z if it needed to be delayed until shader
-                 * end due to potential discards.
-                 *
-                 * Since (single-threaded) fragment shaders always need a TLB
-                 * write, emit passthrouh Z if we didn't have any color
-                 * buffers and flag us as potentially discarding, so that we
-                 * can use Z as the TLB write.
-                 */
-                c->s->info.fs.uses_discard = true;
-
-                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
-                                                  vir_nop_reg());
                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
+                struct qinst *inst;
+
+                if (c->output_position_index != -1) {
+                        /* Shader writes to gl_FragDepth, use that */
+                        inst = vir_MOV_dest(c, tlbu_reg,
+                                            c->outputs[c->output_position_index]);
+
+                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
+                                          TLB_SAMPLE_MODE_PER_PIXEL);
+                } else {
+                        /* Shader doesn't write to gl_FragDepth, take Z from
+                         * FEP.
+                         */
+                        c->writes_z_from_fep = true;
+                        inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
 
-                if (c->devinfo->ver >= 42) {
                         /* The spec says the PER_PIXEL flag is ignored for
                          * invariant writes, but the simulator demands it.
                          */
                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
                                           TLB_SAMPLE_MODE_PER_PIXEL);
-                } else {
-                        tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
+
+                        /* Since (single-threaded) fragment shaders always need
+                         * a TLB write, if we dond't have any we emit a
+                         * passthrouh Z and flag us as potentially discarding,
+                         * so that we can use Z as the required TLB write.
+                         */
+                        if (!has_any_tlb_color_write)
+                                c->s->info.fs.uses_discard = true;
                 }
 
-                inst->uniform = vir_get_uniform_index(c,
-                                                      QUNIFORM_CONSTANT,
+                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
                                                       tlb_specifier |
                                                       0xffffff00);
-                c->writes_z = true;
+                inst->is_tlb_z_write = true;
         }
 
         /* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1767,7 +1990,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
                        struct qreg vpm_index,
                        bool uniform_vpm_index)
 {
-        assert(c->devinfo->ver >= 40);
         if (uniform_vpm_index)
                 vir_STVPMV(c, vpm_index, val);
         else
@@ -1777,13 +1999,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
 static void
 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
 {
-        if (c->devinfo->ver >= 40) {
-                vir_VPM_WRITE_indirect(c, val,
-                                       vir_uniform_ui(c, vpm_index), true);
-        } else {
-                /* XXX: v3d33_vir_vpm_write_setup(c); */
-                vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
-        }
+        vir_VPM_WRITE_indirect(c, val,
+                               vir_uniform_ui(c, vpm_index), true);
 }
 
 static void
@@ -1791,7 +2008,7 @@ emit_vert_end(struct v3d_compile *c)
 {
         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
          */
-        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+        if (c->devinfo->ver == 42)
                 vir_VPMWT(c);
 }
 
@@ -1800,7 +2017,7 @@ emit_geom_end(struct v3d_compile *c)
 {
         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
          */
-        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+        if (c->devinfo->ver == 42)
                 vir_VPMWT(c);
 }
 
@@ -1812,8 +2029,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
                        nir_intrinsic_instr *high,
                        void *data)
 {
-        /* Our backend is 32-bit only at present */
-        if (bit_size != 32)
+        /* TMU general access only supports 32-bit vectors */
+        if (bit_size > 32)
+                return false;
+
+        if ((bit_size == 8 || bit_size == 16) && num_components > 1)
                 return false;
 
         if (align_mul % 4 != 0 || align_offset % 4 != 0)
@@ -1843,7 +2063,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
         do {
                 progress = false;
 
-                NIR_PASS_V(s, nir_lower_vars_to_ssa);
+                NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_opt_deref);
+
+                NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+                if (!s->info.var_copies_lowered) {
+                        /* Only run this pass if nir_lower_var_copies was not called
+                         * yet. That would lower away any copy_deref instructions and we
+                         * don't want to introduce any more.
+                         */
+                        NIR_PASS(progress, s, nir_opt_find_array_copies);
+                }
+
+                NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+                NIR_PASS(progress, s, nir_opt_dead_write_vars);
+                NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
+
+                NIR_PASS(progress, s, nir_remove_dead_variables,
+                         (nir_variable_mode)(nir_var_function_temp |
+                                             nir_var_shader_temp |
+                                             nir_var_mem_shared),
+                         NULL);
+
                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
                 NIR_PASS(progress, s, nir_copy_prop);
@@ -1851,10 +2093,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
 
+                NIR_PASS(progress, s, nir_opt_intrinsics);
+                NIR_PASS(progress, s, nir_opt_idiv_const, 32);
+                NIR_PASS(progress, s, nir_lower_alu);
+
+                if (nir_opt_loop(s)) {
+                   progress = true;
+                   NIR_PASS(progress, s, nir_copy_prop);
+                   NIR_PASS(progress, s, nir_opt_dce);
+                }
+
+                NIR_PASS(progress, s, nir_opt_conditional_discard);
+
+                NIR_PASS(progress, s, nir_opt_remove_phis);
+                NIR_PASS(progress, s, nir_opt_if, false);
+                if (c && !c->disable_gcm) {
+                        bool local_progress = false;
+                        NIR_PASS(local_progress, s, nir_opt_gcm, false);
+                        c->gcm_progress |= local_progress;
+                        progress |= local_progress;
+                }
+
+                /* Note that vectorization may undo the load/store scalarization
+                 * pass we run for non 32-bit TMU general load/store by
+                 * converting, for example, 2 consecutive 16-bit loads into a
+                 * single 32-bit load. This is fine (and desirable) as long as
+                 * the resulting 32-bit load meets 32-bit alignment requirements,
+                 * which mem_vectorize_callback() should be enforcing.
+                 */
                 nir_load_store_vectorize_options vectorize_opts = {
                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
                                  nir_var_mem_push_const | nir_var_mem_shared |
@@ -1862,7 +2133,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
                         .callback = mem_vectorize_callback,
                         .robust_modes = 0,
                 };
-                NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
+                bool vectorize_progress = false;
+
+
+                /* This requires that we have called
+                 * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
+                 * first, which we may not have done yet if we call here too
+                 * early durign NIR pre-processing. We can detect this because
+                 * in that case we won't have a compile object
+                 */
+                if (c) {
+                        NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
+                                 &vectorize_opts);
+                        if (vectorize_progress) {
+                                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
+                                NIR_PASS(progress, s, nir_lower_pack);
+                                progress = true;
+                        }
+                }
 
                 if (lower_flrp != 0) {
                         bool lower_flrp_progress = false;
@@ -1895,10 +2183,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
 
         nir_move_options sink_opts =
                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
-                nir_move_load_ubo;
+                nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
-
-        NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
 }
 
 static int
@@ -1915,27 +2201,9 @@ ntq_emit_vpm_read(struct v3d_compile *c,
                   uint32_t *remaining,
                   uint32_t vpm_index)
 {
-        struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
-
-        if (c->devinfo->ver >= 40 ) {
-                return vir_LDVPMV_IN(c,
-                                     vir_uniform_ui(c,
-                                                    (*num_components_queued)++));
-        }
-
-        if (*num_components_queued != 0) {
-                (*num_components_queued)--;
-                return vir_MOV(c, vpm);
-        }
-
-        uint32_t num_components = MIN2(*remaining, 32);
-
-        v3d33_vir_vpm_read_setup(c, num_components);
-
-        *num_components_queued = num_components - 1;
-        *remaining -= num_components;
-
-        return vir_MOV(c, vpm);
+        return vir_LDVPMV_IN(c,
+                             vir_uniform_ui(c,
+                                            (*num_components_queued)++));
 }
 
 static void
@@ -2005,31 +2273,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c)
         }
 
         /* The actual loads will happen directly in nir_intrinsic_load_input
-         * on newer versions.
          */
-        if (c->devinfo->ver >= 40)
-                return;
-
-        for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
-                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + 1) * 4);
-
-                for (int i = 0; i < c->vattr_sizes[loc]; i++) {
-                        c->inputs[loc * 4 + i] =
-                                ntq_emit_vpm_read(c,
-                                                  &vpm_components_queued,
-                                                  &num_components,
-                                                  loc * 4 + i);
-
-                }
-        }
-
-        if (c->devinfo->ver >= 40) {
-                assert(vpm_components_queued == num_components);
-        } else {
-                assert(vpm_components_queued == 0);
-                assert(num_components == 0);
-        }
+        return;
 }
 
 static bool
@@ -2058,14 +2303,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
                  */
                 assert(glsl_type_is_array(var->type));
                 const struct glsl_type *type = glsl_get_array_element(var->type);
-                unsigned array_len = MAX2(glsl_get_length(type), 1);
+                unsigned var_len = glsl_count_vec4_slots(type, false, false);
                 unsigned loc = var->data.driver_location;
 
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + array_len) * 4);
+                                  (loc + var_len) * 4);
 
                 if (var->data.compact) {
-                        for (unsigned j = 0; j < array_len; j++) {
+                        for (unsigned j = 0; j < var_len; j++) {
                                 unsigned input_idx = c->num_inputs++;
                                 unsigned loc_frac = var->data.location_frac + j;
                                 unsigned loc = var->data.location + loc_frac / 4;
@@ -2076,8 +2321,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
                        continue;
                 }
 
-                for (unsigned j = 0; j < array_len; j++) {
-                        unsigned num_elements = glsl_get_vector_elements(type);
+                for (unsigned j = 0; j < var_len; j++) {
+                        unsigned num_elements =
+                                glsl_type_is_struct(glsl_without_array(type)) ?
+                                4 : glsl_get_vector_elements(type);
                         for (unsigned k = 0; k < num_elements; k++) {
                                 unsigned chan = var->data.location_frac + k;
                                 unsigned input_idx = c->num_inputs++;
@@ -2124,7 +2371,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
                 } else if (var->data.compact) {
                         for (int j = 0; j < var_len; j++)
                                 emit_compact_fragment_input(c, loc, var, j);
-                } else if (glsl_type_is_struct(var->type)) {
+                } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
                         for (int j = 0; j < var_len; j++) {
                            emit_fragment_input(c, loc, var, j, 4);
                         }
@@ -2143,12 +2390,9 @@ ntq_setup_outputs(struct v3d_compile *c)
                 return;
 
         nir_foreach_shader_out_variable(var, c->s) {
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+                assert(glsl_type_is_vector_or_scalar(var->type));
                 unsigned loc = var->data.driver_location * 4;
 
-                assert(array_len == 1);
-                (void)array_len;
-
                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
                         add_output(c, loc + var->data.location_frac + i,
                                    var->data.location,
@@ -2157,15 +2401,17 @@ ntq_setup_outputs(struct v3d_compile *c)
 
                 switch (var->data.location) {
                 case FRAG_RESULT_COLOR:
-                        c->output_color_var[0] = var;
-                        c->output_color_var[1] = var;
-                        c->output_color_var[2] = var;
-                        c->output_color_var[3] = var;
+                        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+                                c->output_color_var[i] = var;
                         break;
                 case FRAG_RESULT_DATA0:
                 case FRAG_RESULT_DATA1:
                 case FRAG_RESULT_DATA2:
                 case FRAG_RESULT_DATA3:
+                case FRAG_RESULT_DATA4:
+                case FRAG_RESULT_DATA5:
+                case FRAG_RESULT_DATA6:
+                case FRAG_RESULT_DATA7:
                         c->output_color_var[var->data.location -
                                             FRAG_RESULT_DATA0] = var;
                         break;
@@ -2185,17 +2431,19 @@ ntq_setup_outputs(struct v3d_compile *c)
  * Each nir_register gets a struct qreg per 32-bit component being stored.
  */
 static void
-ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
+ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
 {
-        foreach_list_typed(nir_register, nir_reg, node, list) {
-                unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
+        nir_foreach_reg_decl(decl, impl) {
+                unsigned num_components = nir_intrinsic_num_components(decl);
+                unsigned array_len = nir_intrinsic_num_array_elems(decl);
+                array_len = MAX2(array_len, 1);
                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
-                                                  array_len *
-                                                  nir_reg->num_components);
+                                                  array_len * num_components);
 
+                nir_def *nir_reg = &decl->def;
                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
 
-                for (int i = 0; i < array_len * nir_reg->num_components; i++)
+                for (int i = 0; i < array_len * num_components; i++)
                         qregs[i] = vir_get_temp(c);
         }
 }
@@ -2222,23 +2470,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
         assert(nir_src_as_uint(instr->src[1]) == 0);
 
-        ntq_store_dest(c, &instr->dest, 0,
+        ntq_store_def(c, &instr->def, 0,
                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
         if (instr->num_components > 1) {
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_uniform(c,
-                                           instr->num_components == 2 && is_array ?
-                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
-                                                   QUNIFORM_IMAGE_HEIGHT,
-                                           image_index));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_uniform(c,
+                                          instr->num_components == 2 && is_array ?
+                                                  QUNIFORM_IMAGE_ARRAY_SIZE :
+                                                  QUNIFORM_IMAGE_HEIGHT,
+                                          image_index));
         }
         if (instr->num_components > 2) {
-                ntq_store_dest(c, &instr->dest, 2,
-                               vir_uniform(c,
-                                           is_array ?
-                                           QUNIFORM_IMAGE_ARRAY_SIZE :
-                                           QUNIFORM_IMAGE_DEPTH,
-                                           image_index));
+                ntq_store_def(c, &instr->def, 2,
+                              vir_uniform(c,
+                                          is_array ?
+                                          QUNIFORM_IMAGE_ARRAY_SIZE :
+                                          QUNIFORM_IMAGE_DEPTH,
+                                          image_index));
         }
 }
 
@@ -2263,16 +2511,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
          *
          * To fix that, we make sure we always emit a thread switch before the
          * first tlb color read. If that happens to be the last thread switch
-         * we emit, then everything is fine, but otherwsie, if any code after
+         * we emit, then everything is fine, but otherwise, if any code after
          * this point needs to emit additional thread switches, then we will
          * switch the strategy to locking the scoreboard on the first thread
          * switch instead -- see vir_emit_thrsw().
          */
         if (!c->emitted_tlb_load) {
-                if (!c->last_thrsw_at_top_level) {
-                        assert(c->devinfo->ver >= 41);
+                if (!c->last_thrsw_at_top_level)
                         vir_emit_thrsw(c);
-                }
 
                 c->emitted_tlb_load = true;
         }
@@ -2371,27 +2617,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
         }
 
         assert(color_reads_for_sample[component].file != QFILE_NULL);
-        ntq_store_dest(c, &instr->dest, 0,
-                       vir_MOV(c, color_reads_for_sample[component]));
+        ntq_store_def(c, &instr->def, 0,
+                      vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
+
+static bool
+try_emit_uniform(struct v3d_compile *c,
+                 int offset,
+                 int num_components,
+                 nir_def *def,
+                 enum quniform_contents contents)
+{
+        /* Even though ldunif is strictly 32-bit we can still use it
+         * to load scalar 8-bit/16-bit uniforms so long as their offset
+         * is 32-bit aligned. In this case, ldunif would still load
+         * 32-bit into the destination with the 8-bit/16-bit uniform
+         * data in the LSB and garbage in the MSB, but that is fine
+         * because we should only be accessing the valid bits of the
+         * destination.
+         *
+         * FIXME: if in the future we improve our register allocator to
+         * pack 2 16-bit variables in the MSB and LSB of the same
+         * register then this optimization would not be valid as is,
+         * since the load clobbers the MSB.
+         */
+        if (offset % 4 != 0)
+                return false;
+
+        /* We need dwords */
+        offset = offset / 4;
+
+        for (int i = 0; i < num_components; i++) {
+                ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
+        }
+
+        return true;
 }
 
 static void
 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
+        /* We scalarize general TMU access for anything that is not 32-bit. */
+        assert(instr->def.bit_size == 32 ||
+               instr->num_components == 1);
+
+        /* Try to emit ldunif if possible, otherwise fallback to general TMU */
         if (nir_src_is_const(instr->src[0])) {
                 int offset = (nir_intrinsic_base(instr) +
                              nir_src_as_uint(instr->src[0]));
-                assert(offset % 4 == 0);
-                /* We need dwords */
-                offset = offset / 4;
-                for (int i = 0; i < instr->num_components; i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_UNIFORM,
-                                                   offset + i));
+
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->def, QUNIFORM_UNIFORM)) {
+                        return;
+                }
+        }
+
+        if (!ntq_emit_load_unifa(c, instr)) {
+                ntq_emit_tmu_general(c, instr, false, false);
+                c->has_general_tmu_load = true;
+        }
+}
+
+static bool
+ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        if (c->compiler->max_inline_uniform_buffers <= 0)
+                return false;
+
+        /* Regular UBOs start after inline UBOs */
+        uint32_t index = nir_src_as_uint(instr->src[0]);
+        if (index >= c->compiler->max_inline_uniform_buffers)
+                return false;
+
+        /* We scalarize general TMU access for anything that is not 32-bit */
+        assert(instr->def.bit_size == 32 ||
+               instr->num_components == 1);
+
+        if (nir_src_is_const(instr->src[1])) {
+                int offset = nir_src_as_uint(instr->src[1]);
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->def,
+                                     QUNIFORM_INLINE_UBO_0 + index)) {
+                        return true;
                 }
-        } else {
-               ntq_emit_tmu_general(c, instr, false);
         }
+
+        /* Fallback to regular UBO load */
+        return false;
 }
 
 static void
@@ -2411,7 +2726,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
         unsigned offset =
                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
 
-        if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+        if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
                /* Emit the LDVPM directly now, rather than at the top
                 * of the shader like we did for V3D 3.x (which needs
                 * vpmsetup when not just taking the next offset).
@@ -2433,19 +2748,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
                                SYSTEM_VALUE_VERTEX_ID)) {
                       index++;
                }
-               for (int i = 0; i < offset; i++)
-                      index += c->vattr_sizes[i];
+
+               for (int i = 0; i < offset; i++) {
+                      /* GFXH-1602: if any builtins (vid, iid, etc) are read then
+                       * attribute 0 must be active (size > 0). When we hit this,
+                       * the driver is expected to program attribute 0 to have a
+                       * size of 1, so here we need to add that.
+                       */
+                      if (i == 0 && c->vs_key->is_coord &&
+                          c->vattr_sizes[i] == 0 && index > 0) {
+                         index++;
+                      } else {
+                         index += c->vattr_sizes[i];
+                      }
+               }
+
                index += nir_intrinsic_component(instr);
                for (int i = 0; i < instr->num_components; i++) {
                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
-                      ntq_store_dest(c, &instr->dest, i,
-                                     vir_LDVPMV_IN(c, vpm_offset));
+                      ntq_store_def(c, &instr->def, i,
+                                    vir_LDVPMV_IN(c, vpm_offset));
                 }
         } else {
                 for (int i = 0; i < instr->num_components; i++) {
                         int comp = nir_intrinsic_component(instr) + i;
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
+                        struct qreg input = c->inputs[offset * 4 + comp];
+                        ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
+
+                        if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
+                            input.file == c->payload_z.file &&
+                            input.index == c->payload_z.index) {
+                                c->reads_z = true;
+                        }
                 }
         }
 }
@@ -2610,18 +2944,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c,
         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
         struct qreg F = vir_uniform_ui(c, 0);
         struct qreg T = vir_uniform_ui(c, ~0);
-        struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
+        struct qreg s0 = vir_AND(c, sample_mask, i1);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
-        s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
+        s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s1 = vir_AND(c, sample_mask, i2);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
-        s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
+        s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s2 = vir_AND(c, sample_mask, i4);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
-        s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
+        s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s3 = vir_AND(c, sample_mask, i8);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
-        s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
+        s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
 
         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
         struct qreg sample_idx = i3;
@@ -2708,28 +3042,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result)
         c->current_unifa_offset += 4;
 }
 
-static void
-ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
+/* Checks if the value of a nir src is derived from a nir register */
+static bool
+nir_src_derived_from_reg(nir_src src)
+{
+        nir_def *def = src.ssa;
+        if (nir_load_reg_for_def(def))
+                return true;
+
+        nir_instr *parent = def->parent_instr;
+        switch (parent->type) {
+        case nir_instr_type_alu: {
+                nir_alu_instr *alu = nir_instr_as_alu(parent);
+                int num_srcs = nir_op_infos[alu->op].num_inputs;
+                for (int i = 0; i < num_srcs; i++) {
+                        if (nir_src_derived_from_reg(alu->src[i].src))
+                                return true;
+                }
+                return false;
+        }
+        case nir_instr_type_intrinsic: {
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+                int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
+                for (int i = 0; i < num_srcs; i++) {
+                        if (nir_src_derived_from_reg(intr->src[i]))
+                                return true;
+                }
+                return false;
+        }
+        case nir_instr_type_load_const:
+        case nir_instr_type_undef:
+                return false;
+        default:
+                /* By default we assume it may come from a register, the above
+                 * cases should be able to handle the majority of situations
+                 * though.
+                 */
+                return true;
+        };
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
+        assert(instr->intrinsic == nir_intrinsic_load_ubo ||
+               instr->intrinsic == nir_intrinsic_load_ssbo ||
+               instr->intrinsic == nir_intrinsic_load_uniform);
+
+        bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
+        bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
+        bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
+
         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
          * current unifa offset is 4 bytes ahead of the offset of the last load.
          */
         static const int32_t max_unifa_skip_dist =
                 MAX_UNIFA_SKIP_DISTANCE - 4;
 
-        bool dynamic_src = !nir_src_is_const(instr->src[1]);
-        uint32_t const_offset =
-                dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
+        /* We can only use unifa if the offset is uniform */
+        nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
+        if (nir_src_is_divergent(offset))
+                return false;
 
-        /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
-         * shifted up by 1 (0 is gallium's constant buffer 0).
+        /* Emitting loads from unifa may not be safe under non-uniform control
+         * flow. It seems the address that is used to write to the unifa
+         * register is taken from the first lane and if that lane is disabled
+         * by control flow then the value we read may be bogus and lead to
+         * invalid memory accesses on follow-up ldunifa instructions. However,
+         * ntq_store_def only emits conditional writes for nir registersas long
+         * we can be certain that the offset isn't derived from a load_reg we
+         * should be fine.
+         *
+         * The following CTS test can be used to trigger the problem, which
+         * causes a GMP violations in the sim without this check:
+         * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
          */
-        uint32_t index = nir_src_as_uint(instr->src[0]);
-        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
+        if (vir_in_nonuniform_control_flow(c) &&
+            nir_src_derived_from_reg(offset)) {
+                return false;
+        }
+
+        /* We can only use unifa with SSBOs if they are read-only. Otherwise
+         * ldunifa won't see the shader writes to that address (possibly
+         * because ldunifa doesn't read from the L2T cache).
+         */
+        if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
+                return false;
+
+        /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
+         * that we may have been written to scratch using the TMU.
+         */
+        bool dynamic_src = !nir_src_is_const(offset);
+        if (is_uniform && dynamic_src && c->s->scratch_size > 0)
+                return false;
+
+        uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
+        if (is_uniform)
+                const_offset += nir_intrinsic_base(instr);
+
+        /* ldunifa is a 32-bit load instruction so we can only use it with
+         * 32-bit aligned addresses. We always produce 32-bit aligned addresses
+         * except for types smaller than 32-bit, so in these cases we can only
+         * use ldunifa if we can verify alignment, which we can only do for
+         * loads with a constant offset.
+         */
+        uint32_t bit_size = instr->def.bit_size;
+        uint32_t value_skips = 0;
+        if (bit_size < 32) {
+                if (dynamic_src) {
+                        return false;
+                } else if (const_offset % 4 != 0) {
+                        /* If we are loading from an unaligned offset, fix
+                         * alignment and skip over unused elements in result.
+                         */
+                        value_skips = (const_offset % 4) / (bit_size / 8);
+                        const_offset &= ~0x3;
+                }
+        }
+
+        assert((bit_size == 32 && value_skips == 0) ||
+               (bit_size == 16 && value_skips <= 1) ||
+               (bit_size == 8  && value_skips <= 3));
+
+        /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
+         * constants.
+         */
+        uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
+
+        /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
+         * index 0 for Gallium's constant buffer (GL) or push constants
+         * (Vulkan).
+         */
+        if (is_ubo)
                 index++;
 
         /* We can only keep track of the last unifa address we used with
-         * constant offset loads. If the new load targets the same UBO and
+         * constant offset loads. If the new load targets the same buffer and
          * is close enough to the previous load, we can skip the unifa register
          * write by emitting dummy ldunifa instructions to update the unifa
          * address.
@@ -2739,6 +3187,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
         if (dynamic_src) {
                 c->current_unifa_block = NULL;
         } else if (c->cur_block == c->current_unifa_block &&
+                   c->current_unifa_is_ubo == !is_ssbo &&
                    c->current_unifa_index == index &&
                    c->current_unifa_offset <= const_offset &&
                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
@@ -2746,32 +3195,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
         } else {
                 c->current_unifa_block = c->cur_block;
+                c->current_unifa_is_ubo = !is_ssbo;
                 c->current_unifa_index = index;
                 c->current_unifa_offset = const_offset;
         }
 
         if (!skip_unifa) {
-                struct qreg base_offset =
+                struct qreg base_offset = !is_ssbo ?
                         vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                    v3d_unit_data_create(index, const_offset));
+                                    v3d_unit_data_create(index, const_offset)) :
+                        vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
 
                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 if (!dynamic_src) {
-                        vir_MOV_dest(c, unifa, base_offset);
+                        if (!is_ssbo) {
+                                /* Avoid the extra MOV to UNIFA by making
+                                 * ldunif load directly into it. We can't
+                                 * do this if we have not actually emitted
+                                 * ldunif and are instead reusing a previous
+                                 * one.
+                                 */
+                                struct qinst *inst =
+                                        (struct qinst *)c->cur_block->instructions.prev;
+                                if (inst == c->defs[base_offset.index]) {
+                                   inst->dst = unifa;
+                                   c->defs[base_offset.index] = NULL;
+                                } else {
+                                   vir_MOV_dest(c, unifa, base_offset);
+                                }
+                        } else {
+                                vir_ADD_dest(c, unifa, base_offset,
+                                             vir_uniform_ui(c, const_offset));
+                        }
                 } else {
                         vir_ADD_dest(c, unifa, base_offset,
-                                     ntq_get_src(c, instr->src[1], 0));
+                                     ntq_get_src(c, offset, 0));
                 }
         } else {
                 for (int i = 0; i < ldunifa_skips; i++)
                         emit_ldunifa(c, NULL);
         }
 
-        for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+        uint32_t num_components = nir_intrinsic_dest_components(instr);
+        for (uint32_t i = 0; i < num_components; ) {
                 struct qreg data;
                 emit_ldunifa(c, &data);
-                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
+
+                if (bit_size == 32) {
+                        assert(value_skips == 0);
+                        ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
+                        i++;
+                } else {
+                        assert((bit_size == 16 && value_skips <= 1) ||
+                               (bit_size ==  8 && value_skips <= 3));
+
+                        /* If we have any values to skip, shift to the first
+                         * valid value in the ldunifa result.
+                         */
+                        if (value_skips > 0) {
+                                data = vir_SHR(c, data,
+                                               vir_uniform_ui(c, bit_size *
+                                                                 value_skips));
+                        }
+
+                        /* Check how many valid components we have discounting
+                         * read components to skip.
+                         */
+                        uint32_t valid_count = (32 / bit_size) - value_skips;
+                        assert((bit_size == 16 && valid_count <= 2) ||
+                               (bit_size ==  8 && valid_count <= 4));
+                        assert(valid_count > 0);
+
+                        /* Process the valid components */
+                        do {
+                                struct qreg tmp;
+                                uint32_t mask = (1 << bit_size) - 1;
+                                tmp = vir_AND(c, vir_MOV(c, data),
+                                              vir_uniform_ui(c, mask));
+                                ntq_store_def(c, &instr->def, i,
+                                              vir_MOV(c, tmp));
+                                i++;
+                                valid_count--;
+
+                                /* Shift to next component */
+                                if (i < num_components && valid_count > 0) {
+                                        data = vir_SHR(c, data,
+                                                       vir_uniform_ui(c, bit_size));
+                                }
+                        } while (i < num_components && valid_count > 0);
+                }
         }
+
+        return true;
 }
 
 static inline struct qreg
@@ -2781,187 +3296,273 @@ emit_load_local_invocation_index(struct v3d_compile *c)
                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
 }
 
-/* Various subgroup operations rely on the A flags, so this helper ensures that
- * A flags represents currently active lanes in the subgroup.
+/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
+ * fragment shaders a lane is considered active if any sample flags are set
+ * for *any* lane in the same quad, however, we still need to ensure that
+ * terminated lanes (OpTerminate) are not included. Further, we also need to
+ * disable lanes that may be disabled because of non-uniform control
+ * flow.
  */
-static void
-set_a_flags_for_subgroup(struct v3d_compile *c)
+static enum v3d_qpu_cond
+setup_subgroup_control_flow_condition(struct v3d_compile *c)
 {
-        /* MSF returns 0 for disabled lanes in compute shaders so
-         * PUSHZ will set A=1 for disabled lanes. We want the inverse
-         * of this but we don't have any means to negate the A flags
-         * directly, but we can do it by repeating the same operation
-         * with NORZ (A = ~A & ~Z).
+        assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
+               c->s->info.stage == MESA_SHADER_COMPUTE);
+
+        enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
+
+        /* We need to make sure that terminated lanes in fragment shaders are
+         * not included. We can identify these lanes by comparing the inital
+         * sample mask with the current. This fixes:
+         * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
          */
-        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
-        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
-        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
+                vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
+                                           vir_NOT(c, vir_XOR(c, c->start_msf,
+                                                              vir_MSF(c)))),
+                           V3D_QPU_PF_PUSHZ);
+                cond = V3D_QPU_COND_IFNA;
+        }
 
-        /* If we are under non-uniform control flow we also need to
-         * AND the A flags with the current execute mask.
+        /* If we are in non-uniform control-flow update the condition to
+         * also limit lanes to those in the current execution mask.
          */
         if (vir_in_nonuniform_control_flow(c)) {
-                const uint32_t bidx = c->cur_block->index;
-                vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
-                                           c->execute,
-                                           vir_uniform_ui(c, bidx)),
-                           V3D_QPU_UF_ANDZ);
+                if (cond == V3D_QPU_COND_IFNA) {
+                        vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_UF_NORNZ);
+                } else {
+                        assert(cond == V3D_QPU_COND_NONE);
+                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                }
+                cond = V3D_QPU_COND_IFA;
         }
+
+        return cond;
+}
+
+static void
+emit_compute_barrier(struct v3d_compile *c)
+{
+        /* Ensure we flag the use of the control barrier. NIR's
+         * gather info pass usually takes care of this, but that
+         * requires that we call that pass after any other pass
+         * may emit a control barrier, so this is safer.
+         */
+        c->s->info.uses_control_barrier = true;
+
+        /* Emit a TSY op to get all invocations in the workgroup
+         * (actually supergroup) to block until the last
+         * invocation reaches the TSY op.
+         */
+        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
+}
+
+static void
+emit_barrier(struct v3d_compile *c)
+{
+        struct qreg eidx = vir_EIDX(c);
+
+        /* The config for the TSY op should be setup like this:
+         * - Lane 0: Quorum
+         * - Lane 2: TSO id
+         * - Lane 3: TSY opcode
+         */
+
+        /* Lane 0: we want to synchronize across one subgroup. Here we write to
+         * all lanes unconditionally and will overwrite other lanes below.
+         */
+        struct qreg tsy_conf = vir_uniform_ui(c, 1);
+
+        /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
+         * curent QPU index and thread index to ensure we get a unique one for
+         * this group of invocations in this core.
+         */
+        struct qreg tso_id =
+                vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
+        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
+                   V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
+
+        /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
+        struct qreg tsy_op = vir_uniform_ui(c, 16);
+        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
+                   V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
+
+        /* Emit TSY sync */
+        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
 }
 
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
+        case nir_intrinsic_decl_reg:
+        case nir_intrinsic_load_reg:
+        case nir_intrinsic_store_reg:
+                break; /* Ignore these */
+
         case nir_intrinsic_load_uniform:
                 ntq_emit_load_uniform(c, instr);
                 break;
 
+        case nir_intrinsic_load_global_2x32:
+                ntq_emit_tmu_general(c, instr, false, true);
+                c->has_general_tmu_load = true;
+                break;
+
         case nir_intrinsic_load_ubo:
-                if (!nir_src_is_divergent(instr->src[1]))
-                        ntq_emit_load_ubo_unifa(c, instr);
-                else
-                        ntq_emit_tmu_general(c, instr, false);
-                break;
-
-        case nir_intrinsic_ssbo_atomic_add:
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_ssbo_atomic_comp_swap:
+           if (ntq_emit_inline_ubo_load(c, instr))
+                   break;
+           FALLTHROUGH;
         case nir_intrinsic_load_ssbo:
+                if (!ntq_emit_load_unifa(c, instr)) {
+                        ntq_emit_tmu_general(c, instr, false, false);
+                        c->has_general_tmu_load = true;
+                }
+                break;
+
         case nir_intrinsic_store_ssbo:
-                ntq_emit_tmu_general(c, instr, false);
-                break;
-
-        case nir_intrinsic_shared_atomic_add:
-        case nir_intrinsic_shared_atomic_imin:
-        case nir_intrinsic_shared_atomic_umin:
-        case nir_intrinsic_shared_atomic_imax:
-        case nir_intrinsic_shared_atomic_umax:
-        case nir_intrinsic_shared_atomic_and:
-        case nir_intrinsic_shared_atomic_or:
-        case nir_intrinsic_shared_atomic_xor:
-        case nir_intrinsic_shared_atomic_exchange:
-        case nir_intrinsic_shared_atomic_comp_swap:
-        case nir_intrinsic_load_shared:
+        case nir_intrinsic_ssbo_atomic:
+        case nir_intrinsic_ssbo_atomic_swap:
+                ntq_emit_tmu_general(c, instr, false, false);
+                break;
+
+        case nir_intrinsic_store_global_2x32:
+        case nir_intrinsic_global_atomic_2x32:
+        case nir_intrinsic_global_atomic_swap_2x32:
+                ntq_emit_tmu_general(c, instr, false, true);
+                break;
+
+        case nir_intrinsic_shared_atomic:
+        case nir_intrinsic_shared_atomic_swap:
         case nir_intrinsic_store_shared:
-        case nir_intrinsic_load_scratch:
         case nir_intrinsic_store_scratch:
-                ntq_emit_tmu_general(c, instr, true);
+                ntq_emit_tmu_general(c, instr, true, false);
+                break;
+
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_shared:
+                ntq_emit_tmu_general(c, instr, true, false);
+                c->has_general_tmu_load = true;
                 break;
 
-        case nir_intrinsic_image_load:
         case nir_intrinsic_image_store:
-        case nir_intrinsic_image_atomic_add:
-        case nir_intrinsic_image_atomic_imin:
-        case nir_intrinsic_image_atomic_umin:
-        case nir_intrinsic_image_atomic_imax:
-        case nir_intrinsic_image_atomic_umax:
-        case nir_intrinsic_image_atomic_and:
-        case nir_intrinsic_image_atomic_or:
-        case nir_intrinsic_image_atomic_xor:
-        case nir_intrinsic_image_atomic_exchange:
-        case nir_intrinsic_image_atomic_comp_swap:
-                v3d40_vir_emit_image_load_store(c, instr);
+        case nir_intrinsic_image_atomic:
+        case nir_intrinsic_image_atomic_swap:
+                v3d_vir_emit_image_load_store(c, instr);
+                break;
+
+        case nir_intrinsic_image_load:
+                v3d_vir_emit_image_load_store(c, instr);
+                /* Not really a general TMU load, but we only use this flag
+                 * for NIR scheduling and we do schedule these under the same
+                 * policy as general TMU.
+                 */
+                c->has_general_tmu_load = true;
                 break;
 
         case nir_intrinsic_get_ssbo_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
-                                           nir_src_comp_as_uint(instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
+                                          nir_src_comp_as_uint(instr->src[0], 0)));
                 break;
 
         case nir_intrinsic_get_ubo_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
-                                           nir_src_comp_as_uint(instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
+                                          nir_src_comp_as_uint(instr->src[0], 0)));
                 break;
 
         case nir_intrinsic_load_user_clip_plane:
                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
-                                                   nir_intrinsic_ucp_id(instr) *
-                                                   4 + i));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+                                                  nir_intrinsic_ucp_id(instr) *
+                                                  4 + i));
                 }
                 break;
 
         case nir_intrinsic_load_viewport_x_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_y_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_z_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_z_offset:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
                 break;
 
         case nir_intrinsic_load_line_coord:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
                 break;
 
         case nir_intrinsic_load_line_width:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
                 break;
 
         case nir_intrinsic_load_aa_line_width:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
                 break;
 
         case nir_intrinsic_load_sample_mask_in:
-                ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
+                ntq_store_def(c, &instr->def, 0, vir_MSF(c));
                 break;
 
         case nir_intrinsic_load_helper_invocation:
                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
-                ntq_store_dest(c, &instr->dest, 0, qdest);
+                ntq_store_def(c, &instr->def, 0, qdest);
                 break;
 
         case nir_intrinsic_load_front_face:
                 /* The register contains 0 (front) or 1 (back), and we need to
                  * turn it into a NIR bool where true means front.
                  */
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_ADD(c,
-                                       vir_uniform_ui(c, -1),
-                                       vir_REVF(c)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_ADD(c,
+                                      vir_uniform_ui(c, -1),
+                                      vir_REVF(c)));
                 break;
 
         case nir_intrinsic_load_base_instance:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
                 break;
 
         case nir_intrinsic_load_instance_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
                 break;
 
         case nir_intrinsic_load_vertex_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
+                break;
+
+        case nir_intrinsic_load_draw_id:
+                ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
                 break;
 
         case nir_intrinsic_load_tlb_color_v3d:
                 vir_emit_tlb_color_read(c, instr);
                 break;
 
+        case nir_intrinsic_load_fep_w_v3d:
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
+                break;
+
         case nir_intrinsic_load_input:
                 ntq_emit_load_input(c, instr);
                 break;
@@ -2978,7 +3579,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_emit_image_size(c, instr);
                 break;
 
+        /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
+         * is intended to match the semantics of GLSL's discard) should
+         * terminate the invocation immediately. Our implementation doesn't
+         * do that. What we do is actually a demote by removing the invocations
+         * from the sample mask. Maybe we could be more strict and force an
+         * early termination by emitting a (maybe conditional) jump to the
+         * end section of the fragment shader for affected invocations.
+         */
         case nir_intrinsic_discard:
+        case nir_intrinsic_terminate:
+                c->emitted_discard = true;
+                FALLTHROUGH;
+        case nir_intrinsic_demote:
                 ntq_flush_tmu(c);
 
                 if (vir_in_nonuniform_control_flow(c)) {
@@ -2993,7 +3606,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
-        case nir_intrinsic_discard_if: {
+        case nir_intrinsic_discard_if:
+        case nir_intrinsic_terminate_if:
+                c->emitted_discard = true;
+                FALLTHROUGH;
+        case nir_intrinsic_demote_if: {
                 ntq_flush_tmu(c);
 
                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
@@ -3011,102 +3628,79 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
                                              vir_uniform_ui(c, 0)), cond);
-
                 break;
         }
 
-        case nir_intrinsic_memory_barrier:
-        case nir_intrinsic_memory_barrier_buffer:
-        case nir_intrinsic_memory_barrier_image:
-        case nir_intrinsic_memory_barrier_shared:
-        case nir_intrinsic_memory_barrier_tcs_patch:
-        case nir_intrinsic_group_memory_barrier:
-                /* We don't do any instruction scheduling of these NIR
-                 * instructions between each other, so we just need to make
-                 * sure that the TMU operations before the barrier are flushed
+        case nir_intrinsic_barrier:
+                /* Ensure that the TMU operations before the barrier are flushed
                  * before the ones after the barrier.
                  */
                 ntq_flush_tmu(c);
-                break;
-
-        case nir_intrinsic_control_barrier:
-                /* Emit a TSY op to get all invocations in the workgroup
-                 * (actually supergroup) to block until the last invocation
-                 * reaches the TSY op.
-                 */
-                ntq_flush_tmu(c);
 
-                if (c->devinfo->ver >= 42) {
-                        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
-                                                      V3D_QPU_WADDR_SYNCB));
-                } else {
-                        struct qinst *sync =
-                                vir_BARRIERID_dest(c,
-                                                   vir_reg(QFILE_MAGIC,
-                                                           V3D_QPU_WADDR_SYNCU));
-                        sync->uniform =
-                                vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                      0xffffff00 |
-                                                      V3D_TSY_WAIT_INC_CHECK);
+                if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
+                        if (c->s->info.stage == MESA_SHADER_COMPUTE)
+                                emit_compute_barrier(c);
+                        else
+                                emit_barrier(c);
 
+                        /* The blocking of a TSY op only happens at the next
+                         * thread switch. No texturing may be outstanding at the
+                         * time of a TSY blocking operation.
+                         */
+                        vir_emit_thrsw(c);
                 }
-
-                /* The blocking of a TSY op only happens at the next thread
-                 * switch.  No texturing may be outstanding at the time of a
-                 * TSY blocking operation.
-                 */
-                vir_emit_thrsw(c);
                 break;
 
         case nir_intrinsic_load_num_workgroups:
                 for (int i = 0; i < 3; i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
-                                                   i));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
+                                                  i));
                 }
                 break;
 
         case nir_intrinsic_load_workgroup_id: {
                 struct qreg x = vir_AND(c, c->cs_payload[0],
                                          vir_uniform_ui(c, 0xffff));
+                ntq_store_def(c, &instr->def, 0, x);
 
                 struct qreg y = vir_SHR(c, c->cs_payload[0],
                                          vir_uniform_ui(c, 16));
+                ntq_store_def(c, &instr->def, 1, y);
 
                 struct qreg z = vir_AND(c, c->cs_payload[1],
                                          vir_uniform_ui(c, 0xffff));
+                ntq_store_def(c, &instr->def, 2, z);
+                break;
+        }
 
-                /* We only support dispatch base in Vulkan */
-                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
-                        x = vir_ADD(c, x,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
-                        y = vir_ADD(c, y,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
-                        z = vir_ADD(c, z,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
-                }
+        case nir_intrinsic_load_base_workgroup_id: {
+                struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
+                ntq_store_def(c, &instr->def, 0, x);
 
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
-                ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
+                struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
+                ntq_store_def(c, &instr->def, 1, y);
+
+                struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
+                ntq_store_def(c, &instr->def, 2, z);
                 break;
         }
 
         case nir_intrinsic_load_local_invocation_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               emit_load_local_invocation_index(c));
+                ntq_store_def(c, &instr->def, 0,
+                              emit_load_local_invocation_index(c));
                 break;
 
         case nir_intrinsic_load_subgroup_id: {
                 /* This is basically the batch index, which is the Local
                  * Invocation Index divided by the SIMD width).
                  */
-                STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+                STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
                 struct qreg lii = emit_load_local_invocation_index(c);
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_SHR(c, lii,
-                                       vir_uniform_ui(c, divide_shift)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_SHR(c, lii,
+                                      vir_uniform_ui(c, divide_shift)));
                 break;
         }
 
@@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
                 for (int i = 0; i < instr->num_components; i++) {
                         struct qreg row = vir_uniform_ui(c, row_idx++);
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_LDVPMG_IN(c, row, col));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_LDVPMG_IN(c, row, col));
                 }
                 break;
         }
@@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  * using ldvpm(v,d)_in (See Table 71).
                  */
                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
                 break;
         }
 
         case nir_intrinsic_load_invocation_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
+                ntq_store_def(c, &instr->def, 0, vir_IID(c));
                 break;
 
         case nir_intrinsic_load_fb_layers_v3d:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
                 break;
 
         case nir_intrinsic_load_sample_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
+                ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
                 break;
 
         case nir_intrinsic_load_sample_pos:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
                 break;
 
         case nir_intrinsic_load_barycentric_at_offset:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
                 break;
 
         case nir_intrinsic_load_barycentric_pixel:
-                ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
-                ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+                ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+                ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
                 break;
 
         case nir_intrinsic_load_barycentric_at_sample: {
                 if (!c->fs_key->msaa) {
-                        ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
-                        ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+                        ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+                        ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
                         return;
                 }
 
@@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
 
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+                ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
                 break;
         }
 
@@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg offset_y =
                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
 
-                ntq_store_dest(c, &instr->dest, 0,
-                                  vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
-                ntq_store_dest(c, &instr->dest, 1,
-                                  vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
+                ntq_store_def(c, &instr->def, 0,
+                             vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
                 break;
         }
 
         case nir_intrinsic_load_barycentric_centroid: {
                 struct qreg offset_x, offset_y;
                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+                ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
                 break;
         }
 
@@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                          */
                         if (!c->fs_key->msaa ||
                             c->interp[input_idx].vp.file == QFILE_NULL) {
-                                ntq_store_dest(c, &instr->dest, i,
-                                               vir_MOV(c, c->inputs[input_idx]));
+                                ntq_store_def(c, &instr->def, i,
+                                              vir_MOV(c, c->inputs[input_idx]));
                                 continue;
                         }
 
@@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                               ntq_emit_load_interpolated_input(c, p, C,
                                                                offset_x, offset_y,
                                                                interp_mode);
-                        ntq_store_dest(c, &instr->dest, i, result);
+                        ntq_store_def(c, &instr->def, i, result);
                 }
                 break;
         }
 
         case nir_intrinsic_load_subgroup_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform_ui(c, V3D_CHANNELS));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform_ui(c, V3D_CHANNELS));
                 break;
 
         case nir_intrinsic_load_subgroup_invocation:
-                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+                ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
                 break;
 
         case nir_intrinsic_elect: {
-                set_a_flags_for_subgroup(c);
-                struct qreg first = vir_FLAFIRST(c);
+                struct qreg first;
+                if (vir_in_nonuniform_control_flow(c)) {
+                        /* Sets A=1 for lanes enabled in the execution mask */
+                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                        /* Updates A ANDing with lanes enabled in MSF */
+                        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
+                                   V3D_QPU_UF_ANDNZ);
+                        first = vir_FLAFIRST(c);
+                } else {
+                        /* Sets A=1 for inactive lanes */
+                        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
+                                   V3D_QPU_PF_PUSHZ);
+                        first = vir_FLNAFIRST(c);
+                }
 
-                /* Produce a boolean result from Flafirst */
+                /* Produce a boolean result */
                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
                                            first, vir_uniform_ui(c, 1)),
                                            V3D_QPU_PF_PUSHZ);
                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
-                ntq_store_dest(c, &instr->dest, 0, result);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_ballot: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_read_invocation: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                struct qreg index = ntq_get_src(c, instr->src[1], 0);
+                struct qreg res = vir_SHUFFLE(c, value, index);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_read_first_invocation: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_shuffle: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                struct qreg indices = ntq_get_src(c, instr->src[1], 0);
+                struct qreg res = vir_SHUFFLE(c, value, indices);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_vote_feq:
+        case nir_intrinsic_vote_ieq: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
+                             vir_ALLEQ_dest(c, res, value) :
+                             vir_ALLFEQ_dest(c, res, value),
+                             cond);
+
+                /* Produce boolean result */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_vote_all: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+                /* We want to check if 'all lanes are equal (alleq != 0) and
+                 * their value is True (value != 0)'.
+                 *
+                 * The first MOV.pushz generates predicate for 'alleq == 0'.
+                 * The second MOV.NORZ generates predicate for:
+                 * '!(alleq == 0) & !(value == 0).
+                 */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+                           V3D_QPU_UF_NORZ);
+                struct qreg result =
+                        ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_vote_any: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+                /* We want to check 'not (all lanes are equal (alleq != 0)'
+                 * and their value is False (value == 0))'.
+                 *
+                 * The first MOV.pushz generates predicate for 'alleq == 0'.
+                 * The second MOV.NORNZ generates predicate for:
+                 * '!(alleq == 0) & (value == 0).
+                 * The IFNA condition negates the predicate when evaluated:
+                 * '!(!alleq == 0) & (value == 0))
+                 */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+                           V3D_QPU_UF_NORNZ);
+                struct qreg result =
+                        ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+                ntq_store_def(c, &instr->def, 0, result);
                 break;
         }
 
@@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_view_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
                 break;
 
         default:
@@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c)
         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
 }
 
+static bool
+is_cheap_block(nir_block *block)
+{
+        int32_t cost = 3;
+        nir_foreach_instr(instr, block) {
+                switch (instr->type) {
+                case nir_instr_type_alu:
+                case nir_instr_type_undef:
+                case nir_instr_type_load_const:
+                        if (--cost <= 0)
+                                return false;
+                break;
+                case nir_instr_type_intrinsic: {
+                        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                        switch (intr->intrinsic) {
+                        case nir_intrinsic_decl_reg:
+                        case nir_intrinsic_load_reg:
+                        case nir_intrinsic_store_reg:
+                                continue;
+                        default:
+                                return false;
+                        }
+                }
+                default:
+                        return false;
+                }
+        }
+        return true;
+}
+
 static void
 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
 {
@@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                      c->execute,
                      vir_uniform_ui(c, else_block->index));
 
-        /* Jump to ELSE if nothing is active for THEN, otherwise fall
-         * through.
+        /* Set the flags for taking the THEN block */
+        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                   V3D_QPU_PF_PUSHZ);
+
+        /* Jump to ELSE if nothing is active for THEN (unless THEN block is
+         * so small it won't pay off), otherwise fall through.
          */
-        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
-        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
-        vir_link_blocks(c->cur_block, else_block);
+        bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
+                        is_cheap_block(nir_if_first_then_block(if_stmt));
+        if (!is_cheap) {
+                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+                vir_link_blocks(c->cur_block, else_block);
+        }
         vir_link_blocks(c->cur_block, then_block);
 
-        /* Process the THEN block. */
+        /* Process the THEN block.
+         *
+         * Notice we don't call ntq_activate_execute_for_block here on purpose:
+         * c->execute is already set up to be 0 for lanes that must take the
+         * THEN block.
+         */
         vir_set_emit_block(c, then_block);
         ntq_emit_cf_list(c, &if_stmt->then_list);
 
@@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
                              vir_uniform_ui(c, after_block->index));
 
-                /* If everything points at ENDIF, then jump there immediately. */
-                vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
-                                        c->execute,
-                                        vir_uniform_ui(c, after_block->index)),
-                           V3D_QPU_PF_PUSHZ);
-                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
-                vir_link_blocks(c->cur_block, after_block);
+                /* If everything points at ENDIF, then jump there immediately
+                 * (unless ELSE block is so small it won't pay off).
+                 */
+                bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
+                                is_cheap_block(nir_else_block);
+                if (!is_cheap) {
+                        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
+                                                   c->execute,
+                                                   vir_uniform_ui(c, after_block->index)),
+                                   V3D_QPU_PF_PUSHZ);
+                        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+                        vir_link_blocks(c->cur_block, after_block);
+                }
                 vir_link_blocks(c->cur_block, else_block);
 
                 vir_set_emit_block(c, else_block);
@@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
                 break;
 
-        case nir_instr_type_ssa_undef:
+        case nir_instr_type_undef:
                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
                 break;
 
@@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
 static void
 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
 {
-
         c->loop_cont_block = vir_new_block(c);
         c->loop_break_block = vir_new_block(c);
 
@@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
 static void
 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
 {
+        assert(!nir_loop_has_continue_construct(loop));
+
+        /* Disable flags optimization for loop conditions. The problem here is
+         * that we can have code like this:
+         *
+         *  // block_0
+         *  vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
+         *  loop {
+         *     // block_1
+         *     if ssa_9 {
+         *
+         * In this example we emit flags to compute ssa_9 and the optimization
+         * will skip regenerating them again for the loop condition in the
+         * loop continue block (block_1). However, this is not safe after the
+         * first iteration because the loop body can stomp the flags if it has
+         * any conditionals.
+         */
+        c->flags_temp = -1;
+
         bool was_in_control_flow = c->in_control_flow;
         c->in_control_flow = true;
 
@@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
 static void
 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
 {
-        ntq_setup_registers(c, &impl->registers);
+        ntq_setup_registers(c, impl);
         ntq_emit_cf_list(c, &impl->body);
 }
 
@@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c)
 {
         switch (c->s->info.stage) {
         case MESA_SHADER_FRAGMENT:
-                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                c->start_msf = vir_MSF(c);
+                if (c->devinfo->ver < 71)
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                else
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
@@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c)
                                emit_fragment_varying(c, NULL, -1, 0, 0);
                 }
 
-                if (c->fs_key->is_points &&
-                    (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
+                if (c->fs_key->is_points && program_reads_point_coord(c)) {
                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->uses_implicit_point_line_varyings = true;
                 } else if (c->fs_key->is_lines &&
-                           (c->devinfo->ver < 40 ||
-                            BITSET_TEST(c->s->info.system_values_read,
+                           (BITSET_TEST(c->s->info.system_values_read,
                                         SYSTEM_VALUE_LINE_COORD))) {
                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->uses_implicit_point_line_varyings = true;
                 }
-
-                c->force_per_sample_msaa =
-                   c->s->info.fs.uses_sample_qualifier ||
-                   BITSET_TEST(c->s->info.system_values_read,
-                               SYSTEM_VALUE_SAMPLE_ID) ||
-                   BITSET_TEST(c->s->info.system_values_read,
-                               SYSTEM_VALUE_SAMPLE_POS);
                 break;
         case MESA_SHADER_COMPUTE:
                 /* Set up the TSO for barriers, assuming we do some. */
@@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c)
                                                       V3D_QPU_WADDR_SYNC));
                 }
 
-                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
-                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                if (c->devinfo->ver == 42) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                } else if (c->devinfo->ver >= 71) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                }
 
                 /* Set up the division between gl_LocalInvocationIndex and
                  * wg_in_mem in the payload reg.
@@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c)
 
         /* Find the main function and emit the body. */
         nir_foreach_function(function, c->s) {
-                assert(strcmp(function->name, "main") == 0);
+                assert(function->is_entrypoint);
                 assert(function->impl);
                 ntq_emit_impl(c, function->impl);
         }
@@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c,
 {
         *restore_last_thrsw = c->last_thrsw;
 
-        /* On V3D before 4.1, we need a TMU op to be outstanding when thread
-         * switching, so disable threads if we didn't do any TMU ops (each of
-         * which would have emitted a THRSW).
-         */
-        if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
-                c->threads = 1;
-                if (c->last_thrsw)
-                        vir_remove_thrsw(c);
-                *restore_last_thrsw = NULL;
-        }
-
         /* If we're threaded and the last THRSW was in conditional code, then
          * we need to emit another one so that we can flag it as the last
          * thrsw.
          */
-        if (c->last_thrsw && !c->last_thrsw_at_top_level) {
-                assert(c->devinfo->ver >= 41);
+        if (c->last_thrsw && !c->last_thrsw_at_top_level)
                 vir_emit_thrsw(c);
-        }
 
         /* If we're threaded, then we need to mark the last THRSW instruction
          * so we can emit a pair of them at QPU emit time.
@@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c,
          * For V3D 4.x, we can spawn the non-fragment shaders already in the
          * post-last-THRSW state, so we can skip this.
          */
-        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                assert(c->devinfo->ver >= 41);
+        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
                 vir_emit_thrsw(c);
-        }
 
         /* If we have not inserted a last thread switch yet, do it now to ensure
          * any potential spilling we do happens before this. If we don't spill
@@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c)
 
         vir_for_each_inst_inorder(inst, c) {
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                        if (inst->src[i].file == QFILE_REG &&
-                            inst->src[i].index == 0) {
+                        if (inst->src[i].file == c->payload_w.file &&
+                            inst->src[i].index == c->payload_w.index) {
                                 c->uses_center_w = true;
                                 return;
                         }
@@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c)
 void
 v3d_nir_to_vir(struct v3d_compile *c)
 {
-        if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(NIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d NIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 unreachable("bad stage");
         }
 
-        if (V3D_DEBUG & (V3D_DEBUG_VIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(VIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
          * instructions until the results are needed.
          */
 
-        if (V3D_DEBUG & (V3D_DEBUG_VIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(VIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c)
         /* Attempt to allocate registers for the temporaries.  If we fail,
          * reduce thread count and try again.
          */
-        int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
+        int min_threads = 2;
         struct qpu_reg *temp_registers;
         while (true) {
-                bool spilled;
-                temp_registers = v3d_register_allocate(c, &spilled);
-                if (spilled)
-                        continue;
-
-                if (temp_registers)
+                temp_registers = v3d_register_allocate(c);
+                if (temp_registers) {
+                        assert(c->spills + c->fills <= c->max_tmu_spills);
                         break;
+                }
 
                 if (c->threads == min_threads &&
-                    (V3D_DEBUG & V3D_DEBUG_RA)) {
+                    V3D_DBG(RA)) {
                         fprintf(stderr,
                                 "Failed to register allocate using %s\n",
                                 c->fallback_scheduler ? "the fallback scheduler:" :
@@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 }
 
                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
-                        if (V3D_DEBUG & V3D_DEBUG_PERF) {
+                        if (V3D_DBG(PERF)) {
                                 fprintf(stderr,
-                                        "Failed to register allocate %s at "
-                                        "%d threads.\n", vir_get_stage_name(c),
-                                        c->threads);
+                                        "Failed to register allocate %s "
+                                        "prog %d/%d at %d threads.\n",
+                                        vir_get_stage_name(c),
+                                        c->program_id, c->variant_id, c->threads);
                         }
                         c->compilation_result =
                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
                         return;
                 }
 
-                c->spill_count = 0;
+                c->spills = 0;
+                c->fills = 0;
                 c->threads /= 2;
 
                 if (c->threads == 1)
@@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
 
         if (c->spills &&
-            (V3D_DEBUG & (V3D_DEBUG_VIR |
-                          v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+            (V3D_DBG(VIR) ||
+             v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index c559814b9ea..ba76ac87e1e 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -85,6 +85,7 @@ struct schedule_state {
         struct schedule_node *last_unif;
         struct schedule_node *last_rtop;
         struct schedule_node *last_unifa;
+        struct schedule_node *last_setmsf;
         enum direction dir;
         /* Estimated cycle when the current instruction would start. */
         uint32_t time;
@@ -97,7 +98,7 @@ add_dep(struct schedule_state *state,
         bool write)
 {
         bool write_after_read = !write && state->dir == R;
-        void *edge_data = (void *)(uintptr_t)write_after_read;
+        uintptr_t edge_data = write_after_read;
 
         if (!before || !after)
                 return;
@@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return false;
 
-        if (inst->alu.add.magic_write &&
+        if (inst->alu.add.op != V3D_QPU_A_NOP &&
+            inst->alu.add.magic_write &&
             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
                 return true;
 
-        if (inst->alu.mul.magic_write &&
+        if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+            inst->alu.mul.magic_write &&
             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
                 return true;
@@ -153,12 +156,13 @@ static void
 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
                  enum v3d_qpu_mux mux)
 {
+        assert(state->devinfo->ver < 71);
         switch (mux) {
         case V3D_QPU_MUX_A:
                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
                 break;
         case V3D_QPU_MUX_B:
-                if (!n->inst->qpu.sig.small_imm) {
+                if (!n->inst->qpu.sig.small_imm_b) {
                         add_read_dep(state,
                                      state->last_rf[n->inst->qpu.raddr_b], n);
                 }
@@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
         }
 }
 
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+                   uint8_t raddr, bool is_small_imm)
+{
+        assert(state->devinfo->ver >= 71);
+
+        if (!is_small_imm)
+                add_read_dep(state, state->last_rf[raddr], n);
+}
+
 static bool
 tmu_write_is_sequence_terminator(uint32_t waddr)
 {
@@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr)
 static bool
 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
 {
-        if (devinfo->ver < 40)
-                return false;
-
         if (tmu_write_is_sequence_terminator(waddr))
                 return false;
 
@@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
                         break;
 
                 case V3D_QPU_WADDR_UNIFA:
-                        if (state->devinfo->ver >= 40)
-                                add_write_dep(state, &state->last_unifa, n);
+                        add_write_dep(state, &state->last_unifa, n);
                         break;
 
                 case V3D_QPU_WADDR_NOP:
@@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
         /* If the input and output segments are shared, then all VPM reads to
          * a location need to happen before all writes.  We handle this by
          * serializing all VPM operations for now.
+         *
+         * FIXME: we are assuming that the segments are shared. That is
+         * correct right now as we are only using shared, but technically you
+         * can choose.
          */
         bool separate_vpm_segment = false;
 
@@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 
         /* XXX: LOAD_IMM */
 
-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
-                process_mux_deps(state, n, inst->alu.add.a);
-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
-                process_mux_deps(state, n, inst->alu.add.b);
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.add.a.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.add.a.raddr,
+                                           inst->sig.small_imm_a);
+                }
+        }
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.add.b.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.add.b.raddr,
+                                           inst->sig.small_imm_b);
+                }
+        }
 
-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
-                process_mux_deps(state, n, inst->alu.mul.a);
-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
-                process_mux_deps(state, n, inst->alu.mul.b);
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.mul.a.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+                                           inst->sig.small_imm_c);
+                }
+        }
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.mul.b.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+                                           inst->sig.small_imm_d);
+                }
+        }
 
         switch (inst->alu.add.op) {
         case V3D_QPU_A_VPMSETUP:
@@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 
         case V3D_QPU_A_MSF:
                 add_read_dep(state, state->last_tlb, n);
+                add_read_dep(state, state->last_setmsf, n);
                 break;
 
         case V3D_QPU_A_SETMSF:
+                add_write_dep(state, &state->last_setmsf, n);
+                add_write_dep(state, &state->last_tmu_write, n);
+                FALLTHROUGH;
         case V3D_QPU_A_SETREVF:
                 add_write_dep(state, &state->last_tlb, n);
                 break;
 
+        case V3D_QPU_A_BALLOT:
+        case V3D_QPU_A_BCASTF:
+        case V3D_QPU_A_ALLEQ:
+        case V3D_QPU_A_ALLFEQ:
+                add_read_dep(state, state->last_setmsf, n);
+                break;
+
         default:
                 break;
         }
@@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
                 add_write_dep(state, &state->last_r[4], n);
         if (v3d_qpu_writes_r5(devinfo, inst))
                 add_write_dep(state, &state->last_r[5], n);
+        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+                add_write_dep(state, &state->last_rf[0], n);
 
         /* If we add any more dependencies here we should consider whether we
          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -492,9 +544,16 @@ struct choose_scoreboard {
         int last_thrsw_tick;
         int last_branch_tick;
         int last_setmsf_tick;
-        bool tlb_locked;
+        bool first_thrsw_emitted;
+        bool last_thrsw_emitted;
         bool fixup_ldvary;
         int ldvary_count;
+        int pending_ldtmu_count;
+        bool first_ldtmu_after_thrsw;
+
+        /* V3D 7.x */
+        int last_implicit_rf0_write_tick;
+        bool has_rf0_flops_conflict;
 };
 
 static bool
@@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
 }
 
 static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+               const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+        switch (raddr) {
+        case 0: /* ldvary delayed write of C coefficient to rf0 */
+                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+                        return true;
+                break;
+        default:
+                break;
+        }
+
+        return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+                           struct choose_scoreboard *scoreboard,
                            struct qinst *qinst)
 {
         const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
 
         if (inst->alu.add.op != V3D_QPU_A_NOP) {
-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
-                        return true;
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+                                        return true;
+                        }
                 }
-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
-                        return true;
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+                                        return true;
+                        }
                 }
         }
 
         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
-                        return true;
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
+                                        return true;
+                        }
                 }
-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
-                        return true;
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+                                        return true;
+                        }
                 }
         }
 
@@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
             v3d_qpu_writes_r4(devinfo, inst))
                 return true;
 
+        if (devinfo->ver == 42)
+           return false;
+
+        /* Don't schedule anything that writes rf0 right after ldvary, since
+         * that would clash with the ldvary's delayed rf0 write (the exception
+         * is another ldvary, since its implicit rf0 write would also have
+         * one cycle of delay and would not clash).
+         */
+        if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+            (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+             (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+              !inst->sig.ldvary))) {
+            return true;
+       }
+
         return false;
 }
 
 static bool
-pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+scoreboard_is_locked(struct choose_scoreboard *scoreboard,
+                     bool lock_scoreboard_on_first_thrsw)
+{
+        if (lock_scoreboard_on_first_thrsw) {
+                return scoreboard->first_thrsw_emitted &&
+                       scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+        }
+
+        return scoreboard->last_thrsw_emitted &&
+               scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct v3d_compile *c,
+                          struct choose_scoreboard *scoreboard,
                           const struct v3d_qpu_instr *inst)
 {
-        return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+        return qpu_inst_is_tlb(inst) &&
+               !scoreboard_is_locked(scoreboard,
+                                     c->lock_scoreboard_on_first_thrsw);
 }
 
 static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+                        const struct v3d_qpu_instr *inst,
                         uint32_t waddr) {
 
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
            return false;
 
-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
-            inst->raddr_a == waddr)
-              return true;
+        if (devinfo->ver < 71) {
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+                    inst->raddr_a == waddr)
+                        return true;
 
-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
-            !inst->sig.small_imm && (inst->raddr_b == waddr))
-              return true;
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+                    !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+                        return true;
+        } else {
+                if (v3d71_qpu_reads_raddr(inst, waddr))
+                        return true;
+        }
 
         return false;
 }
 
 static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
-                const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+            struct choose_scoreboard *scoreboard,
+            const struct v3d_qpu_instr *inst)
 {
         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
-                qpu_instruction_uses_rf(inst,
+                qpu_instruction_uses_rf(devinfo, inst,
                                         scoreboard->last_stallable_sfu_reg);
 }
 
 /* We define a max schedule priority to allow negative priorities as result of
- * substracting this max when an instruction stalls. So instructions that
+ * subtracting this max when an instruction stalls. So instructions that
  * stall have lower priority than regular instructions. */
 #define MAX_SCHEDULE_PRIORITY 16
 
@@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
                 return next_score;
         next_score++;
 
+        /* Empirical testing shows that using priorities to hide latency of
+         * TMU operations when scheduling QPU leads to slightly worse
+         * performance, even at 2 threads. We think this is because the thread
+         * switching is already quite effective at hiding latency and NIR
+         * scheduling (and possibly TMU pipelining too) are sufficient to hide
+         * TMU latency, so piling up on that here doesn't provide any benefits
+         * and instead may cause us to postpone critical paths that depend on
+         * the TMU results.
+         */
+#if 0
         /* Schedule texture read results collection late to hide latency. */
         if (v3d_qpu_waits_on_tmu(inst))
                 return next_score;
         next_score++;
+#endif
 
         /* Default score for things that aren't otherwise special. */
         baseline_score = next_score;
         next_score++;
 
+#if 0
         /* Schedule texture read setup early to hide their latency better. */
         if (v3d_qpu_writes_tmu(devinfo, inst))
                 return next_score;
         next_score++;
+#endif
 
         /* We should increase the maximum if we assert here */
         assert(next_score < MAX_SCHEDULE_PRIORITY);
@@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
         return baseline_score;
 }
 
-static bool
-qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
-                          enum v3d_qpu_waddr waddr)
-{
-        return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
-                v3d_qpu_magic_waddr_is_sfu(waddr) ||
-                v3d_qpu_magic_waddr_is_tlb(waddr) ||
-                v3d_qpu_magic_waddr_is_vpm(waddr) ||
-                v3d_qpu_magic_waddr_is_tsy(waddr));
-}
+enum {
+        V3D_PERIPHERAL_VPM_READ           = (1 << 0),
+        V3D_PERIPHERAL_VPM_WRITE          = (1 << 1),
+        V3D_PERIPHERAL_VPM_WAIT           = (1 << 2),
+        V3D_PERIPHERAL_SFU                = (1 << 3),
+        V3D_PERIPHERAL_TMU_WRITE          = (1 << 4),
+        V3D_PERIPHERAL_TMU_READ           = (1 << 5),
+        V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
+        V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
+        V3D_PERIPHERAL_TSY                = (1 << 8),
+        V3D_PERIPHERAL_TLB_READ           = (1 << 9),
+        V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
+};
 
-static bool
-qpu_accesses_peripheral(const struct v3d_device_info *devinfo,
-                        const struct v3d_qpu_instr *inst)
+static uint32_t
+qpu_peripherals(const struct v3d_device_info *devinfo,
+                const struct v3d_qpu_instr *inst)
 {
-        if (v3d_qpu_uses_vpm(inst))
-                return true;
+        uint32_t result = 0;
+        if (v3d_qpu_reads_vpm(inst))
+                result |= V3D_PERIPHERAL_VPM_READ;
+        if (v3d_qpu_writes_vpm(inst))
+                result |= V3D_PERIPHERAL_VPM_WRITE;
+        if (v3d_qpu_waits_vpm(inst))
+                result |= V3D_PERIPHERAL_VPM_WAIT;
+
+        if (v3d_qpu_writes_tmu(devinfo, inst))
+                result |= V3D_PERIPHERAL_TMU_WRITE;
+        if (inst->sig.ldtmu)
+                result |= V3D_PERIPHERAL_TMU_READ;
+        if (inst->sig.wrtmuc)
+                result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
+
         if (v3d_qpu_uses_sfu(inst))
-                return true;
+                result |= V3D_PERIPHERAL_SFU;
+
+        if (v3d_qpu_reads_tlb(inst))
+                result |= V3D_PERIPHERAL_TLB_READ;
+        if (v3d_qpu_writes_tlb(inst))
+                result |= V3D_PERIPHERAL_TLB_WRITE;
 
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
                     inst->alu.add.magic_write &&
-                    qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
-                        return true;
+                    v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
+                        result |= V3D_PERIPHERAL_TSY;
                 }
 
                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
-                        return true;
-
-                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
-                    inst->alu.mul.magic_write &&
-                    qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
-                        return true;
-                }
+                        result |= V3D_PERIPHERAL_TMU_WAIT;
         }
 
-        return (inst->sig.ldvpm ||
-                inst->sig.ldtmu ||
-                inst->sig.ldtlb ||
-                inst->sig.ldtlbu ||
-                inst->sig.wrtmuc);
+        return result;
 }
 
 static bool
@@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
                                  const struct v3d_qpu_instr *a,
                                  const struct v3d_qpu_instr *b)
 {
-        const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
-        const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
+        const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
+        const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
 
         /* We can always do one peripheral access per instruction. */
-        if (!a_uses_peripheral || !b_uses_peripheral)
+        if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
                 return true;
 
-        if (devinfo->ver < 41)
+        /* V3D 4.x can't do more than one peripheral access except in a
+         * few cases:
+         */
+        if (devinfo->ver == 42) {
+                /* WRTMUC signal with TMU register write (other than tmuc). */
+                if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                    b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+                }
+                if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                    a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+                }
+
+                /* TMU read with VPM read/write. */
+                if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+                    (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+                     b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+                        return true;
+                }
+                if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+                    (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+                     a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+                        return true;
+                }
+
                 return false;
+        }
 
-        /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
-         * WRTMUC with a TMU magic register write (other than tmuc).
-         */
-        if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
-            (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
-                return true;
+        /* V3D 7.x can't have more than one of these restricted peripherals */
+        const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+                                    V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+                                    V3D_PERIPHERAL_TSY |
+                                    V3D_PERIPHERAL_TLB_READ |
+                                    V3D_PERIPHERAL_SFU |
+                                    V3D_PERIPHERAL_VPM_READ |
+                                    V3D_PERIPHERAL_VPM_WRITE;
+
+        const uint32_t a_restricted = a_peripherals & restricted;
+        const uint32_t b_restricted = b_peripherals & restricted;
+        if (a_restricted && b_restricted) {
+                /* WRTMUC signal with TMU register write (other than tmuc) is
+                 * allowed though.
+                 */
+                if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                       b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+                       v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+                      (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                       a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+                       v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+                        return false;
+                }
         }
 
-        if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
-            (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
-                return true;
+        /* Only one TMU read per instruction */
+        if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+            (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+                return false;
         }
 
-        return false;
+        /* Only one TLB access per instruction */
+        if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+                              V3D_PERIPHERAL_TLB_READ)) &&
+            (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+                              V3D_PERIPHERAL_TLB_READ))) {
+                return false;
+        }
+
+        return true;
 }
 
 /* Compute a bitmask of which rf registers are used between
@@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
         uint64_t raddrs_used = 0;
         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
                 raddrs_used |= (1ll << a->raddr_a);
-        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+        if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
                 raddrs_used |= (1ll << a->raddr_b);
         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
                 raddrs_used |= (1ll << b->raddr_a);
-        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+        if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
                 raddrs_used |= (1ll << b->raddr_b);
 
         return raddrs_used;
 }
 
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
  */
 static bool
 qpu_merge_raddrs(struct v3d_qpu_instr *result,
                  const struct v3d_qpu_instr *add_instr,
-                 const struct v3d_qpu_instr *mul_instr)
+                 const struct v3d_qpu_instr *mul_instr,
+                 const struct v3d_device_info *devinfo)
 {
+        if (devinfo->ver >= 71) {
+                assert(add_instr->sig.small_imm_a +
+                       add_instr->sig.small_imm_b <= 1);
+                assert(add_instr->sig.small_imm_c +
+                       add_instr->sig.small_imm_d == 0);
+                assert(mul_instr->sig.small_imm_a +
+                       mul_instr->sig.small_imm_b == 0);
+                assert(mul_instr->sig.small_imm_c +
+                       mul_instr->sig.small_imm_d <= 1);
+
+                result->sig.small_imm_a = add_instr->sig.small_imm_a;
+                result->sig.small_imm_b = add_instr->sig.small_imm_b;
+                result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+                result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+                return (result->sig.small_imm_a +
+                        result->sig.small_imm_b +
+                        result->sig.small_imm_c +
+                        result->sig.small_imm_d) <= 1;
+        }
+
+        assert(devinfo->ver == 42);
+
         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
         int naddrs = util_bitcount64(raddrs_used);
 
         if (naddrs > 2)
                 return false;
 
-        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+        if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
                 if (naddrs > 1)
                         return false;
 
-                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+                if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
                         if (add_instr->raddr_b != mul_instr->raddr_b)
                                 return false;
 
-                result->sig.small_imm = true;
-                result->raddr_b = add_instr->sig.small_imm ?
+                result->sig.small_imm_b = true;
+                result->raddr_b = add_instr->sig.small_imm_b ?
                         add_instr->raddr_b : mul_instr->raddr_b;
         }
 
@@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
         raddrs_used &= ~(1ll << raddr_a);
         result->raddr_a = raddr_a;
 
-        if (!result->sig.small_imm) {
+        if (!result->sig.small_imm_b) {
                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
                     raddr_a == add_instr->raddr_b) {
-                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
-                                result->alu.add.a = V3D_QPU_MUX_A;
-                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+                        if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+                                result->alu.add.a.mux = V3D_QPU_MUX_A;
+                        if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
-                                result->alu.add.b = V3D_QPU_MUX_A;
+                                result->alu.add.b.mux = V3D_QPU_MUX_A;
                         }
                 }
                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
                     raddr_a == mul_instr->raddr_b) {
-                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
-                                result->alu.mul.a = V3D_QPU_MUX_A;
-                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+                        if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+                                result->alu.mul.a.mux = V3D_QPU_MUX_A;
+                        if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
-                                result->alu.mul.b = V3D_QPU_MUX_A;
+                                result->alu.mul.b.mux = V3D_QPU_MUX_A;
                         }
                 }
         }
@@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
         result->raddr_b = raddr_b;
         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
             raddr_b == add_instr->raddr_a) {
-                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
-                        result->alu.add.a = V3D_QPU_MUX_B;
-                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+                if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+                        result->alu.add.a.mux = V3D_QPU_MUX_B;
+                if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
-                        result->alu.add.b = V3D_QPU_MUX_B;
+                        result->alu.add.b.mux = V3D_QPU_MUX_B;
                 }
         }
         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
             raddr_b == mul_instr->raddr_a) {
-                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
-                        result->alu.mul.a = V3D_QPU_MUX_B;
-                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+                if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+                        result->alu.mul.a.mux = V3D_QPU_MUX_B;
+                if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
-                        result->alu.mul.b = V3D_QPU_MUX_B;
+                        result->alu.mul.b.mux = V3D_QPU_MUX_B;
                 }
         }
 
@@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
 }
 
 static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+                       struct v3d_qpu_instr *inst)
 {
         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
         assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
         inst->flags.ac = V3D_QPU_COND_NONE;
         inst->flags.apf = V3D_QPU_PF_NONE;
         inst->flags.auf = V3D_QPU_UF_NONE;
+
+        inst->alu.mul.output_pack = inst->alu.add.output_pack;
+
+        inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+        inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
+        inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+        inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+        inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+        if (devinfo->ver >= 71) {
+                assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+                assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+                if (inst->sig.small_imm_a) {
+                        inst->sig.small_imm_c = true;
+                        inst->sig.small_imm_a = false;
+                } else if (inst->sig.small_imm_b) {
+                        inst->sig.small_imm_d = true;
+                        inst->sig.small_imm_b = false;
+                }
+        }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+        switch (op) {
+        case V3D_QPU_M_MOV:
+        case V3D_QPU_M_FMOV:
+                return devinfo->ver >= 71;
+        default:
+                return false;
+        }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+        switch (op) {
+        case V3D_QPU_M_MOV:
+                return V3D_QPU_A_MOV;
+        case V3D_QPU_M_FMOV:
+                return V3D_QPU_A_FMOV;
+        default:
+                unreachable("unexpected mov opcode");
+        }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+        STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+        assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+        assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+        memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+        inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+        inst->alu.mul.op = V3D_QPU_M_NOP;
+
+        inst->flags.ac = inst->flags.mc;
+        inst->flags.apf = inst->flags.mpf;
+        inst->flags.auf = inst->flags.muf;
+        inst->flags.mc = V3D_QPU_COND_NONE;
+        inst->flags.mpf = V3D_QPU_PF_NONE;
+        inst->flags.muf = V3D_QPU_UF_NONE;
+
+        inst->alu.add.output_pack = inst->alu.mul.output_pack;
+        inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+        inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+        inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+        inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+        inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+        assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+        assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+        if (inst->sig.small_imm_c) {
+                inst->sig.small_imm_a = true;
+                inst->sig.small_imm_c = false;
+        } else if (inst->sig.small_imm_d) {
+                inst->sig.small_imm_b = true;
+                inst->sig.small_imm_d = false;
+        }
 }
 
 static bool
@@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
                          can_do_add_as_mul(b->alu.add.op)) {
                         mul_inst = *b;
-                        qpu_convert_add_to_mul(&mul_inst);
+                        qpu_convert_add_to_mul(devinfo, &mul_inst);
 
                         merge.alu.mul = mul_inst.alu.mul;
 
-                        merge.flags.mc = b->flags.ac;
-                        merge.flags.mpf = b->flags.apf;
-                        merge.flags.muf = b->flags.auf;
+                        merge.flags.mc = mul_inst.flags.mc;
+                        merge.flags.mpf = mul_inst.flags.mpf;
+                        merge.flags.muf = mul_inst.flags.muf;
 
                         add_instr = a;
                         mul_instr = &mul_inst;
                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
                            can_do_add_as_mul(a->alu.add.op)) {
                         mul_inst = *a;
-                        qpu_convert_add_to_mul(&mul_inst);
+                        qpu_convert_add_to_mul(devinfo, &mul_inst);
 
                         merge = mul_inst;
                         merge.alu.add = b->alu.add;
@@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
                 }
         }
 
+        struct v3d_qpu_instr add_inst;
         if (b->alu.mul.op != V3D_QPU_M_NOP) {
-                if (a->alu.mul.op != V3D_QPU_M_NOP)
-                        return false;
-                merge.alu.mul = b->alu.mul;
+                if (a->alu.mul.op == V3D_QPU_M_NOP) {
+                        merge.alu.mul = b->alu.mul;
 
-                merge.flags.mc = b->flags.mc;
-                merge.flags.mpf = b->flags.mpf;
-                merge.flags.muf = b->flags.muf;
+                        merge.flags.mc = b->flags.mc;
+                        merge.flags.mpf = b->flags.mpf;
+                        merge.flags.muf = b->flags.muf;
 
-                mul_instr = b;
-                add_instr = a;
+                        mul_instr = b;
+                        add_instr = a;
+                }
+                /* If a's mul op is used but its add op is not, then see if we
+                 * can convert either a's mul op or b's mul op to an add op
+                 * so we can merge.
+                 */
+                else if (a->alu.add.op == V3D_QPU_A_NOP &&
+                         can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+                        add_inst = *b;
+                        qpu_convert_mul_to_add(&add_inst);
+
+                        merge.alu.add = add_inst.alu.add;
+
+                        merge.flags.ac = add_inst.flags.ac;
+                        merge.flags.apf = add_inst.flags.apf;
+                        merge.flags.auf = add_inst.flags.auf;
+
+                        mul_instr = a;
+                        add_instr = &add_inst;
+                } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+                           can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+                        add_inst = *a;
+                        qpu_convert_mul_to_add(&add_inst);
+
+                        merge = add_inst;
+                        merge.alu.mul = b->alu.mul;
+
+                        merge.flags.mc = b->flags.mc;
+                        merge.flags.mpf = b->flags.mpf;
+                        merge.flags.muf = b->flags.muf;
+
+                        mul_instr = b;
+                        add_instr = &add_inst;
+                } else {
+                        return false;
+                }
         }
 
+        /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+         * they have restrictions on the number of raddrs that can be adressed
+         * in a single instruction. In V3D 7.x, we don't have that restriction,
+         * but we are still limited to a single small immediate per instruction.
+         */
         if (add_instr && mul_instr &&
-            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
-                        return false;
+            !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+                return false;
         }
 
         merge.sig.thrsw |= b->sig.thrsw;
@@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
         merge.sig.ldtmu |= b->sig.ldtmu;
         merge.sig.ldvary |= b->sig.ldvary;
         merge.sig.ldvpm |= b->sig.ldvpm;
-        merge.sig.small_imm |= b->sig.small_imm;
         merge.sig.ldtlb |= b->sig.ldtlb;
         merge.sig.ldtlbu |= b->sig.ldtlbu;
         merge.sig.ucb |= b->sig.ucb;
@@ -1047,24 +1403,25 @@ retry:
                  *  regfile A or B that was written to by the previous
                  *  instruction."
                  */
-                if (reads_too_soon_after_write(scoreboard, n->inst))
+                if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
                         continue;
 
                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
                         continue;
 
-                /* "A scoreboard wait must not occur in the first two
-                 *  instructions of a fragment shader. This is either the
-                 *  explicit Wait for Scoreboard signal or an implicit wait
-                 *  with the first tile-buffer read or write instruction."
+                /* "Before doing a TLB access a scoreboard wait must have been
+                 *  done. This happens either on the first or last thread
+                 *  switch, depending on a setting (scb_wait_on_first_thrsw) in
+                 *  the shader state."
                  */
-                if (pixel_scoreboard_too_soon(scoreboard, inst))
+                if (pixel_scoreboard_too_soon(c, scoreboard, inst))
                         continue;
 
-                /* ldunif and ldvary both write r5, but ldunif does so a tick
-                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
+                /* ldunif and ldvary both write the same register (r5 for v42
+                 * and below, rf0 for v71), but ldunif does so a tick sooner.
+                 * If the ldvary's register wasn't used, then ldunif might
                  * otherwise get scheduled so ldunif and ldvary try to update
-                 * r5 in the same tick.
+                 * the register in the same tick.
                  */
                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1131,24 +1488,54 @@ retry:
                                 continue;
                         }
 
-                        /* Don't merge in something that will lock the TLB.
-                         * Hopwefully what we have in inst will release some
-                         * other instructions, allowing us to delay the
-                         * TLB-locking instruction until later.
+                        /* Don't merge TLB instructions before we have acquired
+                         * the scoreboard lock.
                          */
-                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+                        if (pixel_scoreboard_too_soon(c, scoreboard, inst))
                                 continue;
 
-                        /* When we succesfully pair up an ldvary we then try
+                        /* When we successfully pair up an ldvary we then try
                          * to merge it into the previous instruction if
                          * possible to improve pipelining. Don't pick up the
                          * ldvary now if the follow-up fixup would place
                          * it in the delay slots of a thrsw, which is not
                          * allowed and would prevent the fixup from being
-                         * successul.
+                         * successful. In V3D 7.x we can allow this to happen
+                         * as long as it is not the last delay slot.
                          */
-                        if (inst->sig.ldvary &&
-                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
+                        if (inst->sig.ldvary) {
+                                if (c->devinfo->ver == 42 &&
+                                    scoreboard->last_thrsw_tick + 2 >=
+                                    scoreboard->tick - 1) {
+                                        continue;
+                                }
+                                if (c->devinfo->ver >= 71 &&
+                                    scoreboard->last_thrsw_tick + 2 ==
+                                    scoreboard->tick - 1) {
+                                        continue;
+                                }
+                        }
+
+                        /* We can emit a new tmu lookup with a previous ldtmu
+                         * if doing this would free just enough space in the
+                         * TMU output fifo so we don't overflow, however, this
+                         * is only safe if the ldtmu cannot stall.
+                         *
+                         * A ldtmu can stall if it is not the first following a
+                         * thread switch and corresponds to the first word of a
+                         * read request.
+                         *
+                         * FIXME: For now we forbid pairing up a new lookup
+                         * with a previous ldtmu that is not the first after a
+                         * thrsw if that could overflow the TMU output fifo
+                         * regardless of whether the ldtmu is reading the first
+                         * word of a TMU result or not, since we don't track
+                         * this aspect in the compiler yet.
+                         */
+                        if (prev_inst->inst->qpu.sig.ldtmu &&
+                            !scoreboard->first_ldtmu_after_thrsw &&
+                            (scoreboard->pending_ldtmu_count +
+                             n->inst->ldtmu_count > 16 / c->threads)) {
                                 continue;
                         }
 
@@ -1161,7 +1548,7 @@ retry:
 
                 int prio = get_instruction_priority(c->devinfo, inst);
 
-                if (mux_read_stalls(scoreboard, inst)) {
+                if (read_stalls(c->devinfo, scoreboard, inst)) {
                         /* Don't merge an instruction that stalls */
                         if (prev_inst)
                                 continue;
@@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
 {
         if (v3d_qpu_magic_waddr_is_sfu(waddr))
                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
-        else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
+        else if (waddr == V3D_QPU_WADDR_UNIFA)
                 scoreboard->last_unifa_write_tick = scoreboard->tick;
 }
 
@@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
 }
 
 static void
+update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+                               const struct qinst *inst)
+{
+        /* Track if the have seen any ldtmu after the last thread switch */
+        if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
+                scoreboard->first_ldtmu_after_thrsw = true;
+
+        /* Track the number of pending ldtmu instructions for outstanding
+         * TMU lookups.
+         */
+        scoreboard->pending_ldtmu_count += inst->ldtmu_count;
+        if (inst->qpu.sig.ldtmu) {
+                assert(scoreboard->pending_ldtmu_count > 0);
+                scoreboard->pending_ldtmu_count--;
+                scoreboard->first_ldtmu_after_thrsw = false;
+        }
+}
+
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+                           const struct v3d_qpu_instr *inst,
+                           const struct v3d_device_info *devinfo)
+{
+        if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+            v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+            !inst->sig_magic) {
+                scoreboard->has_rf0_flops_conflict = true;
+        }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+                                const struct v3d_qpu_instr *inst,
+                                const struct v3d_device_info *devinfo)
+{
+        if (devinfo->ver < 71)
+                return;
+
+        /* Thread switch restrictions:
+         *
+         * At the point of a thread switch or thread end (when the actual
+         * thread switch or thread end happens, not when the signalling
+         * instruction is processed):
+         *
+         *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
+         *      ldvary instruction in which another signal also wrote to the
+         *      register file, and the final instruction of the thread section
+         *      contained a signal which wrote to the register file, then the
+         *      value of rf0 is undefined at the start of the new section
+         *
+         * Here we use the scoreboard to track if our last rf0 implicit write
+         * happens at the same time that another signal writes the register
+         * file (has_rf0_flops_conflict). We will use that information when
+         * scheduling thrsw instructions to avoid putting anything in their
+         * last delay slot which has a signal that writes to the register file.
+         */
+
+        /* Reset tracking if we have an explicit rf0 write or we are starting
+         * a new thread section.
+         */
+        if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+            scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+                scoreboard->last_implicit_rf0_write_tick = -10;
+                scoreboard->has_rf0_flops_conflict = false;
+        }
+
+        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+                scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+                        scoreboard->tick + 1 : scoreboard->tick;
+        }
+
+        set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
+static void
 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
-                             const struct v3d_qpu_instr *inst,
+                             const struct qinst *qinst,
                              const struct v3d_device_info *devinfo)
 {
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
                 return;
 
@@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                 }
         }
 
+        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
+                update_scoreboard_for_magic_waddr(scoreboard,
+                                                  inst->sig_addr,
+                                                  devinfo);
+        }
+
         if (inst->sig.ldvary)
                 scoreboard->last_ldvary_tick = scoreboard->tick;
 
-        if (qpu_inst_is_tlb(inst))
-                scoreboard->tlb_locked = true;
+        update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
+        update_scoreboard_tmu_tracking(scoreboard, qinst);
 }
 
 static void
@@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo,
             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return latency;
 
-        if (before_inst->alu.add.magic_write) {
+        if (v3d_qpu_instr_is_sfu(before_inst))
+                return 2;
+
+        if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
+            before_inst->alu.add.magic_write) {
                 latency = MAX2(latency,
                                magic_waddr_latency(devinfo,
                                                    before_inst->alu.add.waddr,
                                                    after_inst));
         }
 
-        if (before_inst->alu.mul.magic_write) {
+        if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
+            before_inst->alu.mul.magic_write) {
                 latency = MAX2(latency,
                                magic_waddr_latency(devinfo,
                                                    before_inst->alu.mul.waddr,
                                                    after_inst));
         }
 
-        if (v3d_qpu_instr_is_sfu(before_inst))
-                return 2;
-
         return latency;
 }
 
@@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
 {
         list_addtail(&inst->link, &block->instructions);
 
-        update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
+        update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
         c->qpu_inst_count++;
         scoreboard->tick++;
 }
@@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
 {
         const struct v3d_qpu_instr *inst = &qinst->qpu;
 
-        /* Only TLB Z writes are prohibited in the last slot, but we don't
-         * have those flagged so prohibit all TLB ops for now.
-         */
-        if (slot == 2 && qpu_inst_is_tlb(inst))
+        if (slot == 2 && qinst->is_tlb_z_write)
                 return false;
 
         if (slot > 0 && qinst->uniform != ~0)
                 return false;
 
-        if (v3d_qpu_uses_vpm(inst))
+        if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
                 return false;
 
         if (inst->sig.ldvary)
@@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
 
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
-                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+                if (c->devinfo->ver == 42 && slot == 2 &&
+                    inst->alu.add.op == V3D_QPU_A_TMUWT) {
                         return false;
+                }
 
-                /* No writing physical registers at the end. */
-                if (!inst->alu.add.magic_write ||
-                    !inst->alu.mul.magic_write) {
-                        return false;
+                if (c->devinfo->ver == 42) {
+                        /* No writing physical registers at the end. */
+                        bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+                        bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+                        if ((!add_is_nop && !inst->alu.add.magic_write) ||
+                            (!mul_is_nop && !inst->alu.mul.magic_write)) {
+                                return false;
+                        }
+
+                        if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+                            !inst->sig_magic) {
+                                return false;
+                        }
                 }
 
-                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
-                        return false;
+                if (c->devinfo->ver >= 71) {
+                        /* The thread end instruction must not write to the
+                         * register file via the add/mul ALUs.
+                         */
+                        if (slot == 0 &&
+                            (!inst->alu.add.magic_write ||
+                             !inst->alu.mul.magic_write)) {
+                                return false;
+                        }
+                }
 
-                /* RF0-2 might be overwritten during the delay slots by
-                 * fragment shader setup.
-                 */
-                if (inst->raddr_a < 3 &&
-                    (inst->alu.add.a == V3D_QPU_MUX_A ||
-                     inst->alu.add.b == V3D_QPU_MUX_A ||
-                     inst->alu.mul.a == V3D_QPU_MUX_A ||
-                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
-                        return false;
+                if (c->devinfo->ver == 42) {
+                        /* RF0-2 might be overwritten during the delay slots by
+                         * fragment shader setup.
+                         */
+                        if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+                                return false;
+
+                        if (inst->raddr_b < 3 &&
+                            !inst->sig.small_imm_b &&
+                            v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+                                return false;
+                        }
                 }
 
-                if (inst->raddr_b < 3 &&
-                    !inst->sig.small_imm &&
-                    (inst->alu.add.a == V3D_QPU_MUX_B ||
-                     inst->alu.add.b == V3D_QPU_MUX_B ||
-                     inst->alu.mul.a == V3D_QPU_MUX_B ||
-                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
-                        return false;
+                if (c->devinfo->ver >= 71) {
+                        /* RF2-3 might be overwritten during the delay slots by
+                         * fragment shader setup.
+                         */
+                        if (v3d71_qpu_reads_raddr(inst, 2) ||
+                            v3d71_qpu_reads_raddr(inst, 3)) {
+                                return false;
+                        }
+
+                        if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+                            v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+                                return false;
+                        }
                 }
         }
 
@@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
  */
 static bool
 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+                                          struct choose_scoreboard *scoreboard,
                                           const struct qinst *qinst,
                                           uint32_t slot)
 {
@@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
          * thread.  The simulator complains for safety, though it
          * would only occur for dead code in our case.
          */
-        if (slot > 0 &&
-            qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
-            (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
-             v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
-                return false;
+        if (slot > 0) {
+                if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
+                        return false;
+                if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
+                        return false;
         }
 
-        if (slot > 0 && qinst->qpu.sig.ldvary)
-                return false;
+        if (qinst->qpu.sig.ldvary) {
+                if (c->devinfo->ver == 42 && slot > 0)
+                        return false;
+                if (c->devinfo->ver >= 71 && slot == 2)
+                        return false;
+        }
 
         /* unifa and the following 3 instructions can't overlap a
          * thread switch/end. The docs further clarify that this means
@@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
                 return false;
 
+        /* See comment when we set has_rf0_flops_conflict for details */
+        if (c->devinfo->ver >= 71 &&
+            slot == 2 &&
+            v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+            !qinst->qpu.sig_magic) {
+                if (scoreboard->has_rf0_flops_conflict)
+                        return false;
+                if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+                        return false;
+        }
+
         return true;
 }
 
@@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         assert(slot <= 2);
 
         /* We merge thrsw instructions back into the instruction stream
-         * manually, so any instructions scheduled after a thrsw shold be
+         * manually, so any instructions scheduled after a thrsw should be
          * in the actual delay slots and not in the same slot as the thrsw.
          */
         assert(slot >= 1);
@@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
          * also apply to instructions scheduled after the thrsw that we want
          * to place in its delay slots.
          */
-        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
                 return false;
 
         /* TLB access is disallowed until scoreboard wait is executed, which
@@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         if (v3d_qpu_writes_flags(&qinst->qpu))
                 return false;
 
+        /* TSY sync ops materialize at the point of the next thread switch,
+         * therefore, if we have a TSY sync right after a thread switch, we
+         * cannot place it in its delay slots, or we would be moving the sync
+         * to the thrsw before it instead.
+         */
+        if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
+                return false;
+
         return true;
 }
 
@@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
                      struct qinst *qinst, int instructions_in_sequence,
                      bool is_thrend)
 {
-        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
-        if (scoreboard->last_thrsw_tick + 3 >
-            scoreboard->tick - instructions_in_sequence) {
-                return false;
-        }
-
         for (int slot = 0; slot < instructions_in_sequence; slot++) {
-                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+                                                               qinst, slot)) {
                         return false;
+                }
 
                 if (is_thrend &&
                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c,
 
         /* Find how far back into previous instructions we can put the THRSW. */
         int slots_filled = 0;
+        int invalid_sig_count = 0;
+        int invalid_seq_count = 0;
+        bool last_thrsw_after_invalid_ok = false;
         struct qinst *merge_inst = NULL;
         vir_for_each_inst_rev(prev_inst, block) {
-                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
-                sig.thrsw = true;
-                uint32_t packed_sig;
-
-                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
+                /* No emitting our thrsw while the previous thrsw hasn't
+                 * happened yet.
+                 */
+                if (scoreboard->last_thrsw_tick + 3 >
+                    scoreboard->tick - (slots_filled + 1)) {
                         break;
+                }
+
 
                 if (!valid_thrsw_sequence(c, scoreboard,
                                           prev_inst, slots_filled + 1,
                                           is_thrend)) {
-                        break;
+                        /* Even if the current sequence isn't valid, we may
+                         * be able to get a valid sequence by trying to move the
+                         * thrsw earlier, so keep going.
+                         */
+                        invalid_seq_count++;
+                        goto cont_block;
+                }
+
+                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
+                sig.thrsw = true;
+                uint32_t packed_sig;
+                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
+                        /* If we can't merge the thrsw here because of signal
+                         * incompatibility, keep going, we might be able to
+                         * merge it in an earlier instruction.
+                         */
+                        invalid_sig_count++;
+                        goto cont_block;
                 }
 
+                /* For last thrsw we need 2 consecutive slots that are
+                 * thrsw compatible, so if we have previously jumped over
+                 * an incompatible signal, flag that we have found the first
+                 * valid slot here and keep going.
+                 */
+                if (inst->is_last_thrsw && invalid_sig_count > 0 &&
+                    !last_thrsw_after_invalid_ok) {
+                        last_thrsw_after_invalid_ok = true;
+                        invalid_sig_count++;
+                        goto cont_block;
+                }
+
+                /* We can merge the thrsw in this instruction */
+                last_thrsw_after_invalid_ok = false;
+                invalid_sig_count = 0;
+                invalid_seq_count = 0;
                 merge_inst = prev_inst;
+
+cont_block:
                 if (++slots_filled == 3)
                         break;
         }
 
+        /* If we jumped over a signal incompatibility and did not manage to
+         * merge the thrsw in the end, we need to adjust slots filled to match
+         * the last valid merge point.
+         */
+        assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
+                slots_filled >= invalid_sig_count + invalid_seq_count);
+        if (invalid_sig_count > 0)
+                slots_filled -= invalid_sig_count;
+        if (invalid_seq_count > 0)
+                slots_filled -= invalid_seq_count;
+
         bool needs_free = false;
         if (merge_inst) {
                 merge_inst->qpu.sig.thrsw = true;
@@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c,
                 merge_inst = inst;
         }
 
+        scoreboard->first_thrsw_emitted = true;
+
         /* If we're emitting the last THRSW (other than program end), then
          * signal that to the HW by emitting two THRSWs in a row.
          */
@@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c,
                 struct qinst *second_inst =
                         (struct qinst *)merge_inst->link.next;
                 second_inst->qpu.sig.thrsw = true;
+                scoreboard->last_thrsw_emitted = true;
         }
 
         /* Make sure the thread end executes within the program lifespan */
@@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c,
         assert(scoreboard->last_branch_tick + 3 < branch_tick);
         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
 
-        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+        /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
          * setmsf.
          */
         bool is_safe_msf_branch =
+                c->devinfo->ver >= 71 ||
                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c,
                         break;
                 }
 
+                /* Do not move up a branch if it can disrupt an ldvary sequence
+                 * as that can cause stomping of the r5 register.
+                 */
+                if (scoreboard->last_ldvary_tick + 2 >=
+                    branch_tick - slots_filled) {
+                       break;
+                }
+
                 /* Can't move a conditional branch before the instruction
                  * that writes the flags for its condition.
                  */
@@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c,
 }
 
 static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+                   struct v3d_qpu_instr *inst,
                    bool add, bool magic, uint32_t index)
 {
         uint32_t num_src;
-        enum v3d_qpu_mux mux_a, mux_b;
-
-        if (add) {
+        if (add)
                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
-                mux_a = inst->alu.add.a;
-                mux_b = inst->alu.add.b;
-        } else {
+        else
                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
-                mux_a = inst->alu.mul.a;
-                mux_b = inst->alu.mul.b;
-        }
 
-        for (int i = 0; i < num_src; i++) {
-                if (magic) {
-                        if (i == 0 && mux_a == index)
-                                return true;
-                        if (i == 1 && mux_b == index)
-                                return true;
+        if (devinfo->ver == 42) {
+                enum v3d_qpu_mux mux_a, mux_b;
+                if (add) {
+                        mux_a = inst->alu.add.a.mux;
+                        mux_b = inst->alu.add.b.mux;
                 } else {
-                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
+                        mux_a = inst->alu.mul.a.mux;
+                        mux_b = inst->alu.mul.b.mux;
+                }
+
+                for (int i = 0; i < num_src; i++) {
+                        if (magic) {
+                                if (i == 0 && mux_a == index)
+                                        return true;
+                                if (i == 1 && mux_b == index)
+                                        return true;
+                        } else {
+                                if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
                         }
                 }
+
+                return false;
+        }
+
+        assert(devinfo->ver >= 71);
+        assert(!magic);
+
+        uint32_t raddr_a, raddr_b;
+        if (add) {
+                raddr_a = inst->alu.add.a.raddr;
+                raddr_b = inst->alu.add.b.raddr;
+        } else {
+                raddr_a = inst->alu.mul.a.raddr;
+                raddr_b = inst->alu.mul.b.raddr;
+        }
+
+        for (int i = 0; i < num_src; i++) {
+                if (i == 0 && raddr_a == index)
+                        return true;
+                if (i == 1 && raddr_b == index)
+                        return true;
         }
 
         return false;
@@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
                        struct qblock *block,
                        struct v3d_qpu_instr *inst)
 {
-        /* We only call this if we have successfuly merged an ldvary into a
+        const struct v3d_device_info *devinfo = c->devinfo;
+
+        /* We only call this if we have successfully merged an ldvary into a
          * previous instruction.
          */
         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
@@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
          * the ldvary destination, if it does, then moving the ldvary before
          * it would overwrite it.
          */
-        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
                 return false;
-        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
+                return false;
+
+        /* The implicit ldvary destination may not be written to by a signal
+         * in the instruction following ldvary. Since we are planning to move
+         * ldvary to the previous instruction, this means we need to check if
+         * the current instruction has any other signal that could create this
+         * conflict. The only other signal that can write to the implicit
+         * ldvary destination that is compatible with ldvary in the same
+         * instruction is ldunif.
+         */
+        if (inst->sig.ldunif)
                 return false;
 
         /* The previous instruction can't write to the same destination as the
@@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         }
 
         /* The previous instruction cannot have a conflicting signal */
-        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+        if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
+                return false;
+
+        uint32_t sig;
+        struct v3d_qpu_sig new_sig = prev->qpu.sig;
+        new_sig.ldvary = true;
+        if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
                 return false;
 
         /* The previous instruction cannot use flags since ldvary uses the
@@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
 
         /* We can't put an ldvary in the delay slots of a thrsw. We should've
          * prevented this when pairing up the ldvary with another instruction
-         * and flagging it for a fixup.
+         * and flagging it for a fixup. In V3D 7.x this is limited only to the
+         * second delay slot.
          */
-        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+        assert((devinfo->ver == 42 &&
+                scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+               (devinfo->ver >= 71 &&
+                scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
 
         /* Move the ldvary to the previous instruction and remove it from the
          * current one.
@@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         inst->sig_magic = false;
         inst->sig_addr = 0;
 
-        /* By moving ldvary to the previous instruction we make it update
-         * r5 in the current one, so nothing else in it should write r5.
+        /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+        if (devinfo->ver >= 71) {
+                scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+                set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+        }
+
+        /* By moving ldvary to the previous instruction we make it update r5
+         * (rf0 for ver >= 71) in the current one, so nothing else in it
+         * should write this register.
+         *
          * This should've been prevented by our depedency tracking, which
          * would not allow ldvary to be paired up with an instruction that
-         * writes r5 (since our dependency tracking doesn't know that the
-         * ldvary write r5 happens in the next instruction).
+         * writes r5/rf0 (since our dependency tracking doesn't know that the
+         * ldvary write to r5/rf0 happens in the next instruction).
          */
-        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+        assert(!v3d_qpu_writes_r5(devinfo, inst));
+        assert(devinfo->ver == 42 ||
+               (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+                !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
 
         return true;
 }
@@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c,
                                                 merge->inst->uniform;
                                 }
 
+                                chosen->inst->ldtmu_count +=
+                                        merge->inst->ldtmu_count;
+
                                 if (debug) {
                                         fprintf(stderr, "t=%4d: merging: ",
                                                 time);
@@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c,
                                         }
                                 }
                         }
-                        if (mux_read_stalls(scoreboard, inst))
+                        if (read_stalls(c->devinfo, scoreboard, inst))
                                 c->qpu_inst_stalled_count++;
                 }
 
@@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         scoreboard.last_branch_tick = -10;
         scoreboard.last_setmsf_tick = -10;
         scoreboard.last_stallable_sfu_tick = -10;
+        scoreboard.first_ldtmu_after_thrsw = true;
+        scoreboard.last_implicit_rf0_write_tick = - 10;
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index ec9ed66650c..538b247e3e0 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
         int last_sfu_write;
         int last_branch_ip;
         int last_thrsw_ip;
+        int first_tlb_z_write;
 
         /* Set when we've found the last-THRSW signal, or if we were started
          * in single-segment mode.
@@ -110,11 +111,58 @@ static void
 qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
 {
         const struct v3d_device_info *devinfo = state->c->devinfo;
+
+        if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+                state->first_tlb_z_write = state->ip;
+
         const struct v3d_qpu_instr *inst = &qinst->qpu;
 
+        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+            state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write &&
+            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+                fail_instr(state, "Implicit branch MSF read after TLB Z write");
+        }
+
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return;
 
+        if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+            state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write) {
+                fail_instr(state, "SETMSF after TLB Z write");
+        }
+
+        if (state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write &&
+            inst->alu.add.op == V3D_QPU_A_MSF) {
+                fail_instr(state, "MSF read after TLB Z write");
+        }
+
+        if (devinfo->ver < 71) {
+                if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+                    inst->sig.small_imm_d) {
+                        fail_instr(state, "small imm a/c/d added after V3D 7.1");
+                }
+        } else {
+                if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+                    !vir_is_add(qinst)) {
+                        fail_instr(state, "small imm a/b used but no ADD inst");
+                }
+                if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+                    !vir_is_mul(qinst)) {
+                        fail_instr(state, "small imm c/d used but no MUL inst");
+                }
+                if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+                    inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+                        fail_instr(state, "only one small immediate can be "
+                                   "enabled per instruction");
+                }
+        }
+
         /* LDVARY writes r5 two instructions later and LDUNIF writes
          * r5 one instruction later, which is illegal to have
          * together.
@@ -128,7 +176,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
          *
          * FIXME: This would not check correctly for V3D 4.2 versions lower
          * than V3D 4.2.14, but that is not a real issue because the simulator
-         * will still catch this, and we are not really targetting any such
+         * will still catch this, and we are not really targeting any such
          * versions anyway.
          */
         if (state->c->devinfo->ver < 42) {
@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
                                    "SFU write started during THRSW delay slots ");
                 }
 
-                if (inst->sig.ldvary)
-                        fail_instr(state, "LDVARY during THRSW delay slots");
+                if (inst->sig.ldvary) {
+                        if (devinfo->ver == 42)
+                                fail_instr(state, "LDVARY during THRSW delay slots");
+                        if (devinfo->ver >= 71 &&
+                            state->ip - state->last_thrsw_ip == 2) {
+                                fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+                        }
+                }
         }
 
         (void)qpu_magic_waddr_matches; /* XXX */
@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
             vpm_writes +
             tlb_writes +
             tsy_writes +
-            inst->sig.ldtmu +
+            (devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
             inst->sig.ldtlb +
             inst->sig.ldvpm +
             inst->sig.ldtlbu > 1) {
@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
             inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
                      !inst->alu.add.magic_write)) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver == 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71) {
+                                if (state->last_thrsw_ip - state->ip == 0) {
+                                        fail_instr(state,
+                                                   "ADD RF write at THREND");
+                                }
+                                if (inst->alu.add.waddr == 2 ||
+                                    inst->alu.add.waddr == 3) {
+                                        fail_instr(state,
+                                                   "RF2-3 write after THREND");
+                                }
+                        }
                 }
 
                 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
                      !inst->alu.mul.magic_write)) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver == 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71) {
+                                if (state->last_thrsw_ip - state->ip == 0) {
+                                        fail_instr(state,
+                                                   "MUL RF write at THREND");
+                                }
+
+                                if (inst->alu.mul.waddr == 2 ||
+                                    inst->alu.mul.waddr == 3) {
+                                        fail_instr(state,
+                                                   "RF2-3 write after THREND");
+                                }
+                        }
                 }
 
                 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
                     !inst->sig_magic) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver == 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71 &&
+                                   (inst->sig_addr == 2 ||
+                                    inst->sig_addr == 3)) {
+                                fail_instr(state, "RF2-3 write after THREND");
+                        }
                 }
 
                 /* GFXH-1625: No TMUWT in the last instruction */
@@ -312,7 +397,7 @@ qpu_validate(struct v3d_compile *c)
          * keep compiling the validation code to make sure it doesn't get
          * broken.
          */
-#ifndef DEBUG
+#if !MESA_DEBUG
         return;
 #endif
 
@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
                 .last_sfu_write = -10,
                 .last_thrsw_ip = -10,
                 .last_branch_ip = -10,
+                .first_tlb_z_write = INT_MAX,
                 .ip = 0,
 
                 .last_thrsw_found = !c->last_thrsw,
diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c
deleted file mode 100644
index b933635f6fe..00000000000
--- a/src/broadcom/compiler/v3d33_tex.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
-{
-        /* FIXME: We don't bother implementing pipelining for texture reads
-         * for any pre 4.x hardware. It should be straight forward to do but
-         * we are not really testing or even targetting this hardware at
-         * present.
-         */
-        ntq_flush_tmu(c);
-
-        unsigned unit = instr->texture_index;
-
-        struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = {
-                V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header,
-
-                .fetch_sample_mode = instr->op == nir_texop_txf,
-        };
-
-        struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked = {
-        };
-
-        switch (instr->sampler_dim) {
-        case GLSL_SAMPLER_DIM_1D:
-                if (instr->is_array)
-                        p0_unpacked.lookup_type = TEXTURE_1D_ARRAY;
-                else
-                        p0_unpacked.lookup_type = TEXTURE_1D;
-                break;
-        case GLSL_SAMPLER_DIM_2D:
-        case GLSL_SAMPLER_DIM_RECT:
-                if (instr->is_array)
-                        p0_unpacked.lookup_type = TEXTURE_2D_ARRAY;
-                else
-                        p0_unpacked.lookup_type = TEXTURE_2D;
-                break;
-        case GLSL_SAMPLER_DIM_3D:
-                p0_unpacked.lookup_type = TEXTURE_3D;
-                break;
-        case GLSL_SAMPLER_DIM_CUBE:
-                p0_unpacked.lookup_type = TEXTURE_CUBE_MAP;
-                break;
-        default:
-                unreachable("Bad sampler type");
-        }
-
-        struct qreg coords[5];
-        int next_coord = 0;
-        for (unsigned i = 0; i < instr->num_srcs; i++) {
-                switch (instr->src[i].src_type) {
-                case nir_tex_src_coord:
-                        for (int j = 0; j < instr->coord_components; j++) {
-                                coords[next_coord++] =
-                                        ntq_get_src(c, instr->src[i].src, j);
-                        }
-                        if (instr->coord_components < 2)
-                                coords[next_coord++] = vir_uniform_f(c, 0.5);
-                        break;
-                case nir_tex_src_bias:
-                        coords[next_coord++] =
-                                ntq_get_src(c, instr->src[i].src, 0);
-
-                        p0_unpacked.bias_supplied = true;
-                        break;
-                case nir_tex_src_lod:
-                        coords[next_coord++] =
-                                vir_FADD(c,
-                                         ntq_get_src(c, instr->src[i].src, 0),
-                                         vir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL,
-                                                     unit));
-
-                        if (instr->op != nir_texop_txf &&
-                            instr->op != nir_texop_tg4) {
-                                p0_unpacked.disable_autolod_use_bias_only = true;
-                        }
-                        break;
-                case nir_tex_src_comparator:
-                        coords[next_coord++] =
-                                ntq_get_src(c, instr->src[i].src, 0);
-
-                        p0_unpacked.shadow = true;
-                        break;
-
-                case nir_tex_src_offset: {
-                        p0_unpacked.texel_offset_for_s_coordinate =
-                                nir_src_comp_as_int(instr->src[i].src, 0);
-
-                        if (instr->coord_components >= 2)
-                                p0_unpacked.texel_offset_for_t_coordinate =
-                                        nir_src_comp_as_int(instr->src[i].src, 1);
-
-                        if (instr->coord_components >= 3)
-                                p0_unpacked.texel_offset_for_r_coordinate =
-                                        nir_src_comp_as_int(instr->src[i].src, 2);
-                        break;
-                }
-
-                default:
-                        unreachable("unknown texture source");
-                }
-        }
-
-        /* Limit the number of channels returned to both how many the NIR
-         * instruction writes and how many the instruction could produce.
-         */
-        p1_unpacked.return_words_of_texture_data =
-                instr->dest.is_ssa ?
-                nir_ssa_def_components_read(&instr->dest.ssa) :
-                (1 << instr->dest.reg.reg->num_components) - 1;
-
-        uint32_t p0_packed;
-        V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
-                                                         (uint8_t *)&p0_packed,
-                                                         &p0_unpacked);
-
-        uint32_t p1_packed;
-        V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL,
-                                                         (uint8_t *)&p1_packed,
-                                                         &p1_unpacked);
-        /* Load unit number into the address field, which will be be used by
-         * the driver to decide which texture to put in the actual address
-         * field.
-         */
-        p1_packed |= unit << 5;
-
-        /* There is no native support for GL texture rectangle coordinates, so
-         * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
-         * 1]).
-         */
-        if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
-                coords[0] = vir_FMUL(c, coords[0],
-                                     vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X,
-                                                 unit));
-                coords[1] = vir_FMUL(c, coords[1],
-                                     vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y,
-                                                 unit));
-        }
-
-        int texture_u[] = {
-                vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
-                vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
-        };
-
-        for (int i = 0; i < next_coord; i++) {
-                struct qreg dst;
-
-                if (i == next_coord - 1)
-                        dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL);
-                else
-                        dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU);
-
-                struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
-
-                if (i < 2)
-                        tmu->uniform = texture_u[i];
-        }
-
-        vir_emit_thrsw(c);
-
-        for (int i = 0; i < 4; i++) {
-                if (p1_unpacked.return_words_of_texture_data & (1 << i))
-                        ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
-        }
-}
diff --git a/src/broadcom/compiler/v3d33_vpm_setup.c b/src/broadcom/compiler/v3d33_vpm_setup.c
deleted file mode 100644
index 8bce67dfae9..00000000000
--- a/src/broadcom/compiler/v3d33_vpm_setup.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "broadcom/cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components)
-{
-        struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = {
-                V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header,
-
-                .horiz = true,
-                .laned = false,
-                /* If the field is 0, that means a read count of 32. */
-                .num = num_components & 31,
-                .segs = true,
-                .stride = 1,
-                .size = VPM_SETUP_SIZE_32_BIT,
-                .addr = c->num_inputs,
-        };
-
-        uint32_t packed;
-        V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL,
-                                                (uint8_t *)&packed,
-                                                &unpacked);
-        vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
-
-void
-v3d33_vir_vpm_write_setup(struct v3d_compile *c)
-{
-        uint32_t packed;
-        struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = {
-                V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header,
-
-                .horiz = true,
-                .laned = false,
-                .segs = true,
-                .stride = 1,
-                .size = VPM_SETUP_SIZE_32_BIT,
-                .addr = 0,
-        };
-
-        V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL,
-                                                (uint8_t *)&packed,
-                                                &unpacked);
-        vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 0c1419661d3..12aaacdc14a 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -31,6 +31,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "util/blend.h"
 #include "util/macros.h"
 #include "common/v3d_debug.h"
 #include "common/v3d_device_info.h"
@@ -40,7 +41,6 @@
 #include "util/u_math.h"
 
 #include "qpu/qpu_instr.h"
-#include "pipe/p_state.h"
 
 /**
  * Maximum number of outstanding TMU operations we can queue for execution.
@@ -87,7 +87,7 @@ enum qfile {
 
         /** A physical register, such as the W coordinate payload. */
         QFILE_REG,
-        /** One of the regsiters for fixed function interactions. */
+        /** One of the registers for fixed function interactions. */
         QFILE_MAGIC,
 
         /**
@@ -97,12 +97,6 @@ enum qfile {
         QFILE_TEMP,
 
         /**
-         * VPM reads use this with an index value to say what part of the VPM
-         * is being read.
-         */
-        QFILE_VPM,
-
-        /**
          * Stores an immediate value in the index field that will be used
          * directly by qpu_load_imm().
          */
@@ -169,6 +163,19 @@ struct qinst {
          * otherwise.
          */
         int uniform;
+
+        /* If this is a a TLB Z write */
+        bool is_tlb_z_write;
+
+        /* If this is a retiring TMU instruction (the last in a lookup sequence),
+         * how many ldtmu instructions are required to read the results.
+         */
+        uint32_t ldtmu_count;
+
+        /* Position of this instruction in the program. Filled in during
+         * register allocation.
+         */
+        int32_t ip;
 };
 
 enum quniform_contents {
@@ -330,6 +337,19 @@ enum quniform_contents {
          * Current value of gl_ViewIndex for Multiview rendering.
          */
         QUNIFORM_VIEW_INDEX,
+
+        /**
+         * Inline uniform buffers
+         */
+         QUNIFORM_INLINE_UBO_0,
+         QUNIFORM_INLINE_UBO_1,
+         QUNIFORM_INLINE_UBO_2,
+         QUNIFORM_INLINE_UBO_3,
+
+        /**
+         * Current value of DrawIndex for Multidraw
+         */
+        QUNIFORM_DRAW_ID,
 };
 
 static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
@@ -369,13 +389,7 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
         return slot.slot_and_component & 3;
 }
 
-enum v3d_execution_environment {
-   V3D_ENVIRONMENT_OPENGL = 0,
-   V3D_ENVIRONMENT_VULKAN,
-};
-
 struct v3d_key {
-        void *shader_state;
         struct {
                 uint8_t swizzle[4];
         } tex[V3D_MAX_TEXTURE_SAMPLERS];
@@ -388,9 +402,9 @@ struct v3d_key {
         uint8_t num_samplers_used;
         uint8_t ucp_enables;
         bool is_last_geometry_stage;
-        bool robust_buffer_access;
-
-        enum v3d_execution_environment environment;
+        bool robust_uniform_access;
+        bool robust_storage_access;
+        bool robust_image_access;
 };
 
 struct v3d_fs_key {
@@ -400,7 +414,6 @@ struct v3d_fs_key {
         bool line_smoothing;
         bool point_coord_upper_left;
         bool msaa;
-        bool sample_coverage;
         bool sample_alpha_to_coverage;
         bool sample_alpha_to_one;
         /* Mask of which color render targets are present. */
@@ -419,14 +432,12 @@ struct v3d_fs_key {
          */
         struct {
                 enum pipe_format format;
-                const uint8_t *swizzle;
+                uint8_t swizzle[4];
         } color_fmt[V3D_MAX_DRAW_BUFFERS];
 
-        uint8_t logicop_func;
+        enum pipe_logicop logicop_func;
         uint32_t point_sprite_mask;
 
-        struct pipe_rt_blend_state blend;
-
         /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios:
          *
          * - If there is a geometry shader, then gl_PrimitiveID must be written
@@ -468,7 +479,7 @@ struct v3d_vs_key {
         bool clamp_color;
 };
 
-/** A basic block of VIR intructions. */
+/** A basic block of VIR instructions. */
 struct qblock {
         struct list_head link;
 
@@ -566,6 +577,7 @@ enum v3d_compilation_result {
  */
 struct v3d_compiler {
         const struct v3d_device_info *devinfo;
+        uint32_t max_inline_uniform_buffers;
         struct ra_regs *regs;
         struct ra_class *reg_class_any[3];
         struct ra_class *reg_class_r5[3];
@@ -584,6 +596,19 @@ struct v3d_interp_input {
    unsigned mode; /* interpolation mode */
 };
 
+struct v3d_ra_node_info {
+        struct {
+                uint32_t priority;
+                uint8_t class_bits;
+                bool is_program_end;
+                bool unused;
+
+                /* V3D 7.x */
+                bool is_ldunif_dst;
+        } *info;
+        uint32_t alloc_count;
+};
+
 struct v3d_compile {
         const struct v3d_device_info *devinfo;
         nir_shader *s;
@@ -596,7 +621,7 @@ struct v3d_compile {
         void *debug_output_data;
 
         /**
-         * Mapping from nir_register * or nir_ssa_def * to array of struct
+         * Mapping from nir_register * or nir_def * to array of struct
          * qreg for the values.
          */
         struct hash_table *def_ht;
@@ -615,11 +640,12 @@ struct v3d_compile {
                 uint32_t output_fifo_size;
 
                 struct {
-                        nir_dest *dest;
+                        nir_def *def;
                         uint8_t num_components;
                         uint8_t component_mask;
                 } flush[MAX_TMU_QUEUE_SIZE];
                 uint32_t flush_count;
+                uint32_t total_count;
         } tmu;
 
         /**
@@ -652,16 +678,13 @@ struct v3d_compile {
 
         bool uses_center_w;
         bool writes_z;
+        bool writes_z_from_fep;
+        bool reads_z;
         bool uses_implicit_point_line_varyings;
 
         /* True if a fragment shader reads gl_PrimitiveID */
         bool fs_uses_primitive_id;
 
-        /* If the fragment shader does anything that requires to force
-         * per-sample MSAA, such as reading gl_SampleID.
-         */
-        bool force_per_sample_msaa;
-
         /* Whether we are using the fallback scheduler. This will be set after
          * register allocation has failed once.
          */
@@ -681,6 +704,11 @@ struct v3d_compile {
         bool disable_constant_ubo_load_sorting;
         bool sorted_any_ubo_loads;
 
+        /* Moves UBO/SSBO loads right before their first user (nir_opt_move).
+         * This can reduce register pressure.
+         */
+        bool move_buffer_loads;
+
         /* Emits ldunif for each new uniform, even if the uniform was already
          * emitted in the same block. Useful to compile shaders with high
          * register pressure or to disable the optimization during uniform
@@ -692,6 +720,19 @@ struct v3d_compile {
         bool disable_loop_unrolling;
         bool unrolled_any_loops;
 
+        /* Disables nir_opt_gcm to reduce register pressure. */
+        bool disable_gcm;
+
+        /* If calling nir_opt_gcm made any progress. Used to skip new rebuilds
+         * if possible
+         */
+        bool gcm_progress;
+
+        /* Disables scheduling of general TMU loads (and unfiltered image load).
+         */
+        bool disable_general_tmu_sched;
+        bool has_general_tmu_load;
+
         /* Minimum number of threads we are willing to use to register allocate
          * a shader with the current compilation strategy. This only prevents
          * us from lowering the thread count to register allocate successfully,
@@ -705,7 +746,9 @@ struct v3d_compile {
          * strategies that can reduce register pressure and hopefully reduce or
          * eliminate TMU spills in the shader.
          */
-        bool tmu_spilling_allowed;
+        uint32_t max_tmu_spills;
+
+        uint32_t compile_strategy_idx;
 
         /* The UBO index and block used with the last unifa load, as well as the
          * current unifa offset *after* emitting that load. This is used to skip
@@ -715,6 +758,7 @@ struct v3d_compile {
         struct qblock *current_unifa_block;
         int32_t current_unifa_index;
         uint32_t current_unifa_offset;
+        bool current_unifa_is_ubo;
 
         /* State for whether we're executing on each channel currently.  0 if
          * yes, otherwise a block number + 1 that the channel jumped to.
@@ -749,6 +793,11 @@ struct v3d_compile {
         struct qreg cs_shared_offset;
         int local_invocation_index_bits;
 
+        /* Starting value of the sample mask in a fragment shader. We use
+         * this to identify lanes that have been terminated/discarded.
+         */
+        struct qreg start_msf;
+
         /* If the shader uses subgroup functionality */
         bool has_subgroups;
 
@@ -761,14 +810,27 @@ struct v3d_compile {
         uint32_t spill_size;
         /* Shader-db stats */
         uint32_t spills, fills, loops;
+
+        /* Whether we are in the process of spilling registers for
+         * register allocation
+         */
+        bool spilling;
+
         /**
          * Register spilling's per-thread base address, shared between each
-         * spill/fill's addressing calculations.
+         * spill/fill's addressing calculations (also used for scratch
+         * access).
          */
         struct qreg spill_base;
+
         /* Bit vector of which temps may be spilled */
         BITSET_WORD *spillable;
 
+        /* Used during register allocation */
+        int thread_index;
+        struct v3d_ra_node_info nodes;
+        struct ra_graph *g;
+
         /**
          * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
          *
@@ -799,11 +861,16 @@ struct v3d_compile {
         uint32_t uniform_array_size;
         uint32_t num_uniforms;
         uint32_t output_position_index;
-        nir_variable *output_color_var[4];
+        nir_variable *output_color_var[V3D_MAX_DRAW_BUFFERS];
         uint32_t output_sample_mask_index;
 
         struct qreg undef;
         uint32_t num_temps;
+        /* Number of temps in the program right before we spill a new temp. We
+         * use this to know which temps existed before a spill and which were
+         * added with the spill itself.
+         */
+        uint32_t spill_start_num_temps;
 
         struct vir_cursor cursor;
         struct list_head blocks;
@@ -848,12 +915,16 @@ struct v3d_compile {
         bool emitted_tlb_load;
         bool lock_scoreboard_on_first_thrsw;
 
-        /* Total number of spilled registers in the program */
-        uint32_t spill_count;
-
         enum v3d_compilation_result compilation_result;
 
         bool tmu_dirty_rcl;
+        bool has_global_address;
+
+        /* If we have processed a discard/terminate instruction. This may
+         * cause some lanes to be inactive even during uniform control
+         * flow.
+         */
+        bool emitted_discard;
 };
 
 struct v3d_uniform_list {
@@ -866,6 +937,13 @@ struct v3d_prog_data {
         struct v3d_uniform_list uniforms;
 
         uint32_t spill_size;
+        uint32_t tmu_spills;
+        uint32_t tmu_fills;
+        uint32_t tmu_count;
+
+        uint32_t qpu_read_stalls;
+
+        uint8_t compile_strategy_idx;
 
         uint8_t threads;
 
@@ -877,6 +955,8 @@ struct v3d_prog_data {
         bool tmu_dirty_rcl;
 
         bool has_control_barrier;
+
+        bool has_global_address;
 };
 
 struct v3d_vs_prog_data {
@@ -964,10 +1044,15 @@ struct v3d_fs_prog_data {
 
         uint8_t num_inputs;
         bool writes_z;
+        bool writes_z_from_fep;
         bool disable_ez;
         bool uses_center_w;
         bool uses_implicit_point_line_varyings;
         bool lock_scoreboard_on_first_thrsw;
+
+        /* If the fragment shader does anything that requires to force
+         * per-sample MSAA, such as reading gl_SampleID.
+         */
         bool force_per_sample_msaa;
 };
 
@@ -998,6 +1083,10 @@ v3d_compute_vpm_config(struct v3d_device_info *devinfo,
                        struct v3d_gs_prog_data *gs,
                        struct vpm_config *vpm_cfg_bin,
                        struct vpm_config *vpm_cfg);
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+                                  uint32_t *p1_packed,
+                                  bool unnormalized_coordinates);
 
 static inline bool
 vir_has_uniform(struct qinst *inst)
@@ -1005,7 +1094,8 @@ vir_has_uniform(struct qinst *inst)
         return inst->uniform != ~0;
 }
 
-const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo,
+                                             uint32_t max_inline_uniform_buffers);
 void v3d_compiler_free(const struct v3d_compiler *compiler);
 void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
 
@@ -1066,15 +1156,14 @@ bool vir_is_raw_mov(struct qinst *inst);
 bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
 bool vir_is_add(struct qinst *inst);
 bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
 struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
 uint8_t vir_channels_written(struct qinst *inst);
 struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
-void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
-                    struct qreg result);
+void ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+                   struct qreg result);
 bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
-void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
+void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_def *def,
                                uint32_t component_mask);
 void ntq_flush_tmu(struct v3d_compile *c);
 void vir_emit_thrsw(struct v3d_compile *c);
@@ -1095,32 +1184,27 @@ bool vir_opt_redundant_flags(struct v3d_compile *c);
 bool vir_opt_small_immediates(struct v3d_compile *c);
 bool vir_opt_vpm(struct v3d_compile *c);
 bool vir_opt_constant_alu(struct v3d_compile *c);
-void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_line_smooth(nir_shader *shader);
-void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c);
-void v3d_nir_lower_scratch(nir_shader *s);
-void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_image_load_store(nir_shader *s);
-void vir_lower_uniforms(struct v3d_compile *c);
-
-void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
-void v3d33_vir_vpm_write_setup(struct v3d_compile *c);
-void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_image_load_store(struct v3d_compile *c,
-                                     nir_intrinsic_instr *instr);
+bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_line_smooth(nir_shader *shader);
+bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_scratch(nir_shader *s);
+bool v3d_nir_lower_txf_ms(nir_shader *s);
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
+
+void v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
+void v3d_vir_emit_image_load_store(struct v3d_compile *c,
+                                   nir_intrinsic_instr *instr);
 
 void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
 uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
 void qpu_validate(struct v3d_compile *c);
-struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
 bool vir_init_reg_sets(struct v3d_compiler *compiler);
 
 int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str);
 
-bool v3d_gl_format_is_return_32(GLenum format);
+bool v3d_gl_format_is_return_32(enum pipe_format format);
 
 uint32_t
 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
@@ -1220,28 +1304,35 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)         \
 #define VIR_SFU(name)                                                      \
 static inline struct qreg                                                \
 vir_##name(struct v3d_compile *c, struct qreg a)                         \
-{                                                                        \
-        if (c->devinfo->ver >= 41) {                                     \
-                return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,    \
-                                                    c->undef,            \
-                                                    a, c->undef));       \
-        } else {                                                         \
-                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
-                return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
-        }                                                                \
+{                                                                       \
+        return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,           \
+                                            c->undef,                   \
+                                            a, c->undef));              \
 }                                                                        \
 static inline struct qinst *                                             \
 vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
                   struct qreg a)                                         \
 {                                                                        \
-        if (c->devinfo->ver >= 41) {                                     \
-                return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
-                                                       dest,             \
-                                                       a, c->undef));    \
-        } else {                                                         \
-                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
-                return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
-        }                                                                \
+        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name,        \
+                                               dest,                    \
+                                               a, c->undef));           \
+}
+
+#define VIR_SFU2(name)                                                   \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)          \
+{                                                                        \
+        return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,            \
+                                            c->undef,                    \
+                                            a, b));                      \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a, struct qreg b)                          \
+{                                                                        \
+        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name,         \
+                                               dest,                     \
+                                               a, b));                   \
 }
 
 #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
@@ -1343,6 +1434,28 @@ VIR_SFU(LOG)
 VIR_SFU(SIN)
 VIR_SFU(RSQRT2)
 
+VIR_SFU(BALLOT)
+VIR_SFU(BCASTF)
+VIR_SFU(ALLEQ)
+VIR_SFU(ALLFEQ)
+VIR_SFU2(ROTQ)
+VIR_SFU2(ROT)
+VIR_SFU2(SHUFFLE)
+
+VIR_A_ALU2(VPACK)
+VIR_A_ALU2(V8PACK)
+VIR_A_ALU2(V10PACK)
+VIR_A_ALU2(V11FPACK)
+
+VIR_M_ALU1(FTOUNORM16)
+VIR_M_ALU1(FTOSNORM16)
+
+VIR_M_ALU1(VFTOUNORM8)
+VIR_M_ALU1(VFTOSNORM8)
+
+VIR_M_ALU1(VFTOUNORM10LO)
+VIR_M_ALU1(VFTOUNORM10HI)
+
 static inline struct qinst *
 vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
              struct qreg dest, struct qreg src)
@@ -1372,16 +1485,11 @@ vir_NOP(struct v3d_compile *c)
 static inline struct qreg
 vir_LDTMU(struct v3d_compile *c)
 {
-        if (c->devinfo->ver >= 41) {
-                struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
-                                                   c->undef, c->undef);
-                ldtmu->qpu.sig.ldtmu = true;
-
-                return vir_emit_def(c, ldtmu);
-        } else {
-                vir_NOP(c)->qpu.sig.ldtmu = true;
-                return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
-        }
+        struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+                                           c->undef, c->undef);
+        ldtmu->qpu.sig.ldtmu = true;
+
+        return vir_emit_def(c, ldtmu);
 }
 
 static inline struct qreg
@@ -1394,7 +1502,6 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
 static inline struct qreg
 vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
 {
-        assert(c->devinfo->ver >= 41); /* XXX */
         assert((config & 0xffffff00) == 0xffffff00);
 
         struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
@@ -1407,38 +1514,12 @@ vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
 static inline struct qreg
 vir_TLB_COLOR_READ(struct v3d_compile *c)
 {
-        assert(c->devinfo->ver >= 41); /* XXX */
-
         struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                            c->undef, c->undef);
         ldtlb->qpu.sig.ldtlb = true;
         return vir_emit_def(c, ldtlb);
 }
 
-/*
-static inline struct qreg
-vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val), c->undef));
-}
-
-static inline struct qreg
-vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-static inline struct qreg
-vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-*/
-
 static inline struct qinst *
 vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 {
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2706432d5ef..9a651bfc6a7 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,9 +40,20 @@
  * calculations and load/store using the TMU general memory access path.
  */
 
+static const unsigned bits_8[4] = {8, 8, 8, 8};
+static const unsigned bits_16[4] = {16, 16, 16, 16};
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+
 bool
 v3d_gl_format_is_return_32(enum pipe_format format)
 {
+        /* We can get a NONE format in Vulkan because we support the
+         * shaderStorageImageReadWithoutFormat feature. We consider these to
+         * always use 32-bit precision.
+         */
+        if (format == PIPE_FORMAT_NONE)
+                return true;
+
         const struct util_format_description *desc =
                 util_format_description(format);
         const struct util_format_channel_description *chan = &desc->channel[0];
@@ -52,15 +63,17 @@ v3d_gl_format_is_return_32(enum pipe_format format)
 
 /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
  * 32-bit SSA value, with as many channels as necessary to store all the bits
+ *
+ * This is the generic helper, using all common nir operations.
  */
-static nir_ssa_def *
-pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
+static nir_def *
+pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
           int num_components, bool mask)
 {
-        nir_ssa_def *results[4];
+        nir_def *results[4];
         int offset = 0;
         for (int i = 0; i < num_components; i++) {
-                nir_ssa_def *chan = nir_channel(b, color, i);
+                nir_def *chan = nir_channel(b, color, i);
 
                 /* Channels being stored shouldn't cross a 32-bit boundary. */
                 assert((offset & ~31) == ((offset + bits[i] - 1) & ~31));
@@ -84,10 +97,187 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
         return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
 }
 
-static void
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static inline nir_def *
+nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2)
+{
+        return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_def *
+pack_11f11f10f(nir_builder *b, nir_def *color)
+{
+        nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                     nir_channel(b, color, 1));
+        nir_def *undef = nir_undef(b, 1, color->bit_size);
+        nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+        return nir_pack_32_to_r11g11b10_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_def *color)
+{
+        nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+                                                nir_channel(b, color, 1));
+        nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+                                                nir_channel(b, color, 3));
+
+        return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color)
+{
+        nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                     nir_channel(b, color, 1));
+        p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1);
+
+        nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+                                     nir_channel(b, color, 3));
+        p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2);
+
+        return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+enum hw_conversion {
+        NONE,
+        TO_SNORM,
+        TO_UNORM
+};
+
+static inline nir_def *
+pack_8bit(nir_builder *b, nir_def *color,
+                        unsigned num_components,
+                        enum hw_conversion conversion)
+{
+        /* Note that usually you should not use this method (that relies on
+         * custom packing) for 1 component if we are not doing any
+         * conversion. But we support also that case, and let the caller
+         * decide which method to use.
+         */
+        nir_def *p1;
+        nir_def *p2;
+
+        if (conversion == NONE) {
+                p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+                                               nir_channel(b, color, num_components == 1 ? 0 : 1));
+        } else {
+                p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                nir_channel(b, color, num_components == 1 ? 0 : 1));
+                p1 = (conversion == TO_UNORM) ?
+                   nir_pack_2x16_to_unorm_2x8_v3d(b, p1) :
+                   nir_pack_2x16_to_snorm_2x8_v3d(b, p1);
+        }
+        if (num_components == 4) {
+                if (conversion == NONE) {
+                        p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+                                                       nir_channel(b, color, 3));
+                } else {
+                        p2 = nir_vfpack(b, nir_channel(b, color, 2),
+                                        nir_channel(b, color, 3));
+                        p2 = (conversion == TO_UNORM) ?
+                           nir_pack_2x16_to_unorm_2x8_v3d(b, p2) :
+                           nir_pack_2x16_to_snorm_2x8_v3d(b, p2);
+                }
+        } else {
+                /* Using an undef here would be more correct. But for this
+                 * case we are getting worse shader-db values with some CTS
+                 * tests, so we just reuse the first packing.
+                 */
+                p2 = p1;
+        }
+
+        return nir_pack_4x16_to_4x8_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_16bit(nir_builder *b, nir_def *color,
+                         unsigned num_components,
+                         enum hw_conversion conversion)
+{
+        nir_def *results[2] = {0};
+        nir_def *channels[4] = {0};
+
+        for (unsigned i = 0; i < num_components; i++) {
+                channels[i] = nir_channel(b, color, i);
+                switch (conversion) {
+                case TO_SNORM:
+                        channels[i] = nir_f2snorm_16_v3d(b, channels[i]);
+                        break;
+                case TO_UNORM:
+                        channels[i] = nir_f2unorm_16_v3d(b, channels[i]);
+                        break;
+                default:
+                        /* Note that usually you should not use this method
+                         * (that relies on custom packing) if we are not doing
+                         * any conversion. But we support also that case, and
+                         * let the caller decide which method to use.
+                         */
+                        break;
+                }
+        }
+
+        switch (num_components) {
+        case 1:
+                results[0] = channels[0];
+                break;
+        case 4:
+                results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]);
+                FALLTHROUGH;
+        case 2:
+                results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]);
+                break;
+        default:
+                unreachable("Invalid number of components");
+        }
+
+        return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_def *
+pack_xbit(nir_builder *b, nir_def *color,
+          unsigned num_components,
+          const struct util_format_channel_description *r_chan)
+{
+        bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
+        enum hw_conversion conversion = NONE;
+        if (r_chan->normalized) {
+                conversion =
+                        (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
+        }
+
+        switch (r_chan->size) {
+        case 8:
+                if (conversion == NONE && num_components < 2)
+                        return pack_bits(b, color, bits_8, num_components, pack_mask);
+                else
+                        return pack_8bit(b, color, num_components, conversion);
+                break;
+        case 16:
+                /* pack_mask implies that the generic packing method would
+                 * need to include extra operations to handle negative values,
+                 * so in that case, even without a conversion, it is better to
+                 * use the packing using custom hw operations.
+                 */
+                if (conversion == NONE && !pack_mask)
+                        return pack_bits(b, color, bits_16, num_components, pack_mask);
+                else
+                        return pack_16bit(b, color, num_components, conversion);
+                break;
+        default:
+                unreachable("unrecognized bits");
+        }
+}
+
+static bool
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
 {
         enum pipe_format format = nir_intrinsic_format(instr);
+        assert(format != PIPE_FORMAT_NONE);
         const struct util_format_description *desc =
                 util_format_description(format);
         const struct util_format_channel_description *r_chan = &desc->channel[0];
@@ -95,10 +285,10 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
 
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *color = nir_channels(b,
-                                          nir_ssa_for_src(b, instr->src[3], 4),
-                                          (1 << num_components) - 1);
-        nir_ssa_def *formatted = NULL;
+        nir_def *color = nir_trim_vector(b,
+                                             instr->src[3].ssa,
+                                             num_components);
+        nir_def *formatted = NULL;
 
         if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
                 formatted = nir_format_pack_11f11f10f(b, color);
@@ -110,9 +300,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
                  */
                 formatted = color;
         } else {
-                static const unsigned bits_8[4] = {8, 8, 8, 8};
-                static const unsigned bits_16[4] = {16, 16, 16, 16};
-                static const unsigned bits_1010102[4] = {10, 10, 10, 2};
                 const unsigned *bits;
 
                 switch (r_chan->size) {
@@ -132,11 +319,13 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
                 bool pack_mask = false;
                 if (r_chan->pure_integer &&
                     r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        formatted = nir_format_clamp_sint(b, color, bits);
+                        /* We don't need to do any conversion or clamping in this case */
+                        formatted = color;
                         pack_mask = true;
                 } else if (r_chan->pure_integer &&
                            r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
-                        formatted = nir_format_clamp_uint(b, color, bits);
+                        /* We don't need to do any conversion or clamping in this case */
+                        formatted = color;
                 } else if (r_chan->normalized &&
                            r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
                         formatted = nir_format_float_to_snorm(b, color, bits);
@@ -154,75 +343,116 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
                                       pack_mask);
         }
 
-        nir_instr_rewrite_src(&instr->instr, &instr->src[3],
-                              nir_src_for_ssa(formatted));
+        nir_src_rewrite(&instr->src[3], formatted);
         instr->num_components = formatted->num_components;
+
+        return true;
 }
 
-static void
+
+static bool
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
+{
+        enum pipe_format format = nir_intrinsic_format(instr);
+        assert(format != PIPE_FORMAT_NONE);
+        const struct util_format_description *desc =
+                util_format_description(format);
+        const struct util_format_channel_description *r_chan = &desc->channel[0];
+        unsigned num_components = util_format_get_nr_components(format);
+        b->cursor = nir_before_instr(&instr->instr);
+
+        nir_def *color =
+           nir_trim_vector(b, instr->src[3].ssa, num_components);
+        nir_def *formatted = NULL;
+        if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+                formatted = nir_format_pack_r9g9b9e5(b, color);
+        } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+                formatted = pack_11f11f10f(b, color);
+        } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
+                formatted = pack_r10g10b10a2_uint(b, color);
+        } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+                formatted = pack_r10g10b10a2_unorm(b, color);
+        } else if (r_chan->size == 32) {
+                /* For 32-bit formats, we just have to move the vector
+                 * across (possibly reducing the number of channels).
+                 */
+                formatted = color;
+        } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+                assert(r_chan->size == 16);
+                formatted = nir_format_float_to_half(b, color);
+                formatted = pack_bits(b, formatted, bits_16, num_components,
+                                      false);
+        } else {
+                assert(r_chan->size == 8 || r_chan->size == 16);
+                formatted = pack_xbit(b, color, num_components, r_chan);
+        }
+
+        nir_src_rewrite(&instr->src[3], formatted);
+        instr->num_components = formatted->num_components;
+
+        return true;
+}
+
+static bool
 v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
 {
         static const unsigned bits16[] = {16, 16, 16, 16};
         enum pipe_format format = nir_intrinsic_format(instr);
 
         if (v3d_gl_format_is_return_32(format))
-                return;
+                return false;
 
         b->cursor = nir_after_instr(&instr->instr);
 
-        assert(instr->dest.is_ssa);
-        nir_ssa_def *result = &instr->dest.ssa;
+        nir_def *result = &instr->def;
         if (util_format_is_pure_uint(format)) {
                 result = nir_format_unpack_uint(b, result, bits16, 4);
         } else if (util_format_is_pure_sint(format)) {
                 result = nir_format_unpack_sint(b, result, bits16, 4);
         } else {
-            nir_ssa_def *rg = nir_channel(b, result, 0);
-            nir_ssa_def *ba = nir_channel(b, result, 1);
-            result = nir_vec4(b,
-                              nir_unpack_half_2x16_split_x(b, rg),
-                              nir_unpack_half_2x16_split_y(b, rg),
-                              nir_unpack_half_2x16_split_x(b, ba),
-                              nir_unpack_half_2x16_split_y(b, ba));
+                nir_def *rg = nir_channel(b, result, 0);
+                nir_def *ba = nir_channel(b, result, 1);
+                result = nir_vec4(b,
+                                  nir_unpack_half_2x16_split_x(b, rg),
+                                  nir_unpack_half_2x16_split_y(b, rg),
+                                  nir_unpack_half_2x16_split_x(b, ba),
+                                  nir_unpack_half_2x16_split_y(b, ba));
         }
 
-        nir_ssa_def_rewrite_uses_after(&instr->dest.ssa, result,
+        nir_def_rewrite_uses_after(&instr->def, result,
                                        result->parent_instr);
+
+        return true;
 }
 
-void
-v3d_nir_lower_image_load_store(nir_shader *s)
+static bool
+v3d_nir_lower_image_load_store_cb(nir_builder *b,
+                                  nir_intrinsic_instr *intr,
+                                  void *_state)
 {
-        nir_foreach_function(function, s) {
-                if (!function->impl)
-                        continue;
-
-                nir_builder b;
-                nir_builder_init(&b, function->impl);
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
-
-                                nir_intrinsic_instr *intr =
-                                        nir_instr_as_intrinsic(instr);
-
-                                switch (intr->intrinsic) {
-                                case nir_intrinsic_image_load:
-                                        v3d_nir_lower_image_load(&b, intr);
-                                        break;
-                                case nir_intrinsic_image_store:
-                                        v3d_nir_lower_image_store(&b, intr);
-                                        break;
-                                default:
-                                        break;
-                                }
-                        }
-                }
+        struct v3d_compile *c = (struct v3d_compile *) _state;
 
-                nir_metadata_preserve(function->impl,
-                                      nir_metadata_block_index |
-                                      nir_metadata_dominance);
+        switch (intr->intrinsic) {
+        case nir_intrinsic_image_load:
+                return v3d_nir_lower_image_load(b, intr);
+        case nir_intrinsic_image_store:
+                if (c->devinfo->ver >= 71)
+                        return v3d_nir_lower_image_store_v71(b, intr);
+                else
+                        return v3d_nir_lower_image_store_v42(b, intr);
+                break;
+        default:
+                return false;
         }
+
+        return false;
+}
+
+bool
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
+{
+        return nir_shader_intrinsics_pass(s,
+                                            v3d_nir_lower_image_load_store_cb,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance, c);
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 895b1a39163..55e2e4f2e11 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -24,8 +24,6 @@
 #include "compiler/v3d_compiler.h"
 #include "compiler/nir/nir_builder.h"
 
-#include "util/u_helpers.h"
-
 /**
  * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
  * intrinsics into something amenable to the V3D architecture.
@@ -64,7 +62,7 @@ struct v3d_nir_lower_io_state {
 
         BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
 
-        nir_ssa_def *pos[4];
+        nir_def *pos[4];
 };
 
 static void
@@ -72,8 +70,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                             struct v3d_nir_lower_io_state *state);
 
 static void
-v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
-                     nir_ssa_def *chan)
+v3d_nir_store_output(nir_builder *b, int base, nir_def *offset,
+                     nir_def *chan)
 {
         if (offset) {
                 /* When generating the VIR instruction, the base and the offset
@@ -90,29 +88,6 @@ v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
         nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0);
 }
 
-/* Convert the uniform offset to bytes.  If it happens to be a constant,
- * constant-folding will clean up the shift for us.
- */
-static void
-v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
-                      nir_intrinsic_instr *intr)
-{
-        /* On SPIR-V/Vulkan we are already getting our offsets in
-         * bytes.
-         */
-        if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
-                return;
-
-        b->cursor = nir_before_instr(&intr->instr);
-
-        nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
-
-        nir_instr_rewrite_src(&intr->instr,
-                              &intr->src[0],
-                              nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
-                                                       nir_imm_int(b, 4))));
-}
-
 static int
 v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component)
 {
@@ -159,14 +134,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
         /* If this is a geometry shader we need to emit our outputs
          * to the current vertex offset in the VPM.
          */
-        nir_ssa_def *offset_reg =
+        nir_def *offset_reg =
                 c->s->info.stage == MESA_SHADER_GEOMETRY ?
                         nir_load_var(b, state->gs.output_offset_var) : NULL;
 
         int start_comp = nir_intrinsic_component(intr);
         unsigned location = nir_intrinsic_io_semantics(intr).location;
-        nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
-                                           intr->num_components);
+        nir_def *src = intr->src[0].ssa;
         /* Save off the components of the position for the setup of VPM inputs
          * read by fixed function HW.
          */
@@ -184,8 +158,8 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
 
         if (location == VARYING_SLOT_LAYER) {
                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
-                nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
-                header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff));
+                nir_def *header = nir_load_var(b, state->gs.header_var);
+                header = nir_iand_imm(b, header, 0xff00ffff);
 
                 /* From the GLES 3.2 spec:
                  *
@@ -205,24 +179,26 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
                  * to 0 in that case (we always allocate tile state for at
                  * least one layer).
                  */
-                nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
-                nir_ssa_def *cond = nir_ige(b, src, fb_layers);
-                nir_ssa_def *layer_id =
+                nir_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
+                nir_def *cond = nir_ige(b, src, fb_layers);
+                nir_def *layer_id =
                         nir_bcsel(b, cond,
                                   nir_imm_int(b, 0),
-                                  nir_ishl(b, src, nir_imm_int(b, 16)));
+                                  nir_ishl_imm(b, src, 16));
                 header = nir_ior(b, header, layer_id);
                 nir_store_var(b, state->gs.header_var, header, 0x1);
         }
 
         /* Scalarize outputs if it hasn't happened already, since we want to
-         * schedule each VPM write individually.  We can skip any outut
+         * schedule each VPM write individually.  We can skip any output
          * components not read by the FS.
          */
         for (int i = 0; i < intr->num_components; i++) {
                 int vpm_offset =
                         v3d_varying_slot_vpm_offset(c, location, start_comp + i);
 
+                if (!(nir_intrinsic_write_mask(intr) & (1 << i)))
+                        continue;
 
                 if (vpm_offset == -1)
                         continue;
@@ -261,9 +237,9 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
 {
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
-        nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
-        nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
+        nir_def *header = nir_load_var(b, state->gs.header_var);
+        nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
+        nir_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
 
         /* Emit fixed function outputs */
         v3d_nir_emit_ff_vpm_outputs(c, b, state);
@@ -273,13 +249,13 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
 
         /* Update VPM offset for next vertex output data and header */
         output_offset =
-                nir_iadd(b, output_offset,
-                            nir_imm_int(b, state->gs.output_vertex_data_size));
+                nir_iadd_imm(b, output_offset,
+                             state->gs.output_vertex_data_size);
 
-        header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
+        header_offset = nir_iadd_imm(b, header_offset, 1);
 
         /* Reset the New Primitive bit */
-        header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
+        header = nir_iand_imm(b, header, 0xfffffffe);
 
         nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
         nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
@@ -304,7 +280,7 @@ v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
  * doesn't provide means to do that, so we need to apply the swizzle in the
  * vertex shader.
  *
- * This is required at least in Vulkan to support madatory vertex attribute
+ * This is required at least in Vulkan to support mandatory vertex attribute
  * format VK_FORMAT_B8G8R8A8_UNORM.
  */
 static void
@@ -327,59 +303,6 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
                 nir_intrinsic_set_component(instr, (comp + 2) % 4);
 }
 
-/* Sometimes the origin of gl_PointCoord is in the upper left rather than the
- * lower left so we need to flip it.
- *
- * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
- */
-static void
-v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
-                             nir_intrinsic_instr *intr)
-{
-        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
-
-        /* Gallium uses lower_wpos_pntc */
-        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
-                return;
-
-        b->cursor = nir_after_instr(&intr->instr);
-
-        int comp = nir_intrinsic_component(intr);
-
-        nir_variable *input_var =
-                nir_find_variable_with_driver_location(c->s,
-                                                       nir_var_shader_in,
-                                                       nir_intrinsic_base(intr));
-
-        if (input_var && util_varying_is_point_coord(input_var->data.location,
-                                                     c->fs_key->point_sprite_mask)) {
-                assert(intr->num_components == 1);
-
-                nir_ssa_def *result = &intr->dest.ssa;
-
-                switch (comp) {
-                case 0:
-                case 1:
-                        if (!c->fs_key->is_points)
-                                result = nir_imm_float(b, 0.0);
-                        break;
-                case 2:
-                        result = nir_imm_float(b, 0.0);
-                        break;
-                case 3:
-                        result = nir_imm_float(b, 1.0);
-                        break;
-                }
-                if (c->fs_key->point_coord_upper_left && comp == 1)
-                        result = nir_fsub(b, nir_imm_float(b, 1.0), result);
-                if (result != &intr->dest.ssa) {
-                        nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
-                                                       result,
-                                                       result->parent_instr);
-                }
-        }
-}
-
 static void
 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                        struct nir_instr *instr,
@@ -393,12 +316,6 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
         case nir_intrinsic_load_input:
                 if (c->s->info.stage == MESA_SHADER_VERTEX)
                         v3d_nir_lower_vertex_input(c, b, intr);
-                else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
-                        v3d_nir_lower_fragment_input(c, b, intr);
-                break;
-
-        case nir_intrinsic_load_uniform:
-                v3d_nir_lower_uniform(c, b, intr);
                 break;
 
         case nir_intrinsic_store_output:
@@ -558,16 +475,16 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
         /* If this is a geometry shader we need to emit our fixed function
          * outputs to the current vertex offset in the VPM.
          */
-        nir_ssa_def *offset_reg =
+        nir_def *offset_reg =
                 c->s->info.stage == MESA_SHADER_GEOMETRY ?
                         nir_load_var(b, state->gs.output_offset_var) : NULL;
 
         for (int i = 0; i < 4; i++) {
                 if (!state->pos[i])
-                        state->pos[i] = nir_ssa_undef(b, 1, 32);
+                        state->pos[i] = nir_undef(b, 1, 32);
         }
 
-        nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
+        nir_def *rcp_wc = nir_frcp(b, state->pos[3]);
 
         if (state->pos_vpm_offset != -1) {
                 for (int i = 0; i < 4; i++) {
@@ -578,8 +495,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
 
         if (state->vp_vpm_offset != -1) {
                 for (int i = 0; i < 2; i++) {
-                        nir_ssa_def *pos;
-                        nir_ssa_def *scale;
+                        nir_def *pos;
+                        nir_def *scale;
                         pos = state->pos[i];
                         if (i == 0)
                                 scale = nir_load_viewport_x_scale(b);
@@ -598,14 +515,18 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                          * The correct fix for this as recommended by Broadcom
                          * is to convert to .8 fixed-point with ffloor().
                          */
-                        pos = nir_f2i32(b, nir_ffloor(b, pos));
-                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
-                                             offset_reg, pos);
+                        if (c->devinfo->ver == 42)
+                                 pos = nir_f2i32(b, nir_ffloor(b, pos));
+                        else
+                                 pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+                       v3d_nir_store_output(b, state->vp_vpm_offset + i,
+                                            offset_reg, pos);
                 }
         }
 
         if (state->zs_vpm_offset != -1) {
-                nir_ssa_def *z = state->pos[2];
+                nir_def *z = state->pos[2];
                 z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
                 z = nir_fmul(b, z, rcp_wc);
                 z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
@@ -679,21 +600,22 @@ emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
          * have a variable just to keep track of the number of vertices we
          * emitted and instead we can just compute it here from the header
          * offset variable by removing the one generic header slot that always
-         * goes at the begining of out header.
+         * goes at the beginning of out header.
          */
-        nir_ssa_def *header_offset =
+        nir_def *header_offset =
                 nir_load_var(b, state->gs.header_offset_var);
-        nir_ssa_def *vertex_count =
-                nir_isub(b, header_offset, nir_imm_int(b, 1));
-        nir_ssa_def *header =
-                nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
-                           nir_ishl(b, vertex_count,
-                                    nir_imm_int(b, VERTEX_COUNT_OFFSET)));
+        nir_def *vertex_count =
+                nir_iadd_imm(b, header_offset, -1);
+        nir_def *header =
+                nir_ior_imm(b,
+                            nir_ishl_imm(b, vertex_count,
+                                         VERTEX_COUNT_OFFSET),
+                            state->gs.output_header_size);
 
         v3d_nir_store_output(b, 0, NULL, header);
 }
 
-void
+bool
 v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
 {
         struct v3d_nir_lower_io_state state = { 0 };
@@ -713,36 +635,39 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
                 unreachable("Unsupported shader stage");
         }
 
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_builder b;
-                        nir_builder_init(&b, function->impl);
-
-                        if (c->s->info.stage == MESA_SHADER_GEOMETRY)
-                                emit_gs_prolog(c, &b, function->impl, &state);
-
-                        nir_foreach_block(block, function->impl) {
-                                nir_foreach_instr_safe(instr, block)
-                                        v3d_nir_lower_io_instr(c, &b, instr,
-                                                               &state);
-                        }
-
-                        nir_block *last = nir_impl_last_block(function->impl);
-                        b.cursor = nir_after_block(last);
-                        if (s->info.stage == MESA_SHADER_VERTEX) {
-                                v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
-                        } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
-                                emit_gs_vpm_output_header_prolog(c, &b, &state);
-                        }
-
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
+        nir_foreach_function_impl(impl, s) {
+                nir_builder b = nir_builder_create(impl);
+
+                if (c->s->info.stage == MESA_SHADER_GEOMETRY)
+                        emit_gs_prolog(c, &b, impl, &state);
+
+                nir_foreach_block(block, impl) {
+                        nir_foreach_instr_safe(instr, block)
+                                v3d_nir_lower_io_instr(c, &b, instr,
+                                                       &state);
                 }
+
+                nir_block *last = nir_impl_last_block(impl);
+                b.cursor = nir_after_block(last);
+                if (s->info.stage == MESA_SHADER_VERTEX) {
+                        v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+                } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
+                        emit_gs_vpm_output_header_prolog(c, &b, &state);
+                }
+
+                nir_metadata_preserve(impl,
+                                      nir_metadata_block_index |
+                                      nir_metadata_dominance);
         }
 
         if (s->info.stage == MESA_SHADER_VERTEX ||
             s->info.stage == MESA_SHADER_GEOMETRY) {
                 v3d_nir_lower_io_update_output_var_base(c, &state);
         }
+
+        /* It is really unlikely that we don't get progress here, and fully
+         * filtering when not would make code more complex, but we are still
+         * interested on getting this lowering going through NIR_PASS
+         */
+        return true;
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
index 8f6e7d4e648..05b5224bc52 100644
--- a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
+++ b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -42,25 +42,23 @@ lower_line_smooth_intrinsic(struct lower_line_smooth_state *state,
 {
         b->cursor = nir_before_instr(&intr->instr);
 
-        nir_ssa_def *one = nir_imm_float(b, 1.0f);
+        nir_def *one = nir_imm_float(b, 1.0f);
 
-        nir_ssa_def *coverage = nir_load_var(b, state->coverage);
+        nir_def *coverage = nir_load_var(b, state->coverage);
 
-        nir_ssa_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
+        nir_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
                                         intr->src[0].ssa);
 
-        nir_instr_rewrite_src(&intr->instr,
-                              &intr->src[0],
-                              nir_src_for_ssa(new_val));
+        nir_src_rewrite(&intr->src[0], new_val);
 }
 
-static void
+static bool
 lower_line_smooth_func(struct lower_line_smooth_state *state,
                        nir_function_impl *impl)
 {
-        nir_builder b;
+        bool progress = false;
 
-        nir_builder_init(&b, impl);
+        nir_builder b = nir_builder_create(impl);
 
         nir_foreach_block(block, impl) {
                 nir_foreach_instr_safe(instr, block) {
@@ -72,58 +70,66 @@ lower_line_smooth_func(struct lower_line_smooth_state *state,
 
                         if (intr->intrinsic != nir_intrinsic_store_output ||
                             nir_intrinsic_base(intr) != 0 ||
-                            intr->num_components != 4 ||
-                            !intr->src[0].is_ssa)
+                            intr->num_components != 4)
                                 continue;
 
                         lower_line_smooth_intrinsic(state, &b, intr);
+                        progress = true;
                 }
         }
+
+        return progress;
 }
 
 static void
 initialise_coverage_var(struct lower_line_smooth_state *state,
                         nir_function_impl *impl)
 {
-        nir_builder b;
-
-        nir_builder_init(&b, impl);
+        nir_builder b = nir_builder_at(nir_before_impl(impl));
 
-        b.cursor = nir_before_block(nir_start_block(impl));
+        nir_def *line_width = nir_load_line_width(&b);
 
-        nir_ssa_def *line_width = nir_load_line_width(&b);
+        nir_def *real_line_width = nir_load_aa_line_width(&b);
 
-        nir_ssa_def *real_line_width = nir_load_aa_line_width(&b);
-
-        /* The line coord varies from 0.0 to 1.0 across the width of the line */
-        nir_ssa_def *line_coord = nir_load_line_coord(&b);
+        /* According to the PRM, the line coord varies from 0.0 to 1.0 across
+         * the width of the line. But actually, when a perspective projection
+         * is used, it is also applied to the line coords, so the values end
+         * up being between [min_coord, 1], based on the Wc coordinate.  We
+         * need to re-map the values to be between [0.0, 1.0].
+         */
+        nir_def *line_coord = nir_load_line_coord(&b);
+        nir_def *wc = nir_load_fep_w_v3d(&b, 32);
+        nir_def *min_coord_val = nir_fsub(&b, nir_imm_float(&b, 1.0f), wc);
+        nir_def *normalized_line_coord = nir_fdiv(&b,
+                                                  nir_fsub(&b, line_coord, min_coord_val),
+                                                  nir_fsub_imm(&b, 1.0, min_coord_val));;
 
         /* fabs(line_coord - 0.5) * real_line_width */
-        nir_ssa_def *pixels_from_center =
+        nir_def *pixels_from_center =
                 nir_fmul(&b, real_line_width,
-                         nir_fabs(&b, nir_fsub(&b, line_coord,
+                         nir_fabs(&b, nir_fsub(&b, normalized_line_coord,
                                                nir_imm_float(&b, 0.5f))));
 
         /* 0.5 - 1/√2 * (pixels_from_center - line_width * 0.5) */
-        nir_ssa_def *coverage =
+        nir_def *coverage =
                 nir_fsub(&b,
                          nir_imm_float(&b, 0.5f),
                          nir_fmul(&b,
                                   nir_imm_float(&b, 1.0f / M_SQRT2),
                                   nir_fsub(&b, pixels_from_center,
-                                           nir_fmul(&b,
-                                                    line_width,
-                                                    nir_imm_float(&b, 0.5f)))));
+                                           nir_fmul_imm(&b,
+                                                        line_width,
+                                                        0.5f))));
 
         /* Discard fragments that aren’t covered at all by the line */
-        nir_ssa_def *outside = nir_fge(&b, nir_imm_float(&b, 0.0f), coverage);
+        nir_def *outside = nir_fle_imm(&b, coverage, 0.0f);
 
         nir_discard_if(&b, outside);
 
         /* Clamp to at most 1.0. If it was less than 0.0 then the fragment will
          * be discarded so we don’t need to handle that.
          */
-        nir_ssa_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
+        nir_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
 
         nir_store_var(&b, state->coverage, clamped, 0x1 /* writemask */);
 }
@@ -140,9 +146,11 @@ make_coverage_var(nir_shader *s)
         return var;
 }
 
-void
+bool
 v3d_nir_lower_line_smooth(nir_shader *s)
 {
+        bool progress = false;
+
         assert(s->info.stage == MESA_SHADER_FRAGMENT);
 
         struct lower_line_smooth_state state = {
@@ -150,10 +158,20 @@ v3d_nir_lower_line_smooth(nir_shader *s)
                 .coverage = make_coverage_var(s),
         };
 
-        nir_foreach_function(function, s) {
+        nir_foreach_function_with_impl(function, impl, s) {
                 if (function->is_entrypoint)
-                        initialise_coverage_var(&state, function->impl);
+                        initialise_coverage_var(&state, impl);
+
+                progress |= lower_line_smooth_func(&state, impl);
 
-                lower_line_smooth_func(&state, function->impl);
+                if (progress) {
+                        nir_metadata_preserve(impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                } else {
+                        nir_metadata_preserve(impl, nir_metadata_all);
+                }
         }
+
+        return progress;
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
new file mode 100644
index 00000000000..0caf5dbc92c
--- /dev/null
+++ b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright © 2021 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * The V3D TMU unit can only do 32-bit general vector access so for anything
+ * else we need to split vector load/store instructions to scalar.
+ *
+ * Note that a vectorization pass after this lowering may be able to
+ * re-vectorize some of these using 32-bit load/store instructions instead,
+ * which we do support.
+ */
+
+static int
+value_src(nir_intrinsic_op intrinsic)
+{
+   switch (intrinsic) {
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_store_global_2x32:
+      return 0;
+   default:
+      unreachable("Unsupported intrinsic");
+   }
+}
+
+static int
+offset_src(nir_intrinsic_op intrinsic)
+{
+   switch (intrinsic) {
+   case nir_intrinsic_load_uniform:
+   case nir_intrinsic_load_shared:
+   case nir_intrinsic_load_scratch:
+   case nir_intrinsic_load_global_2x32:
+      return 0;
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_store_global_2x32:
+      return 1;
+   case nir_intrinsic_store_ssbo:
+      return 2;
+   default:
+      unreachable("Unsupported intrinsic");
+   }
+}
+
+static nir_intrinsic_instr *
+init_scalar_intrinsic(nir_builder *b,
+                      nir_intrinsic_instr *intr,
+                      uint32_t component,
+                      nir_def *offset,
+                      uint32_t bit_size,
+                      nir_def **scalar_offset)
+{
+
+        nir_intrinsic_instr *new_intr =
+                nir_intrinsic_instr_create(b->shader, intr->intrinsic);
+
+        nir_intrinsic_copy_const_indices(new_intr, intr);
+
+        const int offset_units = bit_size / 8;
+        assert(offset_units >= 1);
+
+        if (nir_intrinsic_has_align_mul(intr)) {
+                assert(nir_intrinsic_has_align_offset(intr));
+                unsigned align_mul = nir_intrinsic_align_mul(intr);
+                unsigned align_off = nir_intrinsic_align_offset(intr);
+
+                align_off += offset_units * component;
+                align_off = align_off % align_mul;
+
+                nir_intrinsic_set_align(new_intr, align_mul, align_off);
+        }
+
+        *scalar_offset = offset;
+        unsigned offset_adj = offset_units * component;
+        if (nir_intrinsic_has_base(intr)) {
+                nir_intrinsic_set_base(
+                        new_intr, nir_intrinsic_base(intr) + offset_adj);
+        } else {
+                *scalar_offset =
+                        nir_iadd(b, offset,
+                                 nir_imm_intN_t(b, offset_adj,
+                                                offset->bit_size));
+        }
+
+        new_intr->num_components = 1;
+
+        return new_intr;
+}
+
+static bool
+lower_load_bitsize(nir_builder *b,
+                   nir_intrinsic_instr *intr)
+{
+        uint32_t bit_size = intr->def.bit_size;
+        if (bit_size == 32)
+                return false;
+
+        /* No need to split if it is already scalar */
+        int num_comp = nir_intrinsic_dest_components(intr);
+        if (num_comp <= 1)
+                return false;
+
+        b->cursor = nir_before_instr(&intr->instr);
+
+        /* For global 2x32 we ignore Y component because it must be zero */
+        unsigned offset_idx = offset_src(intr->intrinsic);
+        nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+        /* Split vector store to multiple scalar loads */
+        nir_def *dest_components[4] = { NULL };
+        const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+        for (int component = 0; component < num_comp; component++) {
+                nir_def *scalar_offset;
+                nir_intrinsic_instr *new_intr =
+                        init_scalar_intrinsic(b, intr, component, offset,
+                                              bit_size, &scalar_offset);
+
+                for (unsigned i = 0; i < info->num_srcs; i++) {
+                        if (i == offset_idx) {
+                                nir_def *final_offset;
+                                final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
+                                        scalar_offset :
+                                        nir_vec2(b, scalar_offset,
+                                                 nir_imm_int(b, 0));
+                                new_intr->src[i] = nir_src_for_ssa(final_offset);
+                        } else {
+                                new_intr->src[i] = intr->src[i];
+                        }
+                }
+
+                nir_def_init(&new_intr->instr, &new_intr->def, 1,
+                             bit_size);
+                dest_components[component] = &new_intr->def;
+
+                nir_builder_instr_insert(b, &new_intr->instr);
+        }
+
+        nir_def *new_dst = nir_vec(b, dest_components, num_comp);
+        nir_def_rewrite_uses(&intr->def, new_dst);
+
+        nir_instr_remove(&intr->instr);
+        return true;
+}
+
+static bool
+lower_store_bitsize(nir_builder *b,
+                    nir_intrinsic_instr *intr)
+{
+        /* No need to split if it is already scalar */
+        int value_idx = value_src(intr->intrinsic);
+        int num_comp = nir_intrinsic_src_components(intr, value_idx);
+        if (num_comp <= 1)
+                return false;
+
+        /* No need to split if it is 32-bit */
+        if (nir_src_bit_size(intr->src[value_idx]) == 32)
+                return false;
+
+        nir_def *value = intr->src[value_idx].ssa;
+
+        b->cursor = nir_before_instr(&intr->instr);
+
+        /* For global 2x32 we ignore Y component because it must be zero */
+        unsigned offset_idx = offset_src(intr->intrinsic);
+        nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+        /* Split vector store to multiple scalar stores */
+        const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+        unsigned wrmask = nir_intrinsic_write_mask(intr);
+        while (wrmask) {
+                unsigned component = ffs(wrmask) - 1;
+
+                nir_def *scalar_offset;
+                nir_intrinsic_instr *new_intr =
+                        init_scalar_intrinsic(b, intr, component, offset,
+                                              value->bit_size, &scalar_offset);
+
+                nir_intrinsic_set_write_mask(new_intr, 0x1);
+
+                for (unsigned i = 0; i < info->num_srcs; i++) {
+                        if (i == value_idx) {
+                                nir_def *scalar_value =
+                                        nir_channels(b, value, 1 << component);
+                                new_intr->src[i] = nir_src_for_ssa(scalar_value);
+                        } else if (i == offset_idx) {
+                                nir_def *final_offset;
+                                final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
+                                        scalar_offset :
+                                        nir_vec2(b, scalar_offset,
+                                                 nir_imm_int(b, 0));
+                                new_intr->src[i] = nir_src_for_ssa(final_offset);
+                        } else {
+                                new_intr->src[i] = intr->src[i];
+                        }
+                }
+
+                nir_builder_instr_insert(b, &new_intr->instr);
+
+                wrmask &= ~(1 << component);
+        }
+
+        nir_instr_remove(&intr->instr);
+        return true;
+}
+
+static bool
+lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
+                         void *data)
+{
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_ssbo:
+        case nir_intrinsic_load_ubo:
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_global_2x32:
+               return lower_load_bitsize(b, intr);
+
+        case nir_intrinsic_store_ssbo:
+        case nir_intrinsic_store_scratch:
+        case nir_intrinsic_store_global_2x32:
+                return lower_store_bitsize(b, intr);
+
+        default:
+                return false;
+        }
+}
+
+bool
+v3d_nir_lower_load_store_bitsize(nir_shader *s)
+{
+        return nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance,
+                                            NULL);
+}
diff --git a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
index 11782c7348f..4affb79a7e2 100644
--- a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
+++ b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
@@ -36,8 +36,8 @@
 #include "v3d_compiler.h"
 
 
-typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c);
-typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c);
+typedef nir_def *(*nir_pack_func)(nir_builder *b, nir_def *c);
+typedef nir_def *(*nir_unpack_func)(nir_builder *b, nir_def *c);
 
 static bool
 logicop_depends_on_dst_color(int logicop_func)
@@ -53,9 +53,9 @@ logicop_depends_on_dst_color(int logicop_func)
         }
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_logicop(nir_builder *b, int logicop_func,
-            nir_ssa_def *src, nir_ssa_def *dst)
+            nir_def *src, nir_def *dst)
 {
         switch (logicop_func) {
         case PIPE_LOGICOP_CLEAR:
@@ -96,8 +96,8 @@ v3d_logicop(nir_builder *b, int logicop_func,
         }
 }
 
-static nir_ssa_def *
-v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+static nir_def *
+v3d_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz)
 {
         switch (swiz) {
         default:
@@ -116,57 +116,57 @@ v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
         }
 }
 
-static nir_ssa_def *
-v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans,
+static nir_def *
+v3d_nir_swizzle_and_pack(nir_builder *b, nir_def **chans,
                          const uint8_t *swiz, nir_pack_func pack_func)
 {
-        nir_ssa_def *c[4];
+        nir_def *c[4];
         for (int i = 0; i < 4; i++)
                 c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]);
 
         return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3]));
 }
 
-static nir_ssa_def *
-v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed,
+static nir_def *
+v3d_nir_unpack_and_swizzle(nir_builder *b, nir_def *packed,
                            const uint8_t *swiz, nir_unpack_func unpack_func)
 {
-        nir_ssa_def *unpacked = unpack_func(b, packed);
+        nir_def *unpacked = unpack_func(b, packed);
 
-        nir_ssa_def *unpacked_chans[4];
+        nir_def *unpacked_chans[4];
         for (int i = 0; i < 4; i++)
                 unpacked_chans[i] = nir_channel(b, unpacked, i);
 
-        nir_ssa_def *c[4];
+        nir_def *c[4];
         for (int i = 0; i < 4; i++)
                 c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]);
 
         return nir_vec4(b, c[0], c[1], c[2], c[3]);
 }
 
-static nir_ssa_def *
-pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+pack_unorm_rgb10a2(nir_builder *b, nir_def *c)
 {
         static const unsigned bits[4] = { 10, 10, 10, 2 };
-        nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits);
+        nir_def *unorm = nir_format_float_to_unorm(b, c, bits);
 
-        nir_ssa_def *chans[4];
+        nir_def *chans[4];
         for (int i = 0; i < 4; i++)
                 chans[i] = nir_channel(b, unorm, i);
 
-        nir_ssa_def *result = nir_mov(b, chans[0]);
+        nir_def *result = nir_mov(b, chans[0]);
         int offset = bits[0];
         for (int i = 1; i < 4; i++) {
-                nir_ssa_def *shifted_chan =
-                        nir_ishl(b, chans[i], nir_imm_int(b, offset));
+                nir_def *shifted_chan =
+                        nir_ishl_imm(b, chans[i], offset);
                 result = nir_ior(b, result, shifted_chan);
                 offset += bits[i];
         }
         return result;
 }
 
-static nir_ssa_def *
-unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+unpack_unorm_rgb10a2(nir_builder *b, nir_def *c)
 {
         static const unsigned bits[4] = { 10, 10, 10, 2 };
         const unsigned masks[4] = { BITFIELD_MASK(bits[0]),
@@ -174,11 +174,11 @@ unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
                                     BITFIELD_MASK(bits[2]),
                                     BITFIELD_MASK(bits[3]) };
 
-        nir_ssa_def *chans[4];
+        nir_def *chans[4];
         for (int i = 0; i < 4; i++) {
-                nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i]));
+                nir_def *unorm = nir_iand_imm(b, c, masks[i]);
                 chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]);
-                c = nir_ushr(b, c, nir_imm_int(b, bits[i]));
+                c = nir_ushr_imm(b, c, bits[i]);
         }
 
         return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
@@ -201,13 +201,13 @@ v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt)
         }
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
 {
         uint32_t num_components =
                 util_format_get_nr_components(c->fs_key->color_fmt[rt].format);
 
-        nir_ssa_def *color[4];
+        nir_def *color[4];
         for (int i = 0; i < 4; i++) {
                 if (i < num_components) {
                         color[i] =
@@ -222,71 +222,68 @@ v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
         return nir_vec4(b, color[0], color[1], color[2], color[3]);
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b,
-                      nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+                      nir_def **src_chans, nir_def **dst_chans,
                       int rt, int sample)
 {
         const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
 
-        nir_ssa_def *op_res[4];
+        nir_def *op_res[4];
         for (int i = 0; i < 4; i++) {
-                nir_ssa_def *src = src_chans[i];
-                nir_ssa_def *dst =
+                nir_def *src = src_chans[i];
+                nir_def *dst =
                         v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]);
                 op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst);
 
-                /* In Vulkan we configure our integer RTs to clamp, so we need
-                 * to ignore result bits that don't fit in the destination RT
-                 * component size.
+                /* We configure our integer RTs to clamp, so we need to ignore
+                 * result bits that don't fit in the destination RT component
+                 * size.
                  */
-                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
-                        uint32_t bits =
-                                util_format_get_component_bits(
-                                        c->fs_key->color_fmt[rt].format,
-                                        UTIL_FORMAT_COLORSPACE_RGB, i);
-                        if (bits > 0 && bits < 32) {
-                                nir_ssa_def *mask =
-                                        nir_imm_int(b, (1u << bits) - 1);
-                                op_res[i] = nir_iand(b, op_res[i], mask);
-                        }
+                uint32_t bits =
+                        util_format_get_component_bits(
+                                c->fs_key->color_fmt[rt].format,
+                                UTIL_FORMAT_COLORSPACE_RGB, i);
+                if (bits > 0 && bits < 32) {
+                        op_res[i] =
+                                nir_iand_imm(b, op_res[i], (1u << bits) - 1);
                 }
         }
 
-        nir_ssa_def *r[4];
+        nir_def *r[4];
         for (int i = 0; i < 4; i++)
                 r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]);
 
         return nir_vec4(b, r[0], r[1], r[2], r[3]);
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b,
-                        nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+                        nir_def **src_chans, nir_def **dst_chans,
                         int rt, int sample,
                         nir_pack_func pack_func, nir_unpack_func unpack_func)
 {
         static const uint8_t src_swz[4] = { 0, 1, 2, 3 };
-        nir_ssa_def *packed_src =
+        nir_def *packed_src =
                 v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func);
 
         const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
-        nir_ssa_def *packed_dst =
+        nir_def *packed_dst =
                 v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func);
 
-        nir_ssa_def *packed_result =
+        nir_def *packed_result =
                 v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst);
 
         return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func);
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
-                      nir_ssa_def *src, int rt, int sample)
+                      nir_def *src, int rt, int sample)
 {
-        nir_ssa_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
+        nir_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
 
-        nir_ssa_def *src_chans[4], *dst_chans[4];
+        nir_def *src_chans[4], *dst_chans[4];
         for (unsigned i = 0; i < 4; i++) {
                 src_chans[i] = nir_channel(b, src, i);
                 dst_chans[i] = nir_channel(b, dst, i);
@@ -309,7 +306,7 @@ v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
 
 static void
 v3d_emit_ms_output(nir_builder *b,
-                   nir_ssa_def *color, nir_src *offset,
+                   nir_def *color, nir_src *offset,
                    nir_alu_type type, int rt, int sample)
 {
         nir_store_tlb_sample_color_v3d(b, color, nir_imm_int(b, rt), .base = sample, .component = 0, .src_type = type);
@@ -321,7 +318,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
                              nir_intrinsic_instr *intr,
                              int rt)
 {
-        nir_ssa_def *frag_color = intr->src[0].ssa;
+        nir_def *frag_color = intr->src[0].ssa;
 
 
         const int logic_op = c->fs_key->logicop_func;
@@ -331,7 +328,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
                 nir_src *offset = &intr->src[1];
                 nir_alu_type type = nir_intrinsic_src_type(intr);
                 for (int i = 0; i < V3D_MAX_SAMPLES; i++) {
-                        nir_ssa_def *sample =
+                        nir_def *sample =
                                 v3d_nir_emit_logic_op(c, b, frag_color, rt, i);
 
                         v3d_emit_ms_output(b, sample, offset, type, rt, i);
@@ -339,11 +336,10 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
 
                 nir_instr_remove(&intr->instr);
         } else {
-                nir_ssa_def *result =
+                nir_def *result =
                         v3d_nir_emit_logic_op(c, b, frag_color, rt, 0);
 
-                nir_instr_rewrite_src(&intr->instr, &intr->src[0],
-                                      nir_src_for_ssa(result));
+                nir_src_rewrite(&intr->src[0], result);
                 intr->num_components = result->num_components;
         }
 }
@@ -351,6 +347,8 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
 static bool
 v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
 {
+        bool progress = false;
+
         nir_foreach_instr_safe(instr, block) {
                 if (instr->type != nir_instr_type_intrinsic)
                         continue;
@@ -384,35 +382,40 @@ v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
                                 continue;
                         }
 
-                        nir_function_impl *impl =
-                                nir_cf_node_get_function(&block->cf_node);
-                        nir_builder b;
-                        nir_builder_init(&b, impl);
-                        b.cursor = nir_before_instr(&intr->instr);
+                        nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
                         v3d_nir_lower_logic_op_instr(c, &b, intr, rt);
+
+                        progress = true;
                 }
         }
 
-        return true;
+        return progress;
 }
 
-void
+bool
 v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c)
 {
+        bool progress = false;
+
         /* Nothing to do if logic op is 'copy src to dst' or if logic ops are
          * disabled (we set the logic op to copy in that case).
          */
         if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY)
-                return;
+                return false;
 
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_foreach_block(block, function->impl)
-                                v3d_nir_lower_logic_ops_block(block, c);
+        nir_foreach_function_impl(impl, s) {
+                nir_foreach_block(block, impl)
+                        progress |= v3d_nir_lower_logic_ops_block(block, c);
 
-                        nir_metadata_preserve(function->impl,
+                if (progress) {
+                        nir_metadata_preserve(impl,
                                               nir_metadata_block_index |
                                               nir_metadata_dominance);
+                } else {
+                        nir_metadata_preserve(impl,
+                                              nir_metadata_all);
                 }
         }
+
+        return progress;
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c b/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
deleted file mode 100644
index 40f1cc23b1a..00000000000
--- a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "compiler/v3d_compiler.h"
-#include "compiler/nir/nir_builder.h"
-
-static void
-rewrite_offset(nir_builder *b,
-               nir_intrinsic_instr *instr,
-               uint32_t buffer_idx,
-               uint32_t offset_src,
-               nir_intrinsic_op buffer_size_op)
-{
-        b->cursor = nir_before_instr(&instr->instr);
-
-        /* Get size of the buffer */
-        nir_intrinsic_instr *size =
-                nir_intrinsic_instr_create(b->shader, buffer_size_op);
-        size->src[0] = nir_src_for_ssa(nir_imm_int(b, buffer_idx));
-        nir_ssa_dest_init(&size->instr, &size->dest, 1, 32, NULL);
-        nir_builder_instr_insert(b, &size->instr);
-
-        /* All out TMU accesses are 32-bit aligned */
-        nir_ssa_def *aligned_buffer_size =
-                nir_iand(b, &size->dest.ssa, nir_imm_int(b, 0xfffffffc));
-
-        /* Rewrite offset */
-        nir_ssa_def *offset =
-                nir_umin(b, instr->src[offset_src].ssa, aligned_buffer_size);
-        nir_instr_rewrite_src(&instr->instr, &instr->src[offset_src],
-                              nir_src_for_ssa(offset));
-}
-
-static void
-lower_load(struct v3d_compile *c,
-           nir_builder *b,
-           nir_intrinsic_instr *instr)
-{
-        uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
-
-        nir_intrinsic_op op;
-        if (instr->intrinsic == nir_intrinsic_load_ubo) {
-                op = nir_intrinsic_get_ubo_size;
-                if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
-                        index--;
-        } else {
-                op = nir_intrinsic_get_ssbo_size;
-        }
-
-        rewrite_offset(b, instr, index, 1, op);
-}
-
-static void
-lower_store(struct v3d_compile *c,
-            nir_builder *b,
-            nir_intrinsic_instr *instr)
-{
-        uint32_t index = nir_src_comp_as_uint(instr->src[1], 0);
-        rewrite_offset(b, instr, index, 2, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_atomic(struct v3d_compile *c,
-             nir_builder *b,
-             nir_intrinsic_instr *instr)
-{
-        uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
-        rewrite_offset(b, instr, index, 1, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_shared(struct v3d_compile *c,
-             nir_builder *b,
-             nir_intrinsic_instr *instr)
-{
-        b->cursor = nir_before_instr(&instr->instr);
-        nir_ssa_def *aligned_size =
-                nir_imm_int(b, c->s->info.shared_size & 0xfffffffc);
-        nir_ssa_def *offset = nir_umin(b, instr->src[0].ssa, aligned_size);
-        nir_instr_rewrite_src(&instr->instr, &instr->src[0],
-                              nir_src_for_ssa(offset));
-}
-
-static void
-lower_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr)
-{
-        if (instr->type != nir_instr_type_intrinsic)
-                return;
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-        switch (intr->intrinsic) {
-        case nir_intrinsic_load_ubo:
-        case nir_intrinsic_load_ssbo:
-                lower_load(c, b, intr);
-                break;
-        case nir_intrinsic_store_ssbo:
-                lower_store(c, b, intr);
-                break;
-        case nir_intrinsic_ssbo_atomic_add:
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_ssbo_atomic_comp_swap:
-                lower_atomic(c, b, intr);
-                break;
-        case nir_intrinsic_load_shared:
-        case nir_intrinsic_shared_atomic_add:
-        case nir_intrinsic_shared_atomic_imin:
-        case nir_intrinsic_shared_atomic_umin:
-        case nir_intrinsic_shared_atomic_imax:
-        case nir_intrinsic_shared_atomic_umax:
-        case nir_intrinsic_shared_atomic_and:
-        case nir_intrinsic_shared_atomic_or:
-        case nir_intrinsic_shared_atomic_xor:
-        case nir_intrinsic_shared_atomic_exchange:
-        case nir_intrinsic_shared_atomic_comp_swap:
-                lower_shared(c, b, intr);
-                break;
-        default:
-                break;
-        }
-}
-
-void
-v3d_nir_lower_robust_buffer_access(nir_shader *s, struct v3d_compile *c)
-{
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_builder b;
-                        nir_builder_init(&b, function->impl);
-
-                        nir_foreach_block(block, function->impl) {
-                                nir_foreach_instr_safe(instr, block)
-                                        lower_instr(c, &b, instr);
-                        }
-
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
-                }
-        }
-}
diff --git a/src/broadcom/compiler/v3d_nir_lower_scratch.c b/src/broadcom/compiler/v3d_nir_lower_scratch.c
index 893b6f6ae28..93ed1bb6e26 100644
--- a/src/broadcom/compiler/v3d_nir_lower_scratch.c
+++ b/src/broadcom/compiler/v3d_nir_lower_scratch.c
@@ -34,11 +34,11 @@
  * writemasks in the process.
  */
 
-static nir_ssa_def *
+static nir_def *
 v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr)
 {
         bool is_store = instr->intrinsic == nir_intrinsic_store_scratch;
-        nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1);
+        nir_def *offset = instr->src[is_store ? 1 : 0].ssa;
 
         assert(nir_intrinsic_align_mul(instr) >= 4);
         assert(nir_intrinsic_align_offset(instr) == 0);
@@ -55,18 +55,18 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
 {
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr);
+        nir_def *offset = v3d_nir_scratch_offset(b,instr);
 
-        nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS];
+        nir_def *chans[NIR_MAX_VEC_COMPONENTS];
         for (int i = 0; i < instr->num_components; i++) {
-                nir_ssa_def *chan_offset =
+                nir_def *chan_offset =
                         nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
 
                 nir_intrinsic_instr *chan_instr =
                         nir_intrinsic_instr_create(b->shader, instr->intrinsic);
                 chan_instr->num_components = 1;
-                nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1,
-                                  instr->dest.ssa.bit_size, NULL);
+                nir_def_init(&chan_instr->instr, &chan_instr->def, 1,
+                             instr->def.bit_size);
 
                 chan_instr->src[0] = nir_src_for_ssa(chan_offset);
 
@@ -74,11 +74,11 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
 
                 nir_builder_instr_insert(b, &chan_instr->instr);
 
-                chans[i] = &chan_instr->dest.ssa;
+                chans[i] = &chan_instr->def;
         }
 
-        nir_ssa_def *result = nir_vec(b, chans, instr->num_components);
-        nir_ssa_def_rewrite_uses(&instr->dest.ssa, result);
+        nir_def *result = nir_vec(b, chans, instr->num_components);
+        nir_def_rewrite_uses(&instr->def, result);
         nir_instr_remove(&instr->instr);
 }
 
@@ -87,15 +87,14 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
 {
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr);
-        nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0],
-                                             instr->num_components);
+        nir_def *offset = v3d_nir_scratch_offset(b, instr);
+        nir_def *value = instr->src[0].ssa;
 
         for (int i = 0; i < instr->num_components; i++) {
                 if (!(nir_intrinsic_write_mask(instr) & (1 << i)))
                         continue;
 
-                nir_ssa_def *chan_offset =
+                nir_def *chan_offset =
                         nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
 
                 nir_intrinsic_instr *chan_instr =
@@ -115,39 +114,29 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
         nir_instr_remove(&instr->instr);
 }
 
-void
-v3d_nir_lower_scratch(nir_shader *s)
+static bool
+v3d_nir_lower_scratch_cb(nir_builder *b,
+                         nir_intrinsic_instr *intr,
+                         void *_state)
 {
-        nir_foreach_function(function, s) {
-                if (!function->impl)
-                        continue;
-
-                nir_builder b;
-                nir_builder_init(&b, function->impl);
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
-
-                                nir_intrinsic_instr *intr =
-                                        nir_instr_as_intrinsic(instr);
-
-                                switch (intr->intrinsic) {
-                                case nir_intrinsic_load_scratch:
-                                        v3d_nir_lower_load_scratch(&b, intr);
-                                        break;
-                                case nir_intrinsic_store_scratch:
-                                        v3d_nir_lower_store_scratch(&b, intr);
-                                        break;
-                                default:
-                                        break;
-                                }
-                        }
-                }
-
-                nir_metadata_preserve(function->impl,
-                                      nir_metadata_block_index |
-                                      nir_metadata_dominance);
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_scratch:
+                v3d_nir_lower_load_scratch(b, intr);
+                return true;
+        case nir_intrinsic_store_scratch:
+                v3d_nir_lower_store_scratch(b, intr);
+                return true;
+        default:
+                return false;
         }
+
+        return false;
+}
+
+bool
+v3d_nir_lower_scratch(nir_shader *s)
+{
+        return nir_shader_intrinsics_pass(s, v3d_nir_lower_scratch_cb,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance, NULL);
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
index d79969374d5..e78c3cb9e3e 100644
--- a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
+++ b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
@@ -32,25 +32,21 @@
  * 2x2 quad.
  */
 
-#define V3D_MAX_SAMPLES 4
-
-static nir_ssa_def *
+static nir_def *
 v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
 {
         nir_tex_instr *instr = nir_instr_as_tex(in_instr);
 
         b->cursor = nir_before_instr(&instr->instr);
 
-        int coord_index = nir_tex_instr_src_index(instr, nir_tex_src_coord);
-        int sample_index = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
-        nir_ssa_def *coord = instr->src[coord_index].src.ssa;
-        nir_ssa_def *sample = instr->src[sample_index].src.ssa;
+        nir_def *coord = nir_steal_tex_src(instr, nir_tex_src_coord);
+        nir_def *sample = nir_steal_tex_src(instr, nir_tex_src_ms_index);
 
-        nir_ssa_def *one = nir_imm_int(b, 1);
-        nir_ssa_def *x = nir_iadd(b,
+        nir_def *one = nir_imm_int(b, 1);
+        nir_def *x = nir_iadd(b,
                                   nir_ishl(b, nir_channel(b, coord, 0), one),
                                   nir_iand(b, sample, one));
-        nir_ssa_def *y = nir_iadd(b,
+        nir_def *y = nir_iadd(b,
                                   nir_ishl(b, nir_channel(b, coord, 1), one),
                                   nir_iand(b, nir_ushr(b, sample, one), one));
         if (instr->is_array)
@@ -58,10 +54,7 @@ v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
         else
                 coord = nir_vec2(b, x, y);
 
-        nir_instr_rewrite_src(&instr->instr,
-                              &instr->src[nir_tex_src_coord].src,
-                              nir_src_for_ssa(coord));
-        nir_tex_instr_remove_src(instr, sample_index);
+        nir_tex_instr_add_src(instr, nir_tex_src_coord, coord);
         instr->op = nir_texop_txf;
         instr->sampler_dim = GLSL_SAMPLER_DIM_2D;
 
@@ -75,11 +68,11 @@ v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data)
                 nir_instr_as_tex(instr)->op == nir_texop_txf_ms);
 }
 
-void
-v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c)
+bool
+v3d_nir_lower_txf_ms(nir_shader *s)
 {
-        nir_shader_lower_instructions(s,
-                                      v3d_nir_lower_txf_ms_filter,
-                                      v3d_nir_lower_txf_ms_instr,
-                                      NULL);
+        return nir_shader_lower_instructions(s,
+                                             v3d_nir_lower_txf_ms_filter,
+                                             v3d_nir_lower_txf_ms_instr,
+                                             NULL);
 }
diff --git a/src/broadcom/compiler/v3d_packing.c b/src/broadcom/compiler/v3d_packing.c
new file mode 100644
index 00000000000..46643edd5e6
--- /dev/null
+++ b/src/broadcom/compiler/v3d_packing.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#define __gen_unpack_address(cl, s, e) (__gen_unpack_uint(cl, s, e) << (31 - (e - s)))
+#include "cle/v3d_packet_v42_pack.h"
+
+
+/* Typically, this method would wrap calling version-specific variant of this
+ * method, but as TMU_CONFIG_PARAMETER_1 doesn't change between v42 and v71,
+ * we can assume that p1_packed is the same struct, and use the same method.
+ */
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+                                  uint32_t *p1_packed,
+                                  bool unnormalized_coordinates)
+{
+        assert(devinfo->ver == 71 || devinfo->ver == 42);
+
+        struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked;
+        V3D42_TMU_CONFIG_PARAMETER_1_unpack((uint8_t *)p1_packed, &p1_unpacked);
+        p1_unpacked.unnormalized_coordinates = unnormalized_coordinates;
+        V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)p1_packed,
+                                     &p1_unpacked);
+}
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d_tex.c
index 7bebfe95552..643c73c4e58 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d_tex.c
@@ -28,27 +28,29 @@
 #define __gen_address_type uint32_t
 #define __gen_address_offset(reloc) (*reloc)
 #define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
 
-static inline void
+static inline struct qinst *
 vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
 {
         /* XXX perf: We should figure out how to merge ALU operations
          * producing the val with this MOV, when possible.
          */
-        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+        return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
 }
 
-static inline void
+static inline struct qinst *
 vir_TMU_WRITE_or_count(struct v3d_compile *c,
                        enum v3d_qpu_waddr waddr,
                        struct qreg val,
                        uint32_t *tmu_writes)
 {
-        if (tmu_writes)
+        if (tmu_writes) {
                 (*tmu_writes)++;
-        else
-                vir_TMU_WRITE(c, waddr, val);
+                return NULL;
+        } else {
+                return vir_TMU_WRITE(c, waddr, val);
+        }
 }
 
 static void
@@ -59,11 +61,11 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data
         inst->uniform = vir_get_uniform_index(c, contents, data);
 }
 
-static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
         .per_pixel_mask_enable = true,
 };
 
-static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
         .op = V3D_TMU_OP_REGULAR,
 };
 
@@ -84,7 +86,7 @@ handle_tex_src(struct v3d_compile *c,
                nir_tex_instr *instr,
                unsigned src_idx,
                unsigned non_array_components,
-               struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+               struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
                struct qreg *s_out,
                unsigned *tmu_writes)
 {
@@ -199,7 +201,7 @@ handle_tex_src(struct v3d_compile *c,
 static void
 vir_tex_handle_srcs(struct v3d_compile *c,
                     nir_tex_instr *instr,
-                    struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+                    struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
                     struct qreg *s,
                     unsigned *tmu_writes)
 {
@@ -222,31 +224,62 @@ get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
 }
 
 void
-v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 {
-        assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);
-
         unsigned texture_idx = instr->texture_index;
-        unsigned sampler_idx = instr->sampler_index;
 
-        struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+        /* For instructions that don't have a sampler (i.e. txf) we bind
+         * default sampler state via the backend_flags to handle precision.
+         */
+        unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ?
+                               instr->sampler_index : instr->backend_flags;
+
+        /* Even if the texture operation doesn't need a sampler by
+         * itself, we still need to add the sampler configuration
+         * parameter if the output is 32 bit
+         */
+        assert(sampler_idx < c->key->num_samplers_used);
+        bool output_type_32_bit =
+                c->key->sampler[sampler_idx].return_size == 32;
+
+        struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
         };
 
         /* Limit the number of channels returned to both how many the NIR
          * instruction writes and how many the instruction could produce.
          */
-        p0_unpacked.return_words_of_texture_data =
-                instr->dest.is_ssa ?
-                nir_ssa_def_components_read(&instr->dest.ssa) :
-                (1 << instr->dest.reg.reg->num_components) - 1;
+        nir_intrinsic_instr *store = nir_store_reg_for_def(&instr->def);
+        if (store == NULL) {
+                p0_unpacked.return_words_of_texture_data =
+                        nir_def_components_read(&instr->def);
+        } else {
+                nir_def *reg = store->src[1].ssa;
+                nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                unsigned reg_num_components =
+                        nir_intrinsic_num_components(decl);
+
+                /* For the non-ssa case we don't have a full equivalent to
+                 * nir_def_components_read. This is a problem for the 16
+                 * bit case. nir_lower_tex will not change the destination as
+                 * nir_tex_instr_dest_size will still return 4. The driver is
+                 * just expected to not store on other channels, so we
+                 * manually ensure that here.
+                 */
+                uint32_t num_components = output_type_32_bit ?
+                        MIN2(reg_num_components, 4) :
+                        MIN2(reg_num_components, 2);
+
+                p0_unpacked.return_words_of_texture_data = (1 << num_components) - 1;
+        }
         assert(p0_unpacked.return_words_of_texture_data != 0);
 
-        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
+        struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
                 .op = V3D_TMU_OP_REGULAR,
                 .gather_mode = instr->op == nir_texop_tg4,
                 .gather_component = instr->component,
                 .coefficient_mode = instr->op == nir_texop_txd,
-                .disable_autolod = instr->op == nir_texop_tg4
+                .disable_autolod = instr->op == nir_texop_tg4,
+                .lod_query = instr->op == nir_texop_lod,
         };
 
         const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr);
@@ -270,22 +303,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
 
         uint32_t p0_packed;
-        V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
                                           &p0_unpacked);
 
         uint32_t p2_packed;
-        V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
                                           (uint8_t *)&p2_packed,
                                           &p2_unpacked);
 
-        /* We manually set the LOD Query bit (see
-         * V3D42_TMU_CONFIG_PARAMETER_2) as right now is the only V42 specific
-         * feature over V41 we are using
-         */
-        if (instr->op == nir_texop_lod)
-           p2_packed |= 1UL << 24;
-
         /* Load texture_idx number into the high bits of the texture address field,
          * which will be be used by the driver to decide which texture to put
          * in the actual address field.
@@ -294,14 +320,6 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
 
-        /* Even if the texture operation doesn't need a sampler by
-         * itself, we still need to add the sampler configuration
-         * parameter if the output is 32 bit
-         */
-        bool output_type_32_bit =
-                c->key->sampler[sampler_idx].return_size == 32 &&
-                !instr->is_shadow;
-
         /* p1 is optional, but we can skip it only if p2 can be skipped too */
         bool needs_p2_config =
                 (instr->op == nir_texop_lod ||
@@ -313,7 +331,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 output_type_32_bit;
 
         if (non_default_p1_config) {
-                struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+                struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
                         .output_type_32_bit = output_type_32_bit,
 
                         .unnormalized_coordinates = (instr->sampler_dim ==
@@ -330,7 +348,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                        p0_unpacked.return_words_of_texture_data < (1 << 2));
 
                 uint32_t p1_packed;
-                V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+                V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
                                                   (uint8_t *)&p1_packed,
                                                   &p1_unpacked);
 
@@ -358,7 +376,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                  * address
                  */
                 uint32_t p1_packed_default;
-                V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+                V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
                                                   (uint8_t *)&p1_packed_default,
                                                   &p1_unpacked_default);
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default);
@@ -368,48 +386,54 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
         /* Emit retiring TMU write */
+        struct qinst *retiring;
         if (instr->op == nir_texop_txf) {
                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
         } else if (instr->op == nir_texop_txl) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
         } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
         }
 
-        ntq_add_pending_tmu_flush(c, &instr->dest,
+        retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+        ntq_add_pending_tmu_flush(c, &instr->def,
                                   p0_unpacked.return_words_of_texture_data);
 }
 
 static uint32_t
-v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
+v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr)
+{
+        nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+        switch (atomic_op) {
+        case nir_atomic_op_iadd:    return v3d_get_op_for_atomic_add(instr, 3);
+        case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
+        case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+        case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
+        case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
+        case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
+        case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+        case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+        case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+        default:                    unreachable("unknown atomic op");
+        }
+}
+
+static uint32_t
+v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
         case nir_intrinsic_image_load:
         case nir_intrinsic_image_store:
                 return V3D_TMU_OP_REGULAR;
-        case nir_intrinsic_image_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 3);
-        case nir_intrinsic_image_atomic_imin:
-                return V3D_TMU_OP_WRITE_SMIN;
-        case nir_intrinsic_image_atomic_umin:
-                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
-        case nir_intrinsic_image_atomic_imax:
-                return V3D_TMU_OP_WRITE_SMAX;
-        case nir_intrinsic_image_atomic_umax:
-                return V3D_TMU_OP_WRITE_UMAX;
-        case nir_intrinsic_image_atomic_and:
-                return V3D_TMU_OP_WRITE_AND_READ_INC;
-        case nir_intrinsic_image_atomic_or:
-                return V3D_TMU_OP_WRITE_OR_READ_DEC;
-        case nir_intrinsic_image_atomic_xor:
-                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
-        case nir_intrinsic_image_atomic_exchange:
-                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
-        case nir_intrinsic_image_atomic_comp_swap:
-                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+        case nir_intrinsic_image_atomic:
+        case nir_intrinsic_image_atomic_swap:
+                return v3d_image_atomic_tmu_op(instr);
+
         default:
                 unreachable("unknown image intrinsic");
         };
@@ -427,7 +451,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
  * which is why we always call ntq_get_src() even if we are only interested in
  * register write counts.
  */
-static void
+static struct qinst *
 vir_image_emit_register_writes(struct v3d_compile *c,
                                nir_intrinsic_instr *instr,
                                bool atomic_add_replaced,
@@ -480,7 +504,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                 }
 
                 /* Second atomic argument */
-                if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) {
+                if (instr->intrinsic == nir_intrinsic_image_atomic_swap &&
+                    nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) {
                         struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0);
                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0,
                                                tmu_writes);
@@ -494,7 +519,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                            V3D_QPU_PF_PUSHZ);
         }
 
-        vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
+        struct qinst *retiring =
+                vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
 
         if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
             instr->intrinsic != nir_intrinsic_image_load) {
@@ -502,6 +528,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                         (struct  qinst *)c->cur_block->instructions.prev;
                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
         }
+
+        return retiring;
 }
 
 static unsigned
@@ -516,21 +544,21 @@ get_required_image_tmu_writes(struct v3d_compile *c,
 }
 
 void
-v3d40_vir_emit_image_load_store(struct v3d_compile *c,
-                                nir_intrinsic_instr *instr)
+v3d_vir_emit_image_load_store(struct v3d_compile *c,
+                              nir_intrinsic_instr *instr)
 {
         unsigned format = nir_intrinsic_format(instr);
         unsigned unit = nir_src_as_uint(instr->src[0]);
 
-        struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+        struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
         };
 
-        struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+        struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
                 .per_pixel_mask_enable = true,
                 .output_type_32_bit = v3d_gl_format_is_return_32(format),
         };
 
-        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
+        struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
 
         /* Limit the number of channels returned to both how many the NIR
          * instruction writes and how many the instruction could produce.
@@ -542,19 +570,20 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         p0_unpacked.return_words_of_texture_data =
                 (1 << instr_return_channels) - 1;
 
-        p2_unpacked.op = v3d40_image_load_store_tmu_op(instr);
+        p2_unpacked.op = v3d_image_load_store_tmu_op(instr);
 
         /* If we were able to replace atomic_add for an inc/dec, then we
          * need/can to do things slightly different, like not loading the
          * amount to add/sub, as that is implicit.
          */
         bool atomic_add_replaced =
-                (instr->intrinsic == nir_intrinsic_image_atomic_add &&
-                 (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
-                  p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+                instr->intrinsic == nir_intrinsic_image_atomic &&
+                nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
+                (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+                 p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC);
 
         uint32_t p0_packed;
-        V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
                                           &p0_unpacked);
 
@@ -565,12 +594,12 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         p0_packed |= unit << 24;
 
         uint32_t p1_packed;
-        V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
                                           (uint8_t *)&p1_packed,
                                           &p1_unpacked);
 
         uint32_t p2_packed;
-        V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
                                           (uint8_t *)&p2_packed,
                                           &p2_unpacked);
 
@@ -599,8 +628,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
                    vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
-        vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
-
-        ntq_add_pending_tmu_flush(c, &instr->dest,
+        struct qinst *retiring =
+                vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
+        retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+        ntq_add_pending_tmu_flush(c, &instr->def,
                                   p0_unpacked.return_words_of_texture_data);
 }
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 27869a35a3b..c59a8aac434 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -23,7 +23,6 @@
 
 #include "broadcom/common/v3d_device_info.h"
 #include "v3d_compiler.h"
-#include "util/u_prim.h"
 #include "compiler/nir/nir_schedule.h"
 #include "compiler/nir/nir_builder.h"
 
@@ -89,7 +88,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
          * pointer, so each read has a side effect (we don't care for ldunif
          * because we reconstruct the uniform stream buffer after compiling
          * with the surviving uniforms), so allowing DCE to remove
-         * one would break follow-up loads. We could fix this by emiting a
+         * one would break follow-up loads. We could fix this by emitting a
          * unifa for each ldunifa, but each unifa requires 3 delay slots
          * before a ldunifa, so that would be quite expensive.
          */
@@ -113,10 +112,10 @@ vir_is_raw_mov(struct qinst *inst)
                 return false;
         }
 
-        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+        if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
                 return false;
         }
 
@@ -156,30 +155,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
 }
 
 bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+                         struct qinst *inst)
 {
-        for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                switch (inst->src[i].file) {
-                case QFILE_VPM:
-                        return true;
-                default:
-                        break;
-                }
-        }
-
-        if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
-                                  inst->qpu.sig.ldtlb ||
-                                  inst->qpu.sig.ldtlbu ||
-                                  inst->qpu.sig.ldvpm)) {
-                return true;
-        }
-
-        return false;
-}
+        if (!devinfo->has_accumulators)
+                return false;
 
-bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
-{
         switch (inst->dst.file) {
         case QFILE_MAGIC:
                 switch (inst->dst.index) {
@@ -195,9 +176,6 @@ vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
                 break;
         }
 
-        if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
-                return true;
-
         return false;
 }
 
@@ -209,15 +187,15 @@ vir_set_unpack(struct qinst *inst, int src,
 
         if (vir_is_add(inst)) {
                 if (src == 0)
-                        inst->qpu.alu.add.a_unpack = unpack;
+                        inst->qpu.alu.add.a.unpack = unpack;
                 else
-                        inst->qpu.alu.add.b_unpack = unpack;
+                        inst->qpu.alu.add.b.unpack = unpack;
         } else {
                 assert(vir_is_mul(inst));
                 if (src == 0)
-                        inst->qpu.alu.mul.a_unpack = unpack;
+                        inst->qpu.alu.mul.a.unpack = unpack;
                 else
-                        inst->qpu.alu.mul.b_unpack = unpack;
+                        inst->qpu.alu.mul.b.unpack = unpack;
         }
 }
 
@@ -369,6 +347,8 @@ vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct q
         inst->src[1] = src1;
         inst->uniform = ~0;
 
+        inst->ip = -1;
+
         return inst;
 }
 
@@ -385,6 +365,8 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q
         inst->src[1] = src1;
         inst->uniform = ~0;
 
+        inst->ip = -1;
+
         return inst;
 }
 
@@ -404,12 +386,16 @@ vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
         inst->dst = vir_nop_reg();
         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
 
+        inst->ip = -1;
+
         return inst;
 }
 
 static void
 vir_emit(struct v3d_compile *c, struct qinst *inst)
 {
+        inst->ip = -1;
+
         switch (c->cursor.mode) {
         case vir_cursor_add:
                 list_add(&inst->link, c->cursor.link);
@@ -509,13 +495,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
 }
 
 const struct v3d_compiler *
-v3d_compiler_init(const struct v3d_device_info *devinfo)
+v3d_compiler_init(const struct v3d_device_info *devinfo,
+                  uint32_t max_inline_uniform_buffers)
 {
         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
         if (!compiler)
                 return NULL;
 
         compiler->devinfo = devinfo;
+        compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
 
         if (!vir_init_reg_sets(compiler)) {
                 ralloc_free(compiler);
@@ -531,6 +519,19 @@ v3d_compiler_free(const struct v3d_compiler *compiler)
         ralloc_free((void *)compiler);
 }
 
+struct v3d_compiler_strategy {
+        const char *name;
+        uint32_t max_threads;
+        uint32_t min_threads;
+        bool disable_general_tmu_sched;
+        bool disable_gcm;
+        bool disable_loop_unrolling;
+        bool disable_ubo_load_sorting;
+        bool move_buffer_loads;
+        bool disable_tmu_pipelining;
+        uint32_t max_tmu_spills;
+};
+
 static struct v3d_compile *
 vir_compile_init(const struct v3d_compiler *compiler,
                  struct v3d_key *key,
@@ -539,12 +540,8 @@ vir_compile_init(const struct v3d_compiler *compiler,
                                       void *debug_output_data),
                  void *debug_output_data,
                  int program_id, int variant_id,
-                 uint32_t max_threads,
-                 uint32_t min_threads_for_reg_alloc,
-                 bool tmu_spilling_allowed,
-                 bool disable_loop_unrolling,
-                 bool disable_constant_ubo_load_sorting,
-                 bool disable_tmu_pipelining,
+                 uint32_t compile_strategy_idx,
+                 const struct v3d_compiler_strategy *strategy,
                  bool fallback_scheduler)
 {
         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
@@ -554,17 +551,22 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->key = key;
         c->program_id = program_id;
         c->variant_id = variant_id;
-        c->threads = max_threads;
+        c->compile_strategy_idx = compile_strategy_idx;
+        c->threads = strategy->max_threads;
         c->debug_output = debug_output;
         c->debug_output_data = debug_output_data;
         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
-        c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
-        c->tmu_spilling_allowed = tmu_spilling_allowed;
+        c->min_threads_for_reg_alloc = strategy->min_threads;
+        c->max_tmu_spills = strategy->max_tmu_spills;
         c->fallback_scheduler = fallback_scheduler;
-        c->disable_tmu_pipelining = disable_tmu_pipelining;
-        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
-        c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
-                ? true : disable_loop_unrolling;
+        c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
+        c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
+        c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
+        c->move_buffer_loads = strategy->move_buffer_loads;
+        c->disable_gcm = strategy->disable_gcm;
+        c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
+                ? true : strategy->disable_loop_unrolling;
+
 
         s = nir_shader_clone(c, s);
         c->s = s;
@@ -590,17 +592,107 @@ type_size_vec4(const struct glsl_type *type, bool bindless)
         return glsl_count_attribute_slots(type, false);
 }
 
+static enum nir_lower_tex_packing
+lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
+{
+   struct v3d_compile *c = (struct v3d_compile *) data;
+
+   int sampler_index = nir_tex_instr_need_sampler(tex) ?
+      tex->sampler_index : tex->backend_flags;
+
+   assert(sampler_index < c->key->num_samplers_used);
+   return c->key->sampler[sampler_index].return_size == 16 ?
+      nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
+}
+
+static bool
+v3d_nir_lower_null_pointers_cb(nir_builder *b,
+                               nir_intrinsic_instr *intr,
+                               void *_state)
+{
+        uint32_t buffer_src_idx;
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_ubo:
+        case nir_intrinsic_load_ssbo:
+                buffer_src_idx = 0;
+                break;
+        case nir_intrinsic_store_ssbo:
+                buffer_src_idx = 1;
+                break;
+        default:
+                return false;
+        }
+
+        /* If index if constant we are good */
+        nir_src *src = &intr->src[buffer_src_idx];
+        if (nir_src_is_const(*src))
+                return false;
+
+        /* Otherwise, see if it comes from a bcsel including a null pointer */
+        if (src->ssa->parent_instr->type != nir_instr_type_alu)
+                return false;
+
+        nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
+        if (alu->op != nir_op_bcsel)
+                return false;
+
+        /* A null pointer is specified using block index 0xffffffff */
+        int32_t null_src_idx = -1;
+        for (int i = 1; i < 3; i++) {
+                 /* FIXME: since we are running this before optimization maybe
+                  * we need to also handle the case where we may have bcsel
+                  * chain that we need to recurse?
+                  */
+                if (!nir_src_is_const(alu->src[i].src))
+                        continue;
+                if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
+                        continue;
+
+                /* One of the bcsel srcs is a null pointer reference */
+                null_src_idx = i;
+                break;
+        }
+
+        if (null_src_idx < 0)
+                return false;
+
+        assert(null_src_idx == 1 || null_src_idx == 2);
+        int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
+
+        /* Rewrite the null pointer reference so we use the same buffer index
+         * as the other bcsel branch. This will allow optimization to remove
+         * the bcsel and we should then end up with a constant buffer index
+         * like we need.
+         */
+        b->cursor = nir_before_instr(&alu->instr);
+        nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
+        nir_src_rewrite(&alu->src[null_src_idx].src, copy);
+
+        return true;
+}
+
+static bool
+v3d_nir_lower_null_pointers(nir_shader *s)
+{
+        return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance, NULL);
+}
+
 static void
 v3d_lower_nir(struct v3d_compile *c)
 {
         struct nir_lower_tex_options tex_options = {
                 .lower_txd = true,
+                .lower_tg4_offsets = true,
                 .lower_tg4_broadcom_swizzle = true,
 
                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
                 .lower_txp = ~0,
                 /* Apply swizzles to all samplers. */
                 .swizzle_result = ~0,
+                .lower_invalid_implicit_lod = true,
         };
 
         /* Lower the format swizzle and (for 32-bit returns)
@@ -612,38 +704,35 @@ v3d_lower_nir(struct v3d_compile *c)
                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
         }
 
-        assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
-        for (int i = 0; i < c->key->num_samplers_used; i++) {
-                if (c->key->sampler[i].return_size == 16) {
-                        tex_options.lower_tex_packing[i] =
-                                nir_lower_tex_packing_16;
-                }
-        }
-
-        /* CS textures may not have return_size reflecting the shadow state. */
-        nir_foreach_uniform_variable(var, c->s) {
-                const struct glsl_type *type = glsl_without_array(var->type);
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+        tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
+        tex_options.lower_tex_packing_data = c;
 
-                if (!glsl_type_is_sampler(type) ||
-                    !glsl_sampler_type_is_shadow(type))
-                        continue;
+        NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
+        NIR_PASS(_, c->s, nir_lower_system_values);
 
-                for (int i = 0; i < array_len; i++) {
-                        tex_options.lower_tex_packing[var->data.binding + i] =
-                                nir_lower_tex_packing_16;
-                }
+        if (c->s->info.zero_initialize_shared_memory &&
+            c->s->info.shared_size > 0) {
+                /* All our BOs allocate full pages, so the underlying allocation
+                 * for shared memory will always be a multiple of 4KB. This
+                 * ensures that we can do an exact number of full chunk_size
+                 * writes to initialize the memory independently of the actual
+                 * shared_size used by the shader, which is a requirement of
+                 * the initialization pass.
+                 */
+                const unsigned chunk_size = 16; /* max single store size */
+                NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
+                         align(c->s->info.shared_size, chunk_size), chunk_size);
         }
 
-        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
-        NIR_PASS_V(c->s, nir_lower_system_values);
-        NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
+        NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
 
-        NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
-                   nir_var_function_temp,
-                   0,
-                   glsl_get_natural_size_align_bytes);
-        NIR_PASS_V(c->s, v3d_nir_lower_scratch);
+        NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
+                 nir_var_function_temp,
+                 0,
+                 glsl_get_natural_size_align_bytes);
+        NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
+        NIR_PASS(_, c->s, v3d_nir_lower_scratch);
+        NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
 }
 
 static void
@@ -711,6 +800,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
 
         /* Set us up for shared input/output segments.  This is apparently
          * necessary for our VCM setup to avoid varying corruption.
+         *
+         * FIXME: initial testing on V3D 7.1 seems to work fine when using
+         * separate segments. So we could try to reevaluate in the future, if
+         * there is any advantage of using separate segments.
          */
         prog_data->separate_segments = false;
         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
@@ -807,13 +900,14 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
 {
         v3d_set_fs_prog_data_inputs(c, prog_data);
         prog_data->writes_z = c->writes_z;
+        prog_data->writes_z_from_fep = c->writes_z_from_fep;
         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
         prog_data->uses_center_w = c->uses_center_w;
         prog_data->uses_implicit_point_line_varyings =
                 c->uses_implicit_point_line_varyings;
         prog_data->lock_scoreboard_on_first_thrsw =
                 c->lock_scoreboard_on_first_thrsw;
-        prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
+        prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
         prog_data->uses_pid = c->fs_uses_primitive_id;
 }
 
@@ -837,8 +931,14 @@ v3d_set_prog_data(struct v3d_compile *c,
         prog_data->threads = c->threads;
         prog_data->single_seg = !c->last_thrsw;
         prog_data->spill_size = c->spill_size;
+        prog_data->tmu_spills = c->spills;
+        prog_data->tmu_fills = c->fills;
+        prog_data->tmu_count = c->tmu.total_count;
+        prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
+        prog_data->compile_strategy_idx = c->compile_strategy_idx;
         prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
         prog_data->has_control_barrier = c->s->info.uses_control_barrier;
+        prog_data->has_global_address = c->has_global_address;
 
         v3d_set_prog_data_uniforms(c, prog_data);
 
@@ -882,32 +982,32 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
         /* Split our I/O vars and dead code eliminate the unused
          * components.
          */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
-                   nir_var_shader_in | nir_var_shader_out);
+        NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+                 nir_var_shader_in | nir_var_shader_out);
         uint64_t used_outputs[4] = {0};
         for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
                 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
                 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
                 used_outputs[comp] |= 1ull << slot;
         }
-        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
-                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
-        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+        NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+                 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+        NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
         v3d_optimize_nir(c, c->s);
-        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+        NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
 
         /* This must go before nir_lower_io */
         if (c->vs_key->per_vertex_point_size)
-                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+                NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
 
-        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-                   type_size_vec4,
-                   (nir_lower_io_options)0);
+        NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+                 type_size_vec4,
+                 (nir_lower_io_options)0);
         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
          * on the code it generated.
          */
-        NIR_PASS_V(c->s, nir_opt_dce);
-        NIR_PASS_V(c->s, nir_opt_constant_folding);
+        NIR_PASS(_, c->s, nir_opt_dce);
+        NIR_PASS(_, c->s, nir_opt_constant_folding);
 }
 
 static void
@@ -916,29 +1016,32 @@ v3d_nir_lower_gs_early(struct v3d_compile *c)
         /* Split our I/O vars and dead code eliminate the unused
          * components.
          */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
-                   nir_var_shader_in | nir_var_shader_out);
+        NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+                 nir_var_shader_in | nir_var_shader_out);
         uint64_t used_outputs[4] = {0};
         for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
                 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
                 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
                 used_outputs[comp] |= 1ull << slot;
         }
-        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
-                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
-        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+        NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+                 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+        NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
         v3d_optimize_nir(c, c->s);
-        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+        NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
 
         /* This must go before nir_lower_io */
         if (c->gs_key->per_vertex_point_size)
-                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+                NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
 
-        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-                   type_size_vec4,
-                   (nir_lower_io_options)0);
-        /* clean up nir_lower_io's deref_var remains */
-        NIR_PASS_V(c->s, nir_opt_dce);
+        NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+                 type_size_vec4,
+                 (nir_lower_io_options)0);
+        /* clean up nir_lower_io's deref_var remains and do a constant folding pass
+         * on the code it generated.
+         */
+        NIR_PASS(_, c->s, nir_opt_dce);
+        NIR_PASS(_, c->s, nir_opt_constant_folding);
 }
 
 static void
@@ -977,11 +1080,11 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
                 v3d_fixup_fs_output_types(c);
 
-        NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
+        NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
 
         if (c->fs_key->line_smoothing) {
-                v3d_nir_lower_line_smooth(c->s);
-                NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+                NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
+                NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
                 /* The lowering pass can introduce new sysval reads */
                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
         }
@@ -991,26 +1094,26 @@ static void
 v3d_nir_lower_gs_late(struct v3d_compile *c)
 {
         if (c->key->ucp_enables) {
-                NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
-                           false, NULL);
+                NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
+                         true, NULL);
         }
 
         /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
 }
 
 static void
 v3d_nir_lower_vs_late(struct v3d_compile *c)
 {
         if (c->key->ucp_enables) {
-                NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
-                           false, false, NULL);
+                NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
+                         false, true, NULL);
                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
-                           nir_var_shader_out);
+                           nir_var_shader_out, NULL, NULL);
         }
 
         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
 }
 
 static void
@@ -1024,9 +1127,9 @@ v3d_nir_lower_fs_late(struct v3d_compile *c)
          * are using.
          */
         if (c->key->ucp_enables)
-                NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
+                NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
 
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
 }
 
 static uint32_t
@@ -1107,6 +1210,69 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
         return false;
 }
 
+static unsigned
+v3d_instr_delay_cb(nir_instr *instr, void *data)
+{
+   struct v3d_compile *c = (struct v3d_compile *) data;
+
+   switch (instr->type) {
+   case nir_instr_type_undef:
+   case nir_instr_type_load_const:
+   case nir_instr_type_alu:
+   case nir_instr_type_deref:
+   case nir_instr_type_jump:
+   case nir_instr_type_parallel_copy:
+   case nir_instr_type_call:
+   case nir_instr_type_phi:
+      return 1;
+
+   /* We should not use very large delays for TMU instructions. Typically,
+    * thread switches will be sufficient to hide all or most of the latency,
+    * so we typically only need a little bit of extra room. If we over-estimate
+    * the latency here we may end up unnecessarily delaying the critical path in
+    * the shader, which would have a negative effect in performance, so here
+    * we are trying to strike a balance based on empirical testing.
+    */
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      if (!c->disable_general_tmu_sched) {
+         switch (intr->intrinsic) {
+         case nir_intrinsic_decl_reg:
+         case nir_intrinsic_load_reg:
+         case nir_intrinsic_store_reg:
+            return 0;
+         case nir_intrinsic_load_ssbo:
+         case nir_intrinsic_load_scratch:
+         case nir_intrinsic_load_shared:
+         case nir_intrinsic_image_load:
+            return 3;
+         case nir_intrinsic_load_ubo:
+            if (nir_src_is_divergent(intr->src[1]))
+               return 3;
+            FALLTHROUGH;
+         default:
+            return 1;
+         }
+      } else {
+         switch (intr->intrinsic) {
+         case nir_intrinsic_decl_reg:
+         case nir_intrinsic_load_reg:
+         case nir_intrinsic_store_reg:
+            return 0;
+         default:
+            return 1;
+         }
+      }
+      break;
+   }
+
+   case nir_instr_type_tex:
+      return 5;
+   }
+
+   return 0;
+}
+
 static bool
 should_split_wrmask(const nir_instr *instr, const void *data)
 {
@@ -1197,7 +1363,7 @@ v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
                  * reference offset, since otherwise we would not be able to
                  * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
                  */
-                if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
+                if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
                         continue;
 
                 /* We will move this load if its offset is smaller than ref's
@@ -1349,16 +1515,14 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
 static bool
 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
 {
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_foreach_block(block, function->impl) {
-                                c->sorted_any_ubo_loads |=
-                                        v3d_nir_sort_constant_ubo_loads_block(c, block);
-                        }
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
+        nir_foreach_function_impl(impl, s) {
+                nir_foreach_block(block, impl) {
+                        c->sorted_any_ubo_loads |=
+                                v3d_nir_sort_constant_ubo_loads_block(c, block);
                 }
+                nir_metadata_preserve(impl,
+                                      nir_metadata_block_index |
+                                      nir_metadata_dominance);
         }
         return c->sorted_any_ubo_loads;
 }
@@ -1376,8 +1540,8 @@ lower_load_num_subgroups(struct v3d_compile *c,
                 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
                              c->s->info.workgroup_size[1] *
                              c->s->info.workgroup_size[2], V3D_CHANNELS);
-        nir_ssa_def *result = nir_imm_int(b, num_subgroups);
-        nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
+        nir_def *result = nir_imm_int(b, num_subgroups);
+        nir_def_rewrite_uses(&intr->def, result);
         nir_instr_remove(&intr->instr);
 }
 
@@ -1404,6 +1568,36 @@ lower_subgroup_intrinsics(struct v3d_compile *c,
                 case nir_intrinsic_load_subgroup_size:
                 case nir_intrinsic_load_subgroup_invocation:
                 case nir_intrinsic_elect:
+                case nir_intrinsic_ballot:
+                case nir_intrinsic_inverse_ballot:
+                case nir_intrinsic_ballot_bitfield_extract:
+                case nir_intrinsic_ballot_bit_count_reduce:
+                case nir_intrinsic_ballot_find_lsb:
+                case nir_intrinsic_ballot_find_msb:
+                case nir_intrinsic_ballot_bit_count_exclusive:
+                case nir_intrinsic_ballot_bit_count_inclusive:
+                case nir_intrinsic_reduce:
+                case nir_intrinsic_inclusive_scan:
+                case nir_intrinsic_exclusive_scan:
+                case nir_intrinsic_read_invocation:
+                case nir_intrinsic_read_first_invocation:
+                case nir_intrinsic_load_subgroup_eq_mask:
+                case nir_intrinsic_load_subgroup_ge_mask:
+                case nir_intrinsic_load_subgroup_gt_mask:
+                case nir_intrinsic_load_subgroup_le_mask:
+                case nir_intrinsic_load_subgroup_lt_mask:
+                case nir_intrinsic_shuffle:
+                case nir_intrinsic_shuffle_xor:
+                case nir_intrinsic_shuffle_up:
+                case nir_intrinsic_shuffle_down:
+                case nir_intrinsic_vote_all:
+                case nir_intrinsic_vote_any:
+                case nir_intrinsic_vote_feq:
+                case nir_intrinsic_vote_ieq:
+                case nir_intrinsic_quad_broadcast:
+                case nir_intrinsic_quad_swap_horizontal:
+                case nir_intrinsic_quad_swap_vertical:
+                case nir_intrinsic_quad_swap_diagonal:
                         c->has_subgroups = true;
                         break;
                 default:
@@ -1418,18 +1612,15 @@ static bool
 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
 {
         bool progress = false;
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_builder b;
-                        nir_builder_init(&b, function->impl);
+        nir_foreach_function_impl(impl, s) {
+                nir_builder b = nir_builder_create(impl);
 
-                        nir_foreach_block(block, function->impl)
-                                progress |= lower_subgroup_intrinsics(c, block, &b);
+                nir_foreach_block(block, impl)
+                        progress |= lower_subgroup_intrinsics(c, block, &b);
 
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
-                }
+                nir_metadata_preserve(impl,
+                                      nir_metadata_block_index |
+                                      nir_metadata_dominance);
         }
         return progress;
 }
@@ -1483,30 +1674,54 @@ v3d_attempt_compile(struct v3d_compile *c)
                 break;
         }
 
-        NIR_PASS_V(c->s, v3d_nir_lower_io, c);
-        NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
-        NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
+        NIR_PASS(_, c->s, v3d_nir_lower_io, c);
+        NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
+        NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
+
+        NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
         nir_lower_idiv_options idiv_options = {
-                .imprecise_32bit_lowering = true,
                 .allow_fp16 = true,
         };
-        NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
-
-        if (c->key->robust_buffer_access) {
-           /* v3d_nir_lower_robust_buffer_access assumes constant buffer
-            * indices on ubo/ssbo intrinsics so run copy propagation and
-            * constant folding passes before we run the lowering to warrant
-            * this. We also want to run the lowering before v3d_optimize to
-            * clean-up redundant get_buffer_size calls produced in the pass.
-            */
-           NIR_PASS_V(c->s, nir_copy_prop);
-           NIR_PASS_V(c->s, nir_opt_constant_folding);
-           NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
+        NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
+        NIR_PASS(_, c->s, nir_lower_alu);
+
+        if (c->key->robust_uniform_access || c->key->robust_storage_access ||
+            c->key->robust_image_access) {
+                /* nir_lower_robust_access assumes constant buffer
+                 * indices on ubo/ssbo intrinsics so run copy propagation and
+                 * constant folding passes before we run the lowering to warrant
+                 * this. We also want to run the lowering before v3d_optimize to
+                 * clean-up redundant get_buffer_size calls produced in the pass.
+                 */
+                NIR_PASS(_, c->s, nir_copy_prop);
+                NIR_PASS(_, c->s, nir_opt_constant_folding);
+
+                nir_lower_robust_access_options opts = {
+                   .lower_image = c->key->robust_image_access,
+                   .lower_ssbo = c->key->robust_storage_access,
+                   .lower_ubo = c->key->robust_uniform_access,
+                };
+
+                NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
         }
 
-        NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
+        NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
 
-        NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
+        NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
+
+        NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
+
+        const nir_lower_subgroups_options subgroup_opts = {
+                .subgroup_size = V3D_CHANNELS,
+                .ballot_components = 1,
+                .ballot_bit_size = 32,
+                .lower_to_scalar = true,
+                .lower_inverse_ballot = true,
+                .lower_subgroup_masks = true,
+                .lower_relative_shuffle = true,
+                .lower_quad = true,
+        };
+        NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
 
         v3d_optimize_nir(c, c->s);
 
@@ -1519,25 +1734,25 @@ v3d_attempt_compile(struct v3d_compile *c)
         while (more_late_algebraic) {
                 more_late_algebraic = false;
                 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
-                NIR_PASS_V(c->s, nir_opt_constant_folding);
-                NIR_PASS_V(c->s, nir_copy_prop);
-                NIR_PASS_V(c->s, nir_opt_dce);
-                NIR_PASS_V(c->s, nir_opt_cse);
+                NIR_PASS(_, c->s, nir_opt_constant_folding);
+                NIR_PASS(_, c->s, nir_copy_prop);
+                NIR_PASS(_, c->s, nir_opt_dce);
+                NIR_PASS(_, c->s, nir_opt_cse);
         }
 
-        NIR_PASS_V(c->s, nir_lower_bool_to_int32);
-        nir_convert_to_lcssa(c->s, true, true);
+        NIR_PASS(_, c->s, nir_lower_bool_to_int32);
+        NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
         NIR_PASS_V(c->s, nir_divergence_analysis);
-        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+        NIR_PASS(_, c->s, nir_convert_from_ssa, true);
 
         struct nir_schedule_options schedule_options = {
                 /* Schedule for about half our register space, to enable more
                  * shaders to hit 4 threads.
                  */
-                .threshold = 24,
+                .threshold = c->threads == 4 ? 24 : 48,
 
                 /* Vertex shaders share the same memory for inputs and outputs,
-                 * fragement and geometry shaders do not.
+                 * fragment and geometry shaders do not.
                  */
                 .stages_with_shared_io_memory =
                 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
@@ -1548,11 +1763,22 @@ v3d_attempt_compile(struct v3d_compile *c)
 
                 .intrinsic_cb = v3d_intrinsic_dependency_cb,
                 .intrinsic_cb_data = c,
+
+                .instr_delay_cb = v3d_instr_delay_cb,
+                .instr_delay_cb_data = c,
         };
         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
 
         if (!c->disable_constant_ubo_load_sorting)
-                NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
+                NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
+
+        const nir_move_options buffer_opts = c->move_buffer_loads ?
+                (nir_move_load_ubo | nir_move_load_ssbo) : 0;
+        NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
+                                        nir_move_const_undef |
+                                        buffer_opts);
+
+        NIR_PASS_V(c->s, nir_trivialize_registers);
 
         v3d_nir_to_vir(c);
 }
@@ -1611,32 +1837,28 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
  * register allocation to any particular thread count). This is fine
  * because v3d_nir_to_vir will cap this to the actual minimum.
  */
-struct v3d_compiler_strategy {
-        const char *name;
-        uint32_t max_threads;
-        uint32_t min_threads;
-        bool disable_loop_unrolling;
-        bool disable_ubo_load_sorting;
-        bool disable_tmu_pipelining;
-        bool tmu_spilling_allowed;
-} static const strategies[] = {
-  /*0*/ { "default",                        4, 4, false, false, false, false },
-  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false, false },
-  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false, false },
-  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,  false },
-  /*4*/ { "lower thread count",             2, 1, false, false, false, false },
-  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, false },
-  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, false },
-  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  true  },
-  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  true  }
+static const struct v3d_compiler_strategy strategies[] = {
+        /*0*/  { "default",                        4, 4, false, false, false, false, false, false,  0 },
+        /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false, false,  0 },
+        /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false, false,  0 },
+        /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false, false,  0 },
+        /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false, false,  0 },
+        /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  false, true,   0 },
+        /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, false, -1 },
+        /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, false, -1 },
+        /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, false, -1 },
+        /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, false, -1 },
+        /*10*/ { "Move buffer loads (2t)",         2, 1, true,  true,  true,  true,  true,  false, -1 },
+        /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  true,  -1 },
+        /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  true,  -1 }
 };
 
 /**
  * If a particular optimization didn't make any progress during a compile
- * attempt disabling it alone won't allow us to compile the shader successfuly,
+ * attempt disabling it alone won't allow us to compile the shader successfully,
  * since we'll end up with the same code. Detect these scenarios so we can
  * avoid wasting time with useless compiles. We should also consider if the
- * strategy changes other aspects of the compilation process though, like
+ * gy changes other aspects of the compilation process though, like
  * spilling, and not skip it in that case.
  */
 static bool
@@ -1649,31 +1871,55 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
    assert(idx > 0);
 
    /* Don't skip a strategy that changes spilling behavior */
-   if (strategies[idx].tmu_spilling_allowed !=
-       strategies[idx - 1].tmu_spilling_allowed) {
+   if (strategies[idx].max_tmu_spills !=
+       strategies[idx - 1].max_tmu_spills) {
            return false;
    }
 
    switch (idx) {
-   /* Loop unrolling: skip if we didn't unroll any loops */
+   /* General TMU sched.: skip if we didn't emit any TMU loads */
    case 1:
-   case 5:
+   case 7:
+           return !c->has_general_tmu_load;
+   /* Global code motion: skip if nir_opt_gcm didn't make any progress */
+   case 2:
+   case 8:
+           return !c->gcm_progress;
+   /* Loop unrolling: skip if we didn't unroll any loops */
+   case 3:
+   case 9:
            return !c->unrolled_any_loops;
    /* UBO load sorting: skip if we didn't sort any loads */
-   case 2:
-   case 6:
+   case 4:
            return !c->sorted_any_ubo_loads;
+   /* Move buffer loads: we assume any shader with difficult RA
+    * most likely has UBO / SSBO loads so we never try to skip.
+    * For now, we only try this for 2-thread compiles since it
+    * is expected to impact instruction counts and latency.
+    */
+   case 10:
+          assert(c->threads < 4);
+          return false;
    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
-   case 3:
-   case 7:
+   case 5:
+   case 11:
            return !c->pipelined_any_tmu;
    /* Lower thread count: skip if we already tried less that 4 threads */
-   case 4:
+   case 6:
           return c->threads < 4;
    default:
            return false;
    };
 }
+
+static inline void
+set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
+{
+   if (*best)
+      vir_compile_destroy(*best);
+   *best = c;
+}
+
 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       struct v3d_key *key,
                       struct v3d_prog_data **out_prog_data,
@@ -1685,58 +1931,106 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       uint32_t *final_assembly_size)
 {
         struct v3d_compile *c = NULL;
-        for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
+
+        uint32_t best_spill_fill_count = UINT32_MAX;
+        struct v3d_compile *best_c = NULL;
+        for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
                 /* Fallback strategy */
-                if (i > 0) {
+                if (strat > 0) {
                         assert(c);
-                        if (skip_compile_strategy(c, i))
+                        if (skip_compile_strategy(c, strat))
                                 continue;
 
                         char *debug_msg;
                         int ret = asprintf(&debug_msg,
-                                           "Falling back to strategy '%s' for %s",
-                                           strategies[i].name,
-                                           vir_get_stage_name(c));
+                                           "Falling back to strategy '%s' "
+                                           "for %s prog %d/%d",
+                                           strategies[strat].name,
+                                           vir_get_stage_name(c),
+                                           c->program_id, c->variant_id);
 
                         if (ret >= 0) {
-                                if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
+                                if (V3D_DBG(PERF))
                                         fprintf(stderr, "%s\n", debug_msg);
 
                                 c->debug_output(debug_msg, c->debug_output_data);
                                 free(debug_msg);
                         }
 
-                        vir_compile_destroy(c);
+                        if (c != best_c)
+                                vir_compile_destroy(c);
                 }
 
                 c = vir_compile_init(compiler, key, s,
                                      debug_output, debug_output_data,
                                      program_id, variant_id,
-                                     strategies[i].max_threads,
-                                     strategies[i].min_threads,
-                                     strategies[i].tmu_spilling_allowed,
-                                     strategies[i].disable_loop_unrolling,
-                                     strategies[i].disable_ubo_load_sorting,
-                                     strategies[i].disable_tmu_pipelining,
-                                     i == ARRAY_SIZE(strategies) - 1);
+                                     strat, &strategies[strat],
+                                     strat == ARRAY_SIZE(strategies) - 1);
 
                 v3d_attempt_compile(c);
 
-                if (i >= ARRAY_SIZE(strategies) - 1 ||
-                    c->compilation_result !=
-                    V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
+                /* Broken shader or driver bug */
+                if (c->compilation_result == V3D_COMPILATION_FAILED)
                         break;
+
+                /* If we compiled without spills, choose this.
+                 * Otherwise if this is a 4-thread compile, choose this (these
+                 * have a very low cap on the allowed TMU spills so we assume
+                 * it will be better than a 2-thread compile without spills).
+                 * Otherwise, keep going while tracking the strategy with the
+                 * lowest spill count.
+                 */
+                if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
+                        if (c->spills == 0 ||
+                            strategies[strat].min_threads == 4 ||
+                            V3D_DBG(OPT_COMPILE_TIME)) {
+                                set_best_compile(&best_c, c);
+                                break;
+                        } else if (c->spills + c->fills <
+                                   best_spill_fill_count) {
+                                set_best_compile(&best_c, c);
+                                best_spill_fill_count = c->spills + c->fills;
+                        }
+
+                        if (V3D_DBG(PERF)) {
+                                char *debug_msg;
+                                int ret = asprintf(&debug_msg,
+                                                   "Compiled %s prog %d/%d with %d "
+                                                   "spills and %d fills. Will try "
+                                                   "more strategies.",
+                                                   vir_get_stage_name(c),
+                                                   c->program_id, c->variant_id,
+                                                   c->spills, c->fills);
+                                if (ret >= 0) {
+                                        fprintf(stderr, "%s\n", debug_msg);
+                                        c->debug_output(debug_msg, c->debug_output_data);
+                                        free(debug_msg);
+                                }
+                        }
                 }
+
+                /* Only try next streategy if we failed to register allocate
+                 * or we had to spill.
+                 */
+                assert(c->compilation_result ==
+                       V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
+                       c->spills > 0);
         }
 
-        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
+        /* If the best strategy was not the last, choose that */
+        if (best_c && c != best_c)
+                set_best_compile(&c, best_c);
+
+        if (V3D_DBG(PERF) &&
             c->compilation_result !=
             V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
             c->spills > 0) {
                 char *debug_msg;
                 int ret = asprintf(&debug_msg,
-                                   "Compiled %s with %d spills and %d fills",
+                                   "Compiled %s prog %d/%d with %d "
+                                   "spills and %d fills",
                                    vir_get_stage_name(c),
+                                   c->program_id, c->variant_id,
                                    c->spills, c->fills);
                 fprintf(stderr, "%s\n", debug_msg);
 
@@ -1747,8 +2041,12 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
         }
 
         if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
-                fprintf(stderr, "Failed to compile %s with any strategy.\n",
-                        vir_get_stage_name(c));
+                fprintf(stderr, "Failed to compile %s prog %d/%d "
+                        "with any strategy.\n",
+                        vir_get_stage_name(c), c->program_id, c->variant_id);
+
+                vir_compile_destroy(c);
+                return NULL;
         }
 
         struct v3d_prog_data *prog_data;
@@ -1762,8 +2060,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
         char *shaderdb;
         int ret = v3d_shaderdb_dump(c, &shaderdb);
         if (ret >= 0) {
-                if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
-                        fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
+                if (V3D_DBG(SHADERDB))
+                        fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
 
                 c->debug_output(shaderdb, c->debug_output_data);
                 free(shaderdb);
@@ -1872,8 +2170,11 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
         struct qinst *prev_inst = NULL;
         assert(c->cur_block);
 
-#ifdef DEBUG
-        /* Check if the current instruction is part of the current block */
+#if MESA_DEBUG
+        /* We can only reuse a uniform if it was emitted in the same block,
+         * so callers must make sure the current instruction is being emitted
+         * in the current block.
+         */
         bool found = false;
         vir_for_each_inst(inst, c->cur_block) {
                 if (&inst->link == c->cursor.link) {
@@ -1882,7 +2183,7 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
                 }
         }
 
-        assert(found || list_is_empty(&c->cur_block->instructions));
+        assert(found || &c->cur_block->instructions == c->cursor.link);
 #endif
 
         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
@@ -1900,6 +2201,12 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
         if (!prev_inst)
                 return false;
 
+        /* Only reuse the ldunif result if it was written to a temp register,
+         * otherwise there may be special restrictions (for example, ldunif
+         * may write directly to unifa, which is a write-only register).
+         */
+        if (prev_inst->dst.file != QFILE_TEMP)
+                return false;
 
         list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
                                  &c->cur_block->instructions, link) {
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..631eeee52ab 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -182,11 +182,6 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
                 break;
         }
 
-        case QFILE_VPM:
-                fprintf(stderr, "vpm%d.%d",
-                        reg.index / 4, reg.index % 4);
-                break;
-
         case QFILE_TEMP:
                 fprintf(stderr, "t%d", reg.index);
                 break;
@@ -197,9 +192,6 @@ static void
 vir_dump_sig_addr(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *instr)
 {
-        if (devinfo->ver < 41)
-                return;
-
         if (!instr->sig_magic)
                 fprintf(stderr, ".rf%d", instr->sig_addr);
         else {
@@ -270,8 +262,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
 
-                unpack[0] = instr->alu.add.a_unpack;
-                unpack[1] = instr->alu.add.b_unpack;
+                unpack[0] = instr->alu.add.a.unpack;
+                unpack[1] = instr->alu.add.b.unpack;
         } else {
                 fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
                 fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +274,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
 
-                unpack[0] = instr->alu.mul.a_unpack;
-                unpack[1] = instr->alu.mul.b_unpack;
+                unpack[0] = instr->alu.mul.a.unpack;
+                unpack[1] = instr->alu.mul.b.unpack;
         }
 
         for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 2fd6430a0f4..d1f44aa9cf7 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
                                 flags_inst = NULL;
                         }
 
-                        /* Payload registers: r0/1/2 contain W, centroid W,
-                         * and Z at program start.  Register allocation will
-                         * force their nodes to R0/1/2.
+                        /* Payload registers: for fragment shaders, W,
+                         * centroid W, and Z will be initialized in r0/1/2
+                         * until v42, or r1/r2/r3 since v71.
+                         *
+                         * For compute shaders, payload is in r0/r2 up to v42,
+                         * r2/r3 since v71.
+                         *
+                         * Register allocation will force their nodes to those
+                         * registers.
                          */
                         if (inst->src[0].file == QFILE_REG) {
-                                switch (inst->src[0].index) {
-                                case 0:
-                                case 1:
-                                case 2:
+                                uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+                                uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+                                if (inst->src[0].index >= min_payload_r ||
+                                    inst->src[0].index <= max_payload_r) {
                                         c->temp_start[inst->dst.index] = 0;
-                                        break;
                                 }
                         }
 
@@ -306,6 +311,8 @@ vir_calculate_live_intervals(struct v3d_compile *c)
 
                 vir_for_each_block(block, c) {
                         ralloc_free(block->def);
+                        ralloc_free(block->defin);
+                        ralloc_free(block->defout);
                         ralloc_free(block->use);
                         ralloc_free(block->live_in);
                         ralloc_free(block->live_out);
diff --git a/src/broadcom/compiler/vir_opt_constant_alu.c b/src/broadcom/compiler/vir_opt_constant_alu.c
index 483646f882e..dc4c8a65026 100644
--- a/src/broadcom/compiler/vir_opt_constant_alu.c
+++ b/src/broadcom/compiler/vir_opt_constant_alu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -155,6 +155,7 @@ vir_opt_constant_alu(struct v3d_compile *c)
 {
         bool progress = false;
         vir_for_each_block(block, c) {
+                c->cur_block = block;
                 vir_for_each_inst_safe(inst, block) {
                         progress = try_opt_constant_alu(c, inst) || progress;
                 }
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index c5bb6112173..611c4693ed3 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
 #include "v3d_compiler.h"
 
 static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
 {
         if (!inst)
                 return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
                 return false;
         }
 
-        switch (inst->src[0].file) {
-        case QFILE_MAGIC:
-                /* No copy propagating from R3/R4/R5 -- the MOVs from those
-                 * are there to register allocate values produced into R3/4/5
-                 * to other regs (though hopefully r3/4/5).
-                 */
-                switch (inst->src[0].index) {
-                case V3D_QPU_WADDR_R3:
-                case V3D_QPU_WADDR_R4:
-                case V3D_QPU_WADDR_R5:
-                        return false;
+        if (devinfo->ver == 42) {
+                switch (inst->src[0].file) {
+                case QFILE_MAGIC:
+                        /* No copy propagating from R3/R4/R5 -- the MOVs from
+                         * those are there to register allocate values produced
+                         * into R3/4/5 to other regs (though hopefully r3/4/5).
+                         */
+                        switch (inst->src[0].index) {
+                        case V3D_QPU_WADDR_R3:
+                        case V3D_QPU_WADDR_R4:
+                        case V3D_QPU_WADDR_R5:
+                                return false;
+                        default:
+                                break;
+                        }
+                        break;
+
+                case QFILE_REG:
+                        switch (inst->src[0].index) {
+                        case 0:
+                        case 1:
+                        case 2:
+                                /* MOVs from rf0/1/2 are only to track the live
+                                 * intervals for W/centroid W/Z.
+                                 */
+                                return false;
+                        }
+                        break;
+
                 default:
                         break;
                 }
-                break;
-
-        case QFILE_REG:
-                switch (inst->src[0].index) {
-                case 0:
-                case 1:
-                case 2:
-                        /* MOVs from rf0/1/2 are only to track the live
+        } else {
+                assert(devinfo->ver >= 71);
+                switch (inst->src[0].file) {
+                case QFILE_REG:
+                        switch (inst->src[0].index) {
+                        /* MOVs from rf1/2/3 are only to track the live
                          * intervals for W/centroid W/Z.
+                         *
+                         * Note: rf0 can be implicitly written by ldvary
+                         * (no temp involved), so it is not an SSA value and
+                         * could clash with writes to other temps that are
+                         * also allocated to rf0. In theory, that would mean
+                         * that we can't copy propagate from it, but we handle
+                         * this at register allocation time, preventing temps
+                         * from being allocated to rf0 while the rf0 value from
+                         * ldvary is still live.
                          */
-                        return false;
-                }
-                break;
+                        case 1:
+                        case 2:
+                        case 3:
+                                return false;
+                        }
+                        break;
 
-        default:
-                break;
+                default:
+                        break;
+                }
         }
 
         return true;
@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)
 
         if (vir_is_add(inst)) {
                 if (chan == 0)
-                        return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
                 else
-                        return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
         } else {
                 if (chan == 0)
-                        return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
                 else
-                        return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
         }
 }
 
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
                  */
                 struct qinst *mov = movs[inst->src[i].index];
                 if (!mov) {
-                        if (!is_copy_mov(c->defs[inst->src[i].index]))
+                        if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
                                 continue;
                         mov = c->defs[inst->src[i].index];
 
@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
                                 continue;
 
                         /* these ops can't represent abs. */
-                        if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+                        if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
                                 switch (inst->qpu.alu.add.op) {
                                 case V3D_QPU_A_VFPACK:
                                 case V3D_QPU_A_FROUND:
@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
 
                 inst->src[i] = mov->src[0];
                 if (vir_has_unpack(mov, 0)) {
-                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
 
                         vir_set_unpack(inst, i, unpack);
                 }
@@ -238,12 +267,14 @@ vir_opt_copy_propagate(struct v3d_compile *c)
                  */
                 memset(movs, 0, sizeof(struct qinst *) * c->num_temps);
 
+                c->cur_block = block;
                 vir_for_each_inst(inst, block) {
+
                         progress = try_copy_prop(c, inst, movs) || progress;
 
                         apply_kills(c, movs, inst);
 
-                        if (is_copy_mov(inst))
+                        if (is_copy_mov(c->devinfo, inst))
                                 movs[inst->dst.index] = inst;
                 }
         }
diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c
index 64c762c88db..fd1af944427 100644
--- a/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/src/broadcom/compiler/vir_opt_dead_code.c
@@ -52,21 +52,10 @@ dce(struct v3d_compile *c, struct qinst *inst)
 }
 
 static bool
-has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
-{
-        for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                if (inst->src[i].file == QFILE_VPM)
-                        return true;
-        }
-
-        return false;
-}
-
-static bool
 can_write_to_null(struct v3d_compile *c, struct qinst *inst)
 {
         /* The SFU instructions must write to a physical register. */
-        if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu))
+        if (v3d_qpu_uses_sfu(&inst->qpu))
                 return false;
 
         return true;
@@ -149,30 +138,25 @@ check_first_ldunifa(struct v3d_compile *c,
 }
 
 static bool
-increment_unifa_address(struct v3d_compile *c, struct qblock *block, struct qinst *unifa)
+increment_unifa_address(struct v3d_compile *c, struct qinst *unifa)
 {
-        struct qblock *current_block = c->cur_block;
         if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
             unifa->qpu.alu.mul.op == V3D_QPU_M_MOV) {
                 c->cursor = vir_after_inst(unifa);
-                c->cur_block = block;
                 struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 vir_ADD_dest(c, unifa_reg, unifa->src[0], vir_uniform_ui(c, 4u));
                 vir_remove_instruction(c, unifa);
-                c->cur_block = current_block;
                 return true;
         }
 
         if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
             unifa->qpu.alu.add.op == V3D_QPU_A_ADD) {
                 c->cursor = vir_after_inst(unifa);
-                c->cur_block = block;
                 struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 struct qreg tmp =
                         vir_ADD(c, unifa->src[1], vir_uniform_ui(c, 4u));
                 vir_ADD_dest(c, unifa_reg, unifa->src[0], tmp);
                 vir_remove_instruction(c, unifa);
-                c->cur_block = current_block;
                 return true;
         }
 
@@ -200,7 +184,7 @@ vir_opt_dead_code(struct v3d_compile *c)
 
         vir_for_each_block(block, c) {
                 struct qinst *last_flags_write = NULL;
-
+                c->cur_block = block;
                 vir_for_each_inst_safe(inst, block) {
                         /* If this instruction reads the flags, we can't
                          * remove the flags generation for it.
@@ -246,7 +230,6 @@ vir_opt_dead_code(struct v3d_compile *c)
                         }
 
                         if (v3d_qpu_writes_flags(&inst->qpu) ||
-                            has_nonremovable_reads(c, inst) ||
                             (is_ldunifa && !is_first_ldunifa && !is_last_ldunifa)) {
                                 /* If we can't remove the instruction, but we
                                  * don't need its destination value, just
@@ -276,7 +259,7 @@ vir_opt_dead_code(struct v3d_compile *c)
                          */
                         if (is_first_ldunifa) {
                                 assert(unifa);
-                                if (!increment_unifa_address(c, block, unifa))
+                                if (!increment_unifa_address(c, unifa))
                                         continue;
                         }
 
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index 4609ef9c361..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
             a->qpu.flags.mpf != b->qpu.flags.mpf ||
             a->qpu.alu.add.op != b->qpu.alu.add.op ||
             a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
-            a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
-            a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+            a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+            a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
             a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
-            a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
-            a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+            a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+            a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
             a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
                 return false;
         }
@@ -99,6 +99,7 @@ vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block)
         struct qinst *last_flags = NULL;
         bool progress = false;
 
+        c->cur_block = block;
         vir_for_each_inst(inst, block) {
                 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
                     inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..56f0bf20706 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
                 /* The small immediate value sits in the raddr B field, so we
                  * can't have 2 small immediates in one instruction (unless
                  * they're the same value, but that should be optimized away
-                 * elsewhere).
+                 * elsewhere). Since 7.x we can encode small immediates in
+                 * any raddr field, but each instruction can still only use
+                 * one.
                  */
                 bool uses_small_imm = false;
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
                          */
                         struct v3d_qpu_sig new_sig = inst->qpu.sig;
                         uint32_t sig_packed;
-                        new_sig.small_imm = true;
+                        if (c->devinfo->ver == 42) {
+                                new_sig.small_imm_b = true;
+                        } else {
+                               if (vir_is_add(inst)) {
+                                       if (i == 0)
+                                               new_sig.small_imm_a = true;
+                                       else
+                                               new_sig.small_imm_b = true;
+                               } else {
+                                       if (i == 0)
+                                               new_sig.small_imm_c = true;
+                                       else
+                                               new_sig.small_imm_d = true;
+                               }
+                        }
+
                         if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
                                 continue;
 
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
                                 vir_dump_inst(c, inst);
                                 fprintf(stderr, "\n");
                         }
-                        inst->qpu.sig.small_imm = true;
+                        inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+                        inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+                        inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+                        inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
                         inst->qpu.raddr_b = packed;
 
                         inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 08698b4ece1..53e84840899 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -26,12 +26,100 @@
 #include "common/v3d_device_info.h"
 #include "v3d_compiler.h"
 
-#define QPU_R(i) { .magic = false, .index = i }
-
 #define ACC_INDEX     0
 #define ACC_COUNT     6
-#define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT    64
+
+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+        if (devinfo->has_accumulators)
+                return ACC_INDEX + ACC_COUNT;
+        else
+                return 0;
+}
+
+/* ACC as accumulator */
+#define CLASS_BITS_PHYS   (1 << 0)
+#define CLASS_BITS_ACC    (1 << 1)
+#define CLASS_BITS_R5     (1 << 4)
+
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+        if (devinfo->has_accumulators)
+                return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+        else
+                return CLASS_BITS_PHYS;
+}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+   if (!devinfo->has_accumulators) {
+      assert(class_bits & CLASS_BITS_PHYS);
+      class_bits = CLASS_BITS_PHYS;
+   }
+   return class_bits;
+}
+
+static inline uint32_t
+temp_to_node(struct v3d_compile *c, uint32_t temp)
+{
+        return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+                                                      IMPLICIT_RF_COUNT);
+}
+
+static inline uint32_t
+node_to_temp(struct v3d_compile *c, uint32_t node)
+{
+        assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+               (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+        return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+                                                      IMPLICIT_RF_COUNT);
+}
+
+static inline uint8_t
+get_temp_class_bits(struct v3d_compile *c,
+                    uint32_t temp)
+{
+        return c->nodes.info[temp_to_node(c, temp)].class_bits;
+}
+
+static inline void
+set_temp_class_bits(struct v3d_compile *c,
+                    uint32_t temp, uint8_t class_bits)
+{
+        c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
+}
+
+static struct ra_class *
+choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
+{
+        if (class_bits == CLASS_BITS_PHYS) {
+                return c->compiler->reg_class_phys[c->thread_index];
+        } else if (class_bits == (CLASS_BITS_R5)) {
+                assert(c->devinfo->has_accumulators);
+                return c->compiler->reg_class_r5[c->thread_index];
+        } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+                assert(c->devinfo->has_accumulators);
+                return c->compiler->reg_class_phys_or_acc[c->thread_index];
+        } else {
+                assert(class_bits == get_class_bit_any(c->devinfo));
+                return c->compiler->reg_class_any[c->thread_index];
+        }
+}
+
+static inline struct ra_class *
+choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
+{
+        assert(temp < c->num_temps && temp < c->nodes.alloc_count);
+        return choose_reg_class(c, get_temp_class_bits(c, temp));
+}
 
 static inline bool
 qinst_writes_tmu(const struct v3d_device_info *devinfo,
@@ -46,23 +134,22 @@ static bool
 is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
                        struct qinst *inst, struct qblock *block)
 {
-        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
-            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
-                return true;
-        }
-
-        if (!inst->qpu.sig.ldtmu)
+        /* Only tmuwt and ldtmu can finish TMU sequences */
+        bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                        inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+        bool is_ldtmu = inst->qpu.sig.ldtmu;
+        if (!is_tmuwt && !is_ldtmu)
                 return false;
 
+        /* Check if this is the last tmuwt or ldtmu in the sequence */
         list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
                                  &block->instructions, link) {
-                if (scan_inst->qpu.sig.ldtmu)
-                        return false;
+                is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                           scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+                is_ldtmu = scan_inst->qpu.sig.ldtmu;
 
-                if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
-                    inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
-                        return true;
-                }
+                if (is_tmuwt || is_ldtmu)
+                        return false;
 
                 if (qinst_writes_tmu(devinfo, scan_inst))
                         return true;
@@ -79,11 +166,101 @@ vir_is_mov_uniform(struct v3d_compile *c, int temp)
         return def && def->qpu.sig.ldunif;
 }
 
+static bool
+can_reconstruct_inst(struct qinst *inst)
+{
+        assert(inst);
+
+        if (vir_is_add(inst)) {
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_FXCD:
+                case V3D_QPU_A_FYCD:
+                case V3D_QPU_A_XCD:
+                case V3D_QPU_A_YCD:
+                case V3D_QPU_A_IID:
+                case V3D_QPU_A_EIDX:
+                case V3D_QPU_A_TIDX:
+                case V3D_QPU_A_SAMPID:
+                        /* No need to check input unpacks because none of these
+                         * opcodes read sources. FXCD,FYCD have pack variants.
+                         */
+                        return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+                               inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
+                               inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
+                               inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
+                default:
+                        return false;
+                }
+        }
+
+        return false;
+}
+
+static bool
+can_reconstruct_temp(struct v3d_compile *c, int temp)
+{
+        struct qinst *def = c->defs[temp];
+        return def && can_reconstruct_inst(def);
+}
+
+static struct qreg
+reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
+{
+        struct qreg dest;
+        switch (op) {
+        case V3D_QPU_A_FXCD:
+                dest = vir_FXCD(c);
+                break;
+        case V3D_QPU_A_FYCD:
+                dest = vir_FYCD(c);
+                break;
+        case V3D_QPU_A_XCD:
+                dest = vir_XCD(c);
+                break;
+        case V3D_QPU_A_YCD:
+                dest = vir_YCD(c);
+                break;
+        case V3D_QPU_A_IID:
+                dest = vir_IID(c);
+                break;
+        case V3D_QPU_A_EIDX:
+                dest = vir_EIDX(c);
+                break;
+        case V3D_QPU_A_TIDX:
+                dest = vir_TIDX(c);
+                break;
+        case V3D_QPU_A_SAMPID:
+                dest = vir_SAMPID(c);
+                break;
+        default:
+            unreachable("Unexpected opcode for reconstruction");
+        }
+
+        return dest;
+}
+
+enum temp_spill_type {
+        SPILL_TYPE_UNIFORM,
+        SPILL_TYPE_RECONSTRUCT,
+        SPILL_TYPE_TMU
+};
+
+static enum temp_spill_type
+get_spill_type_for_temp(struct v3d_compile *c, int temp)
+{
+   if (vir_is_mov_uniform(c, temp))
+      return SPILL_TYPE_UNIFORM;
+
+   if (can_reconstruct_temp(c, temp))
+      return SPILL_TYPE_RECONSTRUCT;
+
+   return SPILL_TYPE_TMU;
+}
+
 static int
-v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
-                      uint32_t *temp_to_node)
+v3d_choose_spill_node(struct v3d_compile *c)
 {
-        const float tmu_scale = 5;
+        const float tmu_scale = 10;
         float block_scale = 1.0;
         float spill_costs[c->num_temps];
         bool in_tmu_operation = false;
@@ -99,7 +276,8 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                          * starting output writes.
                          */
                         bool no_spilling =
-                                c->threads > 1 && started_last_seg;
+                                (c->threads > 1 && started_last_seg) ||
+                                (c->max_tmu_spills == 0);
 
                         /* Discourage spilling of TMU operations */
                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -107,7 +285,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                         continue;
 
                                 int temp = inst->src[i].index;
-                                if (vir_is_mov_uniform(c, temp)) {
+                                enum temp_spill_type spill_type =
+                                        get_spill_type_for_temp(c, temp);
+
+                                if (spill_type != SPILL_TYPE_TMU) {
                                         spill_costs[temp] += block_scale;
                                 } else if (!no_spilling) {
                                         float tmu_op_scale = in_tmu_operation ?
@@ -122,11 +303,11 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
 
                         if (inst->dst.file == QFILE_TEMP) {
                                 int temp = inst->dst.index;
+                                enum temp_spill_type spill_type =
+                                        get_spill_type_for_temp(c, temp);
 
-                                if (vir_is_mov_uniform(c, temp)) {
-                                        /* We just rematerialize the unform
-                                         * later.
-                                         */
+                                if (spill_type != SPILL_TYPE_TMU) {
+                                        /* We just rematerialize it later */
                                 } else if (!no_spilling) {
                                         spill_costs[temp] += (block_scale *
                                                               tmu_scale);
@@ -147,10 +328,6 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                         if (inst->is_last_thrsw)
                                 started_last_seg = true;
 
-                        if (v3d_qpu_writes_vpm(&inst->qpu) ||
-                            v3d_qpu_uses_tlb(&inst->qpu))
-                                started_last_seg = true;
-
                         /* Track when we're in between a TMU setup and the
                          * final LDTMU or TMUWT from that TMU setup.  We
                          * penalize spills during that time.
@@ -163,12 +340,53 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                 }
         }
 
+        /* We always emit a "last thrsw" to ensure all our spilling occurs
+         * before the last thread section. See vir_emit_last_thrsw.
+         */
+        assert(started_last_seg);
+
         for (unsigned i = 0; i < c->num_temps; i++) {
-                if (BITSET_TEST(c->spillable, i))
-                        ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]);
+                if (BITSET_TEST(c->spillable, i)) {
+                        ra_set_node_spill_cost(c->g, temp_to_node(c, i),
+                                               spill_costs[i]);
+                }
         }
 
-        return ra_get_best_spill_node(g);
+        return ra_get_best_spill_node(c->g);
+}
+
+static void
+ensure_nodes(struct v3d_compile *c)
+{
+        if (c->num_temps < c->nodes.alloc_count)
+                return;
+
+        c->nodes.alloc_count *= 2;
+        c->nodes.info = reralloc_array_size(c,
+                                            c->nodes.info,
+                                            sizeof(c->nodes.info[0]),
+                                            c->nodes.alloc_count +
+                                            MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
+}
+
+/* Creates the interference node for a new temp. We use this to keep the node
+ * list updated during the spilling process, which generates new temps/nodes.
+ */
+static void
+add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+{
+        ensure_nodes(c);
+
+        int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
+        assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+                                              node == temp + IMPLICIT_RF_COUNT);
+
+        /* We fill the node priority after we are done inserting spills */
+        c->nodes.info[node].class_bits = class_bits;
+        c->nodes.info[node].priority = 0;
+        c->nodes.info[node].is_ldunif_dst = false;
+        c->nodes.info[node].is_program_end = false;
+        c->nodes.info[node].unused = false;
 }
 
 /* The spill offset for this thread takes a bit of setup, so do it once at
@@ -206,79 +424,224 @@ v3d_setup_spill_base(struct v3d_compile *c)
                                 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
 
         /* Make sure that we don't spill the spilling setup instructions. */
-        for (int i = start_num_temps; i < c->num_temps; i++)
+        for (int i = start_num_temps; i < c->num_temps; i++) {
                 BITSET_CLEAR(c->spillable, i);
 
+                /* If we are spilling, update the RA map with the temps added
+                 * by the spill setup. Our spill_base register can never be an
+                 * accumulator because it is used for TMU spill/fill and thus
+                 * needs to persist across thread switches.
+                 */
+                if (c->spilling) {
+                        int temp_class = CLASS_BITS_PHYS;
+                        if (c->devinfo->has_accumulators &&
+                            i != c->spill_base.index) {
+                                temp_class |= CLASS_BITS_ACC;
+                        }
+                        add_node(c, i, temp_class);
+                }
+        }
+
         /* Restore the current block. */
         c->cur_block = current_block;
         c->cursor = vir_after_block(c->cur_block);
 }
 
-static struct qinst *
-v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
+/**
+ * Computes the address for a spill/fill sequence and completes the spill/fill
+ * sequence by emitting the following code:
+ *
+ * ldunif.spill_offset
+ * add tmua spill_base spill_offset
+ * thrsw
+ *
+ * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
+ * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
+ *
+ * The parameter 'ip' represents the ip at which the spill/fill is happening.
+ * This is used to disallow accumulators on temps that cross this ip boundary
+ * due to the new thrsw itroduced in the sequence above.
+ */
+static void
+v3d_emit_spill_tmua(struct v3d_compile *c,
+                    uint32_t spill_offset,
+                    enum v3d_qpu_cond cond,
+                    int32_t ip,
+                    struct qreg *fill_dst)
 {
-        return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
-                            c->spill_base, vir_uniform_ui(c, spill_offset));
-}
+        assert(ip >= 0);
+
+        /* Load a uniform with the spill offset and add it to the spill base
+         * to obtain the TMUA address. It can be of class ANY because we know
+         * we are consuming it immediately without thrsw in between.
+         */
+        assert(c->disable_ldunif_opt);
+        struct qreg offset = vir_uniform_ui(c, spill_offset);
+        add_node(c, offset.index, get_class_bit_any(c->devinfo));
 
+        /* We always enable per-quad on spills/fills to ensure we spill
+         * any channels involved with helper invocations.
+         */
+        struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+        struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
+        inst->qpu.flags.ac = cond;
+        inst->ldtmu_count = 1;
+        inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+                                              0xffffff7f); /* per-quad */
+
+        vir_emit_thrsw(c);
+
+        /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
+         * result of the fill. The TMUWT temp is not really read, the ldtmu
+         * temp will be used immediately so just like the uniform above we
+         * can allow accumulators.
+         */
+        int temp_class =
+                filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+        if (!fill_dst) {
+                struct qreg dst = vir_TMUWT(c);
+                assert(dst.file == QFILE_TEMP);
+                add_node(c, dst.index, temp_class);
+        } else {
+                *fill_dst = vir_LDTMU(c);
+                assert(fill_dst->file == QFILE_TEMP);
+                add_node(c, fill_dst->index, temp_class);
+        }
+
+        /* Temps across the thread switch we injected can't be assigned to
+         * accumulators.
+         *
+         * Fills inject code before ip, so anything that starts at ip or later
+         * is not affected by the thrsw. Something that ends at ip will be
+         * affected though.
+         *
+         * Spills inject code after ip, so anything that starts strictly later
+         * than ip is not affected (the temp starting at ip is usually the
+         * spilled temp except for postponed spills). Something that ends at ip
+         * won't be affected either.
+         */
+        for (int i = 0; i < c->spill_start_num_temps; i++) {
+                bool thrsw_cross = fill_dst ?
+                        c->temp_start[i] < ip && c->temp_end[i] >= ip :
+                        c->temp_start[i] <= ip && c->temp_end[i] > ip;
+                if (thrsw_cross) {
+                        ra_set_node_class(c->g, temp_to_node(c, i),
+                                          choose_reg_class(c, CLASS_BITS_PHYS));
+                }
+        }
+}
 
 static void
-v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst,
-                   struct qinst *position, uint32_t spill_offset)
+v3d_emit_tmu_spill(struct v3d_compile *c,
+                   struct qinst *inst,
+                   struct qreg spill_temp,
+                   struct qinst *position,
+                   uint32_t ip,
+                   uint32_t spill_offset)
 {
         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+        assert(inst->dst.file == QFILE_TEMP);
 
         c->cursor = vir_after_inst(position);
-        inst->dst = vir_get_temp(c);
+
         enum v3d_qpu_cond cond = vir_get_cond(inst);
+
+        /* If inst and position don't match, this is a postponed spill,
+         * in which case we have already allocated the temp for the spill
+         * and we should use that, otherwise create a new temp with the
+         * same register class bits as the original.
+         */
+        if (inst == position) {
+                uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
+                inst->dst = vir_get_temp(c);
+                add_node(c, inst->dst.index, class_bits);
+        } else {
+                inst->dst = spill_temp;
+
+                /* If this is a postponed spill the register being spilled may
+                 * have been written more than once including conditional
+                 * writes, so ignore predication on the spill instruction and
+                 * always spill the full register.
+                 */
+                cond = V3D_QPU_COND_NONE;
+        }
+
         struct qinst *tmp =
                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
                              inst->dst);
         tmp->qpu.flags.mc = cond;
-        tmp = v3d_emit_spill_tmua(c, spill_offset);
-        tmp->qpu.flags.ac = cond;
-        vir_emit_thrsw(c);
-        vir_TMUWT(c);
+
+        v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
+
         c->spills++;
         c->tmu_dirty_rcl = true;
 }
 
+static inline bool
+interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
+{
+        return !(t0_start >= t1_end || t1_start >= t0_end);
+}
+
 static void
-v3d_spill_reg(struct v3d_compile *c, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+              int spill_temp)
 {
-        c->spill_count++;
+        c->spill_start_num_temps = c->num_temps;
+        c->spilling = true;
 
-        bool is_uniform = vir_is_mov_uniform(c, spill_temp);
+        enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
 
         uint32_t spill_offset = 0;
-
-        if (!is_uniform) {
+        if (spill_type == SPILL_TYPE_TMU) {
                 spill_offset = c->spill_size;
                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
 
-                if (spill_offset == 0)
+                if (spill_offset == 0) {
                         v3d_setup_spill_base(c);
+
+                        /* Don't allocate our spill base to rf0 to avoid
+                         * conflicts with instructions doing implicit writes
+                         * to that register.
+                         */
+                        if (!c->devinfo->has_accumulators) {
+                                ra_add_node_interference(
+                                        c->g,
+                                        temp_to_node(c, c->spill_base.index),
+                                        implicit_rf_nodes[0]);
+                        }
+                }
         }
 
         struct qinst *last_thrsw = c->last_thrsw;
         assert(last_thrsw && last_thrsw->is_last_thrsw);
 
-        int start_num_temps = c->num_temps;
-
         int uniform_index = ~0;
-        if (is_uniform) {
+        if (spill_type == SPILL_TYPE_UNIFORM) {
                 struct qinst *orig_unif = c->defs[spill_temp];
                 uniform_index = orig_unif->uniform;
         }
 
+        enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
+        if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+                struct qinst *orig_def = c->defs[spill_temp];
+                assert(vir_is_add(orig_def));
+                reconstruct_op = orig_def->qpu.alu.add.op;
+        }
+
+        uint32_t spill_node = temp_to_node(c, spill_temp);
+
         /* We must disable the ldunif optimization if we are spilling uniforms */
         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
         c->disable_ldunif_opt = true;
 
         struct qinst *start_of_tmu_sequence = NULL;
         struct qinst *postponed_spill = NULL;
+        struct qreg postponed_spill_temp = { 0 };
         vir_for_each_block(block, c) {
                 vir_for_each_inst_safe(inst, block) {
+                        int32_t ip = inst->ip;
+
                         /* Track when we're in between a TMU setup and the final
                          * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
                          * temps during that time, because that involves inserting a
@@ -289,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                         if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
                                 if (postponed_spill) {
                                         v3d_emit_tmu_spill(c, postponed_spill,
-                                                           inst, spill_offset);
+                                                           postponed_spill_temp,
+                                                           inst, ip, spill_offset);
                                 }
 
                                 start_of_tmu_sequence = NULL;
@@ -302,49 +666,103 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                         }
 
                         /* fills */
+                        int filled_src = -1;
                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
                                 if (inst->src[i].file != QFILE_TEMP ||
                                     inst->src[i].index != spill_temp) {
                                         continue;
                                 }
 
+                                if (filled_src >= 0) {
+                                        inst->src[i] = inst->src[filled_src];
+                                        continue;
+                                }
+
                                 c->cursor = vir_before_inst(inst);
 
-                                if (is_uniform) {
+                                if (spill_type == SPILL_TYPE_UNIFORM) {
                                         struct qreg unif =
                                                 vir_uniform(c,
                                                             c->uniform_contents[uniform_index],
                                                             c->uniform_data[uniform_index]);
                                         inst->src[i] = unif;
+                                        /* We are using the uniform in the
+                                         * instruction immediately after, so
+                                         * we can use any register class for it.
+                                         */
+                                        add_node(c, unif.index,
+                                                 get_class_bit_any(c->devinfo));
+                                } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+                                        struct qreg temp =
+                                                reconstruct_temp(c, reconstruct_op);
+                                        inst->src[i] = temp;
+                                        /* We are using the temp in the
+                                         * instruction immediately after so we
+                                         * can use ACC.
+                                         */
+                                        int temp_class =
+                                                filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+                                                                              CLASS_BITS_ACC);
+                                        add_node(c, temp.index, temp_class);
                                 } else {
-                                        /* If we have a postponed spill, we don't need
-                                         * a fill as the temp would not have been
-                                         * spilled yet.
+                                        /* If we have a postponed spill, we
+                                         * don't need a fill as the temp would
+                                         * not have been spilled yet, however,
+                                         * we need to update the temp index.
                                          */
-                                        if (postponed_spill)
-                                                continue;
-                                        if (start_of_tmu_sequence)
-                                                c->cursor = vir_before_inst(start_of_tmu_sequence);
-
-                                        v3d_emit_spill_tmua(c, spill_offset);
-                                        vir_emit_thrsw(c);
-                                        inst->src[i] = vir_LDTMU(c);
-                                        c->fills++;
+                                        if (postponed_spill) {
+                                                inst->src[i] =
+                                                        postponed_spill_temp;
+                                        } else {
+                                                int32_t fill_ip = ip;
+                                                if (start_of_tmu_sequence) {
+                                                        c->cursor = vir_before_inst(start_of_tmu_sequence);
+                                                        fill_ip = start_of_tmu_sequence->ip;
+                                                }
+
+                                                v3d_emit_spill_tmua(c,  spill_offset,
+                                                                    V3D_QPU_COND_NONE,
+                                                                    fill_ip, &inst->src[i]);
+                                                c->fills++;
+                                        }
                                 }
+
+                                filled_src = i;
                         }
 
                         /* spills */
                         if (inst->dst.file == QFILE_TEMP &&
                             inst->dst.index == spill_temp) {
-                                if (is_uniform) {
+                                if (spill_type != SPILL_TYPE_TMU) {
                                         c->cursor.link = NULL;
                                         vir_remove_instruction(c, inst);
                                 } else {
-                                        if (start_of_tmu_sequence)
+                                        /* If we are in the middle of a TMU
+                                         * sequence, we postpone the actual
+                                         * spill until we have finished it. We,
+                                         * still need to replace the spill temp
+                                         * with a new temp though.
+                                         */
+                                        if (start_of_tmu_sequence) {
+                                                if (postponed_spill) {
+                                                        postponed_spill->dst =
+                                                                postponed_spill_temp;
+                                                }
+                                                if (!postponed_spill ||
+                                                    vir_get_cond(inst) == V3D_QPU_COND_NONE) {
+                                                        postponed_spill_temp =
+                                                                vir_get_temp(c);
+                                                        add_node(c,
+                                                                 postponed_spill_temp.index,
+                                                                 c->nodes.info[spill_node].class_bits);
+                                                }
                                                 postponed_spill = inst;
-                                        else
-                                                v3d_emit_tmu_spill(c, inst, inst,
+                                        } else {
+                                                v3d_emit_tmu_spill(c, inst,
+                                                                   postponed_spill_temp,
+                                                                   inst, ip,
                                                                    spill_offset);
+                                        }
                                 }
                         }
                 }
@@ -358,21 +776,64 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
         /* Don't allow spilling of our spilling instructions.  There's no way
          * they can help get things colored.
          */
-        for (int i = start_num_temps; i < c->num_temps; i++)
+        for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
                 BITSET_CLEAR(c->spillable, i);
 
+        /* Reset interference for spilled node */
+        ra_set_node_spill_cost(c->g, spill_node, 0);
+        ra_reset_node_interference(c->g, spill_node);
+        BITSET_CLEAR(c->spillable, spill_temp);
+
+        /* Rebuild program ips */
+        int32_t ip = 0;
+        vir_for_each_inst_inorder(inst, c)
+                inst->ip = ip++;
+
+        /* Rebuild liveness */
+        vir_calculate_live_intervals(c);
+
+        /* Add interferences for the new spilled temps and update interferences
+         * for c->spill_base (since we may have modified its liveness). Also,
+         * update node priorities based one new liveness data.
+         */
+        uint32_t sb_temp =c->spill_base.index;
+        uint32_t sb_node = temp_to_node(c, sb_temp);
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                if (c->temp_end[i] == -1)
+                        continue;
+
+                uint32_t node_i = temp_to_node(c, i);
+                c->nodes.info[node_i].priority =
+                        c->temp_end[i] - c->temp_start[i];
+
+                for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
+                     j < c->num_temps; j++) {
+                        if (interferes(c->temp_start[i], c->temp_end[i],
+                                       c->temp_start[j], c->temp_end[j])) {
+                                uint32_t node_j = temp_to_node(c, j);
+                                ra_add_node_interference(c->g, node_i, node_j);
+                        }
+                }
+
+                if (spill_type == SPILL_TYPE_TMU) {
+                        if (i != sb_temp &&
+                            interferes(c->temp_start[i], c->temp_end[i],
+                                       c->temp_start[sb_temp], c->temp_end[sb_temp])) {
+                                ra_add_node_interference(c->g, node_i, sb_node);
+                        }
+                }
+        }
+
         c->disable_ldunif_opt = had_disable_ldunif_opt;
+        c->spilling = false;
 }
 
-struct node_to_temp_map {
-        uint32_t temp;
-        uint32_t priority;
-};
-
 struct v3d_ra_select_callback_data {
+        uint32_t phys_index;
         uint32_t next_acc;
         uint32_t next_phys;
-        struct node_to_temp_map *map;
+        struct v3d_ra_node_info *nodes;
+        const struct v3d_device_info *devinfo;
 };
 
 /* Choosing accumulators improves chances of merging QPU instructions
@@ -384,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
                    BITSET_WORD *regs,
                    int priority)
 {
+        if (!v3d_ra->devinfo->has_accumulators)
+                return false;
+
         /* Favor accumulators if we have less that this number of physical
          * registers. Accumulators have more restrictions (like being
          * invalidated through thrsw), so running out of physical registers
@@ -393,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
         static const int available_rf_threshold = 5;
         int available_rf = 0 ;
         for (int i = 0; i < PHYS_COUNT; i++) {
-                if (BITSET_TEST(regs, PHYS_INDEX + i))
+                if (BITSET_TEST(regs, v3d_ra->phys_index + i))
                         available_rf++;
                 if (available_rf >= available_rf_threshold)
                         break;
@@ -419,6 +883,19 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
                     BITSET_WORD *regs,
                     unsigned int *out)
 {
+        if (!v3d_ra->devinfo->has_accumulators)
+                return false;
+
+        /* Choose r5 for our ldunifs if possible (nobody else can load to that
+         * reg, and it keeps the QPU cond field free from being occupied by
+         * ldunifrf).
+         */
+        int r5 = ACC_INDEX + 5;
+        if (BITSET_TEST(regs, r5)) {
+                *out = r5;
+                return true;
+        }
+
         /* Round-robin through our accumulators to give post-RA instruction
          * selection more options.
          */
@@ -438,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
 
 static bool
 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 unsigned int node,
                  BITSET_WORD *regs,
                  unsigned int *out)
 {
+        /* If this node is for an unused temp, ignore. */
+        if (v3d_ra->nodes->info[node].unused) {
+                *out = 0;
+                return true;
+        }
+
+        /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+         * so we can avoid turning them into ldunifrf (which uses the
+         * cond field to encode the dst and would prevent merge with
+         * instructions that use cond flags).
+         */
+        if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+            BITSET_TEST(regs, v3d_ra->phys_index)) {
+                assert(v3d_ra->devinfo->ver >= 71);
+                *out = v3d_ra->phys_index;
+                return true;
+        }
+
+        /* The last 3 instructions in a shader can't use some specific registers
+         * (usually early rf registers, depends on v3d version) so try to
+         * avoid allocating these to registers used by the last instructions
+         * in the shader.
+         */
+        const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
+        if (v3d_ra->nodes->info[node].is_program_end &&
+            v3d_ra->next_phys < safe_rf_start) {
+                v3d_ra->next_phys = safe_rf_start;
+        }
+
         for (int i = 0; i < PHYS_COUNT; i++) {
                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
-                int phys = PHYS_INDEX + phys_off;
+
+                /* Try to keep rf0 available for ldunif in 7.x (see above). */
+                if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+                        continue;
+
+                int phys = v3d_ra->phys_index + phys_off;
 
                 if (BITSET_TEST(regs, phys)) {
                         v3d_ra->next_phys = phys_off + 1;
@@ -452,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
                 }
         }
 
+        /* If we couldn't allocate, do try to assign rf0 if it is available. */
+        if (v3d_ra->devinfo->ver >= 71 &&
+            BITSET_TEST(regs, v3d_ra->phys_index)) {
+                v3d_ra->next_phys = 1;
+                *out = v3d_ra->phys_index;
+                return true;
+        }
+
         return false;
 }
 
@@ -459,22 +979,14 @@ static unsigned int
 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
 {
         struct v3d_ra_select_callback_data *v3d_ra = data;
-        int r5 = ACC_INDEX + 5;
-
-        /* Choose r5 for our ldunifs if possible (nobody else can load to that
-         * reg, and it keeps the QPU cond field free from being occupied by
-         * ldunifrf).
-         */
-        if (BITSET_TEST(regs, r5))
-                return r5;
 
         unsigned int reg;
-        if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&
+        if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
             v3d_ra_select_accum(v3d_ra, regs, &reg)) {
                 return reg;
         }
 
-        if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+        if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
                 return reg;
 
         /* If we ran out of physical registers try to assign an accumulator
@@ -492,9 +1004,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         /* Allocate up to 3 regfile classes, for the ways the physical
          * register file can be divided up for fragment shader threading.
          */
-        int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+        int max_thread_index = 2;
+        uint8_t phys_index = get_phys_index(compiler->devinfo);
 
-        compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+        compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
                                           false);
         if (!compiler->regs)
                 return false;
@@ -502,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         for (int threads = 0; threads < max_thread_index; threads++) {
                 compiler->reg_class_any[threads] =
                         ra_alloc_contig_reg_class(compiler->regs, 1);
-                compiler->reg_class_r5[threads] =
-                        ra_alloc_contig_reg_class(compiler->regs, 1);
-                compiler->reg_class_phys_or_acc[threads] =
-                        ra_alloc_contig_reg_class(compiler->regs, 1);
+                if (compiler->devinfo->has_accumulators) {
+                        compiler->reg_class_r5[threads] =
+                                ra_alloc_contig_reg_class(compiler->regs, 1);
+                        compiler->reg_class_phys_or_acc[threads] =
+                                ra_alloc_contig_reg_class(compiler->regs, 1);
+                }
                 compiler->reg_class_phys[threads] =
                         ra_alloc_contig_reg_class(compiler->regs, 1);
 
-                for (int i = PHYS_INDEX;
-                     i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                /* Init physical regs */
+                for (int i = phys_index;
+                     i < phys_index + (PHYS_COUNT >> threads); i++) {
+                        if (compiler->devinfo->has_accumulators)
+                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
                         ra_class_add_reg(compiler->reg_class_any[threads], i);
                 }
 
-                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
-                        ra_class_add_reg(compiler->reg_class_any[threads], i);
+                /* Init accumulator regs */
+                if (compiler->devinfo->has_accumulators) {
+                        for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                                ra_class_add_reg(compiler->reg_class_any[threads], i);
+                        }
+                        /* r5 can only store a single 32-bit value, so not much can
+                         * use it.
+                         */
+                        ra_class_add_reg(compiler->reg_class_r5[threads],
+                                         ACC_INDEX + 5);
+                        ra_class_add_reg(compiler->reg_class_any[threads],
+                                         ACC_INDEX + 5);
                 }
-                /* r5 can only store a single 32-bit value, so not much can
-                 * use it.
-                 */
-                ra_class_add_reg(compiler->reg_class_r5[threads],
-                                 ACC_INDEX + 5);
-                ra_class_add_reg(compiler->reg_class_any[threads],
-                                 ACC_INDEX + 5);
         }
 
         ra_set_finalize(compiler->regs, NULL);
@@ -534,52 +1054,220 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         return true;
 }
 
-static int
-node_to_temp_priority(const void *in_a, const void *in_b)
+static inline bool
+tmu_spilling_allowed(struct v3d_compile *c)
 {
-        const struct node_to_temp_map *a = in_a;
-        const struct node_to_temp_map *b = in_b;
-
-        return a->priority - b->priority;
+        return c->spills + c->fills < c->max_tmu_spills;
 }
 
-/**
- * Computes the number of registers to spill in a batch after a register
- * allocation failure.
- */
-static uint32_t
-get_spill_batch_size(struct v3d_compile *c)
-{
-   /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of
-    * over-spilling if the program requires few spills to compile.
-    */
-   if (c->spill_count < 10)
-           return 1;
-
-   /* If we have to spill more than that we assume performance is not going to
-    * be great and we shift focus to batching spills to cut down compile
-    * time at the expense of over-spilling.
-    */
-   return 20;
-}
-
-/* Don't emit spills using the TMU until we've dropped thread count first. We,
- * may also disable spilling when certain optimizations that are known to
- * increase register pressure are active so we favor recompiling with
- * optimizations disabled instead of spilling.
- */
-static inline bool
-tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
+static void
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+                                      int *acc_nodes,
+                                      int *implicit_rf_nodes,
+                                      int last_ldvary_ip,
+                                      struct qinst *inst)
 {
-        return thread_index == 0 && c->tmu_spilling_allowed;
+        int32_t ip = inst->ip;
+        assert(ip >= 0);
+
+        /* If the instruction writes r4 (and optionally moves its
+         * result to a temp), nothing else can be stored in r4 across
+         * it.
+         */
+        if (vir_writes_r4_implicitly(c->devinfo, inst)) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, i),
+                                                         acc_nodes[4]);
+                        }
+                }
+        }
+
+        /* If any instruction writes to a physical register implicitly
+         * nothing else can write the same register across it.
+         */
+        if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, i),
+                                                         implicit_rf_nodes[0]);
+                        }
+                }
+        }
+
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_LDVPMV_IN:
+                case V3D_QPU_A_LDVPMV_OUT:
+                case V3D_QPU_A_LDVPMD_IN:
+                case V3D_QPU_A_LDVPMD_OUT:
+                case V3D_QPU_A_LDVPMP:
+                case V3D_QPU_A_LDVPMG_IN:
+                case V3D_QPU_A_LDVPMG_OUT: {
+                        /* LDVPMs only store to temps (the MA flag
+                         * decides whether the LDVPM is in or out)
+                         */
+                        assert(inst->dst.file == QFILE_TEMP);
+                        set_temp_class_bits(c, inst->dst.index,
+                                            CLASS_BITS_PHYS);
+                        break;
+                }
+
+                case V3D_QPU_A_RECIP:
+                case V3D_QPU_A_RSQRT:
+                case V3D_QPU_A_EXP:
+                case V3D_QPU_A_LOG:
+                case V3D_QPU_A_SIN:
+                case V3D_QPU_A_RSQRT2: {
+                        /* The SFU instructions write directly to the
+                         * phys regfile.
+                         */
+                        assert(inst->dst.file == QFILE_TEMP);
+                        set_temp_class_bits(c, inst->dst.index,
+                                            CLASS_BITS_PHYS);
+                        break;
+                }
+
+                default:
+                        break;
+                }
+        }
+
+        if (inst->src[0].file == QFILE_REG) {
+                switch (inst->src[0].index) {
+                case 0:
+                        /* V3D 7.x doesn't use rf0 for thread payload */
+                        if (c->devinfo->ver >= 71)
+                                break;
+                        else
+                                FALLTHROUGH;
+                case 1:
+                case 2:
+                case 3: {
+                        /* Payload setup instructions: Force allocate
+                         * the dst to the given register (so the MOV
+                         * will disappear).
+                         */
+                        assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+                        assert(inst->dst.file == QFILE_TEMP);
+                        uint32_t node = temp_to_node(c, inst->dst.index);
+                        ra_set_node_reg(c->g, node,
+                                        get_phys_index(c->devinfo) +
+                                        inst->src[0].index);
+                        break;
+                }
+                }
+        }
+
+        /* Don't allocate rf0 to temps that cross ranges where we have
+         * live implicit rf0 writes from ldvary. We can identify these
+         * by tracking the last ldvary instruction and explicit reads
+         * of rf0.
+         */
+        if (c->devinfo->ver >= 71 &&
+            ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+              (vir_get_nsrc(inst) > 1 &&
+               inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip &&
+                            c->temp_end[i] > last_ldvary_ip) {
+                                        ra_add_node_interference(c->g,
+                                                                 temp_to_node(c, i),
+                                                                 implicit_rf_nodes[0]);
+                        }
+                }
+        }
+
+        if (inst->dst.file == QFILE_TEMP) {
+                /* Only a ldunif gets to write to R5, which only has a
+                 * single 32-bit channel of storage.
+                 *
+                 * NOTE: ldunifa is subject to the same, however, going by
+                 * shader-db it is best to keep r5 exclusive to ldunif, probably
+                 * because ldunif has usually a shorter lifespan, allowing for
+                 * more accumulator reuse and QPU merges.
+                 */
+                if (c->devinfo->has_accumulators) {
+                        if (!inst->qpu.sig.ldunif) {
+                                uint8_t class_bits =
+                                        get_temp_class_bits(c, inst->dst.index) &
+                                        ~CLASS_BITS_R5;
+                                set_temp_class_bits(c, inst->dst.index,
+                                                    class_bits);
+
+                        }
+                } else {
+                        /* Make sure we don't allocate the ldvary's
+                         * destination to rf0, since it would clash
+                         * with its implicit write to that register.
+                         */
+                        if (inst->qpu.sig.ldvary) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, inst->dst.index),
+                                                         implicit_rf_nodes[0]);
+                        }
+                        /* Flag dst temps from ldunif(a) instructions
+                         * so we can try to assign rf0 to them and avoid
+                         * converting these to ldunif(a)rf.
+                         */
+                        if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+                                const uint32_t dst_n =
+                                        temp_to_node(c, inst->dst.index);
+                                c->nodes.info[dst_n].is_ldunif_dst = true;
+                        }
+                }
+        }
+
+        /* All accumulators are invalidated across a thread switch. */
+        if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                set_temp_class_bits(c, i,
+                                                    CLASS_BITS_PHYS);
+                        }
+                }
+        }
 }
 
-#define CLASS_BIT_PHYS			(1 << 0)
-#define CLASS_BIT_ACC			(1 << 1)
-#define CLASS_BIT_R5			(1 << 4)
-#define CLASS_BITS_ANY			(CLASS_BIT_PHYS | \
-                                         CLASS_BIT_ACC | \
-                                         CLASS_BIT_R5)
+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+        /* Only look for registers used in this many instructions */
+        uint32_t last_set_count = 6;
+
+        struct qblock *last_block = vir_exit_block(c);
+        list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+                if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                        continue;
+
+                int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+                for (int i = 0; i < num_src; i++) {
+                        if (inst->src[i].file == QFILE_TEMP) {
+                                int node = temp_to_node(c, inst->src[i].index);
+                                c->nodes.info[node].is_program_end = true;
+                        }
+                }
+
+                num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+                for (int i = 0; i < num_src; i++) {
+                       if (inst->src[i].file == QFILE_TEMP) {
+                                int node = temp_to_node(c, inst->src[i].index);
+                                c->nodes.info[node].is_program_end = true;
+
+                        }
+                }
+
+                if (inst->dst.file == QFILE_TEMP) {
+                        int node = temp_to_node(c, inst->dst.index);
+                        c->nodes.info[node].is_program_end = true;
+                }
+
+                if (--last_set_count == 0)
+                        break;
+        }
+}
 
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
@@ -587,24 +1275,37 @@ tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
  * The return value should be freed by the caller.
  */
 struct qpu_reg *
-v3d_register_allocate(struct v3d_compile *c, bool *spilled)
+v3d_register_allocate(struct v3d_compile *c)
 {
-        uint32_t UNUSED start_num_temps = c->num_temps;
-        struct node_to_temp_map map[c->num_temps];
-        uint32_t temp_to_node[c->num_temps];
-        uint8_t class_bits[c->num_temps];
         int acc_nodes[ACC_COUNT];
+        int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+        unsigned num_ra_nodes = c->num_temps;
+        if (c->devinfo->has_accumulators)
+                num_ra_nodes += ARRAY_SIZE(acc_nodes);
+        else
+                num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
+        c->nodes = (struct v3d_ra_node_info) {
+                .alloc_count = c->num_temps,
+                .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
+                                          num_ra_nodes),
+        };
+
+        uint32_t phys_index = get_phys_index(c->devinfo);
+
         struct v3d_ra_select_callback_data callback_data = {
+                .phys_index = phys_index,
                 .next_acc = 0,
                 /* Start at RF3, to try to keep the TLB writes from using
-                 * RF0-2.
+                 * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+                 * using RF2-3.
                  */
-                .next_phys = 3,
-                .map = map,
+                .next_phys = c->devinfo->ver == 42 ? 3 : 4,
+                .nodes = &c->nodes,
+                .devinfo = c->devinfo,
         };
 
-        *spilled = false;
-
         vir_calculate_live_intervals(c);
 
         /* Convert 1, 2, 4 threads to 0, 1, 2 index.
@@ -612,257 +1313,163 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
          * V3D 4.x has double the physical register space, so 64 physical regs
          * are available at both 1x and 2x threading, and 4x has 32.
          */
-        int thread_index = ffs(c->threads) - 1;
-        if (c->devinfo->ver >= 40) {
-                if (thread_index >= 1)
-                        thread_index--;
-        }
+        c->thread_index = ffs(c->threads) - 1;
+        if (c->thread_index >= 1)
+                c->thread_index--;
 
-        struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
-                                                         c->num_temps +
-                                                         ARRAY_SIZE(acc_nodes));
-        ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
+        c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
+        ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
 
         /* Make some fixed nodes for the accumulators, which we will need to
          * interfere with when ops have implied r3/r4 writes or for the thread
          * switches.  We could represent these as classes for the nodes to
          * live in, but the classes take up a lot of memory to set up, so we
-         * don't want to make too many.
+         * don't want to make too many. We use the same mechanism on platforms
+         * without accumulators that can have implicit writes to phys regs.
          */
-        for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
-                acc_nodes[i] = c->num_temps + i;
-                ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
-        }
-
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                map[i].temp = i;
-                map[i].priority = c->temp_end[i] - c->temp_start[i];
-        }
-        qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                temp_to_node[map[i].temp] = i;
+        for (uint32_t i = 0; i < num_ra_nodes; i++) {
+                c->nodes.info[i].is_ldunif_dst = false;
+                c->nodes.info[i].is_program_end = false;
+                c->nodes.info[i].unused = false;
+                c->nodes.info[i].priority = 0;
+                c->nodes.info[i].class_bits = 0;
+                if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+                        acc_nodes[i] = i;
+                        ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+                } else if (!c->devinfo->has_accumulators &&
+                           i < ARRAY_SIZE(implicit_rf_nodes)) {
+                        implicit_rf_nodes[i] = i;
+                        ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
+                } else {
+                        uint32_t t = node_to_temp(c, i);
+                        c->nodes.info[i].priority =
+                                c->temp_end[t] - c->temp_start[t];
+                        c->nodes.info[i].class_bits =
+                                get_class_bit_any(c->devinfo);
+                }
         }
 
-        /* Figure out our register classes and preallocated registers.  We
-         * start with any temp being able to be in any file, then instructions
-         * incrementally remove bits that the temp definitely can't be in.
+        /* Walk the instructions adding register class restrictions and
+         * interferences.
          */
-        memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
-
         int ip = 0;
+        int last_ldvary_ip = -1;
         vir_for_each_inst_inorder(inst, c) {
-                /* If the instruction writes r3/r4 (and optionally moves its
-                 * result to a temp), nothing else can be stored in r3/r4 across
-                 * it.
+                inst->ip = ip++;
+
+                /* ldunif(a) always write to a temporary, so we have
+                 * liveness info available to decide if rf0 is
+                 * available for them, however, ldvary is different:
+                 * it always writes to rf0 directly so we don't have
+                 * liveness information for its implicit rf0 write.
+                 *
+                 * That means the allocator may assign rf0 to a temp
+                 * that is defined while an implicit rf0 write from
+                 * ldvary is still live. We fix that by manually
+                 * tracking rf0 live ranges from ldvary instructions.
                  */
-                if (vir_writes_r3(c->devinfo, inst)) {
-                        for (int i = 0; i < c->num_temps; i++) {
-                                if (c->temp_start[i] < ip &&
-                                    c->temp_end[i] > ip) {
-                                        ra_add_node_interference(g,
-                                                                 temp_to_node[i],
-                                                                 acc_nodes[3]);
-                                }
-                        }
-                }
-                if (vir_writes_r4(c->devinfo, inst)) {
-                        for (int i = 0; i < c->num_temps; i++) {
-                                if (c->temp_start[i] < ip &&
-                                    c->temp_end[i] > ip) {
-                                        ra_add_node_interference(g,
-                                                                 temp_to_node[i],
-                                                                 acc_nodes[4]);
-                                }
-                        }
-                }
-
-                if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
-                        switch (inst->qpu.alu.add.op) {
-                        case V3D_QPU_A_LDVPMV_IN:
-                        case V3D_QPU_A_LDVPMV_OUT:
-                        case V3D_QPU_A_LDVPMD_IN:
-                        case V3D_QPU_A_LDVPMD_OUT:
-                        case V3D_QPU_A_LDVPMP:
-                        case V3D_QPU_A_LDVPMG_IN:
-                        case V3D_QPU_A_LDVPMG_OUT:
-                                /* LDVPMs only store to temps (the MA flag
-                                 * decides whether the LDVPM is in or out)
-                                 */
-                                assert(inst->dst.file == QFILE_TEMP);
-                                class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
-                                break;
-
-                        case V3D_QPU_A_RECIP:
-                        case V3D_QPU_A_RSQRT:
-                        case V3D_QPU_A_EXP:
-                        case V3D_QPU_A_LOG:
-                        case V3D_QPU_A_SIN:
-                        case V3D_QPU_A_RSQRT2:
-                                /* The SFU instructions write directly to the
-                                 * phys regfile.
-                                 */
-                                assert(inst->dst.file == QFILE_TEMP);
-                                class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
-                                break;
-
-                        default:
-                                break;
-                        }
-                }
+                if (inst->qpu.sig.ldvary)
+                        last_ldvary_ip = ip;
 
-                if (inst->src[0].file == QFILE_REG) {
-                        switch (inst->src[0].index) {
-                        case 0:
-                        case 1:
-                        case 2:
-                        case 3:
-                                /* Payload setup instructions: Force allocate
-                                 * the dst to the given register (so the MOV
-                                 * will disappear).
-                                 */
-                                assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
-                                assert(inst->dst.file == QFILE_TEMP);
-                                ra_set_node_reg(g,
-                                                temp_to_node[inst->dst.index],
-                                                PHYS_INDEX +
-                                                inst->src[0].index);
-                                break;
-                        }
-                }
-
-                if (inst->dst.file == QFILE_TEMP) {
-                        /* Only a ldunif gets to write to R5, which only has a
-                         * single 32-bit channel of storage.
-                         */
-                        if (!inst->qpu.sig.ldunif) {
-                                class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
-                        } else {
-                                /* Until V3D 4.x, we could only load a uniform
-                                 * to r5, so we'll need to spill if uniform
-                                 * loads interfere with each other.
-                                 */
-                                if (c->devinfo->ver < 40) {
-                                        class_bits[inst->dst.index] &=
-                                                CLASS_BIT_R5;
-                                }
-                        }
-                }
-
-                if (inst->qpu.sig.thrsw) {
-                        /* All accumulators are invalidated across a thread
-                         * switch.
-                         */
-                        for (int i = 0; i < c->num_temps; i++) {
-                                if (c->temp_start[i] < ip && c->temp_end[i] > ip)
-                                        class_bits[i] &= CLASS_BIT_PHYS;
-                        }
-                }
-
-                ip++;
+                update_graph_and_reg_classes_for_inst(c, acc_nodes,
+                                                      implicit_rf_nodes,
+                                                      last_ldvary_ip, inst);
         }
 
+        /* Flag the nodes that are used in the last instructions of the program
+         * (there are some registers that cannot be used in the last 3
+         * instructions). We only do this for fragment shaders, because the idea
+         * is that by avoiding this conflict we may be able to emit the last
+         * thread switch earlier in some cases, however, in non-fragment shaders
+         * this won't happen because the last instructions are always VPM stores
+         * with a small immediate, which conflicts with other signals,
+         * preventing us from ever moving the thrsw earlier.
+         */
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+                flag_program_end_nodes(c);
+
+        /* Set the register classes for all our temporaries in the graph */
         for (uint32_t i = 0; i < c->num_temps; i++) {
-                if (class_bits[i] == CLASS_BIT_PHYS) {
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_phys[thread_index]);
-                } else if (class_bits[i] == (CLASS_BIT_R5)) {
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_r5[thread_index]);
-                } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_phys_or_acc[thread_index]);
-                } else {
-                        assert(class_bits[i] == CLASS_BITS_ANY);
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_any[thread_index]);
-                }
+                ra_set_node_class(c->g, temp_to_node(c, i),
+                                  choose_reg_class_for_temp(c, i));
         }
 
+        /* Add register interferences based on liveness data */
         for (uint32_t i = 0; i < c->num_temps; i++) {
+                /* And while we are here, let's also flag nodes for
+                 * unused temps.
+                 */
+                if (c->temp_start[i] > c->temp_end[i])
+                        c->nodes.info[temp_to_node(c, i)].unused = true;
+
                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
-                        if (!(c->temp_start[i] >= c->temp_end[j] ||
-                              c->temp_start[j] >= c->temp_end[i])) {
-                                ra_add_node_interference(g,
-                                                         temp_to_node[i],
-                                                         temp_to_node[j]);
+                        if (interferes(c->temp_start[i], c->temp_end[i],
+                                       c->temp_start[j], c->temp_end[j])) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, i),
+                                                         temp_to_node(c, j));
                         }
                 }
         }
 
-        /* Debug code to force a bit of register spilling, for running across
-         * conformance tests to make sure that spilling works.
+        /* Debug option to force a bit of TMU spilling, for running
+         * across conformance tests to make sure that spilling works.
          */
-        int force_register_spills = 0;
-        if (c->spill_size <
-            V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
-                int node = v3d_choose_spill_node(c, g, temp_to_node);
-                if (node != -1) {
-                        v3d_spill_reg(c, map[node].temp);
-                        ralloc_free(g);
-                        *spilled = true;
-                        return NULL;
+        const int force_register_spills = 0;
+        if (force_register_spills > 0)
+                c->max_tmu_spills = UINT32_MAX;
+
+        struct qpu_reg *temp_registers = NULL;
+        while (true) {
+                if (c->spill_size <
+                    V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
+                        int node = v3d_choose_spill_node(c);
+                        uint32_t temp = node_to_temp(c, node);
+                        if (node != -1) {
+                                v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                                continue;
+                        }
                 }
-        }
-
-        bool ok = ra_allocate(g);
-        if (!ok) {
-                const uint32_t spill_batch_size = get_spill_batch_size(c);
-
-                for (uint32_t i = 0; i < spill_batch_size; i++) {
-                        int node = v3d_choose_spill_node(c, g, temp_to_node);
-                        if (node == -1)
-                           break;
-
-                        /* TMU spills inject thrsw signals that invalidate
-                         * accumulators, so we can't batch them.
-                         */
-                        bool is_uniform = vir_is_mov_uniform(c, map[node].temp);
-                        if (i > 0 && !is_uniform)
-                                break;
 
-                        if (is_uniform || tmu_spilling_allowed(c, thread_index)) {
-                                v3d_spill_reg(c, map[node].temp);
-
-                                /* Ask the outer loop to call back in. */
-                                *spilled = true;
+                if (ra_allocate(c->g))
+                        break;
 
-                                /* See comment above about batching TMU spills.
-                                 */
-                                if (!is_uniform) {
-                                        assert(i == 0);
-                                        break;
-                                }
-                        } else {
-                                break;
-                        }
+                /* Failed allocation, try to spill */
+                int node = v3d_choose_spill_node(c);
+                if (node == -1)
+                        goto spill_fail;
+
+                uint32_t temp = node_to_temp(c, node);
+                enum temp_spill_type spill_type =
+                        get_spill_type_for_temp(c, temp);
+                if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
+                        v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                        if (c->spills + c->fills > c->max_tmu_spills)
+                                goto spill_fail;
+                } else {
+                        goto spill_fail;
                 }
-
-                ralloc_free(g);
-                return NULL;
         }
 
-        /* Ensure that we are not accessing temp_to_node out of bounds. We
-         * should never trigger this assertion because `c->num_temps` only
-         * grows when we spill, in which case we return early and don't get
-         * here.
-         */
-        assert(start_num_temps == c->num_temps);
-        struct qpu_reg *temp_registers = calloc(c->num_temps,
-                                                sizeof(*temp_registers));
-
+        /* Allocation was successful, build the 'temp -> reg' map */
+        temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
         for (uint32_t i = 0; i < c->num_temps; i++) {
-                int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
-                if (ra_reg < PHYS_INDEX) {
+                int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+                if (ra_reg < phys_index) {
                         temp_registers[i].magic = true;
                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
                                                    ra_reg - ACC_INDEX);
                 } else {
                         temp_registers[i].magic = false;
-                        temp_registers[i].index = ra_reg - PHYS_INDEX;
+                        temp_registers[i].index = ra_reg - phys_index;
                 }
         }
 
-        ralloc_free(g);
-
+spill_fail:
+        ralloc_free(c->nodes.info);
+        c->nodes.info = NULL;
+        c->nodes.alloc_count = 0;
+        ralloc_free(c->g);
+        c->g = NULL;
         return temp_registers;
 }
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index aa33545420e..605c3e4c7d5 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr)
         return reg;
 }
 
-static inline struct qpu_reg
-qpu_acc(int acc)
-{
-        return qpu_magic(V3D_QPU_WADDR_R0 + acc);
-}
-
 struct v3d_qpu_instr
 v3d_qpu_nop(void)
 {
@@ -92,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
         return q;
 }
 
+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+        /* If we have a small immediate move it from inst->raddr_b to the
+         * corresponding raddr.
+         */
+        if (src.smimm) {
+                assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+                       instr->sig.small_imm_c || instr->sig.small_imm_d);
+                *raddr = instr->raddr_b;
+                return;
+        }
+
+        assert(!src.magic);
+        *raddr = src.index;
+}
+
 /**
  * Allocates the src register (accumulator or register file) into the RADDR
  * fields of the instruction.
  */
 static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d42_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
 {
         if (src.smimm) {
-                assert(instr->sig.small_imm);
+                assert(instr->sig.small_imm_b);
                 *mux = V3D_QPU_MUX_B;
                 return;
         }
@@ -112,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
                 return;
         }
 
-        if (instr->alu.add.a != V3D_QPU_MUX_A &&
-            instr->alu.add.b != V3D_QPU_MUX_A &&
-            instr->alu.mul.a != V3D_QPU_MUX_A &&
-            instr->alu.mul.b != V3D_QPU_MUX_A) {
+        if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+            instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+            instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+            instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
                 instr->raddr_a = src.index;
                 *mux = V3D_QPU_MUX_A;
         } else {
                 if (instr->raddr_a == src.index) {
                         *mux = V3D_QPU_MUX_A;
                 } else {
-                        assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
-                                 instr->alu.add.b == V3D_QPU_MUX_B &&
-                                 instr->alu.mul.a == V3D_QPU_MUX_B &&
-                                 instr->alu.mul.b == V3D_QPU_MUX_B) ||
+                        assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
                                src.index == instr->raddr_b);
 
                         instr->raddr_b = src.index;
@@ -134,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
         }
 }
 
-static bool
-is_no_op_mov(struct qinst *qinst)
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+        enum v3d_qpu_mux *mux,
+        uint8_t *raddr,
+        struct qpu_reg src,
+        const struct v3d_device_info *devinfo)
 {
-        static const struct v3d_qpu_sig no_sig = {0};
-
-        /* Make sure it's just a lone MOV. */
-        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
-            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
-            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
-            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
-                return false;
-        }
+        if (devinfo->ver < 71)
+                return v3d42_set_src(instr, mux, src);
+        else
+                return v3d71_set_src(instr, raddr, src);
+}
 
-        /* Check if it's a MOV from a register to itself. */
+static bool
+v3d42_mov_src_and_dst_equal(struct qinst *qinst)
+{
         enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
         if (qinst->qpu.alu.mul.magic_write) {
                 if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
                         return false;
 
-                if (qinst->qpu.alu.mul.a !=
+                if (qinst->qpu.alu.mul.a.mux !=
                     V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
                         return false;
                 }
         } else {
                 int raddr;
 
-                switch (qinst->qpu.alu.mul.a) {
+                switch (qinst->qpu.alu.mul.a.mux) {
                 case V3D_QPU_MUX_A:
                         raddr = qinst->qpu.raddr_a;
                         break;
@@ -174,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
                         return false;
         }
 
+        return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+        if (qinst->qpu.alu.mul.magic_write)
+                return false;
+
+        enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+        int raddr;
+
+        raddr = qinst->qpu.alu.mul.a.raddr;
+        if (raddr != waddr)
+                return false;
+
+        return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+                      const struct v3d_device_info *devinfo)
+{
+        if (devinfo->ver < 71)
+                return v3d42_mov_src_and_dst_equal(qinst);
+        else
+                return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+             const struct v3d_device_info *devinfo)
+{
+        static const struct v3d_qpu_sig no_sig = {0};
+
+        /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+         * for V3D 7.x there is also A_MOV, we don't need to check for it as
+         * we always emit using M_MOV. We could use A_MOV later on the
+         * squedule to improve performance
+         */
+        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+                return false;
+        }
+
+        if (!mov_src_and_dst_equal(qinst, devinfo))
+                return false;
+
         /* No packing or flags updates, or we need to execute the
          * instruction.
          */
-        if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+        if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
             qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
             qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
             qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -193,8 +262,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         struct qblock *block,
                         struct qpu_reg *temp_registers)
 {
-        int last_vpm_read_index = -1;
-
         vir_for_each_inst_safe(qinst, block) {
 #if 0
                 fprintf(stderr, "translating qinst to qpu: ");
@@ -202,8 +269,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                 fprintf(stderr, "\n");
 #endif
 
-                struct qinst *temp;
-
                 if (vir_has_uniform(qinst))
                         c->num_uniforms++;
 
@@ -219,8 +284,14 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 src[i] = qpu_magic(qinst->src[i].index);
                                 break;
                         case QFILE_NULL:
+                                /* QFILE_NULL is an undef, so we can load
+                                 * anything. Using a reg that doesn't have
+                                 * sched. restrictions.
+                                 */
+                                src[i] = qpu_reg(5);
+                                break;
                         case QFILE_LOAD_IMM:
-                                src[i] = qpu_acc(0);
+                                assert(!"not reached");
                                 break;
                         case QFILE_TEMP:
                                 src[i] = temp_registers[index];
@@ -228,18 +299,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         case QFILE_SMALL_IMM:
                                 src[i].smimm = true;
                                 break;
-
-                        case QFILE_VPM:
-                                assert((int)qinst->src[i].index >=
-                                       last_vpm_read_index);
-                                (void)last_vpm_read_index;
-                                last_vpm_read_index = qinst->src[i].index;
-
-                                temp = new_qpu_nop_before(qinst);
-                                temp->qpu.sig.ldvpm = true;
-
-                                src[i] = qpu_acc(3);
-                                break;
                         }
                 }
 
@@ -261,10 +320,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         dst = temp_registers[qinst->dst.index];
                         break;
 
-                case QFILE_VPM:
-                        dst = qpu_magic(V3D_QPU_WADDR_VPM);
-                        break;
-
                 case QFILE_SMALL_IMM:
                 case QFILE_LOAD_IMM:
                         assert(!"not reached");
@@ -276,10 +331,15 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
 
-                                if (!dst.magic ||
-                                    dst.index != V3D_QPU_WADDR_R5) {
-                                        assert(c->devinfo->ver >= 40);
+                                bool use_rf;
+                                if (c->devinfo->has_accumulators) {
+                                        use_rf = !dst.magic ||
+                                                 dst.index != V3D_QPU_WADDR_R5;
+                                } else {
+                                        use_rf = dst.magic || dst.index != 0;
+                                }
 
+                                if (use_rf) {
                                         if (qinst->qpu.sig.ldunif) {
                                            qinst->qpu.sig.ldunif = false;
                                            qinst->qpu.sig.ldunifrf = true;
@@ -299,13 +359,18 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 qinst->qpu.sig_magic = dst.magic;
                         } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
                                 if (nsrc >= 1) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.add.a, src[0]);
+                                                &qinst->qpu.alu.add.a.mux,
+                                                &qinst->qpu.alu.add.a.raddr,
+                                                src[0], c->devinfo);
                                 }
                                 if (nsrc >= 2) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.add.b, src[1]);
+                                                &qinst->qpu.alu.add.b.mux,
+                                                &qinst->qpu.alu.add.b.raddr,
+                                                src[1], c->devinfo);
                                 }
 
                                 qinst->qpu.alu.add.waddr = dst.index;
@@ -313,17 +378,21 @@ v3d_generate_code_block(struct v3d_compile *c,
                         } else {
                                 if (nsrc >= 1) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.mul.a, src[0]);
+                                                &qinst->qpu.alu.mul.a.mux,
+                                                &qinst->qpu.alu.mul.a.raddr,
+                                                src[0], c->devinfo);
                                 }
                                 if (nsrc >= 2) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.mul.b, src[1]);
+                                                &qinst->qpu.alu.mul.b.mux,
+                                                &qinst->qpu.alu.mul.b.raddr,
+                                                src[1], c->devinfo);
                                 }
 
                                 qinst->qpu.alu.mul.waddr = dst.index;
                                 qinst->qpu.alu.mul.magic_write = dst.magic;
 
-                                if (is_no_op_mov(qinst)) {
+                                if (is_no_op_mov(qinst, c->devinfo)) {
                                         vir_remove_instruction(c, qinst);
                                         continue;
                                 }
@@ -378,11 +447,7 @@ v3d_dump_qpu(struct v3d_compile *c)
                 const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
                 fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str);
 
-                /* We can only do this on 4.x, because we're not tracking TMU
-                 * implicit uniforms here on 3.x.
-                 */
-                if (c->devinfo->ver >= 40 &&
-                    reads_uniform(c->devinfo, c->qpu_insts[i])) {
+                if (reads_uniform(c->devinfo, c->qpu_insts[i])) {
                         fprintf(stderr, " (");
                         vir_dump_uniform(c->uniform_contents[next_uniform],
                                          c->uniform_data[next_uniform]);
@@ -394,8 +459,7 @@ v3d_dump_qpu(struct v3d_compile *c)
         }
 
         /* Make sure our dumping lined up. */
-        if (c->devinfo->ver >= 40)
-                assert(next_uniform == c->num_uniforms);
+        assert(next_uniform == c->num_uniforms);
 
         fprintf(stderr, "\n");
 }
@@ -431,8 +495,8 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
         }
         assert(i == c->qpu_inst_count);
 
-        if (V3D_DEBUG & (V3D_DEBUG_QPU |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(QPU) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 v3d_dump_qpu(c);
         }