19 files changed, 1835 insertions, 1622 deletions
diff --git a/src/panfrost/util/lcra.c b/src/panfrost/util/lcra.c
index 45cff37e61f..00585c646a6 100644
--- a/src/panfrost/util/lcra.c
+++ b/src/panfrost/util/lcra.c
@@ -24,14 +24,14 @@
  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
  */
 
-#include <stdio.h>
+#include "lcra.h"
 #include <assert.h>
+#include <limits.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <limits.h>
 #include "util/macros.h"
 #include "util/u_math.h"
-#include "lcra.h"
 
 /* This module is the reference implementation of "Linearly Constrained
  * Register Allocation". The paper is available in PDF form
@@ -40,161 +40,168 @@
  */
 
 struct lcra_state *
-lcra_alloc_equations(
-                unsigned node_count, unsigned class_count)
+lcra_alloc_equations(unsigned node_count, unsigned class_count)
 {
-        struct lcra_state *l = calloc(1, sizeof(*l));
+   struct lcra_state *l = calloc(1, sizeof(*l));
 
-        l->node_count = node_count;
-        l->class_count = class_count;
+   l->node_count = node_count;
+   l->class_count = class_count;
 
-        l->alignment = calloc(sizeof(l->alignment[0]), node_count);
-        l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
-        l->modulus = calloc(sizeof(l->modulus[0]), node_count);
-        l->class = calloc(sizeof(l->class[0]), node_count);
-        l->class_start = calloc(sizeof(l->class_start[0]), class_count);
-        l->class_disjoint = calloc(sizeof(l->class_disjoint[0]), class_count * class_count);
-        l->class_size = calloc(sizeof(l->class_size[0]), class_count);
-        l->spill_cost = calloc(sizeof(l->spill_cost[0]), node_count);
-        l->solutions = calloc(sizeof(l->solutions[0]), node_count);
+   l->alignment = calloc(sizeof(l->alignment[0]), node_count);
+   l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
+   l->modulus = calloc(sizeof(l->modulus[0]), node_count);
+   l->class = calloc(sizeof(l->class[0]), node_count);
+   l->class_start = calloc(sizeof(l->class_start[0]), class_count);
+   l->class_disjoint =
+      calloc(sizeof(l->class_disjoint[0]), class_count * class_count);
+   l->class_size = calloc(sizeof(l->class_size[0]), class_count);
+   l->spill_cost = calloc(sizeof(l->spill_cost[0]), node_count);
+   l->solutions = calloc(sizeof(l->solutions[0]), node_count);
 
-        memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
+   memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
 
-        return l;
+   return l;
 }
 
 void
 lcra_free(struct lcra_state *l)
 {
-        if (!l)
-                return;
-
-        free(l->alignment);
-        free(l->linear);
-        free(l->modulus);
-        free(l->class);
-        free(l->class_start);
-        free(l->class_disjoint);
-        free(l->class_size);
-        free(l->spill_cost);
-        free(l->solutions);
-
-        free(l);
+   if (!l)
+      return;
+
+   free(l->alignment);
+   free(l->linear);
+   free(l->modulus);
+   free(l->class);
+   free(l->class_start);
+   free(l->class_disjoint);
+   free(l->class_size);
+   free(l->spill_cost);
+   free(l->solutions);
+
+   free(l);
 }
 
 void
-lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, unsigned bound)
+lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2,
+                   unsigned bound)
 {
-        l->alignment[node] = (align_log2 + 1) | (bound << 16);
+   l->alignment[node] = (align_log2 + 1) | (bound << 16);
 }
 
 void
 lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2)
 {
-        l->class_disjoint[(c1 * l->class_count) + c2] = true;
-        l->class_disjoint[(c2 * l->class_count) + c1] = true;
+   l->class_disjoint[(c1 * l->class_count) + c2] = true;
+   l->class_disjoint[(c2 * l->class_count) + c1] = true;
 }
 
 void
 lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len)
 {
-        if (node < l->node_count && l->alignment[node]) {
-                unsigned BA = l->alignment[node];
-                unsigned alignment = (BA & 0xffff) - 1;
-                unsigned bound = BA >> 16;
-                l->modulus[node] = DIV_ROUND_UP(bound - len + 1, 1 << alignment);
-        }
+   if (node < l->node_count && l->alignment[node]) {
+      unsigned BA = l->alignment[node];
+      unsigned alignment = (BA & 0xffff) - 1;
+      unsigned bound = BA >> 16;
+      l->modulus[node] = DIV_ROUND_UP(bound - len + 1, 1 << alignment);
+   }
 }
 
 void
-lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
+lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i,
+                           unsigned j, unsigned cmask_j)
 {
-        if (i == j)
-                return;
+   if (i == j)
+      return;
 
-        if (l->class_disjoint[(l->class[i] * l->class_count) + l->class[j]])
-                return;
+   if (l->class_disjoint[(l->class[i] * l -> class_count) + l->class[j]])
+      return;
 
-        uint32_t constraint_fw = 0;
-        uint32_t constraint_bw = 0;
+   uint32_t constraint_fw = 0;
+   uint32_t constraint_bw = 0;
 
-        for (unsigned D = 0; D < 16; ++D) {
-                if (cmask_i & (cmask_j << D)) {
-                        constraint_bw |= (1 << (15 + D));
-                        constraint_fw |= (1 << (15 - D));
-                }
+   for (unsigned D = 0; D < 16; ++D) {
+      if (cmask_i & (cmask_j << D)) {
+         constraint_bw |= (1 << (15 + D));
+         constraint_fw |= (1 << (15 - D));
+      }
 
-                if (cmask_i & (cmask_j >> D)) {
-                        constraint_fw |= (1 << (15 + D));
-                        constraint_bw |= (1 << (15 - D));
-                }
-        }
+      if (cmask_i & (cmask_j >> D)) {
+         constraint_fw |= (1 << (15 + D));
+         constraint_bw |= (1 << (15 - D));
+      }
+   }
 
-        l->linear[j * l->node_count + i] |= constraint_fw;
-        l->linear[i * l->node_count + j] |= constraint_bw;
+   l->linear[j * l->node_count + i] |= constraint_fw;
+   l->linear[i * l->node_count + j] |= constraint_bw;
 }
 
 static bool
 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
 {
-        unsigned *row = &l->linear[i * l->node_count];
-        signed constant = solutions[i];
+   unsigned *row = &l->linear[i * l->node_count];
+   signed constant = solutions[i];
 
-        for (unsigned j = 0; j < l->node_count; ++j) {
-                if (solutions[j] == ~0) continue;
+   for (unsigned j = 0; j < l->node_count; ++j) {
+      if (solutions[j] == ~0)
+         continue;
 
-                signed lhs = solutions[j] - constant;
+      signed lhs = solutions[j] - constant;
 
-                if (lhs < -15 || lhs > 15)
-                        continue;
+      if (lhs < -15 || lhs > 15)
+         continue;
 
-                if (row[j] & (1 << (lhs + 15)))
-                        return false;
-        }
+      if (row[j] & (1 << (lhs + 15)))
+         return false;
+   }
 
-        return true;
+   return true;
 }
 
 bool
 lcra_solve(struct lcra_state *l)
 {
-        for (unsigned step = 0; step < l->node_count; ++step) {
-                if (l->solutions[step] != ~0) continue;
-                if (l->alignment[step] == 0) continue;
-
-                unsigned _class = l->class[step];
-                unsigned class_start = l->class_start[_class];
-
-                unsigned BA = l->alignment[step];
-                unsigned shift = (BA & 0xffff) - 1;
-                unsigned bound = BA >> 16;
-
-                unsigned P = bound >> shift;
-                unsigned Q = l->modulus[step];
-                unsigned r_max = l->class_size[_class];
-                unsigned k_max = r_max >> shift;
-                unsigned m_max = k_max / P;
-                bool succ = false;
-
-                for (unsigned m = 0; m < m_max; ++m) {
-                        for (unsigned n = 0; n < Q; ++n) {
-                                l->solutions[step] = ((m * P + n) << shift) + class_start;
-                                succ = lcra_test_linear(l, l->solutions, step);
-
-                                if (succ) break;
-                        }
-
-                        if (succ) break;
-                }
-
-                /* Out of registers - prepare to spill */
-                if (!succ) {
-                        l->spill_class = l->class[step];
-                        return false;
-                }
-        }
-
-        return true;
+   for (unsigned step = 0; step < l->node_count; ++step) {
+      if (l->solutions[step] != ~0)
+         continue;
+      if (l->alignment[step] == 0)
+         continue;
+
+      unsigned _class = l->class[step];
+      unsigned class_start = l->class_start[_class];
+
+      unsigned BA = l->alignment[step];
+      unsigned shift = (BA & 0xffff) - 1;
+      unsigned bound = BA >> 16;
+
+      unsigned P = bound >> shift;
+      unsigned Q = l->modulus[step];
+      unsigned r_max = l->class_size[_class];
+      unsigned k_max = r_max >> shift;
+      unsigned m_max = k_max / P;
+      bool succ = false;
+
+      for (unsigned m = 0; m < m_max; ++m) {
+         for (unsigned n = 0; n < Q; ++n) {
+            l->solutions[step] = ((m * P + n) << shift) + class_start;
+            succ = lcra_test_linear(l, l->solutions, step);
+
+            if (succ)
+               break;
+         }
+
+         if (succ)
+            break;
+      }
+
+      /* Out of registers - prepare to spill */
+      if (!succ) {
+         l->spill_class = l->class[step];
+         return false;
+      }
+   }
+
+   return true;
 }
 
 /* Register spilling is implemented with a cost-benefit system. Costs are set
@@ -203,45 +210,47 @@ lcra_solve(struct lcra_state *l)
 void
 lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost)
 {
-        if (node < l->node_count)
-                l->spill_cost[node] = cost;
+   if (node < l->node_count)
+      l->spill_cost[node] = cost;
 }
 
 static unsigned
 lcra_count_constraints(struct lcra_state *l, unsigned i)
 {
-        unsigned count = 0;
-        unsigned *constraints = &l->linear[i * l->node_count];
+   unsigned count = 0;
+   unsigned *constraints = &l->linear[i * l->node_count];
 
-        for (unsigned j = 0; j < l->node_count; ++j)
-                count += util_bitcount(constraints[j]);
+   for (unsigned j = 0; j < l->node_count; ++j)
+      count += util_bitcount(constraints[j]);
 
-        return count;
+   return count;
 }
 
 signed
 lcra_get_best_spill_node(struct lcra_state *l)
 {
-        /* If there are no constraints on a node, do not pick it to spill under
-         * any circumstance, or else we would hang rather than fail RA */
-        float best_benefit = 0.0;
-        signed best_node = -1;
-
-        for (unsigned i = 0; i < l->node_count; ++i) {
-                /* Find spillable nodes */
-                if (l->class[i] != l->spill_class) continue;
-                if (l->spill_cost[i] < 0) continue;
-
-                /* Adapted from Chaitin's heuristic */
-                float constraints = lcra_count_constraints(l, i);
-                float cost = (l->spill_cost[i] + 1);
-                float benefit = constraints / cost;
-
-                if (benefit > best_benefit) {
-                        best_benefit = benefit;
-                        best_node = i;
-                }
-        }
-
-        return best_node;
+   /* If there are no constraints on a node, do not pick it to spill under
+    * any circumstance, or else we would hang rather than fail RA */
+   float best_benefit = 0.0;
+   signed best_node = -1;
+
+   for (unsigned i = 0; i < l->node_count; ++i) {
+      /* Find spillable nodes */
+      if (l->class[i] != l->spill_class)
+         continue;
+      if (l->spill_cost[i] < 0)
+         continue;
+
+      /* Adapted from Chaitin's heuristic */
+      float constraints = lcra_count_constraints(l, i);
+      float cost = (l->spill_cost[i] + 1);
+      float benefit = constraints / cost;
+
+      if (benefit > best_benefit) {
+         best_benefit = benefit;
+         best_node = i;
+      }
+   }
+
+   return best_node;
 }
diff --git a/src/panfrost/util/lcra.h b/src/panfrost/util/lcra.h
index fd47fdc3543..0b1ed13400f 100644
--- a/src/panfrost/util/lcra.h
+++ b/src/panfrost/util/lcra.h
@@ -31,78 +31,71 @@
 #include <stdint.h>
 
 struct lcra_state {
-        unsigned node_count;
-
-        /* Alignment for node in log2(bytes)+1. Since alignment must be
-         * non-negative power-of-two, the elements are strictly positive
-         * integers. Zero is the sentinel for a missing node. In upper word,
-         * bound. */
-        unsigned *alignment;
-
-        /* Linear constraints imposed. Nested array sized upfront, organized as
-         * linear[node_left][node_right]. That is, calculate indices as:
-         *
-         * Each element is itself a bit field denoting whether (c_j - c_i) bias
-         * is present or not, including negative biases.
-         *
-         * Note for Midgard, there are 16 components so the bias is in range
-         * [-15, 15] so encoded by 32-bit field. */
-
-        uint32_t *linear;
-
-        /* Per node max modulus constraints */
-        uint8_t *modulus;
-
-        /* Classes allow nodes to be partitioned with a starting register.
-         * Classes cannot interfere; that is, they are true partitions in the
-         * usual sense of the word. class_count is the number of classes.
-         * class[] is indexed by a node to get the mapped class. class_start is
-         * biased to all solutions in the class. */
-
-        unsigned class_count;
-        unsigned *class;
-        unsigned *class_start;
-        unsigned *class_size;
-        bool *class_disjoint;
-
-        /* Before solving, forced registers; after solving, solutions. */
-        unsigned *solutions;
-
-        /* For register spilling, the costs to spill nodes (as set by the user)
-         * are in spill_cost[], negative if a node is unspillable. Internally,
-         * spill_class specifies which class to spill (whichever class failed
-         * to allocate) */
-
-        signed *spill_cost;
-        unsigned spill_class;
+   unsigned node_count;
+
+   /* Alignment for node in log2(bytes)+1. Since alignment must be
+    * non-negative power-of-two, the elements are strictly positive
+    * integers. Zero is the sentinel for a missing node. In upper word,
+    * bound. */
+   unsigned *alignment;
+
+   /* Linear constraints imposed. Nested array sized upfront, organized as
+    * linear[node_left][node_right]. That is, calculate indices as:
+    *
+    * Each element is itself a bit field denoting whether (c_j - c_i) bias
+    * is present or not, including negative biases.
+    *
+    * Note for Midgard, there are 16 components so the bias is in range
+    * [-15, 15] so encoded by 32-bit field. */
+
+   uint32_t *linear;
+
+   /* Per node max modulus constraints */
+   uint8_t *modulus;
+
+   /* Classes allow nodes to be partitioned with a starting register.
+    * Classes cannot interfere; that is, they are true partitions in the
+    * usual sense of the word. class_count is the number of classes.
+    * class[] is indexed by a node to get the mapped class. class_start is
+    * biased to all solutions in the class. */
+
+   unsigned class_count;
+   unsigned *class;
+   unsigned *class_start;
+   unsigned *class_size;
+   bool *class_disjoint;
+
+   /* Before solving, forced registers; after solving, solutions. */
+   unsigned *solutions;
+
+   /* For register spilling, the costs to spill nodes (as set by the user)
+    * are in spill_cost[], negative if a node is unspillable. Internally,
+    * spill_class specifies which class to spill (whichever class failed
+    * to allocate) */
+
+   signed *spill_cost;
+   unsigned spill_class;
 };
 
-struct lcra_state *
-lcra_alloc_equations(
-                unsigned node_count, unsigned class_count);
+struct lcra_state *lcra_alloc_equations(unsigned node_count,
+                                        unsigned class_count);
 
-void
-lcra_free(struct lcra_state *l);
+void lcra_free(struct lcra_state *l);
 
-void
-lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2);
+void lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2);
 
-void
-lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, unsigned bound);
+void lcra_set_alignment(struct lcra_state *l, unsigned node,
+                        unsigned align_log2, unsigned bound);
 
-void
-lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len);
+void lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len);
 
-void
-lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j);
+void lcra_add_node_interference(struct lcra_state *l, unsigned i,
+                                unsigned cmask_i, unsigned j, unsigned cmask_j);
 
-bool
-lcra_solve(struct lcra_state *l);
+bool lcra_solve(struct lcra_state *l);
 
-void
-lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost);
+void lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost);
 
-signed
-lcra_get_best_spill_node(struct lcra_state *l);
+signed lcra_get_best_spill_node(struct lcra_state *l);
 
 #endif
diff --git a/src/panfrost/util/meson.build b/src/panfrost/util/meson.build
index 75f9024eafb..d2d730fa587 100644
--- a/src/panfrost/util/meson.build
+++ b/src/panfrost/util/meson.build
@@ -22,22 +22,24 @@
 libpanfrost_util_files = files(
   'lcra.c',
   'lcra.h',
-  'nir_mod_helpers.c',
+  'pan_collect_varyings.c',
   'pan_ir.c',
   'pan_ir.h',
   'pan_liveness.c',
   'pan_lower_framebuffer.c',
   'pan_lower_helper_invocation.c',
+  'pan_lower_image_ms.c',
+  'pan_lower_image_index.c',
   'pan_lower_sample_position.c',
+  'pan_lower_store_component.c',
   'pan_lower_writeout.c',
-  'pan_lower_64bit_intrin.c',
-  'pan_sysval.c',
+  'pan_lower_xfb.c',
 )
 
 libpanfrost_util = static_library(
   'panfrost_util',
   [libpanfrost_util_files],
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw],
+  include_directories : [inc_include, inc_src, inc_panfrost_hw],
   dependencies: [idep_nir],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
diff --git a/src/panfrost/util/nir_mod_helpers.c b/src/panfrost/util/nir_mod_helpers.c
deleted file mode 100644
index 2fe7b4fabe5..00000000000
--- a/src/panfrost/util/nir_mod_helpers.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2020 Collabora, Ltd.
- * Copyright (C) 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Alyssa Rosenzweig <alyssa@collabora.com>
- *    Jason Ekstrand (jason@jlekstrand.net)
- *
- */
-
-#include "nir.h"
-#include "pan_ir.h"
-
-/* Check if a given ALU source is the result of a particular componentwise 1-op
- * ALU source (principally fneg or fabs). If so, return true and rewrite the
- * source to be the argument, respecting swizzles as needed. If not (or it
- * cannot be proven), return false and leave the source untouched.
-*/
-
-bool
-pan_has_source_mod(nir_alu_src *src, nir_op op)
-{
-   if (!src->src.is_ssa || src->src.ssa->parent_instr->type != nir_instr_type_alu)
-      return false;
-
-   nir_alu_instr *alu = nir_instr_as_alu(src->src.ssa->parent_instr);
-
-   if (alu->op != op)
-      return false;
-
-   /* This only works for unary ops */
-   assert(nir_op_infos[op].num_inputs == 1);
-
-   /* If the copied source is not SSA, moving it might not be valid */
-   if (!alu->src[0].src.is_ssa)
-      return false;
-
-   /* Okay - we've found the modifier we wanted. Let's construct the new ALU
-    * src. In a scalar world, this is just psrc, but for vector archs we need
-    * to respect the swizzle, so we compose. 
-    */
-
-   nir_alu_src nsrc = {
-      .src = alu->src[0].src,
-   };
-
-   for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; ++i) {
-      /* (a o b)(i) = a(b(i)) ... swizzle composition is intense. */
-      nsrc.swizzle[i] = alu->src[0].swizzle[src->swizzle[i]];
-   }
-
-   *src = nsrc;
-   return true;
-}
-
-/* Check if a given instruction's result will be fed into a
- * componentwise 1-op ALU instruction (principally fsat without
- * swizzles). If so, return true and rewrite the destination. The
- * backend will need to track the new destinations to avoid
- * incorrect double-emits. */
-
-bool
-pan_has_dest_mod(nir_dest **odest, nir_op op)
-{
-   /* This only works for unary ops */
-   assert(nir_op_infos[op].num_inputs == 1);
-
-   /* If not SSA, this might not be legal */
-   nir_dest *dest = *odest;
-   if (!dest->is_ssa)
-      return false;
-
-   /* Check the uses. We want a single use, with the op `op` */
-   if (!list_is_empty(&dest->ssa.if_uses))
-      return false;
-
-   if (!list_is_singular(&dest->ssa.uses))
-      return false;
-
-   nir_src *use = list_first_entry(&dest->ssa.uses, nir_src, use_link);
-   nir_instr *parent = use->parent_instr;
-
-   /* Check if the op is `op` */
-   if (parent->type != nir_instr_type_alu)
-      return false;
-
-   nir_alu_instr *alu = nir_instr_as_alu(parent);
-   if (alu->op != op)
-      return false;
-
-   /* We can't do expansions without a move in the middle */
-   unsigned nr_components = nir_dest_num_components(alu->dest.dest);
-
-   if (nir_dest_num_components(*dest) != nr_components)
-      return false;
-
-   /* We don't handle swizzles here, so check for the identity */
-   for (unsigned i = 0; i < nr_components; ++i) {
-      if (alu->src[0].swizzle[i] != i)
-         return false;
-   }
-
-   if (!alu->dest.dest.is_ssa)
-      return false;
-
-   /* Otherwise, we're good */
-   *odest = &alu->dest.dest;
-   return true;
-}
diff --git a/src/panfrost/util/pan_collect_varyings.c b/src/panfrost/util/pan_collect_varyings.c
new file mode 100644
index 00000000000..0134ecfb67b
--- /dev/null
+++ b/src/panfrost/util/pan_collect_varyings.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
+ * Copyright (C) 2019-2022 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
+
+static enum pipe_format
+varying_format(nir_alu_type t, unsigned ncomps)
+{
+   assert(ncomps >= 1 && ncomps <= 4);
+
+#define VARYING_FORMAT(ntype, nsz, ptype, psz)                                 \
+   {                                                                           \
+      .type = nir_type_##ntype##nsz, .formats = {                              \
+         PIPE_FORMAT_R##psz##_##ptype,                                         \
+         PIPE_FORMAT_R##psz##G##psz##_##ptype,                                 \
+         PIPE_FORMAT_R##psz##G##psz##B##psz##_##ptype,                         \
+         PIPE_FORMAT_R##psz##G##psz##B##psz##A##psz##_##ptype,                 \
+      }                                                                        \
+   }
+
+   static const struct {
+      nir_alu_type type;
+      enum pipe_format formats[4];
+   } conv[] = {
+      VARYING_FORMAT(float, 32, FLOAT, 32),
+      VARYING_FORMAT(uint, 32, UINT, 32),
+      VARYING_FORMAT(float, 16, FLOAT, 16),
+   };
+#undef VARYING_FORMAT
+
+   assert(ncomps > 0 && ncomps <= ARRAY_SIZE(conv[0].formats));
+
+   for (unsigned i = 0; i < ARRAY_SIZE(conv); i++) {
+      if (conv[i].type == t)
+         return conv[i].formats[ncomps - 1];
+   }
+
+   unreachable("Invalid type");
+}
+
+struct slot_info {
+   nir_alu_type type;
+   unsigned count;
+   unsigned index;
+};
+
+static bool
+walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data)
+{
+   struct slot_info *slots = data;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   unsigned count;
+
+   /* Only consider intrinsics that access varyings */
+   switch (intr->intrinsic) {
+   case nir_intrinsic_store_output:
+      if (b->shader->info.stage != MESA_SHADER_VERTEX)
+         return false;
+
+      count = nir_src_num_components(intr->src[0]);
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_interpolated_input:
+      if (b->shader->info.stage != MESA_SHADER_FRAGMENT)
+         return false;
+
+      count = intr->def.num_components;
+      break;
+
+   default:
+      return false;
+   }
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+   if (sem.no_varying)
+      return false;
+
+   /* In a fragment shader, flat shading is lowered to load_input but
+    * interpolation is lowered to load_interpolated_input, so we can check
+    * the intrinsic to distinguish.
+    *
+    * In a vertex shader, we consider everything flat, as the information
+    * will not contribute to the final linked varyings -- flatness is used
+    * only to determine the type, and the GL linker uses the type from the
+    * fragment shader instead.
+    */
+   bool flat = (intr->intrinsic != nir_intrinsic_load_interpolated_input);
+   nir_alu_type type = flat ? nir_type_uint : nir_type_float;
+
+   /* Demote interpolated float varyings to fp16 where possible. We do not
+    * demote flat varyings, including integer varyings, due to various
+    * issues with the Midgard hardware behaviour and TGSI shaders, as well
+    * as having no demonstrable benefit in practice.
+    */
+   if (type == nir_type_float && sem.medium_precision)
+      type |= 16;
+   else
+      type |= 32;
+
+   /* Count currently contains the number of components accessed by this
+    * intrinsics. However, we may be accessing a fractional location,
+    * indicating by the NIR component. Add that in. The final value be the
+    * maximum (component + count), an upper bound on the number of
+    * components possibly used.
+    */
+   count += nir_intrinsic_component(intr);
+
+   /* Consider each slot separately */
+   for (unsigned offset = 0; offset < sem.num_slots; ++offset) {
+      unsigned location = sem.location + offset;
+      unsigned index = pan_res_handle_get_index(nir_intrinsic_base(intr)) + offset;
+
+      if (slots[location].type) {
+         assert(slots[location].type == type);
+         assert(slots[location].index == index);
+      } else {
+         slots[location].type = type;
+         slots[location].index = index;
+      }
+
+      slots[location].count = MAX2(slots[location].count, count);
+   }
+
+   return false;
+}
+
+void
+pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info)
+{
+   if (s->info.stage != MESA_SHADER_VERTEX &&
+       s->info.stage != MESA_SHADER_FRAGMENT)
+      return;
+
+   struct slot_info slots[64] = {0};
+   nir_shader_instructions_pass(s, walk_varyings, nir_metadata_all, slots);
+
+   struct pan_shader_varying *varyings = (s->info.stage == MESA_SHADER_VERTEX)
+                                            ? info->varyings.output
+                                            : info->varyings.input;
+
+   unsigned count = 0;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(slots); ++i) {
+      if (!slots[i].type)
+         continue;
+
+      enum pipe_format format = varying_format(slots[i].type, slots[i].count);
+      assert(format != PIPE_FORMAT_NONE);
+
+      unsigned index = slots[i].index;
+      count = MAX2(count, index + 1);
+
+      varyings[index].location = i;
+      varyings[index].format = format;
+   }
+
+   if (s->info.stage == MESA_SHADER_VERTEX)
+      info->varyings.output_count = count;
+   else
+      info->varyings.input_count = count;
+}
diff --git a/src/panfrost/util/pan_ir.c b/src/panfrost/util/pan_ir.c
index c469274933f..8524e08bc84 100644
--- a/src/panfrost/util/pan_ir.c
+++ b/src/panfrost/util/pan_ir.c
@@ -32,73 +32,66 @@
 uint16_t
 pan_to_bytemask(unsigned bytes, unsigned mask)
 {
-        switch (bytes) {
-        case 0:
-                assert(mask == 0);
-                return 0;
-
-        case 8:
-                return mask;
-
-        case 16: {
-                unsigned space =
-                        (mask & 0x1) |
-                        ((mask & 0x2) << (2 - 1)) |
-                        ((mask & 0x4) << (4 - 2)) |
-                        ((mask & 0x8) << (6 - 3)) |
-                        ((mask & 0x10) << (8 - 4)) |
-                        ((mask & 0x20) << (10 - 5)) |
-                        ((mask & 0x40) << (12 - 6)) |
-                        ((mask & 0x80) << (14 - 7));
-
-                return space | (space << 1);
-        }
-
-        case 32: {
-                unsigned space =
-                        (mask & 0x1) |
-                        ((mask & 0x2) << (4 - 1)) |
-                        ((mask & 0x4) << (8 - 2)) |
-                        ((mask & 0x8) << (12 - 3));
-
-                return space | (space << 1) | (space << 2) | (space << 3);
-        }
-
-        case 64: {
-                unsigned A = (mask & 0x1) ? 0xFF : 0x00;
-                unsigned B = (mask & 0x2) ? 0xFF : 0x00;
-                return A | (B << 8);
-        }
-
-        default:
-                unreachable("Invalid register mode");
-        }
+   switch (bytes) {
+   case 0:
+      assert(mask == 0);
+      return 0;
+
+   case 8:
+      return mask;
+
+   case 16: {
+      unsigned space =
+         (mask & 0x1) | ((mask & 0x2) << (2 - 1)) | ((mask & 0x4) << (4 - 2)) |
+         ((mask & 0x8) << (6 - 3)) | ((mask & 0x10) << (8 - 4)) |
+         ((mask & 0x20) << (10 - 5)) | ((mask & 0x40) << (12 - 6)) |
+         ((mask & 0x80) << (14 - 7));
+
+      return space | (space << 1);
+   }
+
+   case 32: {
+      unsigned space = (mask & 0x1) | ((mask & 0x2) << (4 - 1)) |
+                       ((mask & 0x4) << (8 - 2)) | ((mask & 0x8) << (12 - 3));
+
+      return space | (space << 1) | (space << 2) | (space << 3);
+   }
+
+   case 64: {
+      unsigned A = (mask & 0x1) ? 0xFF : 0x00;
+      unsigned B = (mask & 0x2) ? 0xFF : 0x00;
+      return A | (B << 8);
+   }
+
+   default:
+      unreachable("Invalid register mode");
+   }
 }
 
 void
 pan_block_add_successor(pan_block *block, pan_block *successor)
 {
-        assert(block);
-        assert(successor);
-
-        /* Cull impossible edges */
-        if (block->unconditional_jumps)
-                return;
-
-        for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
-                if (block->successors[i]) {
-                       if (block->successors[i] == successor)
-                               return;
-                       else
-                               continue;
-                }
-
-                block->successors[i] = successor;
-                _mesa_set_add(successor->predecessors, block);
-                return;
-        }
-
-        unreachable("Too many successors");
+   assert(block);
+   assert(successor);
+
+   /* Cull impossible edges */
+   if (block->unconditional_jumps)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
+      if (block->successors[i]) {
+         if (block->successors[i] == successor)
+            return;
+         else
+            continue;
+      }
+
+      block->successors[i] = successor;
+      _mesa_set_add(successor->predecessors, block);
+      return;
+   }
+
+   unreachable("Too many successors");
 }
 
 /* Prints a NIR ALU type in Bifrost-style ".f32" ".i8" etc */
@@ -106,45 +99,42 @@ pan_block_add_successor(pan_block *block, pan_block *successor)
 void
 pan_print_alu_type(nir_alu_type t, FILE *fp)
 {
-        unsigned size = nir_alu_type_get_type_size(t);
-        nir_alu_type base = nir_alu_type_get_base_type(t);
-
-        switch (base) {
-        case nir_type_int:
-                fprintf(fp, ".i");
-                break;
-        case nir_type_uint:
-                fprintf(fp, ".u");
-                break;
-        case nir_type_bool:
-                fprintf(fp, ".b");
-                break;
-        case nir_type_float:
-                fprintf(fp, ".f");
-                break;
-        default:
-                fprintf(fp, ".unknown");
-                break;
-        }
-
-        fprintf(fp, "%u", size);
+   unsigned size = nir_alu_type_get_type_size(t);
+   nir_alu_type base = nir_alu_type_get_base_type(t);
+
+   switch (base) {
+   case nir_type_int:
+      fprintf(fp, ".i");
+      break;
+   case nir_type_uint:
+      fprintf(fp, ".u");
+      break;
+   case nir_type_bool:
+      fprintf(fp, ".b");
+      break;
+   case nir_type_float:
+      fprintf(fp, ".f");
+      break;
+   default:
+      fprintf(fp, ".unknown");
+      break;
+   }
+
+   fprintf(fp, "%u", size);
 }
 
 /* Could optimize with a better data structure if anyone cares, TODO: profile */
 
 unsigned
-pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs)
+pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo,
+                      unsigned offs)
 {
-        struct panfrost_ubo_word word = {
-                .ubo = ubo,
-                .offset = offs
-        };
+   struct panfrost_ubo_word word = {.ubo = ubo, .offset = offs};
 
-        for (unsigned i = 0; i < push->count; ++i) {
-                if (memcmp(push->words + i, &word, sizeof(word)) == 0)
-                        return i;
-        }
-
-        unreachable("UBO not pushed");
+   for (unsigned i = 0; i < push->count; ++i) {
+      if (memcmp(push->words + i, &word, sizeof(word)) == 0)
+         return i;
+   }
 
+   unreachable("UBO not pushed");
 }
diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h
index 1ff240b3070..7b24fb6922e 100644
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@@ -26,326 +26,420 @@
 
 #include <stdint.h>
 #include "compiler/nir/nir.h"
-#include "util/u_dynarray.h"
 #include "util/hash_table.h"
+#include "util/u_dynarray.h"
 
-/* Define the general compiler entry point */
-
-#define MAX_SYSVAL_COUNT 32
-
-/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal
- * their class for equal comparison */
-
-#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type)
-#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff)
-#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16)
-
-/* Define some common types. We start at one for easy indexing of hash
- * tables internal to the compiler */
-
-enum {
-        PAN_SYSVAL_VIEWPORT_SCALE = 1,
-        PAN_SYSVAL_VIEWPORT_OFFSET = 2,
-        PAN_SYSVAL_TEXTURE_SIZE = 3,
-        PAN_SYSVAL_SSBO = 4,
-        PAN_SYSVAL_NUM_WORK_GROUPS = 5,
-        PAN_SYSVAL_SAMPLER = 7,
-        PAN_SYSVAL_LOCAL_GROUP_SIZE = 8,
-        PAN_SYSVAL_WORK_DIM = 9,
-        PAN_SYSVAL_IMAGE_SIZE = 10,
-        PAN_SYSVAL_SAMPLE_POSITIONS = 11,
-        PAN_SYSVAL_MULTISAMPLED = 12,
-        PAN_SYSVAL_RT_CONVERSION = 13,
-        PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14,
-        PAN_SYSVAL_DRAWID = 15,
-};
+/* Indices for named (non-XFB) varyings that are present. These are packed
+ * tightly so they correspond to a bitfield present (P) indexed by (1 <<
+ * PAN_VARY_*). This has the nice property that you can lookup the buffer index
+ * of a given special field given a shift S by:
+ *
+ *      idx = popcount(P & ((1 << S) - 1))
+ *
+ * That is... look at all of the varyings that come earlier and count them, the
+ * count is the new index since plus one. Likewise, the total number of special
+ * buffers required is simply popcount(P)
+ */
+
+enum pan_special_varying {
+   PAN_VARY_GENERAL = 0,
+   PAN_VARY_POSITION = 1,
+   PAN_VARY_PSIZ = 2,
+   PAN_VARY_PNTCOORD = 3,
+   PAN_VARY_FACE = 4,
+   PAN_VARY_FRAGCOORD = 5,
 
-#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array)          \
-	((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0))
+   /* Keep last */
+   PAN_VARY_MAX,
+};
 
-#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id)        ((id) & 0x7f)
-#define PAN_SYSVAL_ID_TO_TXS_DIM(id)            (((id) >> 7) & 0x3)
-#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id)       !!((id) & (1 << 9))
+/* Maximum number of attribute descriptors required for varyings. These include
+ * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL
+ * special varying */
+#define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1)
 
 /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be
  * consistent with the blob so we can compare traces easier. */
 
-enum {
-        PAN_VERTEX_ID   = 16,
-        PAN_INSTANCE_ID = 17,
-        PAN_MAX_ATTRIBUTE
-};
+enum { PAN_VERTEX_ID = 16, PAN_INSTANCE_ID = 17, PAN_MAX_ATTRIBUTE };
 
-struct panfrost_sysvals {
-        /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */
-        unsigned sysvals[MAX_SYSVAL_COUNT];
-        unsigned sysval_count;
-};
-
-/* Technically Midgard could go up to 92 in a pathological case but we don't
- * take advantage of that. Likewise Bifrost's FAU encoding can address 128
- * words but actual implementations (G72, G76) are capped at 64 */
-
-#define PAN_MAX_PUSH 64
+/* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each.
+ * In practice, the maximum number of FAU slots is limited by implementation.
+ * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the
+ * maximum number of 32-bit words is 128, since there are 2 words per FAU slot.
+ *
+ * Midgard can push at most 92 words, so this bound suffices. The Midgard
+ * compiler pushes less than this, as Midgard uses register-mapped uniforms
+ * instead of FAU, preventing large numbers of uniforms to be pushed for
+ * nontrivial programs.
+ */
+#define PAN_MAX_PUSH 128
 
 /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so
  * an offset to a word must be < 2^16. There are less than 2^8 UBOs */
 
 struct panfrost_ubo_word {
-        uint16_t ubo;
-        uint16_t offset;
+   uint16_t ubo;
+   uint16_t offset;
 };
 
 struct panfrost_ubo_push {
-        unsigned count;
-        struct panfrost_ubo_word words[PAN_MAX_PUSH];
+   unsigned count;
+   struct panfrost_ubo_word words[PAN_MAX_PUSH];
 };
 
 /* Helper for searching the above. Note this is O(N) to the number of pushed
  * constants, do not run in the draw call hot path */
 
-unsigned
-pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs);
-
-struct hash_table_u64 *
-panfrost_init_sysvals(struct panfrost_sysvals *sysvals, void *memctx);
-
-unsigned
-pan_lookup_sysval(struct hash_table_u64 *sysval_to_id,
-                  struct panfrost_sysvals *sysvals,
-                  int sysval);
-
-int
-panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest);
+unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo,
+                               unsigned offs);
 
 struct panfrost_compile_inputs {
-        unsigned gpu_id;
-        bool is_blend, is_blit;
-        struct {
-                unsigned rt;
-                unsigned nr_samples;
-                uint64_t bifrost_blend_desc;
-        } blend;
-        unsigned sysval_ubo;
-        bool shaderdb;
-        bool no_ubo_to_push;
-
-        enum pipe_format rt_formats[8];
-        unsigned nr_cbufs;
+   struct util_debug_callback *debug;
+
+   unsigned gpu_id;
+   bool is_blend, is_blit;
+   struct {
+      unsigned nr_samples;
+      uint64_t bifrost_blend_desc;
+   } blend;
+   bool no_idvs;
+   bool no_ubo_to_push;
+
+   /* Used on Valhall.
+    *
+    * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
+    * written by the previous stage (fragment shader) or written by this
+    * stage (vertex shader). Bits are slots from gl_varying_slot.
+    *
+    * For modern APIs (GLES or VK), this should be 0.
+    */
+   uint32_t fixed_varying_mask;
+
+   union {
+      struct {
+         uint32_t rt_conv[8];
+      } bifrost;
+   };
 };
 
 struct pan_shader_varying {
-        gl_varying_slot location;
-        enum pipe_format format;
+   gl_varying_slot location;
+   enum pipe_format format;
 };
 
 struct bifrost_shader_blend_info {
-        nir_alu_type type;
-        uint32_t return_offset;
+   nir_alu_type type;
+   uint32_t return_offset;
 
-        /* mali_bifrost_register_file_format corresponding to nir_alu_type */
-        unsigned format;
+   /* mali_bifrost_register_file_format corresponding to nir_alu_type */
+   unsigned format;
 };
 
-struct bifrost_shader_info {
-        struct bifrost_shader_blend_info blend[8];
-        nir_alu_type blend_src1_type;
-        bool wait_6, wait_7;
+/*
+ * Unpacked form of a v7 message preload descriptor, produced by the compiler's
+ * message preload optimization. By splitting out this struct, the compiler does
+ * not need to know about data structure packing, avoiding a dependency on
+ * GenXML.
+ */
+struct bifrost_message_preload {
+   /* Whether to preload this message */
+   bool enabled;
+
+   /* Varying to load from */
+   unsigned varying_index;
+
+   /* Register type, FP32 otherwise */
+   bool fp16;
 
-        /* Packed, preloaded message descriptors */
-        uint16_t messages[2];
+   /* Number of components, ignored if texturing */
+   unsigned num_components;
+
+   /* If texture is set, performs a texture instruction according to
+    * texture_index, skip, and zero_lod. If texture is unset, only the
+    * varying load is performed.
+    */
+   bool texture, skip, zero_lod;
+   unsigned texture_index;
+};
+
+struct bifrost_shader_info {
+   struct bifrost_shader_blend_info blend[8];
+   nir_alu_type blend_src1_type;
+   bool wait_6, wait_7;
+   struct bifrost_message_preload messages[2];
+
+   /* Whether any flat varyings are loaded. This may disable optimizations
+    * that change the provoking vertex, since that would load incorrect
+    * values for flat varyings.
+    */
+   bool uses_flat_shading;
 };
 
 struct midgard_shader_info {
-        unsigned first_tag;
+   unsigned first_tag;
 };
 
 struct pan_shader_info {
-        gl_shader_stage stage;
-        unsigned work_reg_count;
-        unsigned tls_size;
-        unsigned wls_size;
-
-        union {
-                struct {
-                        bool reads_frag_coord;
-                        bool reads_point_coord;
-                        bool reads_face;
-                        bool helper_invocations;
-                        bool can_discard;
-                        bool writes_depth;
-                        bool writes_stencil;
-                        bool writes_coverage;
-                        bool sidefx;
-                        bool reads_sample_id;
-                        bool reads_sample_pos;
-                        bool reads_sample_mask_in;
-                        bool reads_helper_invocation;
-                        bool sample_shading;
-                        bool early_fragment_tests;
-                        bool can_early_z, can_fpk;
-                        BITSET_WORD outputs_read;
-                        BITSET_WORD outputs_written;
-                } fs;
-
-                struct {
-                        bool writes_point_size;
-                } vs;
-        };
-
-        bool separable;
-        bool contains_barrier;
-        bool writes_global;
-        uint64_t outputs_written;
-
-        unsigned sampler_count;
-        unsigned texture_count;
-        unsigned ubo_count;
-        unsigned attribute_count;
-
-        struct {
-                unsigned input_count;
-                struct pan_shader_varying input[MAX_VARYING];
-                unsigned output_count;
-                struct pan_shader_varying output[MAX_VARYING];
-        } varyings;
-
-        struct panfrost_sysvals sysvals;
-
-        /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access
-         * Uniforms (Bifrost) */
-        struct panfrost_ubo_push push;
-
-        uint32_t ubo_mask;
-
-        union {
-                struct bifrost_shader_info bifrost;
-                struct midgard_shader_info midgard;
-        };
+   gl_shader_stage stage;
+   unsigned work_reg_count;
+   unsigned tls_size;
+   unsigned wls_size;
+
+   /* Bit mask of preloaded registers */
+   uint64_t preload;
+
+   union {
+      struct {
+         bool reads_frag_coord;
+         bool reads_point_coord;
+         bool reads_face;
+         bool can_discard;
+         bool writes_depth;
+         bool writes_stencil;
+         bool writes_coverage;
+         bool sidefx;
+         bool sample_shading;
+         bool early_fragment_tests;
+         bool can_early_z, can_fpk;
+         bool untyped_color_outputs;
+         BITSET_WORD outputs_read;
+         BITSET_WORD outputs_written;
+      } fs;
+
+      struct {
+         bool writes_point_size;
+
+         /* If the primary shader writes point size, the Valhall
+          * driver may need a variant that does not write point
+          * size. Offset to such a shader in the program binary.
+          *
+          * Zero if no such variant is required.
+          *
+          * Only used with IDVS on Valhall.
+          */
+         unsigned no_psiz_offset;
+
+         /* Set if Index-Driven Vertex Shading is in use */
+         bool idvs;
+
+         /* If IDVS is used, whether a varying shader is used */
+         bool secondary_enable;
+
+         /* If a varying shader is used, the varying shader's
+          * offset in the program binary
+          */
+         unsigned secondary_offset;
+
+         /* If IDVS is in use, number of work registers used by
+          * the varying shader
+          */
+         unsigned secondary_work_reg_count;
+
+         /* If IDVS is in use, bit mask of preloaded registers
+          * used by the varying shader
+          */
+         uint64_t secondary_preload;
+      } vs;
+
+      struct {
+         /* Is it legal to merge workgroups? This is true if the
+          * shader uses neither barriers nor shared memory. This
+          * requires caution: if the API allows specifying shared
+          * memory at launch time (instead of compile time), that
+          * memory will not be accounted for by the compiler.
+          *
+          * Used by the Valhall hardware.
+          */
+         bool allow_merging_workgroups;
+      } cs;
+   };
+
+   /* Does the shader contains a barrier? or (for fragment shaders) does it
+    * require helper invocations, which demand the same ordering guarantees
+    * of the hardware? These notions are unified in the hardware, so we
+    * unify them here as well.
+    */
+   bool contains_barrier;
+   bool separable;
+   bool writes_global;
+   uint64_t outputs_written;
+
+   /* Floating point controls that the driver should try to honour */
+   bool ftz_fp16, ftz_fp32;
+
+   unsigned sampler_count;
+   unsigned texture_count;
+   unsigned ubo_count;
+   unsigned attributes_read_count;
+   unsigned attribute_count;
+   unsigned attributes_read;
+
+   struct {
+      unsigned input_count;
+      struct pan_shader_varying input[PAN_MAX_VARYINGS];
+      unsigned output_count;
+      struct pan_shader_varying output[PAN_MAX_VARYINGS];
+   } varyings;
+
+   /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access
+    * Uniforms (Bifrost) */
+   struct panfrost_ubo_push push;
+
+   uint32_t ubo_mask;
+
+   union {
+      struct bifrost_shader_info bifrost;
+      struct midgard_shader_info midgard;
+   };
 };
 
 typedef struct pan_block {
-        /* Link to next block. Must be first for mir_get_block */
-        struct list_head link;
+   /* Link to next block. Must be first for mir_get_block */
+   struct list_head link;
 
-        /* List of instructions emitted for the current block */
-        struct list_head instructions;
+   /* List of instructions emitted for the current block */
+   struct list_head instructions;
 
-        /* Index of the block in source order */
-        unsigned name;
+   /* Index of the block in source order */
+   unsigned name;
 
-        /* Control flow graph */
-        struct pan_block *successors[2];
-        struct set *predecessors;
-        bool unconditional_jumps;
+   /* Control flow graph */
+   struct pan_block *successors[2];
+   struct set *predecessors;
+   bool unconditional_jumps;
 
-        /* In liveness analysis, these are live masks (per-component) for
-         * indices for the block. Scalar compilers have the luxury of using
-         * simple bit fields, but for us, liveness is a vector idea. */
-        uint16_t *live_in;
-        uint16_t *live_out;
+   /* In liveness analysis, these are live masks (per-component) for
+    * indices for the block. Scalar compilers have the luxury of using
+    * simple bit fields, but for us, liveness is a vector idea. */
+   uint16_t *live_in;
+   uint16_t *live_out;
 } pan_block;
 
 struct pan_instruction {
-        struct list_head link;
+   struct list_head link;
 };
 
-#define pan_foreach_instr_in_block_rev(block, v) \
-        list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link)
-
-#define pan_foreach_successor(blk, v) \
-        pan_block *v; \
-        pan_block **_v; \
-        for (_v = (pan_block **) &blk->successors[0], \
-                v = *_v; \
-                v != NULL && _v < (pan_block **) &blk->successors[2]; \
-                _v++, v = *_v) \
-
-#define pan_foreach_predecessor(blk, v) \
-        struct set_entry *_entry_##v; \
-        struct pan_block *v; \
-        for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \
-                v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL);  \
-                _entry_##v != NULL; \
-                _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \
-                v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL))
+#define pan_foreach_instr_in_block_rev(block, v)                               \
+   list_for_each_entry_rev(struct pan_instruction, v, &block->instructions,    \
+                           link)
+
+#define pan_foreach_successor(blk, v)                                          \
+   pan_block *v;                                                               \
+   pan_block **_v;                                                             \
+   for (_v = (pan_block **)&blk->successors[0], v = *_v;                       \
+        v != NULL && _v < (pan_block **)&blk->successors[2]; _v++, v = *_v)
+
+#define pan_foreach_predecessor(blk, v)                                        \
+   struct set_entry *_entry_##v;                                               \
+   struct pan_block *v;                                                        \
+   for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL),            \
+       v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL);          \
+        _entry_##v != NULL;                                                    \
+        _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v),      \
+       v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL))
 
 static inline pan_block *
 pan_exit_block(struct list_head *blocks)
 {
-        pan_block *last = list_last_entry(blocks, pan_block, link);
-        assert(!last->successors[0] && !last->successors[1]);
-        return last;
+   pan_block *last = list_last_entry(blocks, pan_block, link);
+   assert(!last->successors[0] && !last->successors[1]);
+   return last;
 }
 
 typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max);
 
-void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
-void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
+void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max,
+                      uint16_t mask);
+void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max,
+                       uint16_t mask);
 bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max);
 
-void pan_compute_liveness(struct list_head *blocks,
-                unsigned temp_count,
-                pan_liveness_update callback);
+void pan_compute_liveness(struct list_head *blocks, unsigned temp_count,
+                          pan_liveness_update callback);
 
 void pan_free_liveness(struct list_head *blocks);
 
-uint16_t
-pan_to_bytemask(unsigned bytes, unsigned mask);
+uint16_t pan_to_bytemask(unsigned bytes, unsigned mask);
 
 void pan_block_add_successor(pan_block *block, pan_block *successor);
 
 /* IR indexing */
 #define PAN_IS_REG (1)
 
-static inline unsigned
-pan_ssa_index(nir_ssa_def *ssa)
-{
-        /* Off-by-one ensures BIR_NO_ARG is skipped */
-        return ((ssa->index + 1) << 1) | 0;
-}
-
-static inline unsigned
-pan_src_index(nir_src *src)
-{
-        if (src->is_ssa)
-                return pan_ssa_index(src->ssa);
-        else {
-                assert(!src->reg.indirect);
-                return (src->reg.reg->index << 1) | PAN_IS_REG;
-        }
-}
-
-static inline unsigned
-pan_dest_index(nir_dest *dst)
-{
-        if (dst->is_ssa)
-                return pan_ssa_index(&dst->ssa);
-        else {
-                assert(!dst->reg.indirect);
-                return (dst->reg.reg->index << 1) | PAN_IS_REG;
-        }
-}
-
 /* IR printing helpers */
 void pan_print_alu_type(nir_alu_type t, FILE *fp);
 
-/* Until it can be upstreamed.. */
-bool pan_has_source_mod(nir_alu_src *src, nir_op op);
-bool pan_has_dest_mod(nir_dest **dest, nir_op op);
-
 /* NIR passes to do some backend-specific lowering */
 
 #define PAN_WRITEOUT_C 1
 #define PAN_WRITEOUT_Z 2
 #define PAN_WRITEOUT_S 4
+#define PAN_WRITEOUT_2 8
 
-bool pan_nir_reorder_writeout(nir_shader *nir);
 bool pan_nir_lower_zs_store(nir_shader *nir);
+bool pan_nir_lower_store_component(nir_shader *shader);
 
+bool pan_nir_lower_image_ms(nir_shader *shader);
 bool pan_nir_lower_64bit_intrin(nir_shader *shader);
 
 bool pan_lower_helper_invocation(nir_shader *shader);
 bool pan_lower_sample_pos(nir_shader *shader);
+bool pan_lower_xfb(nir_shader *nir);
+
+bool pan_lower_image_index(nir_shader *shader, unsigned vs_img_attrib_offset);
+
+void pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info);
+
+/*
+ * Helper returning the subgroup size. Generally, this is equal to the number of
+ * threads in a warp. For Midgard (including warping models), this returns 1, as
+ * subgroups are not supported.
+ */
+static inline unsigned
+pan_subgroup_size(unsigned arch)
+{
+   if (arch >= 9)
+      return 16;
+   else if (arch >= 7)
+      return 8;
+   else if (arch >= 6)
+      return 4;
+   else
+      return 1;
+}
+
+/*
+ * Helper extracting the table from a given handle of Valhall descriptor model.
+ */
+static inline unsigned
+pan_res_handle_get_table(unsigned handle)
+{
+   unsigned table = handle >> 24;
+
+   assert(table < 64);
+   return table;
+}
+
+/*
+ * Helper returning the index from a given handle of Valhall descriptor model.
+ */
+static inline unsigned
+pan_res_handle_get_index(unsigned handle)
+{
+   return handle & BITFIELD_MASK(24);
+}
+
+/*
+ * Helper creating an handle for Valhall descriptor model.
+ */
+static inline unsigned
+pan_res_handle(unsigned table, unsigned index)
+{
+   assert(table < 64);
+   assert(index < (1u << 24));
+
+   return (table << 24) | index;
+}
 
 #endif
diff --git a/src/panfrost/util/pan_liveness.c b/src/panfrost/util/pan_liveness.c
index 0ec9652b59d..e299bc29d75 100644
--- a/src/panfrost/util/pan_liveness.c
+++ b/src/panfrost/util/pan_liveness.c
@@ -21,10 +21,10 @@
  * SOFTWARE.
  */
 
-#include "pan_ir.h"
-#include "util/u_memory.h"
 #include "util/list.h"
 #include "util/set.h"
+#include "util/u_memory.h"
+#include "pan_ir.h"
 
 /* Routines for liveness analysis. Liveness is tracked per byte per node. Per
  * byte granularity is necessary for proper handling of int8 */
@@ -32,28 +32,28 @@
 void
 pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask)
 {
-        if (node >= max)
-                return;
+   if (node >= max)
+      return;
 
-        live[node] |= mask;
+   live[node] |= mask;
 }
 
 void
 pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask)
 {
-        if (node >= max)
-                return;
+   if (node >= max)
+      return;
 
-        live[node] &= ~mask;
+   live[node] &= ~mask;
 }
 
 bool
 pan_liveness_get(uint16_t *live, unsigned node, uint16_t max)
 {
-        if (node >= max)
-                return false;
+   if (node >= max)
+      return false;
 
-        return live[node];
+   return live[node];
 }
 
 /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */
@@ -61,10 +61,10 @@ pan_liveness_get(uint16_t *live, unsigned node, uint16_t max)
 static void
 liveness_block_live_out(pan_block *blk, unsigned temp_count)
 {
-        pan_foreach_successor(blk, succ) {
-                for (unsigned i = 0; i < temp_count; ++i)
-                        blk->live_out[i] |= succ->live_in[i];
-        }
+   pan_foreach_successor(blk, succ) {
+      for (unsigned i = 0; i < temp_count; ++i)
+         blk->live_out[i] |= succ->live_in[i];
+   }
 }
 
 /* Liveness analysis is a backwards-may dataflow analysis pass. Within a block,
@@ -72,32 +72,30 @@ liveness_block_live_out(pan_block *blk, unsigned temp_count)
  * returns whether progress was made. */
 
 static bool
-liveness_block_update(
-                pan_block *blk, unsigned temp_count,
-                pan_liveness_update callback)
+liveness_block_update(pan_block *blk, unsigned temp_count,
+                      pan_liveness_update callback)
 {
-        bool progress = false;
+   bool progress = false;
 
-        liveness_block_live_out(blk, temp_count);
+   liveness_block_live_out(blk, temp_count);
 
-        uint16_t *live = ralloc_array(blk, uint16_t, temp_count);
-        memcpy(live, blk->live_out, temp_count * sizeof(uint16_t));
+   uint16_t *live = ralloc_array(blk, uint16_t, temp_count);
+   memcpy(live, blk->live_out, temp_count * sizeof(uint16_t));
 
-        pan_foreach_instr_in_block_rev(blk, ins)
-                callback(live, (void *) ins, temp_count);
+   pan_foreach_instr_in_block_rev(blk, ins)
+      callback(live, (void *)ins, temp_count);
 
-        /* To figure out progress, diff live_in */
+   /* To figure out progress, diff live_in */
 
-        for (unsigned i = 0; (i < temp_count) && !progress; ++i)
-                progress |= (blk->live_in[i] != live[i]);
+   for (unsigned i = 0; (i < temp_count) && !progress; ++i)
+      progress |= (blk->live_in[i] != live[i]);
 
-        ralloc_free(blk->live_in);
-        blk->live_in = live;
+   ralloc_free(blk->live_in);
+   blk->live_in = live;
 
-        return progress;
+   return progress;
 }
 
-
 /* Globally, liveness analysis uses a fixed-point algorithm based on a
  * worklist. We initialize a work list with the exit block. We iterate the work
  * list to compute live_in from live_out for each block on the work list,
@@ -105,70 +103,66 @@ liveness_block_update(
  */
 
 void
-pan_compute_liveness(
-                struct list_head *blocks,
-                unsigned temp_count,
-                pan_liveness_update callback)
+pan_compute_liveness(struct list_head *blocks, unsigned temp_count,
+                     pan_liveness_update callback)
 {
 
-        /* Set of pan_block */
-        struct set *work_list = _mesa_set_create(NULL,
-                        _mesa_hash_pointer,
-                        _mesa_key_pointer_equal);
+   /* Set of pan_block */
+   struct set *work_list =
+      _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
 
-        struct set *visited = _mesa_set_create(NULL,
-                        _mesa_hash_pointer,
-                        _mesa_key_pointer_equal);
+   struct set *visited =
+      _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
 
-        /* Free any previous liveness, and allocate */
+   /* Free any previous liveness, and allocate */
 
-        pan_free_liveness(blocks);
+   pan_free_liveness(blocks);
 
-        list_for_each_entry(pan_block, block, blocks, link) {
-                block->live_in = rzalloc_array(block, uint16_t, temp_count);
-                block->live_out = rzalloc_array(block, uint16_t, temp_count);
-        }
+   list_for_each_entry(pan_block, block, blocks, link) {
+      block->live_in = rzalloc_array(block, uint16_t, temp_count);
+      block->live_out = rzalloc_array(block, uint16_t, temp_count);
+   }
 
-        /* Initialize the work list with the exit block */
-        struct set_entry *cur;
+   /* Initialize the work list with the exit block */
+   struct set_entry *cur;
 
-        cur = _mesa_set_add(work_list, pan_exit_block(blocks));
+   cur = _mesa_set_add(work_list, pan_exit_block(blocks));
 
-        /* Iterate the work list */
+   /* Iterate the work list */
 
-        do {
-                /* Pop off a block */
-                pan_block *blk = (struct pan_block *) cur->key;
-                _mesa_set_remove(work_list, cur);
+   do {
+      /* Pop off a block */
+      pan_block *blk = (struct pan_block *)cur->key;
+      _mesa_set_remove(work_list, cur);
 
-                /* Update its liveness information */
-                bool progress = liveness_block_update(blk, temp_count, callback);
+      /* Update its liveness information */
+      bool progress = liveness_block_update(blk, temp_count, callback);
 
-                /* If we made progress, we need to process the predecessors */
+      /* If we made progress, we need to process the predecessors */
 
-                if (progress || !_mesa_set_search(visited, blk)) {
-                        pan_foreach_predecessor(blk, pred)
-                                _mesa_set_add(work_list, pred);
-                }
+      if (progress || !_mesa_set_search(visited, blk)) {
+         pan_foreach_predecessor(blk, pred)
+            _mesa_set_add(work_list, pred);
+      }
 
-                _mesa_set_add(visited, blk);
-        } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL);
+      _mesa_set_add(visited, blk);
+   } while ((cur = _mesa_set_next_entry(work_list, NULL)) != NULL);
 
-        _mesa_set_destroy(visited, NULL);
-        _mesa_set_destroy(work_list, NULL);
+   _mesa_set_destroy(visited, NULL);
+   _mesa_set_destroy(work_list, NULL);
 }
 
 void
 pan_free_liveness(struct list_head *blocks)
 {
-        list_for_each_entry(pan_block, block, blocks, link) {
-                if (block->live_in)
-                        ralloc_free(block->live_in);
+   list_for_each_entry(pan_block, block, blocks, link) {
+      if (block->live_in)
+         ralloc_free(block->live_in);
 
-                if (block->live_out)
-                        ralloc_free(block->live_out);
+      if (block->live_out)
+         ralloc_free(block->live_out);
 
-                block->live_in = NULL;
-                block->live_out = NULL;
-        }
+      block->live_in = NULL;
+      block->live_out = NULL;
+   }
 }
diff --git a/src/panfrost/util/pan_lower_64bit_intrin.c b/src/panfrost/util/pan_lower_64bit_intrin.c
deleted file mode 100644
index 7c4edcfa9d7..00000000000
--- a/src/panfrost/util/pan_lower_64bit_intrin.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (C) 2020 Icecream95 <ixn@disroot.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "pan_ir.h"
-#include "compiler/nir/nir_builder.h"
-
-/* OpenCL uses 64-bit types for some intrinsic functions, including
- * global_invocation_id(). This could be worked around during conversion to
- * MIR, except that global_invocation_id is a vec3, and the 128-bit registers
- * on Midgard can only hold a 64-bit vec2.
- * Rather than attempting to add hacky 64-bit vec3 support, convert these
- * intrinsics to 32-bit and add a cast back to 64-bit, and rely on NIR not
- * vectorizing back to vec3.
- */
-
-static bool
-nir_lower_64bit_intrin_instr(nir_builder *b, nir_instr *instr, void *data)
-{
-        if (instr->type != nir_instr_type_intrinsic)
-                return false;
-
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-        switch (intr->intrinsic) {
-        case nir_intrinsic_load_global_invocation_id:
-        case nir_intrinsic_load_global_invocation_id_zero_base:
-        case nir_intrinsic_load_workgroup_id:
-        case nir_intrinsic_load_num_workgroups:
-                break;
-
-        default:
-                return false;
-        }
-
-        if (nir_dest_bit_size(intr->dest) != 64)
-                return false;
-
-        b->cursor = nir_after_instr(instr);
-
-        assert(intr->dest.is_ssa);
-        intr->dest.ssa.bit_size = 32;
-
-        nir_ssa_def *conv = nir_u2u64(b, &intr->dest.ssa);
-
-        nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, conv,
-                                       conv->parent_instr);
-
-        return true;
-}
-
-bool
-pan_nir_lower_64bit_intrin(nir_shader *shader)
-{
-        return nir_shader_instructions_pass(shader,
-                                            nir_lower_64bit_intrin_instr,
-                                            nir_metadata_block_index | nir_metadata_dominance,
-                                            NULL);
-}
diff --git a/src/panfrost/util/pan_lower_framebuffer.c b/src/panfrost/util/pan_lower_framebuffer.c
index 71795fd9416..96e24683b25 100644
--- a/src/panfrost/util/pan_lower_framebuffer.c
+++ b/src/panfrost/util/pan_lower_framebuffer.c
@@ -47,12 +47,11 @@
  * smallest precision necessary to store the pixel losslessly.
  */
 
+#include "pan_lower_framebuffer.h"
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
 #include "compiler/nir/nir_format_convert.h"
 #include "util/format/u_format.h"
-#include "pan_lower_framebuffer.h"
-#include "panfrost-quirks.h"
 
 /* Determines the unpacked type best suiting a given format, so the rest of the
  * pipeline may be adjusted accordingly */
@@ -60,86 +59,54 @@
 nir_alu_type
 pan_unpacked_type_for_format(const struct util_format_description *desc)
 {
-        int c = util_format_get_first_non_void_channel(desc->format);
-
-        if (c == -1)
-                unreachable("Void format not renderable");
-
-        bool large = (desc->channel[c].size > 16);
-        bool large_norm = (desc->channel[c].size > 8);
-        bool bit8 = (desc->channel[c].size == 8);
-        assert(desc->channel[c].size <= 32);
-
-        if (desc->channel[c].normalized)
-                return large_norm ? nir_type_float32 : nir_type_float16;
-
-        switch (desc->channel[c].type) {
-        case UTIL_FORMAT_TYPE_UNSIGNED:
-                return bit8 ? nir_type_uint8 :
-                        large ? nir_type_uint32 : nir_type_uint16;
-        case UTIL_FORMAT_TYPE_SIGNED:
-                return bit8 ? nir_type_int8 :
-                        large ? nir_type_int32 : nir_type_int16;
-        case UTIL_FORMAT_TYPE_FLOAT:
-                return large ? nir_type_float32 : nir_type_float16;
-        default:
-                unreachable("Format not renderable");
-        }
+   int c = util_format_get_first_non_void_channel(desc->format);
+
+   if (c == -1)
+      unreachable("Void format not renderable");
+
+   bool large = (desc->channel[c].size > 16);
+   bool large_norm = (desc->channel[c].size > 8);
+   bool bit8 = (desc->channel[c].size == 8);
+   assert(desc->channel[c].size <= 32);
+
+   if (desc->channel[c].normalized)
+      return large_norm ? nir_type_float32 : nir_type_float16;
+
+   switch (desc->channel[c].type) {
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+      return bit8 ? nir_type_uint8 : large ? nir_type_uint32 : nir_type_uint16;
+   case UTIL_FORMAT_TYPE_SIGNED:
+      return bit8 ? nir_type_int8 : large ? nir_type_int32 : nir_type_int16;
+   case UTIL_FORMAT_TYPE_FLOAT:
+      return large ? nir_type_float32 : nir_type_float16;
+   default:
+      unreachable("Format not renderable");
+   }
 }
 
-static enum pan_format_class
-pan_format_class_load(const struct util_format_description *desc, unsigned quirks)
+static bool
+pan_is_format_native(const struct util_format_description *desc,
+                     bool broken_ld_special, bool is_store)
 {
-        /* Pure integers can be loaded via EXT_framebuffer_fetch and should be
-         * handled as a raw load with a size conversion (it's cheap). Likewise,
-         * since float framebuffers are internally implemented as raw (i.e.
-         * integer) framebuffers with blend shaders to go back and forth, they
-         * should be s/w as well */
-
-        if (util_format_is_pure_integer(desc->format) || util_format_is_float(desc->format))
-                return PAN_FORMAT_SOFTWARE;
-
-        /* Check if we can do anything better than software architecturally */
-        if (quirks & MIDGARD_NO_TYPED_BLEND_LOADS) {
-                return (quirks & NO_BLEND_PACKS)
-                        ? PAN_FORMAT_SOFTWARE : PAN_FORMAT_PACK;
-        }
-
-        /* Some formats are missing as typed on some GPUs but have unpacks */
-        if (quirks & MIDGARD_MISSING_LOADS) {
-                switch (desc->format) {
-                case PIPE_FORMAT_R11G11B10_FLOAT:
-                        return PAN_FORMAT_PACK;
-                default:
-                        return PAN_FORMAT_NATIVE;
-                }
-        }
-
-        /* Otherwise, we can do native */
-        return PAN_FORMAT_NATIVE;
-}
+   if (is_store || broken_ld_special)
+      return false;
 
-static enum pan_format_class
-pan_format_class_store(const struct util_format_description *desc, unsigned quirks)
-{
-        /* Check if we can do anything better than software architecturally */
-        if (quirks & MIDGARD_NO_TYPED_BLEND_STORES) {
-                return (quirks & NO_BLEND_PACKS)
-                        ? PAN_FORMAT_SOFTWARE : PAN_FORMAT_PACK;
-        }
+   if (util_format_is_pure_integer(desc->format) ||
+       util_format_is_float(desc->format))
+      return false;
 
-        return PAN_FORMAT_NATIVE;
-}
+   /* Some formats are missing as typed but have unpacks */
+   if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
+      return false;
 
-/* Convenience method */
+   if (desc->is_array) {
+      int c = util_format_get_first_non_void_channel(desc->format);
+      assert(c >= 0);
+      if (desc->channel[c].size > 8)
+         return false;
+   }
 
-static enum pan_format_class
-pan_format_class(const struct util_format_description *desc, unsigned quirks, bool is_store)
-{
-        if (is_store)
-                return pan_format_class_store(desc, quirks);
-        else
-                return pan_format_class_load(desc, quirks);
+   return true;
 }
 
 /* Software packs/unpacks, by format class. Packs take in the pixel value typed
@@ -147,430 +114,523 @@ pan_format_class(const struct util_format_description *desc, unsigned quirks, bo
  * suitable for storing (with components replicated to fill). Unpacks do the
  * reverse but cannot rely on replication. */
 
-static nir_ssa_def *
-pan_replicate(nir_builder *b, nir_ssa_def *v, unsigned num_components)
+static nir_def *
+pan_replicate(nir_builder *b, nir_def *v, unsigned num_components)
 {
-        nir_ssa_def *replicated[4];
-
-        for (unsigned i = 0; i < 4; ++i)
-                replicated[i] = nir_channel(b, v, i % num_components);
+   nir_def *replicated[4];
 
-        return nir_vec(b, replicated, 4);
-}
+   for (unsigned i = 0; i < 4; ++i)
+      replicated[i] = nir_channel(b, v, i % num_components);
 
-static nir_ssa_def *
-pan_unpack_pure_32(nir_builder *b, nir_ssa_def *pack, unsigned num_components)
-{
-        return nir_channels(b, pack, (1 << num_components) - 1);
+   return nir_vec(b, replicated, 4);
 }
 
 /* Pure x16 formats are x16 unpacked, so it's similar, but we need to pack
  * upper/lower halves of course */
 
-static nir_ssa_def *
-pan_pack_pure_16(nir_builder *b, nir_ssa_def *v, unsigned num_components)
+static nir_def *
+pan_pack_pure_16(nir_builder *b, nir_def *v, unsigned num_components)
 {
-        nir_ssa_def *v4 = pan_replicate(b, v, num_components);
+   nir_def *v4 = pan_replicate(b, v, num_components);
 
-        nir_ssa_def *lo = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 0));
-        nir_ssa_def *hi = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 2));
+   nir_def *lo = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 0));
+   nir_def *hi = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 2));
 
-        return nir_vec4(b, lo, hi, lo, hi);
+   return nir_vec4(b, lo, hi, lo, hi);
 }
 
-static nir_ssa_def *
-pan_unpack_pure_16(nir_builder *b, nir_ssa_def *pack, unsigned num_components)
+static nir_def *
+pan_unpack_pure_16(nir_builder *b, nir_def *pack, unsigned num_components)
 {
-        nir_ssa_def *unpacked[4];
+   nir_def *unpacked[4];
+
+   assert(num_components <= 4);
+
+   for (unsigned i = 0; i < num_components; i += 2) {
+      nir_def *halves = nir_unpack_32_2x16(b, nir_channel(b, pack, i >> 1));
 
-        assert(num_components <= 4);
+      unpacked[i + 0] = nir_channel(b, halves, 0);
+      unpacked[i + 1] = nir_channel(b, halves, 1);
+   }
 
-        for (unsigned i = 0; i < num_components; i += 2) {
-                nir_ssa_def *halves = 
-                        nir_unpack_32_2x16(b, nir_channel(b, pack, i >> 1));
+   return nir_pad_vec4(b, nir_vec(b, unpacked, num_components));
+}
+
+static nir_def *
+pan_pack_reorder(nir_builder *b, const struct util_format_description *desc,
+                 nir_def *v)
+{
+   unsigned swizzle[4] = {0, 1, 2, 3};
 
-                unpacked[i + 0] = nir_channel(b, halves, 0);
-                unpacked[i + 1] = nir_channel(b, halves, 1);
-        }
+   for (unsigned i = 0; i < v->num_components; i++) {
+      if (desc->swizzle[i] <= PIPE_SWIZZLE_W)
+         swizzle[i] = desc->swizzle[i];
+   }
 
-        return nir_pad_vec4(b, nir_vec(b, unpacked, num_components));
+   return nir_swizzle(b, v, swizzle, v->num_components);
 }
 
-static nir_ssa_def *
-pan_replicate_4(nir_builder *b, nir_ssa_def *v)
+static nir_def *
+pan_unpack_reorder(nir_builder *b, const struct util_format_description *desc,
+                   nir_def *v)
 {
-        return nir_vec4(b, v, v, v, v);
+   unsigned swizzle[4] = {0, 1, 2, 3};
+
+   for (unsigned i = 0; i < v->num_components; i++) {
+      if (desc->swizzle[i] <= PIPE_SWIZZLE_W)
+         swizzle[desc->swizzle[i]] = i;
+   }
+
+   return nir_swizzle(b, v, swizzle, v->num_components);
 }
 
-static nir_ssa_def *
-pan_pack_pure_8(nir_builder *b, nir_ssa_def *v, unsigned num_components)
+static nir_def *
+pan_pack_pure_8(nir_builder *b, nir_def *v, unsigned num_components)
 {
-        return pan_replicate_4(b, nir_pack_32_4x8(b, pan_replicate(b, v, num_components)));
+   return nir_replicate(
+      b, nir_pack_32_4x8(b, pan_replicate(b, v, num_components)), 4);
 }
 
-static nir_ssa_def *
-pan_unpack_pure_8(nir_builder *b, nir_ssa_def *pack, unsigned num_components)
+static nir_def *
+pan_unpack_pure_8(nir_builder *b, nir_def *pack, unsigned num_components)
 {
-        nir_ssa_def *unpacked = nir_unpack_32_4x8(b, nir_channel(b, pack, 0));
-        return nir_channels(b, unpacked, (1 << num_components) - 1);
+   nir_def *unpacked = nir_unpack_32_4x8(b, nir_channel(b, pack, 0));
+   return nir_trim_vector(b, unpacked, num_components);
 }
 
-/* For <= 8-bits per channel, UNORM formats are packed like UNORM 8, with
- * zeroes spacing out each component as needed */
+static nir_def *
+pan_fsat(nir_builder *b, nir_def *v, bool is_signed)
+{
+   if (is_signed)
+      return nir_fsat_signed_mali(b, v);
+   else
+      return nir_fsat(b, v);
+}
 
-static nir_ssa_def *
-pan_pack_unorm(nir_builder *b, nir_ssa_def *v,
-               unsigned x, unsigned y, unsigned z, unsigned w)
+static float
+norm_scale(bool snorm, unsigned bits)
 {
-        /* If a channel has N bits, 1.0 is encoded as 2^N - 1 */
-        nir_ssa_def *scales = nir_imm_vec4_16(b,
-                        (1 << x) - 1, (1 << y) - 1, 
-                        (1 << z) - 1, (1 << w) - 1);
+   if (snorm)
+      return (1 << (bits - 1)) - 1;
+   else
+      return (1 << bits) - 1;
+}
 
-        /* If a channel has N bits, we pad out to the byte by (8 - N) bits */
-        nir_ssa_def *shifts = nir_imm_ivec4(b, 8 - x, 8 - y, 8 - z, 8 - w);
+/* For <= 8-bits per channel, [U,S]NORM formats are packed like [U,S]NORM 8,
+ * with zeroes spacing out each component as needed */
 
-        nir_ssa_def *f = nir_fmul(b, nir_fsat(b, nir_pad_vec4(b, v)), scales);
-        nir_ssa_def *u8 = nir_f2u8(b, nir_fround_even(b, f));
-        nir_ssa_def *s = nir_ishl(b, u8, shifts);
-        nir_ssa_def *repl = nir_pack_32_4x8(b, s);
+static nir_def *
+pan_pack_norm(nir_builder *b, nir_def *v, unsigned x, unsigned y, unsigned z,
+              unsigned w, bool is_signed)
+{
+   /* If a channel has N bits, 1.0 is encoded as 2^N - 1 for UNORMs and
+    * 2^(N-1) - 1 for SNORMs */
+   nir_def *scales =
+      is_signed ? nir_imm_vec4_16(b, (1 << (x - 1)) - 1, (1 << (y - 1)) - 1,
+                                  (1 << (z - 1)) - 1, (1 << (w - 1)) - 1)
+                : nir_imm_vec4_16(b, (1 << x) - 1, (1 << y) - 1, (1 << z) - 1,
+                                  (1 << w) - 1);
+
+   /* If a channel has N bits, we pad out to the byte by (8 - N) bits */
+   nir_def *shifts = nir_imm_ivec4(b, 8 - x, 8 - y, 8 - z, 8 - w);
+   nir_def *clamped = pan_fsat(b, nir_pad_vec4(b, v), is_signed);
+
+   nir_def *f = nir_fmul(b, clamped, scales);
+   nir_def *u8 = nir_f2u8(b, nir_fround_even(b, f));
+   nir_def *s = nir_ishl(b, u8, shifts);
+   nir_def *repl = nir_pack_32_4x8(b, s);
+
+   return nir_replicate(b, repl, 4);
+}
 
-        return pan_replicate_4(b, repl);
+static nir_def *
+pan_pack_unorm(nir_builder *b, nir_def *v, unsigned x, unsigned y, unsigned z,
+               unsigned w)
+{
+   return pan_pack_norm(b, v, x, y, z, w, false);
 }
 
 /* RGB10_A2 is packed in the tilebuffer as the bottom 3 bytes being the top
  * 8-bits of RGB and the top byte being RGBA as 2-bits packed. As imirkin
  * pointed out, this means free conversion to RGBX8 */
 
-static nir_ssa_def *
-pan_pack_unorm_1010102(nir_builder *b, nir_ssa_def *v)
+static nir_def *
+pan_pack_unorm_1010102(nir_builder *b, nir_def *v)
 {
-        nir_ssa_def *scale = nir_imm_vec4(b, 1023.0, 1023.0, 1023.0, 3.0);
-        nir_ssa_def *s = nir_f2u32(b, nir_fround_even(b, nir_fmul(b, nir_fsat(b, v), scale)));
+   nir_def *scale = nir_imm_vec4(b, 1023.0, 1023.0, 1023.0, 3.0);
+   nir_def *s =
+      nir_f2u32(b, nir_fround_even(b, nir_fmul(b, nir_fsat(b, v), scale)));
 
-        nir_ssa_def *top8 = nir_ushr(b, s, nir_imm_ivec4(b, 0x2, 0x2, 0x2, 0x2));
-        nir_ssa_def *top8_rgb = nir_pack_32_4x8(b, nir_u2u8(b, top8));
+   nir_def *top8 = nir_ushr(b, s, nir_imm_ivec4(b, 0x2, 0x2, 0x2, 0x2));
+   nir_def *top8_rgb = nir_pack_32_4x8(b, nir_u2u8(b, top8));
 
-        nir_ssa_def *bottom2 = nir_iand(b, s, nir_imm_ivec4(b, 0x3, 0x3, 0x3, 0x3));
+   nir_def *bottom2 = nir_iand(b, s, nir_imm_ivec4(b, 0x3, 0x3, 0x3, 0x3));
 
-        nir_ssa_def *top =
-                 nir_ior(b,
-                        nir_ior(b, 
-                                nir_ishl(b, nir_channel(b, bottom2, 0), nir_imm_int(b, 24 + 0)),
-                                nir_ishl(b, nir_channel(b, bottom2, 1), nir_imm_int(b, 24 + 2))),
-                        nir_ior(b, 
-                                nir_ishl(b, nir_channel(b, bottom2, 2), nir_imm_int(b, 24 + 4)),
-                                nir_ishl(b, nir_channel(b, bottom2, 3), nir_imm_int(b, 24 + 6))));
+   nir_def *top =
+      nir_ior(b,
+              nir_ior(b, nir_ishl_imm(b, nir_channel(b, bottom2, 0), 24 + 0),
+                      nir_ishl_imm(b, nir_channel(b, bottom2, 1), 24 + 2)),
+              nir_ior(b, nir_ishl_imm(b, nir_channel(b, bottom2, 2), 24 + 4),
+                      nir_ishl_imm(b, nir_channel(b, bottom2, 3), 24 + 6)));
 
-        nir_ssa_def *p = nir_ior(b, top, top8_rgb);
-        return pan_replicate_4(b, p);
+   nir_def *p = nir_ior(b, top, top8_rgb);
+   return nir_replicate(b, p, 4);
 }
 
 /* On the other hand, the pure int RGB10_A2 is identical to the spec */
 
-static nir_ssa_def *
-pan_pack_uint_1010102(nir_builder *b, nir_ssa_def *v)
+static nir_def *
+pan_pack_int_1010102(nir_builder *b, nir_def *v, bool is_signed)
 {
-        nir_ssa_def *shift = nir_ishl(b, nir_u2u32(b, v),
-                        nir_imm_ivec4(b, 0, 10, 20, 30));
+   v = nir_u2u32(b, v);
+
+   /* Clamp the values */
+   if (is_signed) {
+      v = nir_imin(b, v, nir_imm_ivec4(b, 511, 511, 511, 1));
+      v = nir_imax(b, v, nir_imm_ivec4(b, -512, -512, -512, -2));
+   } else {
+      v = nir_umin(b, v, nir_imm_ivec4(b, 1023, 1023, 1023, 3));
+   }
 
-        nir_ssa_def *p = nir_ior(b,
-                        nir_ior(b, nir_channel(b, shift, 0), nir_channel(b, shift, 1)),
-                        nir_ior(b, nir_channel(b, shift, 2), nir_channel(b, shift, 3)));
+   v = nir_ishl(b, v, nir_imm_ivec4(b, 0, 10, 20, 30));
+   v = nir_ior(b, nir_ior(b, nir_channel(b, v, 0), nir_channel(b, v, 1)),
+               nir_ior(b, nir_channel(b, v, 2), nir_channel(b, v, 3)));
 
-        return pan_replicate_4(b, p);
+   return nir_replicate(b, v, 4);
 }
 
-static nir_ssa_def *
-pan_unpack_uint_1010102(nir_builder *b, nir_ssa_def *packed)
+static nir_def *
+pan_unpack_int_1010102(nir_builder *b, nir_def *packed, bool is_signed)
 {
-        nir_ssa_def *chan = nir_channel(b, packed, 0);
+   nir_def *v = nir_replicate(b, nir_channel(b, packed, 0), 4);
 
-        nir_ssa_def *shift = nir_ushr(b, pan_replicate_4(b, chan),
-                        nir_imm_ivec4(b, 0, 10, 20, 30));
+   /* Left shift all components so the sign bit is on the MSB, and
+    * can be extended by ishr(). The ishl()+[u,i]shr() combination
+    * sets all unused bits to 0 without requiring a mask.
+    */
+   v = nir_ishl(b, v, nir_imm_ivec4(b, 22, 12, 2, 0));
 
-        nir_ssa_def *mask = nir_iand(b, shift,
-                        nir_imm_ivec4(b, 0x3ff, 0x3ff, 0x3ff, 0x3));
+   if (is_signed)
+      v = nir_ishr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30));
+   else
+      v = nir_ushr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30));
 
-        return nir_i2i16(b, mask);
+   return nir_i2i16(b, v);
 }
 
 /* NIR means we can *finally* catch a break */
 
-static nir_ssa_def *
-pan_pack_r11g11b10(nir_builder *b, nir_ssa_def *v)
+static nir_def *
+pan_pack_r11g11b10(nir_builder *b, nir_def *v)
 {
-        return pan_replicate_4(b, nir_format_pack_11f11f10f(b, 
-                                nir_f2f32(b, v)));
+   return nir_replicate(b, nir_format_pack_11f11f10f(b, nir_f2f32(b, v)), 4);
 }
 
-static nir_ssa_def *
-pan_unpack_r11g11b10(nir_builder *b, nir_ssa_def *v)
+static nir_def *
+pan_unpack_r11g11b10(nir_builder *b, nir_def *v)
 {
-        nir_ssa_def *f32 = nir_format_unpack_11f11f10f(b, nir_channel(b, v, 0));
-        nir_ssa_def *f16 = nir_f2fmp(b, f32);
-
-        /* Extend to vec4 with alpha */
-        nir_ssa_def *components[4] = {
-                nir_channel(b, f16, 0),
-                nir_channel(b, f16, 1),
-                nir_channel(b, f16, 2),
-                nir_imm_float16(b, 1.0)
-        };
-
-        return nir_vec(b, components, 4);
+   nir_def *f32 = nir_format_unpack_11f11f10f(b, nir_channel(b, v, 0));
+   nir_def *f16 = nir_f2fmp(b, f32);
+
+   /* Extend to vec4 with alpha */
+   nir_def *components[4] = {nir_channel(b, f16, 0), nir_channel(b, f16, 1),
+                             nir_channel(b, f16, 2), nir_imm_float16(b, 1.0)};
+
+   return nir_vec(b, components, 4);
 }
 
 /* Wrapper around sRGB conversion */
 
-static nir_ssa_def *
-pan_linear_to_srgb(nir_builder *b, nir_ssa_def *linear)
+static nir_def *
+pan_linear_to_srgb(nir_builder *b, nir_def *linear)
 {
-        nir_ssa_def *rgb = nir_channels(b, linear, 0x7);
+   nir_def *rgb = nir_trim_vector(b, linear, 3);
 
-        /* TODO: fp16 native conversion */
-        nir_ssa_def *srgb = nir_f2fmp(b,
-                        nir_format_linear_to_srgb(b, nir_f2f32(b, rgb)));
+   /* TODO: fp16 native conversion */
+   nir_def *srgb =
+      nir_f2fmp(b, nir_format_linear_to_srgb(b, nir_f2f32(b, rgb)));
 
-        nir_ssa_def *comp[4] = {
-                nir_channel(b, srgb, 0),
-                nir_channel(b, srgb, 1),
-                nir_channel(b, srgb, 2),
-                nir_channel(b, linear, 3),
-        };
+   nir_def *comp[4] = {
+      nir_channel(b, srgb, 0),
+      nir_channel(b, srgb, 1),
+      nir_channel(b, srgb, 2),
+      nir_channel(b, linear, 3),
+   };
 
-        return nir_vec(b, comp, 4);
+   return nir_vec(b, comp, 4);
+}
+
+static nir_def *
+pan_unpack_pure(nir_builder *b, nir_def *packed, unsigned size, unsigned nr)
+{
+   switch (size) {
+   case 32:
+      return nir_trim_vector(b, packed, nr);
+   case 16:
+      return pan_unpack_pure_16(b, packed, nr);
+   case 8:
+      return pan_unpack_pure_8(b, packed, nr);
+   default:
+      unreachable("Unrenderable size");
+   }
 }
 
 /* Generic dispatches for un/pack regardless of format */
 
-static nir_ssa_def *
-pan_unpack(nir_builder *b,
-                const struct util_format_description *desc,
-                nir_ssa_def *packed)
+static nir_def *
+pan_unpack(nir_builder *b, const struct util_format_description *desc,
+           nir_def *packed)
 {
-        if (desc->is_array) {
-                int c = util_format_get_first_non_void_channel(desc->format);
-                assert(c >= 0);
-                struct util_format_channel_description d = desc->channel[c];
-
-                if (d.size == 32 || d.size == 16) {
-                        assert(!d.normalized);
-                        assert(d.type == UTIL_FORMAT_TYPE_FLOAT || d.pure_integer);
-
-                        return d.size == 32 ? pan_unpack_pure_32(b, packed, desc->nr_channels) :
-                                pan_unpack_pure_16(b, packed, desc->nr_channels);
-                } else if (d.size == 8) {
-                        assert(d.pure_integer);
-                        return pan_unpack_pure_8(b, packed, desc->nr_channels);
-                } else {
-                        unreachable("Unrenderable size");
-                }
-        }
-
-        switch (desc->format) {
-        case PIPE_FORMAT_R10G10B10A2_UINT:
-                return pan_unpack_uint_1010102(b, packed);
-        case PIPE_FORMAT_R11G11B10_FLOAT:
-                return pan_unpack_r11g11b10(b, packed);
-        default:
-                break;
-        }
-
-        fprintf(stderr, "%s\n", desc->name);
-        unreachable("Unknown format");
+   if (desc->is_array) {
+      int c = util_format_get_first_non_void_channel(desc->format);
+      assert(c >= 0);
+      struct util_format_channel_description d = desc->channel[c];
+      nir_def *unpacked = pan_unpack_pure(b, packed, d.size, desc->nr_channels);
+
+      /* Normalized formats are unpacked as integers. We need to
+       * convert to float for the final result.
+       */
+      if (d.normalized) {
+         bool snorm = desc->is_snorm;
+         unsigned float_sz = (d.size <= 8 ? 16 : 32);
+         float multiplier = norm_scale(snorm, d.size);
+
+         nir_def *as_float = snorm ? nir_i2fN(b, unpacked, float_sz)
+                                   : nir_u2fN(b, unpacked, float_sz);
+
+         return nir_fmul_imm(b, as_float, 1.0 / multiplier);
+      } else {
+         return unpacked;
+      }
+   }
+
+   switch (desc->format) {
+   case PIPE_FORMAT_R10G10B10A2_UINT:
+   case PIPE_FORMAT_B10G10R10A2_UINT:
+      return pan_unpack_int_1010102(b, packed, false);
+   case PIPE_FORMAT_R10G10B10A2_SINT:
+   case PIPE_FORMAT_B10G10R10A2_SINT:
+      return pan_unpack_int_1010102(b, packed, true);
+   case PIPE_FORMAT_R11G11B10_FLOAT:
+      return pan_unpack_r11g11b10(b, packed);
+   default:
+      break;
+   }
+
+   fprintf(stderr, "%s\n", desc->name);
+   unreachable("Unknown format");
 }
 
-static nir_ssa_def *
-pan_pack(nir_builder *b,
-                const struct util_format_description *desc,
-                nir_ssa_def *unpacked)
+static nir_def *pan_pack(nir_builder *b,
+                         const struct util_format_description *desc,
+                         nir_def * unpacked)
 {
-        if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
-                unpacked = pan_linear_to_srgb(b, unpacked);
-
-        if (util_format_is_unorm8(desc))
-                return pan_pack_unorm(b, unpacked, 8, 8, 8, 8);
-
-        if (desc->is_array) {
-                int c = util_format_get_first_non_void_channel(desc->format);
-                assert(c >= 0);
-                struct util_format_channel_description d = desc->channel[c];
-
-                if (d.size == 32 || d.size == 16) {
-                        assert(!d.normalized);
-                        assert(d.type == UTIL_FORMAT_TYPE_FLOAT || d.pure_integer);
-
-                        return d.size == 32 ?
-                                pan_replicate(b, unpacked, desc->nr_channels) :
-                                pan_pack_pure_16(b, unpacked, desc->nr_channels);
-                } else if (d.size == 8) {
-                        assert(d.pure_integer);
-                        return pan_pack_pure_8(b, unpacked, desc->nr_channels);
-                } else {
-                        unreachable("Unrenderable size");
-                }
-        }
-
-        switch (desc->format) {
-        case PIPE_FORMAT_B4G4R4A4_UNORM:
-        case PIPE_FORMAT_B4G4R4X4_UNORM:
-        case PIPE_FORMAT_A4R4_UNORM:
-        case PIPE_FORMAT_R4A4_UNORM:
-        case PIPE_FORMAT_A4B4G4R4_UNORM:
-        case PIPE_FORMAT_R4G4B4A4_UNORM:
-                return pan_pack_unorm(b, unpacked, 4, 4, 4, 4);
-        case PIPE_FORMAT_B5G5R5A1_UNORM:
-        case PIPE_FORMAT_R5G5B5A1_UNORM:
-                return pan_pack_unorm(b, unpacked, 5, 6, 5, 1);
-        case PIPE_FORMAT_R5G6B5_UNORM:
-        case PIPE_FORMAT_B5G6R5_UNORM:
-                return pan_pack_unorm(b, unpacked, 5, 6, 5, 0);
-        case PIPE_FORMAT_R10G10B10A2_UNORM:
-        case PIPE_FORMAT_B10G10R10A2_UNORM:
-                return pan_pack_unorm_1010102(b, unpacked);
-        case PIPE_FORMAT_R10G10B10A2_UINT:
-                return pan_pack_uint_1010102(b, unpacked);
-        case PIPE_FORMAT_R11G11B10_FLOAT:
-                return pan_pack_r11g11b10(b, unpacked);
-        default:
-                break;
-        }
-
-        fprintf(stderr, "%s\n", desc->name);
-        unreachable("Unknown format");
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+      unpacked = pan_linear_to_srgb(b, unpacked);
+
+   if (desc->is_array) {
+      int c = util_format_get_first_non_void_channel(desc->format);
+      assert(c >= 0);
+      struct util_format_channel_description d = desc->channel[c];
+
+      /* Pure formats are packed as-is */
+      nir_def *raw = unpacked;
+
+      /* Normalized formats get normalized first */
+      if (d.normalized) {
+         bool snorm = desc->is_snorm;
+         float multiplier = norm_scale(snorm, d.size);
+         nir_def *clamped = pan_fsat(b, unpacked, snorm);
+         nir_def *normed = nir_fmul_imm(b, clamped, multiplier);
+
+         raw = nir_f2uN(b, normed, d.size);
+      }
+
+      /* Pack the raw format */
+      switch (d.size) {
+      case 32:
+         return pan_replicate(b, raw, desc->nr_channels);
+      case 16:
+         return pan_pack_pure_16(b, raw, desc->nr_channels);
+      case 8:
+         return pan_pack_pure_8(b, raw, desc->nr_channels);
+      default:
+         unreachable("Unrenderable size");
+      }
+   }
+
+   switch (desc->format) {
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+   case PIPE_FORMAT_B4G4R4X4_UNORM:
+   case PIPE_FORMAT_A4R4_UNORM:
+   case PIPE_FORMAT_R4A4_UNORM:
+   case PIPE_FORMAT_A4B4G4R4_UNORM:
+   case PIPE_FORMAT_R4G4B4A4_UNORM:
+      return pan_pack_unorm(b, unpacked, 4, 4, 4, 4);
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+   case PIPE_FORMAT_R5G5B5A1_UNORM:
+      return pan_pack_unorm(b, unpacked, 5, 6, 5, 1);
+   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return pan_pack_unorm(b, unpacked, 5, 6, 5, 0);
+   case PIPE_FORMAT_R10G10B10A2_UNORM:
+   case PIPE_FORMAT_B10G10R10A2_UNORM:
+      return pan_pack_unorm_1010102(b, unpacked);
+   case PIPE_FORMAT_R10G10B10A2_UINT:
+   case PIPE_FORMAT_B10G10R10A2_UINT:
+      return pan_pack_int_1010102(b, unpacked, false);
+   case PIPE_FORMAT_R10G10B10A2_SINT:
+   case PIPE_FORMAT_B10G10R10A2_SINT:
+      return pan_pack_int_1010102(b, unpacked, true);
+   case PIPE_FORMAT_R11G11B10_FLOAT:
+      return pan_pack_r11g11b10(b, unpacked);
+   default:
+      break;
+   }
+
+   fprintf(stderr, "%s\n", desc->name);
+   unreachable("Unknown format");
 }
 
 static void
-pan_lower_fb_store(nir_shader *shader,
-                nir_builder *b,
-                nir_intrinsic_instr *intr,
-                const struct util_format_description *desc,
-                unsigned quirks)
+pan_lower_fb_store(nir_builder *b, nir_intrinsic_instr *intr,
+                   const struct util_format_description *desc,
+                   bool reorder_comps, unsigned nr_samples)
 {
-        /* For stores, add conversion before */
-        nir_ssa_def *unpacked = nir_ssa_for_src(b, intr->src[1], 4);
-        nir_ssa_def *packed = pan_pack(b, desc, unpacked);
+   /* For stores, add conversion before */
+   nir_def *unpacked = intr->src[0].ssa;
+   unpacked = nir_pad_vec4(b, unpacked);
 
-        nir_store_raw_output_pan(b, packed);
+   /* Re-order the components */
+   if (reorder_comps)
+      unpacked = pan_pack_reorder(b, desc, unpacked);
+
+   nir_def *packed = pan_pack(b, desc, unpacked);
+
+   /* We have to split writeout in 128 bit chunks */
+   unsigned iterations = DIV_ROUND_UP(desc->block.bits * nr_samples, 128);
+
+   for (unsigned s = 0; s < iterations; ++s) {
+      nir_store_raw_output_pan(b, packed,
+                               .io_semantics = nir_intrinsic_io_semantics(intr),
+                               .base = s);
+   }
 }
 
-static nir_ssa_def *
+static nir_def *
 pan_sample_id(nir_builder *b, int sample)
 {
-        return (sample >= 0) ? nir_imm_int(b, sample) : nir_load_sample_id(b);
+   return (sample >= 0) ? nir_imm_int(b, sample) : nir_load_sample_id(b);
 }
 
 static void
-pan_lower_fb_load(nir_shader *shader,
-                nir_builder *b,
-                nir_intrinsic_instr *intr,
-                const struct util_format_description *desc,
-                unsigned base, int sample, unsigned quirks)
+pan_lower_fb_load(nir_builder *b, nir_intrinsic_instr *intr,
+                  const struct util_format_description *desc,
+                  bool reorder_comps, int sample)
+{
+   nir_def *packed =
+      nir_load_raw_output_pan(b, 4, 32, pan_sample_id(b, sample),
+                              .io_semantics = nir_intrinsic_io_semantics(intr));
+
+   /* Convert the raw value */
+   nir_def *unpacked = pan_unpack(b, desc, packed);
+
+   /* Convert to the size of the load intrinsic.
+    *
+    * We can assume that the type will match with the framebuffer format:
+    *
+    * Page 170 of the PDF of the OpenGL ES 3.0.6 spec says:
+    *
+    * If [UNORM or SNORM, convert to fixed-point]; otherwise no type
+    * conversion is applied. If the values written by the fragment shader
+    * do not match the format(s) of the corresponding color buffer(s),
+    * the result is undefined.
+    */
+
+   unsigned bits = intr->def.bit_size;
+
+   nir_alu_type src_type =
+      nir_alu_type_get_base_type(pan_unpacked_type_for_format(desc));
+
+   unpacked = nir_convert_to_bit_size(b, unpacked, src_type, bits);
+   unpacked = nir_resize_vector(b, unpacked, intr->def.num_components);
+
+   /* Reorder the components */
+   if (reorder_comps)
+      unpacked = pan_unpack_reorder(b, desc, unpacked);
+
+   nir_def_rewrite_uses_after(&intr->def, unpacked, &intr->instr);
+}
+
+struct inputs {
+   const enum pipe_format *rt_fmts;
+   uint8_t raw_fmt_mask;
+   bool is_blend;
+   bool broken_ld_special;
+   unsigned nr_samples;
+};
+
+static bool
+lower(nir_builder *b, nir_instr *instr, void *data)
 {
-        nir_ssa_def *packed =
-                nir_load_raw_output_pan(b, 4, 32, pan_sample_id(b, sample),
-                                        .base = base);
-
-        /* Convert the raw value */
-        nir_ssa_def *unpacked = pan_unpack(b, desc, packed);
-
-        /* Convert to the size of the load intrinsic.
-         *
-         * We can assume that the type will match with the framebuffer format:
-         *
-         * Page 170 of the PDF of the OpenGL ES 3.0.6 spec says:
-         *
-         * If [UNORM or SNORM, convert to fixed-point]; otherwise no type
-         * conversion is applied. If the values written by the fragment shader
-         * do not match the format(s) of the corresponding color buffer(s),
-         * the result is undefined.
-         */
-
-        unsigned bits = nir_dest_bit_size(intr->dest);
-
-        nir_alu_type src_type = nir_alu_type_get_base_type(
-                        pan_unpacked_type_for_format(desc));
-
-        unpacked = nir_convert_to_bit_size(b, unpacked, src_type, bits);
-        unpacked = nir_pad_vector(b, unpacked, nir_dest_num_components(intr->dest));
-
-        nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, unpacked, &intr->instr);
+   struct inputs *inputs = data;
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   bool is_load = intr->intrinsic == nir_intrinsic_load_output;
+   bool is_store = intr->intrinsic == nir_intrinsic_store_output;
+
+   if (!(is_load || (is_store && inputs->is_blend)))
+      return false;
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   if (sem.location < FRAG_RESULT_DATA0)
+      return false;
+
+   unsigned rt = sem.location - FRAG_RESULT_DATA0;
+   if (inputs->rt_fmts[rt] == PIPE_FORMAT_NONE)
+      return false;
+
+   const struct util_format_description *desc =
+      util_format_description(inputs->rt_fmts[rt]);
+
+   /* Don't lower */
+   if (pan_is_format_native(desc, inputs->broken_ld_special, is_store))
+      return false;
+
+   /* EXT_shader_framebuffer_fetch requires per-sample loads. MSAA blend
+    * shaders are not yet handled, so for now always load sample 0.
+    */
+   int sample = inputs->is_blend ? 0 : -1;
+   bool reorder_comps = inputs->raw_fmt_mask & BITFIELD_BIT(rt);
+
+   if (is_store) {
+      b->cursor = nir_before_instr(instr);
+      pan_lower_fb_store(b, intr, desc, reorder_comps, inputs->nr_samples);
+   } else {
+      b->cursor = nir_after_instr(instr);
+      pan_lower_fb_load(b, intr, desc, reorder_comps, sample);
+   }
+
+   nir_instr_remove(instr);
+   return true;
 }
 
 bool
 pan_lower_framebuffer(nir_shader *shader, const enum pipe_format *rt_fmts,
-                      bool is_blend, unsigned quirks)
+                      uint8_t raw_fmt_mask, unsigned blend_shader_nr_samples,
+                      bool broken_ld_special)
 {
-        if (shader->info.stage != MESA_SHADER_FRAGMENT)
-               return false;
-
-        bool progress = false;
-
-        nir_foreach_function(func, shader) {
-                nir_foreach_block(block, func->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
-
-                                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-                                bool is_load = intr->intrinsic == nir_intrinsic_load_deref;
-                                bool is_store = intr->intrinsic == nir_intrinsic_store_deref;
-
-                                if (!(is_load || (is_store && is_blend)))
-                                        continue;
-
-                                nir_variable *var = nir_intrinsic_get_var(intr, 0);
-
-                                if (var->data.mode != nir_var_shader_out)
-                                        continue;
-
-                                if (var->data.location < FRAG_RESULT_DATA0)
-                                        continue;
-
-                                unsigned base = var->data.driver_location;
-                                unsigned rt = var->data.location - FRAG_RESULT_DATA0;
-
-                                if (rt_fmts[rt] == PIPE_FORMAT_NONE)
-                                        continue;
-
-                                const struct util_format_description *desc =
-                                   util_format_description(rt_fmts[rt]);
-
-                                enum pan_format_class fmt_class =
-                                        pan_format_class(desc, quirks, is_store);
-
-                                /* Don't lower */
-                                if (fmt_class == PAN_FORMAT_NATIVE)
-                                        continue;
-
-                                /* EXT_shader_framebuffer_fetch requires
-                                 * per-sample loads.
-                                 * MSAA blend shaders are not yet handled, so
-                                 * for now always load sample 0. */
-                                int sample = is_blend ? 0 : -1;
-
-                                nir_builder b;
-                                nir_builder_init(&b, func->impl);
-
-                                if (is_store) {
-                                        b.cursor = nir_before_instr(instr);
-                                        pan_lower_fb_store(shader, &b, intr, desc, quirks);
-                                } else {
-                                        b.cursor = nir_after_instr(instr);
-                                        pan_lower_fb_load(shader, &b, intr, desc, base, sample, quirks);
-                                }
-
-                                nir_instr_remove(instr);
-
-                                progress = true;
-                        }
-                }
-
-                nir_metadata_preserve(func->impl, nir_metadata_block_index |
-                                nir_metadata_dominance);
-        }
-
-        return progress;
+   assert(shader->info.stage == MESA_SHADER_FRAGMENT);
+
+   return nir_shader_instructions_pass(
+      shader, lower, nir_metadata_block_index | nir_metadata_dominance,
+      &(struct inputs){
+         .rt_fmts = rt_fmts,
+         .raw_fmt_mask = raw_fmt_mask,
+         .nr_samples = blend_shader_nr_samples,
+         .is_blend = blend_shader_nr_samples > 0,
+         .broken_ld_special = broken_ld_special,
+      });
 }
diff --git a/src/panfrost/util/pan_lower_framebuffer.h b/src/panfrost/util/pan_lower_framebuffer.h
index 5491cd346b1..bf6509175bb 100644
--- a/src/panfrost/util/pan_lower_framebuffer.h
+++ b/src/panfrost/util/pan_lower_framebuffer.h
@@ -30,18 +30,12 @@
 #include "compiler/nir/nir.h"
 #include "util/format/u_format.h"
 
-/* NATIVE formats can use a typed load/store. PACK formats cannot but can use a
- * typed pack/unpack instruction. SOFTWARE formats are lowered */
-
-enum pan_format_class {
-        PAN_FORMAT_NATIVE,
-        PAN_FORMAT_PACK,
-        PAN_FORMAT_SOFTWARE
-};
-
-nir_alu_type pan_unpacked_type_for_format(const struct util_format_description *desc);
+nir_alu_type
+pan_unpacked_type_for_format(const struct util_format_description *desc);
 
 bool pan_lower_framebuffer(nir_shader *shader, const enum pipe_format *rt_fmts,
-                           bool is_blend, unsigned quirks);
+                           uint8_t raw_fmt_mask,
+                           unsigned blend_shader_nr_samples,
+                           bool broken_ld_special);
 
 #endif
diff --git a/src/panfrost/util/pan_lower_helper_invocation.c b/src/panfrost/util/pan_lower_helper_invocation.c
index 23a37a15dd3..d4d1614f0f4 100644
--- a/src/panfrost/util/pan_lower_helper_invocation.c
+++ b/src/panfrost/util/pan_lower_helper_invocation.c
@@ -21,8 +21,8 @@
  * SOFTWARE.
  */
 
-#include "pan_ir.h"
 #include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
 
 /* Lower gl_HelperInvocation to (gl_SampleMaskIn == 0), this depends on
  * architectural details but is required for correct operation with
@@ -30,29 +30,25 @@
  * way to implement load_sample_id_no_per_sample. */
 
 static bool
-pan_lower_helper_invocation_instr(nir_builder *b, nir_instr *instr, void *data)
+pan_lower_helper_invocation_instr(nir_builder *b, nir_intrinsic_instr *intr,
+                                  void *data)
 {
-        if (instr->type != nir_instr_type_intrinsic)
-                return false;
-
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-        if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
-                return false;
+   if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
+      return false;
 
-        b->cursor = nir_before_instr(instr);
+   b->cursor = nir_before_instr(&intr->instr);
 
-        nir_ssa_def *mask = nir_load_sample_mask_in(b);
-        nir_ssa_def *eq = nir_ieq(b, mask, nir_imm_int(b, 0));
-        nir_ssa_def_rewrite_uses(&intr->dest.ssa, eq);
+   nir_def *mask = nir_load_sample_mask_in(b);
+   nir_def *eq = nir_ieq_imm(b, mask, 0);
+   nir_def_rewrite_uses(&intr->def, eq);
 
-        return true;
+   return true;
 }
 
 bool
 pan_lower_helper_invocation(nir_shader *shader)
 {
-        return nir_shader_instructions_pass(shader,
-                                            pan_lower_helper_invocation_instr,
-                                            nir_metadata_block_index | nir_metadata_dominance,
-                                            NULL);
+   return nir_shader_intrinsics_pass(
+      shader, pan_lower_helper_invocation_instr,
+      nir_metadata_block_index | nir_metadata_dominance, NULL);
 }
diff --git a/src/panfrost/util/pan_lower_image_index.c b/src/panfrost/util/pan_lower_image_index.c
new file mode 100644
index 00000000000..4f3655af6e4
--- /dev/null
+++ b/src/panfrost/util/pan_lower_image_index.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2024 Collabora, Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
+
+/* Vertex shader gets passed image attribute descriptors through the
+ * vertex attribute descriptor array. This forces us to apply an offset
+ * to all image access to get the actual attribute offset.
+ *
+ * The gallium driver emits the vertex attributes on each draw, and puts
+ * image attributes right after the vertex attributes, which implies passing
+ * vs_img_attrib_offset = util_bitcount64(nir->info.inputs_read).
+ *
+ * The Vulkan driver, on the other hand, uses
+ * VkVertexInputAttributeDescription to build a table of attributes passed
+ * to the shader. While there's no reason for the app to define more
+ * attributes than it actually uses in the vertex shader, it doesn't seem
+ * to be disallowed either. Not to mention that vkCmdSetVertexInputEXT()
+ * allows one to dynamically change the vertex input configuration, and
+ * possibly pass more attributes than referenced by the vertex shader bound to
+ * the command buffer at draw time. Of course, we could carry this information
+ * at the pipeline level, and re-emit the attribute array, but emitting only
+ * when the vertex input configuration is flagged dirty is simpler.
+ * In order for this to work, we use a fixed image attribute offset.
+ */
+static bool
+lower_image_intr(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_image_load &&
+       intr->intrinsic != nir_intrinsic_image_store)
+      return false;
+
+   unsigned img_attr_offset = *(unsigned *)data;
+   nir_def *index = intr->src[0].ssa;
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   index = nir_iadd_imm(b, index, img_attr_offset);
+   nir_src_rewrite(&intr->src[0], index);
+   return true;
+}
+
+bool
+pan_lower_image_index(nir_shader *shader, unsigned vs_img_attrib_offset)
+{
+   if (shader->info.stage != MESA_SHADER_VERTEX)
+      return false;
+
+   return nir_shader_intrinsics_pass(
+      shader, lower_image_intr,
+      nir_metadata_block_index | nir_metadata_dominance, &vs_img_attrib_offset);
+}
diff --git a/src/panfrost/util/pan_lower_image_ms.c b/src/panfrost/util/pan_lower_image_ms.c
new file mode 100644
index 00000000000..0296cf839cd
--- /dev/null
+++ b/src/panfrost/util/pan_lower_image_ms.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2024 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors (Collabora):
+ *   Eric Smith <eric.smith@collabora.com>
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
+
+static bool
+nir_lower_image_ms(nir_builder *b, nir_intrinsic_instr *intr,
+                        UNUSED void *data)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_deref_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_deref_store:
+   case nir_intrinsic_image_texel_address:
+      break;
+   default:
+      return false;
+   }
+
+   if (nir_intrinsic_image_dim(intr) != GLSL_SAMPLER_DIM_MS)
+      return false;
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_def *coord = intr->src[1].ssa;
+   nir_def *index = intr->src[2].ssa;
+
+   /* image2DMS is treated by panfrost as if it were a 3D image, so
+    * the sample index is in src[2]. We need to put this into the coordinates
+    * in the Z component
+    */
+   nir_src_rewrite(&intr->src[1],
+                   nir_vector_insert_imm(b, coord,
+                                         nir_channel(b, index, 0),
+                                         2) );
+   nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_3D);
+   return true;
+}
+
+bool
+pan_nir_lower_image_ms(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(
+      shader, nir_lower_image_ms,
+      nir_metadata_block_index | nir_metadata_dominance, NULL);
+}
diff --git a/src/panfrost/util/pan_lower_sample_position.c b/src/panfrost/util/pan_lower_sample_position.c
index 12a0c47bbff..1bf4efd3de7 100644
--- a/src/panfrost/util/pan_lower_sample_position.c
+++ b/src/panfrost/util/pan_lower_sample_position.c
@@ -21,8 +21,8 @@
  * SOFTWARE.
  */
 
-#include "pan_ir.h"
 #include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
 
 /* Sample positions are supplied in a packed 8:8 fixed-point vec2 format in GPU
  * memory indexed by the sample. We lower in NIR to take advantage of possible
@@ -33,43 +33,38 @@
  * it's a pretty trivial difference */
 
 static bool
-pan_lower_sample_pos_impl(struct nir_builder *b,
-                nir_instr *instr, UNUSED void *data)
+pan_lower_sample_pos_impl(struct nir_builder *b, nir_intrinsic_instr *intr,
+                          UNUSED void *data)
 {
-        if (instr->type != nir_instr_type_intrinsic)
-                return false;
-
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-        if (intr->intrinsic != nir_intrinsic_load_sample_pos)
-                return false;
+   if (intr->intrinsic != nir_intrinsic_load_sample_pos)
+      return false;
 
-        b->cursor = nir_before_instr(instr);
+   b->cursor = nir_before_instr(&intr->instr);
 
-        /* Elements are 4 bytes */
-        nir_ssa_def *addr = nir_iadd(b,
-                        nir_load_sample_positions_pan(b),
-                        nir_u2u64(b, nir_imul_imm(b, nir_load_sample_id(b), 4)));
+   /* Elements are 4 bytes */
+   nir_def *addr =
+      nir_iadd(b, nir_load_sample_positions_pan(b),
+               nir_u2u64(b, nir_imul_imm(b, nir_load_sample_id(b), 4)));
 
-        /* Decode 8:8 fixed-point */
-        nir_ssa_def *raw = nir_load_global(b, addr, 2, 2, 16);
-        nir_ssa_def *decoded = nir_fmul_imm(b, nir_i2f16(b, raw), 1.0 / 256.0);
+   /* Decode 8:8 fixed-point */
+   nir_def *raw = nir_load_global(b, addr, 2, 2, 16);
+   nir_def *decoded = nir_fmul_imm(b, nir_i2f16(b, raw), 1.0 / 256.0);
 
-        /* Make NIR validator happy */
-        if (decoded->bit_size != nir_dest_bit_size(intr->dest))
-                decoded = nir_f2fN(b, decoded, nir_dest_bit_size(intr->dest));
+   /* Make NIR validator happy */
+   if (decoded->bit_size != intr->def.bit_size)
+      decoded = nir_f2fN(b, decoded, intr->def.bit_size);
 
-        nir_ssa_def_rewrite_uses(&intr->dest.ssa, decoded);
-        return true;
+   nir_def_rewrite_uses(&intr->def, decoded);
+   return true;
 }
 
 bool
 pan_lower_sample_pos(nir_shader *shader)
 {
-        if (shader->info.stage != MESA_SHADER_FRAGMENT)
-                return false;
+   if (shader->info.stage != MESA_SHADER_FRAGMENT)
+      return false;
 
-        return nir_shader_instructions_pass(shader,
-                        pan_lower_sample_pos_impl,
-                        nir_metadata_block_index | nir_metadata_dominance,
-                        NULL);
+   return nir_shader_intrinsics_pass(
+      shader, pan_lower_sample_pos_impl,
+      nir_metadata_block_index | nir_metadata_dominance, NULL);
 }
diff --git a/src/panfrost/util/pan_lower_store_component.c b/src/panfrost/util/pan_lower_store_component.c
new file mode 100644
index 00000000000..d591aa9234d
--- /dev/null
+++ b/src/panfrost/util/pan_lower_store_component.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2020-2022 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors (Collabora):
+ *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
+
+/*
+ * If the shader packs multiple varyings into the same location with different
+ * location_frac, we'll need to lower to a single varying store that collects
+ * all of the channels together. This is because the varying instruction on
+ * Midgard and Bifrost is slot-based, writing out an entire vec4 slot at a time.
+ */
+static bool
+lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   struct hash_table_u64 *slots = data;
+   unsigned component = nir_intrinsic_component(intr);
+   nir_src *slot_src = nir_get_io_offset_src(intr);
+   uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
+
+   nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
+   unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
+
+   nir_def *value = intr->src[0].ssa;
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_def *undef = nir_undef(b, 1, value->bit_size);
+   nir_def *channels[4] = {undef, undef, undef, undef};
+
+   /* Copy old */
+   u_foreach_bit(i, mask) {
+      assert(prev != NULL);
+      nir_def *prev_ssa = prev->src[0].ssa;
+      channels[i] = nir_channel(b, prev_ssa, i);
+   }
+
+   /* Copy new */
+   unsigned new_mask = nir_intrinsic_write_mask(intr);
+   mask |= (new_mask << component);
+
+   u_foreach_bit(i, new_mask) {
+      assert(component + i < 4);
+      channels[component + i] = nir_channel(b, value, i);
+   }
+
+   intr->num_components = util_last_bit(mask);
+   nir_src_rewrite(&intr->src[0], nir_vec(b, channels, intr->num_components));
+
+   nir_intrinsic_set_component(intr, 0);
+   nir_intrinsic_set_write_mask(intr, mask);
+
+   if (prev) {
+      _mesa_hash_table_u64_remove(slots, slot);
+      nir_instr_remove(&prev->instr);
+   }
+
+   _mesa_hash_table_u64_insert(slots, slot, intr);
+   return false;
+}
+
+bool
+pan_nir_lower_store_component(nir_shader *s)
+{
+   assert(s->info.stage == MESA_SHADER_VERTEX);
+
+   struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL);
+   bool progress = nir_shader_intrinsics_pass(
+      s, lower_store_component,
+      nir_metadata_block_index | nir_metadata_dominance, stores);
+   _mesa_hash_table_u64_destroy(stores);
+   return progress;
+}
diff --git a/src/panfrost/util/pan_lower_writeout.c b/src/panfrost/util/pan_lower_writeout.c
index 7c298208515..eb528ce3bf3 100644
--- a/src/panfrost/util/pan_lower_writeout.c
+++ b/src/panfrost/util/pan_lower_writeout.c
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#include "pan_ir.h"
 #include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
 
 /* Midgard can write all of color, depth and stencil in a single writeout
  * operation, so we merge depth/stencil stores with color stores.
@@ -33,224 +33,158 @@
  * +ZS_EMIT with respect to +ATEST and +BLEND, as well as combining
  * depth/stencil stores into a single +ZS_EMIT op.
  */
-bool
-pan_nir_lower_zs_store(nir_shader *nir)
-{
-        if (nir->info.stage != MESA_SHADER_FRAGMENT)
-                return false;
-
-        nir_variable *z_var = NULL, *s_var = NULL;
-
-        nir_foreach_shader_out_variable(var, nir) {
-                if (var->data.location == FRAG_RESULT_DEPTH)
-                        z_var = var;
-                else if (var->data.location == FRAG_RESULT_STENCIL)
-                        s_var = var;
-        }
-
-        if (!z_var && !s_var)
-                return false;
-
-        bool progress = false;
-
-        nir_foreach_function(function, nir) {
-                if (!function->impl) continue;
-
-                nir_intrinsic_instr *z_store = NULL, *s_store = NULL;
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
-
-                                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-                                if (intr->intrinsic != nir_intrinsic_store_output)
-                                        continue;
-
-                                if (z_var && nir_intrinsic_base(intr) == z_var->data.driver_location) {
-                                        assert(!z_store);
-                                        z_store = intr;
-                                }
-
-                                if (s_var && nir_intrinsic_base(intr) == s_var->data.driver_location) {
-                                        assert(!s_store);
-                                        s_store = intr;
-                                }
-                        }
-                }
-
-                if (!z_store && !s_store) continue;
-
-                bool replaced = false;
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
-
-                                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-                                if (intr->intrinsic != nir_intrinsic_store_output)
-                                        continue;
-
-                                const nir_variable *var = nir_find_variable_with_driver_location(nir, nir_var_shader_out, nir_intrinsic_base(intr));
-                                assert(var);
-
-                                if (var->data.location < FRAG_RESULT_DATA0)
-                                        continue;
 
-                                if (var->data.index)
-                                        continue;
-
-                                assert(nir_src_is_const(intr->src[1]) && "no indirect outputs");
-
-                                nir_builder b;
-                                nir_builder_init(&b, function->impl);
-
-                                assert(!z_store || z_store->instr.block == instr->block);
-                                assert(!s_store || s_store->instr.block == instr->block);
-                                b.cursor = nir_after_block_before_jump(instr->block);
-
-                                nir_intrinsic_instr *combined_store;
-                                combined_store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_combined_output_pan);
-
-                                combined_store->num_components = intr->src[0].ssa->num_components;
-
-                                nir_intrinsic_set_base(combined_store, nir_intrinsic_base(intr));
-                                nir_intrinsic_set_src_type(combined_store, nir_intrinsic_src_type(intr));
-
-                                unsigned writeout = PAN_WRITEOUT_C;
-                                if (z_store)
-                                        writeout |= PAN_WRITEOUT_Z;
-                                if (s_store)
-                                        writeout |= PAN_WRITEOUT_S;
-
-                                nir_intrinsic_set_component(combined_store, writeout);
+/*
+ * Get the type to report for a piece of a combined store, given the store it
+ * is combining from. If there is no store to render target #0, a dummy <0.0,
+ * 0.0, 0.0, 0.0> write is used, so report a matching float32 type.
+ */
+static nir_alu_type
+pan_nir_rt_store_type(nir_intrinsic_instr *store)
+{
+   return store ? nir_intrinsic_src_type(store) : nir_type_float32;
+}
 
-                                struct nir_ssa_def *zero = nir_imm_int(&b, 0);
+static void
+pan_nir_emit_combined_store(nir_builder *b, nir_intrinsic_instr *rt0_store,
+                            unsigned writeout, nir_intrinsic_instr **stores)
+{
+   nir_intrinsic_instr *intr = nir_intrinsic_instr_create(
+      b->shader, nir_intrinsic_store_combined_output_pan);
 
-                                struct nir_ssa_def *src[4] = {
-                                   intr->src[0].ssa,
-                                   intr->src[1].ssa,
-                                   z_store ? z_store->src[0].ssa : zero,
-                                   s_store ? s_store->src[0].ssa : zero,
-                                };
+   intr->num_components = rt0_store ? rt0_store->src[0].ssa->num_components : 4;
 
-                                for (int i = 0; i < 4; ++i)
-                                   combined_store->src[i] = nir_src_for_ssa(src[i]);
+   if (rt0_store)
+      nir_intrinsic_set_io_semantics(intr,
+                                     nir_intrinsic_io_semantics(rt0_store));
+   nir_intrinsic_set_src_type(intr, pan_nir_rt_store_type(rt0_store));
+   nir_intrinsic_set_dest_type(intr, pan_nir_rt_store_type(stores[2]));
+   nir_intrinsic_set_component(intr, writeout);
 
-                                nir_builder_instr_insert(&b, &combined_store->instr);
+   nir_def *zero = nir_imm_int(b, 0);
+   nir_def *zero4 = nir_imm_ivec4(b, 0, 0, 0, 0);
 
-                                nir_instr_remove(instr);
+   nir_def *src[] = {
+      rt0_store ? rt0_store->src[0].ssa : zero4,
+      rt0_store ? rt0_store->src[1].ssa : zero,
+      stores[0] ? stores[0]->src[0].ssa : zero,
+      stores[1] ? stores[1]->src[0].ssa : zero,
+      stores[2] ? stores[2]->src[0].ssa : zero4,
+   };
 
-                                replaced = true;
-                        }
-                }
+   for (int i = 0; i < ARRAY_SIZE(src); ++i)
+      intr->src[i] = nir_src_for_ssa(src[i]);
 
-                /* Insert a store to the depth RT (0xff) if needed */
-                if (!replaced) {
-                        nir_builder b;
-                        nir_builder_init(&b, function->impl);
+   nir_builder_instr_insert(b, &intr->instr);
+}
+bool
+pan_nir_lower_zs_store(nir_shader *nir)
+{
+   bool progress = false;
 
-                        nir_block *block = NULL;
-                        if (z_store && s_store)
-                                assert(z_store->instr.block == s_store->instr.block);
+   if (nir->info.stage != MESA_SHADER_FRAGMENT)
+      return false;
 
-                        if (z_store)
-                                block = z_store->instr.block;
-                        else
-                                block = s_store->instr.block;
+   nir_foreach_function_impl(impl, nir) {
+      nir_intrinsic_instr *stores[3] = {NULL};
+      unsigned writeout = 0;
 
-                        b.cursor = nir_after_block_before_jump(block);
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
 
-                        nir_intrinsic_instr *combined_store;
-                        combined_store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_combined_output_pan);
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            if (intr->intrinsic != nir_intrinsic_store_output)
+               continue;
 
-                        combined_store->num_components = 4;
+            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+            if (sem.location == FRAG_RESULT_DEPTH) {
+               stores[0] = intr;
+               writeout |= PAN_WRITEOUT_Z;
+            } else if (sem.location == FRAG_RESULT_STENCIL) {
+               stores[1] = intr;
+               writeout |= PAN_WRITEOUT_S;
+            } else if (sem.dual_source_blend_index) {
+               assert(!stores[2]); /* there should be only 1 source for dual blending */
+               stores[2] = intr;
+               writeout |= PAN_WRITEOUT_2;
+            }
+         }
+      }
 
-                        unsigned base;
-                        if (z_store)
-                                base = nir_intrinsic_base(z_store);
-                        else
-                                base = nir_intrinsic_base(s_store);
-                        nir_intrinsic_set_base(combined_store, base);
-                        nir_intrinsic_set_src_type(combined_store, nir_type_float32);
+      if (!writeout)
+         continue;
 
-                        unsigned writeout = 0;
-                        if (z_store)
-                                writeout |= PAN_WRITEOUT_Z;
-                        if (s_store)
-                                writeout |= PAN_WRITEOUT_S;
+      nir_block *common_block = NULL;
 
-                        nir_intrinsic_set_component(combined_store, writeout);
+      /* Ensure all stores are in the same block */
+      for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
+         if (!stores[i])
+            continue;
 
-                        struct nir_ssa_def *zero = nir_imm_int(&b, 0);
+         nir_block *block = stores[i]->instr.block;
 
-                        struct nir_ssa_def *src[4] = {
-                                nir_imm_vec4(&b, 0, 0, 0, 0),
-                                zero,
-                                z_store ? z_store->src[0].ssa : zero,
-                                s_store ? s_store->src[0].ssa : zero,
-                        };
+         if (common_block)
+            assert(common_block == block);
+         else
+            common_block = block;
+      }
 
-                        for (int i = 0; i < 4; ++i)
-                                combined_store->src[i] = nir_src_for_ssa(src[i]);
+      bool replaced = false;
 
-                        nir_builder_instr_insert(&b, &combined_store->instr);
-                }
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
 
-                if (z_store)
-                        nir_instr_remove(&z_store->instr);
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            if (intr->intrinsic != nir_intrinsic_store_output)
+               continue;
 
-                if (s_store)
-                        nir_instr_remove(&s_store->instr);
+            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
 
-                nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
-                progress = true;
-        }
+            if (sem.location < FRAG_RESULT_DATA0)
+               continue;
 
-        return progress;
-}
+            if (sem.dual_source_blend_index)
+               continue;
 
-/* Real writeout stores, which break execution, need to be moved to after
- * dual-source stores, which are just standard register writes. */
-bool
-pan_nir_reorder_writeout(nir_shader *nir)
-{
-        bool progress = false;
+            assert(nir_src_is_const(intr->src[1]) && "no indirect outputs");
 
-        nir_foreach_function(function, nir) {
-                if (!function->impl) continue;
+            nir_builder b =
+               nir_builder_at(nir_after_block_before_jump(instr->block));
 
-                nir_foreach_block(block, function->impl) {
-                        nir_instr *last_writeout = NULL;
+            /* Trying to write depth twice results in the
+             * wrong blend shader being executed on
+             * Midgard */
+            unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout);
 
-                        nir_foreach_instr_reverse_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
+            pan_nir_emit_combined_store(&b, intr, this_store, stores);
 
-                                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-                                if (intr->intrinsic != nir_intrinsic_store_output)
-                                        continue;
+            nir_instr_remove(instr);
 
-                                const nir_variable *var = nir_find_variable_with_driver_location(nir, nir_var_shader_out, nir_intrinsic_base(intr));
+            replaced = true;
+         }
+      }
 
-                                if (var->data.index) {
-                                        if (!last_writeout)
-                                                last_writeout = instr;
-                                        continue;
-                                }
+      /* Insert a store to the depth RT (0xff) if needed */
+      if (!replaced) {
+         nir_builder b =
+            nir_builder_at(nir_after_block_before_jump(common_block));
 
-                                if (!last_writeout)
-                                        continue;
+         pan_nir_emit_combined_store(&b, NULL, writeout, stores);
+      }
 
-                                /* This is a real store, so move it to after dual-source stores */
-                                exec_node_remove(&instr->node);
-                                exec_node_insert_after(&last_writeout->node, &instr->node);
+      for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
+         if (stores[i])
+            nir_instr_remove(&stores[i]->instr);
+      }
 
-                                progress = true;
-                        }
-                }
-        }
+      nir_metadata_preserve(impl,
+                            nir_metadata_block_index | nir_metadata_dominance);
+      progress = true;
+   }
 
-        return progress;
+   return progress;
 }
diff --git a/src/panfrost/util/pan_lower_xfb.c b/src/panfrost/util/pan_lower_xfb.c
new file mode 100644
index 00000000000..3fc96b2e380
--- /dev/null
+++ b/src/panfrost/util/pan_lower_xfb.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2022 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "pan_ir.h"
+
+static void
+lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr,
+                 unsigned start_component, unsigned num_components,
+                 unsigned buffer, unsigned offset_words)
+{
+   assert(buffer < MAX_XFB_BUFFERS);
+   assert(nir_intrinsic_component(intr) == 0); // TODO
+
+   /* Transform feedback info in units of words, convert to bytes. */
+   uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
+   assert(stride != 0);
+
+   uint16_t offset = offset_words * 4;
+
+   nir_def *index = nir_iadd(
+      b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)),
+      nir_load_vertex_id_zero_base(b));
+
+   BITSET_SET(b->shader->info.system_values_read,
+              SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+   BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
+   nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
+   nir_def *addr = nir_iadd(
+      b, buf,
+      nir_u2u64(b, nir_iadd_imm(b, nir_imul_imm(b, index, stride), offset)));
+
+   nir_def *src = intr->src[0].ssa;
+   nir_def *value =
+      nir_channels(b, src, BITFIELD_MASK(num_components) << start_component);
+   nir_store_global(b, addr, 4, value, BITFIELD_MASK(num_components));
+}
+
+static bool
+lower_xfb(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
+{
+   /* In transform feedback programs, vertex ID becomes zero-based, so apply
+    * that lowering even on Valhall.
+    */
+   if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
+      b->cursor = nir_instr_remove(&intr->instr);
+
+      nir_def *repl =
+         nir_iadd(b, nir_load_vertex_id_zero_base(b), nir_load_first_vertex(b));
+
+      nir_def_rewrite_uses(&intr->def, repl);
+      return true;
+   }
+
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   bool progress = false;
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   for (unsigned i = 0; i < 2; ++i) {
+      nir_io_xfb xfb =
+         i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr);
+      for (unsigned j = 0; j < 2; ++j) {
+         if (!xfb.out[j].num_components)
+            continue;
+
+         lower_xfb_output(b, intr, i * 2 + j, xfb.out[j].num_components,
+                          xfb.out[j].buffer, xfb.out[j].offset);
+         progress = true;
+      }
+   }
+
+   nir_instr_remove(&intr->instr);
+   return progress;
+}
+
+bool
+pan_lower_xfb(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(
+      nir, lower_xfb, nir_metadata_block_index | nir_metadata_dominance, NULL);
+}
diff --git a/src/panfrost/util/pan_sysval.c b/src/panfrost/util/pan_sysval.c
deleted file mode 100644
index 80a509f5b8b..00000000000
--- a/src/panfrost/util/pan_sysval.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (C) 2020 Collabora Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors (Collabora):
- *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#include "pan_ir.h"
-#include "compiler/nir/nir_builder.h"
-
-/* TODO: ssbo_size */
-static int
-panfrost_sysval_for_ssbo(nir_intrinsic_instr *instr)
-{
-        nir_src index = instr->src[0];
-        assert(nir_src_is_const(index));
-        uint32_t uindex = nir_src_as_uint(index);
-
-        return PAN_SYSVAL(SSBO, uindex);
-}
-
-static int
-panfrost_sysval_for_sampler(nir_intrinsic_instr *instr)
-{
-        /* TODO: indirect samplers !!! */
-        nir_src index = instr->src[0];
-        assert(nir_src_is_const(index));
-        uint32_t uindex = nir_src_as_uint(index);
-
-        return PAN_SYSVAL(SAMPLER, uindex);
-}
-
-static int
-panfrost_sysval_for_image_size(nir_intrinsic_instr *instr)
-{
-        nir_src index = instr->src[0];
-        assert(nir_src_is_const(index));
-
-        bool is_array = nir_intrinsic_image_array(instr);
-        uint32_t uindex = nir_src_as_uint(index);
-        unsigned dim = nir_intrinsic_dest_components(instr) - is_array;
-
-        return PAN_SYSVAL(IMAGE_SIZE, PAN_TXS_SYSVAL_ID(uindex, dim, is_array));
-}
-
-static unsigned
-panfrost_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr)
-{
-        switch (instr->intrinsic) {
-        case nir_intrinsic_load_viewport_scale:
-                return PAN_SYSVAL_VIEWPORT_SCALE;
-        case nir_intrinsic_load_viewport_offset:
-                return PAN_SYSVAL_VIEWPORT_OFFSET;
-        case nir_intrinsic_load_num_workgroups:
-                return PAN_SYSVAL_NUM_WORK_GROUPS;
-        case nir_intrinsic_load_workgroup_size:
-                return PAN_SYSVAL_LOCAL_GROUP_SIZE;
-        case nir_intrinsic_load_work_dim:
-                return PAN_SYSVAL_WORK_DIM;
-        case nir_intrinsic_load_sample_positions_pan:
-                return PAN_SYSVAL_SAMPLE_POSITIONS;
-        case nir_intrinsic_load_first_vertex:
-        case nir_intrinsic_load_base_vertex:
-        case nir_intrinsic_load_base_instance:
-                return PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS;
-        case nir_intrinsic_load_draw_id:
-                return PAN_SYSVAL_DRAWID;
-        case nir_intrinsic_load_ssbo_address: 
-        case nir_intrinsic_get_ssbo_size: 
-                return panfrost_sysval_for_ssbo(instr);
-        case nir_intrinsic_load_sampler_lod_parameters_pan:
-                return panfrost_sysval_for_sampler(instr);
-        case nir_intrinsic_image_size:
-                return panfrost_sysval_for_image_size(instr);
-        default:
-                return ~0;
-        }
-}
-
-int
-panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest)
-{
-        nir_intrinsic_instr *intr;
-        nir_dest *dst = NULL;
-        nir_tex_instr *tex;
-        unsigned sysval = ~0;
-
-        switch (instr->type) {
-        case nir_instr_type_intrinsic:
-                intr = nir_instr_as_intrinsic(instr);
-                sysval = panfrost_nir_sysval_for_intrinsic(intr);
-                dst = &intr->dest;
-                break;
-        case nir_instr_type_tex:
-                tex = nir_instr_as_tex(instr);
-                if (tex->op != nir_texop_txs)
-                        break;
-
-                sysval = PAN_SYSVAL(TEXTURE_SIZE,
-                                    PAN_TXS_SYSVAL_ID(tex->texture_index,
-                                                      nir_tex_instr_dest_size(tex) -
-                                                      (tex->is_array ? 1 : 0),
-                                                      tex->is_array));
-                dst  = &tex->dest;
-                break;
-        default:
-                break;
-        }
-
-        if (dest && dst)
-                *dest = *dst;
-
-        return sysval;
-}
-
-unsigned
-pan_lookup_sysval(struct hash_table_u64 *sysval_to_id,
-                  struct panfrost_sysvals *sysvals,
-                  int sysval)
-{
-        /* Try to lookup */
-
-        void *cached = _mesa_hash_table_u64_search(sysval_to_id, sysval);
-
-        if (cached)
-                return ((uintptr_t) cached) - 1;
-
-        /* Else assign */
-
-        unsigned id = sysvals->sysval_count++;
-        assert(id < MAX_SYSVAL_COUNT);
-        _mesa_hash_table_u64_insert(sysval_to_id, sysval, (void *) ((uintptr_t) id + 1));
-        sysvals->sysvals[id] = sysval;
-
-        return id;
-}
-
-struct hash_table_u64 *
-panfrost_init_sysvals(struct panfrost_sysvals *sysvals, void *memctx)
-{
-        sysvals->sysval_count = 0;
-        return _mesa_hash_table_u64_create(memctx);
-}