1 files changed, 2266 insertions, 824 deletions
diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c
index cb27054ece1..153590abd88 100644
--- a/src/microsoft/compiler/dxil_nir.c
+++ b/src/microsoft/compiler/dxil_nir.c
@@ -22,11 +22,14 @@
  */
 
 #include "dxil_nir.h"
+#include "dxil_module.h"
 
 #include "nir_builder.h"
 #include "nir_deref.h"
+#include "nir_worklist.h"
 #include "nir_to_dxil.h"
 #include "util/u_math.h"
+#include "vulkan/vulkan_core.h"
 
 static void
 cl_type_size_align(const struct glsl_type *type, unsigned *size,
@@ -36,527 +39,157 @@ cl_type_size_align(const struct glsl_type *type, unsigned *size,
    *align = glsl_get_cl_alignment(type);
 }
 
-static void
-extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32,
-                         unsigned dst_bit_size,
-                         nir_ssa_def **dst_comps,
-                         unsigned num_dst_comps)
-{
-   unsigned step = DIV_ROUND_UP(dst_bit_size, 32);
-   unsigned comps_per32b = 32 / dst_bit_size;
-   nir_ssa_def *tmp;
-
-   for (unsigned i = 0; i < vec32->num_components; i += step) {
-      switch (dst_bit_size) {
-      case 64:
-         tmp = nir_pack_64_2x32_split(b, nir_channel(b, vec32, i),
-                                         nir_channel(b, vec32, i + 1));
-         dst_comps[i / 2] = tmp;
-         break;
-      case 32:
-         dst_comps[i] = nir_channel(b, vec32, i);
-         break;
-      case 16:
-      case 8: {
-         unsigned dst_offs = i * comps_per32b;
-
-         tmp = nir_unpack_bits(b, nir_channel(b, vec32, i), dst_bit_size);
-         for (unsigned j = 0; j < comps_per32b && dst_offs + j < num_dst_comps; j++)
-            dst_comps[dst_offs + j] = nir_channel(b, tmp, j);
-         }
-
-         break;
-      }
-   }
-}
-
-static nir_ssa_def *
-load_comps_to_vec32(nir_builder *b, unsigned src_bit_size,
-                    nir_ssa_def **src_comps, unsigned num_src_comps)
+static nir_def *
+load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
+                  nir_def **src_comps, unsigned num_src_comps,
+                  unsigned dst_bit_size)
 {
-   unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32);
-   unsigned step = DIV_ROUND_UP(src_bit_size, 32);
-   unsigned comps_per32b = 32 / src_bit_size;
-   nir_ssa_def *vec32comps[4];
-
-   for (unsigned i = 0; i < num_vec32comps; i += step) {
-      switch (src_bit_size) {
-      case 64:
-         vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]);
-         vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]);
-         break;
-      case 32:
-         vec32comps[i] = src_comps[i];
-         break;
-      case 16:
-      case 8: {
-         unsigned src_offs = i * comps_per32b;
-
-         vec32comps[i] = nir_u2u32(b, src_comps[src_offs]);
-         for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) {
-            nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]),
-                                           nir_imm_int(b, j * src_bit_size));
-            vec32comps[i] = nir_ior(b, vec32comps[i], tmp);
-         }
-         break;
-      }
+   if (src_bit_size == dst_bit_size)
+      return nir_vec(b, src_comps, num_src_comps);
+   else if (src_bit_size > dst_bit_size)
+      return nir_extract_bits(b, src_comps, num_src_comps, 0, src_bit_size * num_src_comps / dst_bit_size, dst_bit_size);
+
+   unsigned num_dst_comps = DIV_ROUND_UP(num_src_comps * src_bit_size, dst_bit_size);
+   unsigned comps_per_dst = dst_bit_size / src_bit_size;
+   nir_def *dst_comps[4];
+
+   for (unsigned i = 0; i < num_dst_comps; i++) {
+      unsigned src_offs = i * comps_per_dst;
+
+      dst_comps[i] = nir_u2uN(b, src_comps[src_offs], dst_bit_size);
+      for (unsigned j = 1; j < comps_per_dst && src_offs + j < num_src_comps; j++) {
+         nir_def *tmp = nir_ishl_imm(b, nir_u2uN(b, src_comps[src_offs + j], dst_bit_size),
+                                         j * src_bit_size);
+         dst_comps[i] = nir_ior(b, dst_comps[i], tmp);
       }
    }
 
-   return nir_vec(b, vec32comps, num_vec32comps);
-}
-
-static nir_ssa_def *
-build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx)
-{
-   return nir_load_ptr_dxil(b, 1, 32, &deref->dest.ssa, idx);
+   return nir_vec(b, dst_comps, num_dst_comps);
 }
 
 static bool
-lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr)
+lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
 {
-   assert(intr->dest.is_ssa);
-
-   b->cursor = nir_before_instr(&intr->instr);
-
-   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
-   if (!nir_deref_mode_is(deref, nir_var_shader_temp))
-      return false;
-   nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align));
-   nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3)));
-
-   assert(intr->dest.is_ssa);
-   unsigned num_components = nir_dest_num_components(intr->dest);
-   unsigned bit_size = nir_dest_bit_size(intr->dest);
-   unsigned load_size = MAX2(32, bit_size);
+   unsigned bit_size = intr->def.bit_size;
+   unsigned num_components = intr->def.num_components;
    unsigned num_bits = num_components * bit_size;
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
-   unsigned comp_idx = 0;
-
-   nir_deref_path path;
-   nir_deref_path_init(&path, deref, NULL);
-   nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */));
-
-   /* Split loads into 32-bit chunks */
-   for (unsigned i = 0; i < num_bits; i += load_size) {
-      unsigned subload_num_bits = MIN2(num_bits - i, load_size);
-      nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32));
-      nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx);
-
-      if (load_size == 64) {
-         idx = nir_iadd(b, idx, nir_imm_int(b, 1));
-         vec32 = nir_vec2(b, vec32,
-                             build_load_ptr_dxil(b, path.path[0], idx));
-      }
-
-      /* If we have 2 bytes or less to load we need to adjust the u32 value so
-       * we can always extract the LSB.
-       */
-      if (subload_num_bits <= 16) {
-         nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)),
-                                          nir_imm_int(b, 8));
-         vec32 = nir_ushr(b, vec32, shift);
-      }
-
-      /* And now comes the pack/unpack step to match the original type. */
-      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
-                               subload_num_bits / bit_size);
-      comp_idx += subload_num_bits / bit_size;
-   }
-
-   nir_deref_path_finish(&path);
-   assert(comp_idx == num_components);
-   nir_ssa_def *result = nir_vec(b, comps, num_components);
-   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-static nir_ssa_def *
-ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32,
-                          nir_ssa_def *offset, unsigned num_bytes)
-{
-   assert(num_bytes == 16 || num_bytes == 12 || num_bytes == 8 ||
-          num_bytes == 4 || num_bytes == 3 || num_bytes == 2 ||
-          num_bytes == 1);
-   assert(vec32->num_components == 4);
-
-   /* 16 and 12 byte types are always aligned on 16 bytes. */
-   if (num_bytes > 8)
-      return vec32;
-
-   nir_ssa_def *comps[4];
-   nir_ssa_def *cond;
-
-   for (unsigned i = 0; i < 4; i++)
-      comps[i] = nir_channel(b, vec32, i);
-
-   /* If we have 8bytes or less to load, select which half the vec4 should
-    * be used.
-    */
-   cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)),
-                                 nir_imm_int(b, 0));
-
-   comps[0] = nir_bcsel(b, cond, comps[2], comps[0]);
-   comps[1] = nir_bcsel(b, cond, comps[3], comps[1]);
-
-   /* Thanks to the CL alignment constraints, if we want 8 bytes we're done. */
-   if (num_bytes == 8)
-      return nir_vec(b, comps, 2);
-
-   /* 4 bytes or less needed, select which of the 32bit component should be
-    * used and return it. The sub-32bit split is handled in
-    * extract_comps_from_vec32().
-    */
-   cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)),
-                                 nir_imm_int(b, 0));
-   return nir_bcsel(b, cond, comps[1], comps[0]);
-}
-
-nir_ssa_def *
-build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
-                    nir_ssa_def *offset, unsigned num_components,
-                    unsigned bit_size)
-{
-   nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4));
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
-   unsigned num_bits = num_components * bit_size;
-   unsigned comp_idx = 0;
-
-   /* We need to split loads in 16byte chunks because that's the
-    * granularity of cBufferLoadLegacy().
-    */
-   for (unsigned i = 0; i < num_bits; i += (16 * 8)) {
-      /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec
-       * load.
-       */
-      unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8);
-      nir_ssa_def *vec32 =
-         nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8))));
 
-      /* First re-arrange the vec32 to account for intra 16-byte offset. */
-      vec32 = ubo_load_select_32b_comps(b, vec32, offset, subload_num_bits / 8);
-
-      /* If we have 2 bytes or less to load we need to adjust the u32 value so
-       * we can always extract the LSB.
-       */
-      if (subload_num_bits <= 16) {
-         nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset,
-                                                      nir_imm_int(b, 3)),
-                                          nir_imm_int(b, 8));
-         vec32 = nir_ushr(b, vec32, shift);
-      }
-
-      /* And now comes the pack/unpack step to match the original type. */
-      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
-                               subload_num_bits / bit_size);
-      comp_idx += subload_num_bits / bit_size;
-   }
-
-   assert(comp_idx == num_components);
-   return nir_vec(b, comps, num_components);
-}
-
-static bool
-lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
-{
-   assert(intr->dest.is_ssa);
-   assert(intr->src[0].is_ssa);
-   assert(intr->src[1].is_ssa);
-
-   b->cursor = nir_before_instr(&intr->instr);
-
-   nir_ssa_def *buffer = intr->src[0].ssa;
-   nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3));
-   enum gl_access_qualifier access = nir_intrinsic_access(intr);
-   unsigned bit_size = nir_dest_bit_size(intr->dest);
-   unsigned num_components = nir_dest_num_components(intr->dest);
-   unsigned num_bits = num_components * bit_size;
-
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
-   unsigned comp_idx = 0;
-
-   /* We need to split loads in 16byte chunks because that's the optimal
-    * granularity of bufferLoad(). Minimum alignment is 4byte, which saves
-    * from us from extra complexity to extract >= 32 bit components.
-    */
-   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
-      /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
-       * load.
-       */
-      unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32);
-
-      /* The number of components to store depends on the number of bytes. */
-      nir_ssa_def *vec32 =
-         nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, 32), 32,
-                       buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)),
-                       .align_mul = 4,
-                       .align_offset = 0,
-                       .access = access);
-
-      /* If we have 2 bytes or less to load we need to adjust the u32 value so
-       * we can always extract the LSB.
-       */
-      if (subload_num_bits <= 16) {
-         nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)),
-                                          nir_imm_int(b, 8));
-         vec32 = nir_ushr(b, vec32, shift);
-      }
-
-      /* And now comes the pack/unpack step to match the original type. */
-      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
-                               subload_num_bits / bit_size);
-      comp_idx += subload_num_bits / bit_size;
-   }
-
-   assert(comp_idx == num_components);
-   nir_ssa_def *result = nir_vec(b, comps, num_components);
-   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-static bool
-lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
-{
    b->cursor = nir_before_instr(&intr->instr);
 
-   assert(intr->src[0].is_ssa);
-   assert(intr->src[1].is_ssa);
-   assert(intr->src[2].is_ssa);
-
-   nir_ssa_def *val = intr->src[0].ssa;
-   nir_ssa_def *buffer = intr->src[1].ssa;
-   nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3));
-
-   unsigned bit_size = val->bit_size;
-   unsigned num_components = val->num_components;
-   unsigned num_bits = num_components * bit_size;
-
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
-   unsigned comp_idx = 0;
-
-   for (unsigned i = 0; i < num_components; i++)
-      comps[i] = nir_channel(b, val, i);
-
-   /* We split stores in 16byte chunks because that's the optimal granularity
-    * of bufferStore(). Minimum alignment is 4byte, which saves from us from
-    * extra complexity to store >= 32 bit components.
-    */
-   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
-      /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
-       * store.
-       */
-      unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
-      nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
-      nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
-                                               substore_num_bits / bit_size);
-      nir_intrinsic_instr *store;
-
-      if (substore_num_bits < 32) {
-         nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1);
-
-        /* If we have 16 bits or less to store we need to place them
-         * correctly in the u32 component. Anything greater than 16 bits
-         * (including uchar3) is naturally aligned on 32bits.
-         */
-         if (substore_num_bits <= 16) {
-            nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3));
-            nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
-
-            vec32 = nir_ishl(b, vec32, shift);
-            mask = nir_ishl(b, mask, shift);
-         }
-
-         store = nir_intrinsic_instr_create(b->shader,
-                                            nir_intrinsic_store_ssbo_masked_dxil);
-         store->src[0] = nir_src_for_ssa(vec32);
-         store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
-         store->src[2] = nir_src_for_ssa(buffer);
-         store->src[3] = nir_src_for_ssa(local_offset);
-      } else {
-         store = nir_intrinsic_instr_create(b->shader,
-                                            nir_intrinsic_store_ssbo);
-         store->src[0] = nir_src_for_ssa(vec32);
-         store->src[1] = nir_src_for_ssa(buffer);
-         store->src[2] = nir_src_for_ssa(local_offset);
-
-         nir_intrinsic_set_align(store, 4, 0);
-      }
-
-      /* The number of components to store depends on the number of bits. */
-      store->num_components = DIV_ROUND_UP(substore_num_bits, 32);
-      nir_builder_instr_insert(b, &store->instr);
-      comp_idx += substore_num_bits / bit_size;
-   }
-
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-static void
-lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op)
-{
-   for (unsigned i = 0; i < num_comps; i++) {
-      nir_intrinsic_instr *load =
-         nir_intrinsic_instr_create(b->shader, op);
-
-      load->num_components = 1;
-      load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
-      nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
-      nir_builder_instr_insert(b, &load->instr);
-      comps[i] = &load->dest.ssa;
-   }
-}
-
-static bool
-lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr)
-{
-   assert(intr->dest.is_ssa);
-   unsigned bit_size = nir_dest_bit_size(intr->dest);
-   unsigned num_components = nir_dest_num_components(intr->dest);
-   unsigned num_bits = num_components * bit_size;
-
-   b->cursor = nir_before_instr(&intr->instr);
-   nir_intrinsic_op op = intr->intrinsic;
-
-   assert(intr->src[0].is_ssa);
-   nir_ssa_def *offset = intr->src[0].ssa;
-   if (op == nir_intrinsic_load_shared) {
-      offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
-      op = nir_intrinsic_load_shared_dxil;
-   } else {
+   nir_def *offset = intr->src[0].ssa;
+   if (intr->intrinsic == nir_intrinsic_load_shared)
+      offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
+   else
       offset = nir_u2u32(b, offset);
-      op = nir_intrinsic_load_scratch_dxil;
-   }
-   nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
-   nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
+   nir_def *index = nir_ushr_imm(b, offset, 2);
+   nir_def *comps[NIR_MAX_VEC_COMPONENTS];
+   nir_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
 
    /* We need to split loads in 32-bit accesses because the buffer
     * is an i32 array and DXIL does not support type casts.
     */
    unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);
-   lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op);
+   for (unsigned i = 0; i < num_32bit_comps; i++)
+      comps_32bit[i] = nir_load_array_var(b, var, nir_iadd_imm(b, index, i));
    unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);
 
    for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {
       unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);
       unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;
-      nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
+      nir_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
 
       /* If we have 16 bits or less to load we need to adjust the u32 value so
        * we can always extract the LSB.
        */
       if (num_bits <= 16) {
-         nir_ssa_def *shift =
-            nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)),
-                        nir_imm_int(b, 8));
+         nir_def *shift =
+            nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
          vec32 = nir_ushr(b, vec32, shift);
       }
 
       /* And now comes the pack/unpack step to match the original type. */
       unsigned dest_index = i * 32 / bit_size;
-      extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps);
+      nir_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, num_dest_comps, bit_size);
+      for (unsigned comp = 0; comp < num_dest_comps; ++comp, ++dest_index)
+         comps[dest_index] = nir_channel(b, temp_vec, comp);
    }
 
-   nir_ssa_def *result = nir_vec(b, comps, num_components);
-   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
+   nir_def *result = nir_vec(b, comps, num_components);
+   nir_def_rewrite_uses(&intr->def, result);
    nir_instr_remove(&intr->instr);
 
    return true;
 }
 
 static void
-lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op)
+lower_masked_store_vec32(nir_builder *b, nir_def *offset, nir_def *index,
+                         nir_def *vec32, unsigned num_bits, nir_variable *var, unsigned alignment)
 {
+   nir_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
 
-   for (unsigned i = 0; i < vec32->num_components; i++) {
-      nir_intrinsic_instr *store =
-         nir_intrinsic_instr_create(b->shader, op);
-
-      store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i));
-      store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
-      store->num_components = 1;
-      nir_builder_instr_insert(b, &store->instr);
-   }
-}
-
-static void
-lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index,
-                         nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op)
-{
-   nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
-
-   /* If we have 16 bits or less to store we need to place them correctly in
-    * the u32 component. Anything greater than 16 bits (including uchar3) is
-    * naturally aligned on 32bits.
-    */
-   if (num_bits <= 16) {
-      nir_ssa_def *shift =
-         nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8);
+   /* If we have small alignments, we need to place them correctly in the u32 component. */
+   if (alignment <= 2) {
+      nir_def *shift =
+         nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
 
       vec32 = nir_ishl(b, vec32, shift);
       mask = nir_ishl(b, mask, shift);
    }
 
-   if (op == nir_intrinsic_store_shared_dxil) {
+   if (var->data.mode == nir_var_mem_shared) {
       /* Use the dedicated masked intrinsic */
-      nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index);
+      nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
+      nir_deref_atomic(b, 32, &deref->def, nir_inot(b, mask), .atomic_op = nir_atomic_op_iand);
+      nir_deref_atomic(b, 32, &deref->def, vec32, .atomic_op = nir_atomic_op_ior);
    } else {
       /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */
-      nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index);
+      nir_def *load = nir_load_array_var(b, var, index);
 
-      nir_ssa_def *new_val = nir_ior(b, vec32,
+      nir_def *new_val = nir_ior(b, vec32,
                                      nir_iand(b,
                                               nir_inot(b, mask),
                                               load));
 
-      lower_store_vec32(b, index, new_val, op);
+      nir_store_array_var(b, var, index, new_val, 1);
    }
 }
 
 static bool
-lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr)
+lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
 {
-   assert(intr->src[0].is_ssa);
    unsigned num_components = nir_src_num_components(intr->src[0]);
    unsigned bit_size = nir_src_bit_size(intr->src[0]);
    unsigned num_bits = num_components * bit_size;
 
    b->cursor = nir_before_instr(&intr->instr);
-   nir_intrinsic_op op = intr->intrinsic;
 
-   nir_ssa_def *offset = intr->src[1].ssa;
-   if (op == nir_intrinsic_store_shared) {
-      offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
-      op = nir_intrinsic_store_shared_dxil;
-   } else {
+   nir_def *offset = intr->src[1].ssa;
+   if (intr->intrinsic == nir_intrinsic_store_shared)
+      offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
+   else
       offset = nir_u2u32(b, offset);
-      op = nir_intrinsic_store_scratch_dxil;
-   }
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+   nir_def *comps[NIR_MAX_VEC_COMPONENTS];
 
    unsigned comp_idx = 0;
    for (unsigned i = 0; i < num_components; i++)
       comps[i] = nir_channel(b, intr->src[0].ssa, i);
 
-   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
-      /* For each 4byte chunk (or smaller) we generate a 32bit scalar store.
-       */
-      unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
-      nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
-      nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
-                                               substore_num_bits / bit_size);
-      nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2));
+   unsigned step = MAX2(bit_size, 32);
+   for (unsigned i = 0; i < num_bits; i += step) {
+      /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */
+      unsigned substore_num_bits = MIN2(num_bits - i, step);
+      nir_def *local_offset = nir_iadd_imm(b, offset, i / 8);
+      nir_def *vec32 = load_comps_to_vec(b, bit_size, &comps[comp_idx],
+                                             substore_num_bits / bit_size, 32);
+      nir_def *index = nir_ushr_imm(b, local_offset, 2);
 
       /* For anything less than 32bits we need to use the masked version of the
-       * intrinsic to preserve data living in the same 32bit slot.
-       */
-      if (num_bits < 32) {
-         lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op);
+       * intrinsic to preserve data living in the same 32bit slot. */
+      if (substore_num_bits < 32) {
+         lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, var, nir_intrinsic_align(intr));
       } else {
-         lower_store_vec32(b, index, vec32, op);
+         for (unsigned i = 0; i < vec32->num_components; ++i)
+            nir_store_array_var(b, var, nir_iadd_imm(b, index, i), nir_channel(b, vec32, i), 1);
       }
 
       comp_idx += substore_num_bits / bit_size;
@@ -567,52 +200,22 @@ lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr)
    return true;
 }
 
-static void
-ubo_to_temp_patch_deref_mode(nir_deref_instr *deref)
-{
-   deref->modes = nir_var_shader_temp;
-   nir_foreach_use(use_src, &deref->dest.ssa) {
-      if (use_src->parent_instr->type != nir_instr_type_deref)
-	 continue;
-
-      nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr);
-      ubo_to_temp_patch_deref_mode(parent);
-   }
-}
-
-static void
-ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he)
-{
-   assert(nir_deref_mode_is(deref, nir_var_mem_constant));
-   assert(deref->dest.is_ssa);
-   assert(he->data);
-
-   nir_foreach_use(use_src, &deref->dest.ssa) {
-      if (use_src->parent_instr->type == nir_instr_type_deref) {
-         ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he);
-      } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
-         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr);
-         if (intr->intrinsic != nir_intrinsic_load_deref)
-            he->data = NULL;
-      } else {
-         he->data = NULL;
-      }
-
-      if (!he->data)
-         break;
-   }
-}
+#define CONSTANT_LOCATION_UNVISITED 0
+#define CONSTANT_LOCATION_VALID 1
+#define CONSTANT_LOCATION_INVALID 2
 
 bool
-dxil_nir_lower_ubo_to_temp(nir_shader *nir)
+dxil_nir_lower_constant_to_temp(nir_shader *nir)
 {
-   struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL);
    bool progress = false;
+   nir_foreach_variable_with_modes(var, nir, nir_var_mem_constant)
+      var->data.location = var->constant_initializer ?
+         CONSTANT_LOCATION_UNVISITED : CONSTANT_LOCATION_INVALID;
 
    /* First pass: collect all UBO accesses that could be turned into
     * shader temp accesses.
     */
-   foreach_list_typed(nir_function, func, node, &nir->functions) {
+   nir_foreach_function(func, nir) {
       if (!func->is_entrypoint)
          continue;
       assert(func->impl);
@@ -624,60 +227,55 @@ dxil_nir_lower_ubo_to_temp(nir_shader *nir)
 
             nir_deref_instr *deref = nir_instr_as_deref(instr);
             if (!nir_deref_mode_is(deref, nir_var_mem_constant) ||
-                deref->deref_type != nir_deref_type_var)
-                  continue;
-
-            struct hash_entry *he =
-               _mesa_hash_table_search(ubo_to_temp, deref->var);
-
-            if (!he)
-               he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var);
-
-            if (!he->data)
+                deref->deref_type != nir_deref_type_var ||
+                deref->var->data.location == CONSTANT_LOCATION_INVALID)
                continue;
 
-            ubo_to_temp_update_entry(deref, he);
+            deref->var->data.location = nir_deref_instr_has_complex_use(deref, 0) ?
+               CONSTANT_LOCATION_INVALID : CONSTANT_LOCATION_VALID;
          }
       }
    }
 
-   hash_table_foreach(ubo_to_temp, he) {
-      nir_variable *var = he->data;
-
-      if (!var)
+   nir_foreach_variable_with_modes(var, nir, nir_var_mem_constant) {
+      if (var->data.location != CONSTANT_LOCATION_VALID)
          continue;
 
       /* Change the variable mode. */
       var->data.mode = nir_var_shader_temp;
 
-      /* Make sure the variable has a name.
-       * DXIL variables must have names.
-       */
-      if (!var->name)
-         var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables));
-
       progress = true;
    }
-   _mesa_hash_table_destroy(ubo_to_temp, NULL);
 
    /* Second pass: patch all derefs that were accessing the converted UBOs
     * variables.
     */
-   foreach_list_typed(nir_function, func, node, &nir->functions) {
+   nir_foreach_function(func, nir) {
       if (!func->is_entrypoint)
          continue;
       assert(func->impl);
 
+      nir_builder b = nir_builder_create(func->impl);
       nir_foreach_block(block, func->impl) {
          nir_foreach_instr_safe(instr, block) {
             if (instr->type != nir_instr_type_deref)
                continue;
 
             nir_deref_instr *deref = nir_instr_as_deref(instr);
-            if (nir_deref_mode_is(deref, nir_var_mem_constant) &&
-                deref->deref_type == nir_deref_type_var &&
-                deref->var->data.mode == nir_var_shader_temp)
-               ubo_to_temp_patch_deref_mode(deref);
+            if (nir_deref_mode_is(deref, nir_var_mem_constant)) {
+               nir_deref_instr *parent = deref;
+               while (parent && parent->deref_type != nir_deref_type_var)
+                  parent = nir_src_as_deref(parent->parent);
+               if (parent && parent->var->data.mode != nir_var_mem_constant) {
+                  deref->modes = parent->var->data.mode;
+                  /* Also change "pointer" size to 32-bit since this is now a logical pointer */
+                  deref->def.bit_size = 32;
+                  if (deref->deref_type == nir_deref_type_array) {
+                     b.cursor = nir_before_instr(instr);
+                     nir_src_rewrite(&deref->arr.index, nir_u2u32(&b, deref->arr.index.ssa));
+                  }
+               }
+            }
          }
       }
    }
@@ -686,147 +284,460 @@ dxil_nir_lower_ubo_to_temp(nir_shader *nir)
 }
 
 static bool
-lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr)
+flatten_var_arrays(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
-   assert(intr->dest.is_ssa);
-   assert(intr->src[0].is_ssa);
-   assert(intr->src[1].is_ssa);
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_deref:
+   case nir_intrinsic_store_deref:
+   case nir_intrinsic_deref_atomic:
+   case nir_intrinsic_deref_atomic_swap:
+      break;
+   default:
+      return false;
+   }
 
-   b->cursor = nir_before_instr(&intr->instr);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = NULL;
+   for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) {
+      if (d->deref_type == nir_deref_type_cast)
+         return false;
+      if (d->deref_type == nir_deref_type_var) {
+         var = d->var;
+         if (d->type == var->type)
+            return false;
+      }
+   }
+   if (!var)
+      return false;
+
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
 
-   nir_ssa_def *result =
-      build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa,
-                             nir_dest_num_components(intr->dest),
-                             nir_dest_bit_size(intr->dest));
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+   b->cursor = nir_before_instr(&path.path[0]->instr);
+   nir_deref_instr *new_var_deref = nir_build_deref_var(b, var);
+   nir_def *index = NULL;
+   for (unsigned level = 1; path.path[level]; ++level) {
+      nir_deref_instr *arr_deref = path.path[level];
+      assert(arr_deref->deref_type == nir_deref_type_array);
+      b->cursor = nir_before_instr(&arr_deref->instr);
+      nir_def *val = nir_imul_imm(b, arr_deref->arr.index.ssa,
+                                      glsl_get_component_slots(arr_deref->type));
+      if (index) {
+         index = nir_iadd(b, index, val);
+      } else {
+         index = val;
+      }
+   }
 
-   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
-   nir_instr_remove(&intr->instr);
+   unsigned vector_comps = intr->num_components;
+   if (vector_comps > 1) {
+      b->cursor = nir_before_instr(&intr->instr);
+      if (intr->intrinsic == nir_intrinsic_load_deref) {
+         nir_def *components[NIR_MAX_VEC_COMPONENTS];
+         for (unsigned i = 0; i < vector_comps; ++i) {
+            nir_def *final_index = index ? nir_iadd_imm(b, index, i) : nir_imm_int(b, i);
+            nir_deref_instr *comp_deref = nir_build_deref_array(b, new_var_deref, final_index);
+            components[i] = nir_load_deref(b, comp_deref);
+         }
+         nir_def_rewrite_uses(&intr->def, nir_vec(b, components, vector_comps));
+      } else if (intr->intrinsic == nir_intrinsic_store_deref) {
+         for (unsigned i = 0; i < vector_comps; ++i) {
+            if (((1 << i) & nir_intrinsic_write_mask(intr)) == 0)
+               continue;
+            nir_def *final_index = index ? nir_iadd_imm(b, index, i) : nir_imm_int(b, i);
+            nir_deref_instr *comp_deref = nir_build_deref_array(b, new_var_deref, final_index);
+            nir_store_deref(b, comp_deref, nir_channel(b, intr->src[1].ssa, i), 1);
+         }
+      }
+      nir_instr_remove(&intr->instr);
+   } else {
+      nir_src_rewrite(&intr->src[0], &nir_build_deref_array(b, new_var_deref, index)->def);
+   }
+
+   nir_deref_path_finish(&path);
+   return true;
+}
+
+static void
+flatten_constant_initializer(nir_variable *var, nir_constant *src, nir_constant ***dest, unsigned vector_elements)
+{
+   if (src->num_elements == 0) {
+      for (unsigned i = 0; i < vector_elements; ++i) {
+         nir_constant *new_scalar = rzalloc(var, nir_constant);
+         memcpy(&new_scalar->values[0], &src->values[i], sizeof(src->values[0]));
+         new_scalar->is_null_constant = src->values[i].u64 == 0;
+
+         nir_constant **array_entry = (*dest)++;
+         *array_entry = new_scalar;
+      }
+   } else {
+      for (unsigned i = 0; i < src->num_elements; ++i)
+         flatten_constant_initializer(var, src->elements[i], dest, vector_elements);
+   }
+}
+
+static bool
+flatten_var_array_types(nir_variable *var)
+{
+   assert(!glsl_type_is_struct(glsl_without_array(var->type)));
+   const struct glsl_type *matrix_type = glsl_without_array(var->type);
+   if (!glsl_type_is_array_of_arrays(var->type) && glsl_get_components(matrix_type) == 1)
+      return false;
+
+   enum glsl_base_type base_type = glsl_get_base_type(matrix_type);
+   const struct glsl_type *flattened_type = glsl_array_type(glsl_scalar_type(base_type),
+                                                            glsl_get_component_slots(var->type), 0);
+   var->type = flattened_type;
+   if (var->constant_initializer) {
+      nir_constant **new_elements = ralloc_array(var, nir_constant *, glsl_get_length(flattened_type));
+      nir_constant **temp = new_elements;
+      flatten_constant_initializer(var, var->constant_initializer, &temp, glsl_get_vector_elements(matrix_type));
+      var->constant_initializer->num_elements = glsl_get_length(flattened_type);
+      var->constant_initializer->elements = new_elements;
+   }
    return true;
 }
 
 bool
-dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir)
+dxil_nir_flatten_var_arrays(nir_shader *shader, nir_variable_mode modes)
 {
    bool progress = false;
+   nir_foreach_variable_with_modes(var, shader, modes & ~nir_var_function_temp)
+      progress |= flatten_var_array_types(var);
 
-   foreach_list_typed(nir_function, func, node, &nir->functions) {
-      if (!func->is_entrypoint)
-         continue;
-      assert(func->impl);
+   if (modes & nir_var_function_temp) {
+      nir_foreach_function_impl(impl, shader) {
+         nir_foreach_function_temp_variable(var, impl)
+            progress |= flatten_var_array_types(var);
+      }
+   }
 
-      nir_builder b;
-      nir_builder_init(&b, func->impl);
+   if (!progress)
+      return false;
 
-      nir_foreach_block(block, func->impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   nir_shader_intrinsics_pass(shader, flatten_var_arrays,
+                                nir_metadata_block_index |
+                                   nir_metadata_dominance |
+                                   nir_metadata_loop_analysis,
+                                NULL);
+   nir_remove_dead_derefs(shader);
+   return true;
+}
 
-            switch (intr->intrinsic) {
-            case nir_intrinsic_load_deref:
-               progress |= lower_load_deref(&b, intr);
-               break;
-            case nir_intrinsic_load_shared:
-            case nir_intrinsic_load_scratch:
-               progress |= lower_32b_offset_load(&b, intr);
-               break;
-            case nir_intrinsic_load_ssbo:
-               progress |= lower_load_ssbo(&b, intr);
-               break;
-            case nir_intrinsic_load_ubo:
-               progress |= lower_load_ubo(&b, intr);
-               break;
-            case nir_intrinsic_store_shared:
-            case nir_intrinsic_store_scratch:
-               progress |= lower_32b_offset_store(&b, intr);
-               break;
-            case nir_intrinsic_store_ssbo:
-               progress |= lower_store_ssbo(&b, intr);
-               break;
-            default:
-               break;
-            }
+static bool
+lower_deref_bit_size(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_deref:
+   case nir_intrinsic_store_deref:
+      break;
+   default:
+      /* Atomics can't be smaller than 32-bit */
+      return false;
+   }
+
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   /* Only interested in full deref chains */
+   if (!var)
+      return false;
+
+   const struct glsl_type *var_scalar_type = glsl_without_array(var->type);
+   if (deref->type == var_scalar_type || !glsl_type_is_scalar(var_scalar_type))
+      return false;
+
+   assert(deref->deref_type == nir_deref_type_var || deref->deref_type == nir_deref_type_array);
+   const struct glsl_type *old_glsl_type = deref->type;
+   nir_alu_type old_type = nir_get_nir_type_for_glsl_type(old_glsl_type);
+   nir_alu_type new_type = nir_get_nir_type_for_glsl_type(var_scalar_type);
+   if (glsl_get_bit_size(old_glsl_type) < glsl_get_bit_size(var_scalar_type)) {
+      deref->type = var_scalar_type;
+      if (intr->intrinsic == nir_intrinsic_load_deref) {
+         intr->def.bit_size = glsl_get_bit_size(var_scalar_type);
+         b->cursor = nir_after_instr(&intr->instr);
+         nir_def *downcast = nir_type_convert(b, &intr->def, new_type, old_type, nir_rounding_mode_undef);
+         nir_def_rewrite_uses_after(&intr->def, downcast, downcast->parent_instr);
+      }
+      else {
+         b->cursor = nir_before_instr(&intr->instr);
+         nir_def *upcast = nir_type_convert(b, intr->src[1].ssa, old_type, new_type, nir_rounding_mode_undef);
+         nir_src_rewrite(&intr->src[1], upcast);
+      }
+
+      while (deref->deref_type == nir_deref_type_array) {
+         nir_deref_instr *parent = nir_deref_instr_parent(deref);
+         parent->type = glsl_type_wrap_in_arrays(deref->type, parent->type);
+         deref = parent;
+      }
+   } else {
+      /* Assumed arrays are already flattened */
+      b->cursor = nir_before_instr(&deref->instr);
+      nir_deref_instr *parent = nir_build_deref_var(b, var);
+      if (deref->deref_type == nir_deref_type_array)
+         deref = nir_build_deref_array(b, parent, nir_imul_imm(b, deref->arr.index.ssa, 2));
+      else
+         deref = nir_build_deref_array_imm(b, parent, 0);
+      nir_deref_instr *deref2 = nir_build_deref_array(b, parent,
+                                                      nir_iadd_imm(b, deref->arr.index.ssa, 1));
+      b->cursor = nir_before_instr(&intr->instr);
+      if (intr->intrinsic == nir_intrinsic_load_deref) {
+         nir_def *src1 = nir_load_deref(b, deref);
+         nir_def *src2 = nir_load_deref(b, deref2);
+         nir_def_rewrite_uses(&intr->def, nir_pack_64_2x32_split(b, src1, src2));
+      } else {
+         nir_def *src1 = nir_unpack_64_2x32_split_x(b, intr->src[1].ssa);
+         nir_def *src2 = nir_unpack_64_2x32_split_y(b, intr->src[1].ssa);
+         nir_store_deref(b, deref, src1, 1);
+         nir_store_deref(b, deref, src2, 1);
+      }
+      nir_instr_remove(&intr->instr);
+   }
+   return true;
+}
+
+static bool
+lower_var_bit_size_types(nir_variable *var, unsigned min_bit_size, unsigned max_bit_size)
+{
+   assert(!glsl_type_is_array_of_arrays(var->type) && !glsl_type_is_struct(var->type));
+   const struct glsl_type *type = glsl_without_array(var->type);
+   assert(glsl_type_is_scalar(type));
+   enum glsl_base_type base_type = glsl_get_base_type(type);
+   if (glsl_base_type_get_bit_size(base_type) < min_bit_size) {
+      switch (min_bit_size) {
+      case 16:
+         switch (base_type) {
+         case GLSL_TYPE_BOOL:
+            base_type = GLSL_TYPE_UINT16;
+            for (unsigned i = 0; i < (var->constant_initializer ? var->constant_initializer->num_elements : 0); ++i)
+               var->constant_initializer->elements[i]->values[0].u16 = var->constant_initializer->elements[i]->values[0].b ? 0xffff : 0;
+            break;
+         case GLSL_TYPE_INT8:
+            base_type = GLSL_TYPE_INT16;
+            for (unsigned i = 0; i < (var->constant_initializer ? var->constant_initializer->num_elements : 0); ++i)
+               var->constant_initializer->elements[i]->values[0].i16 = var->constant_initializer->elements[i]->values[0].i8;
+            break;
+         case GLSL_TYPE_UINT8: base_type = GLSL_TYPE_UINT16; break;
+         default: unreachable("Unexpected base type");
+         }
+         break;
+      case 32:
+         switch (base_type) {
+         case GLSL_TYPE_BOOL:
+            base_type = GLSL_TYPE_UINT;
+            for (unsigned i = 0; i < (var->constant_initializer ? var->constant_initializer->num_elements : 0); ++i)
+               var->constant_initializer->elements[i]->values[0].u32 = var->constant_initializer->elements[i]->values[0].b ? 0xffffffff : 0;
+            break;
+         case GLSL_TYPE_INT8:
+            base_type = GLSL_TYPE_INT;
+            for (unsigned i = 0; i < (var->constant_initializer ? var->constant_initializer->num_elements : 0); ++i)
+               var->constant_initializer->elements[i]->values[0].i32 = var->constant_initializer->elements[i]->values[0].i8;
+            break;
+         case GLSL_TYPE_INT16:
+            base_type = GLSL_TYPE_INT;
+            for (unsigned i = 0; i < (var->constant_initializer ? var->constant_initializer->num_elements : 0); ++i)
+               var->constant_initializer->elements[i]->values[0].i32 = var->constant_initializer->elements[i]->values[0].i16;
+            break;
+         case GLSL_TYPE_FLOAT16:
+            base_type = GLSL_TYPE_FLOAT;
+            for (unsigned i = 0; i < (var->constant_initializer ? var->constant_initializer->num_elements : 0); ++i)
+               var->constant_initializer->elements[i]->values[0].f32 = _mesa_half_to_float(var->constant_initializer->elements[i]->values[0].u16);
+            break;
+         case GLSL_TYPE_UINT8: base_type = GLSL_TYPE_UINT; break;
+         case GLSL_TYPE_UINT16: base_type = GLSL_TYPE_UINT; break;
+         default: unreachable("Unexpected base type");
+         }
+         break;
+      default: unreachable("Unexpected min bit size");
+      }
+      var->type = glsl_type_wrap_in_arrays(glsl_scalar_type(base_type), var->type);
+      return true;
+   }
+   if (glsl_base_type_bit_size(base_type) > max_bit_size) {
+      assert(!glsl_type_is_array_of_arrays(var->type));
+      var->type = glsl_array_type(glsl_scalar_type(GLSL_TYPE_UINT),
+                                    glsl_type_is_array(var->type) ? glsl_get_length(var->type) * 2 : 2,
+                                    0);
+      if (var->constant_initializer) {
+         unsigned num_elements = var->constant_initializer->num_elements ?
+            var->constant_initializer->num_elements * 2 : 2;
+         nir_constant **element_arr = ralloc_array(var, nir_constant *, num_elements);
+         nir_constant *elements = rzalloc_array(var, nir_constant, num_elements);
+         for (unsigned i = 0; i < var->constant_initializer->num_elements; ++i) {
+            element_arr[i*2] = &elements[i*2];
+            element_arr[i*2+1] = &elements[i*2+1];
+            const nir_const_value *src = var->constant_initializer->num_elements ?
+               var->constant_initializer->elements[i]->values : var->constant_initializer->values;
+            elements[i*2].values[0].u32 = (uint32_t)src->u64;
+            elements[i*2].is_null_constant = (uint32_t)src->u64 == 0;
+            elements[i*2+1].values[0].u32 = (uint32_t)(src->u64 >> 32);
+            elements[i*2+1].is_null_constant = (uint32_t)(src->u64 >> 32) == 0;
          }
+         var->constant_initializer->num_elements = num_elements;
+         var->constant_initializer->elements = element_arr;
       }
+      return true;
    }
+   return false;
+}
 
-   return progress;
+bool
+dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes,
+                            unsigned min_bit_size, unsigned max_bit_size)
+{
+   bool progress = false;
+   nir_foreach_variable_with_modes(var, shader, modes & ~nir_var_function_temp)
+      progress |= lower_var_bit_size_types(var, min_bit_size, max_bit_size);
+
+   if (modes & nir_var_function_temp) {
+      nir_foreach_function_impl(impl, shader) {
+         nir_foreach_function_temp_variable(var, impl)
+            progress |= lower_var_bit_size_types(var, min_bit_size, max_bit_size);
+      }
+   }
+
+   if (!progress)
+      return false;
+
+   nir_shader_intrinsics_pass(shader, lower_deref_bit_size,
+                                nir_metadata_block_index |
+                                   nir_metadata_dominance |
+                                   nir_metadata_loop_analysis,
+                                NULL);
+   nir_remove_dead_derefs(shader);
+   return true;
 }
 
 static bool
-lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr,
-                    nir_intrinsic_op dxil_op)
+remove_oob_array_access(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   uint32_t num_derefs = 1;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_copy_deref:
+      num_derefs = 2;
+      FALLTHROUGH;
+   case nir_intrinsic_load_deref:
+   case nir_intrinsic_store_deref:
+   case nir_intrinsic_deref_atomic:
+   case nir_intrinsic_deref_atomic_swap:
+      break;
+   default:
+      return false;
+   }
+
+   for (uint32_t i = 0; i < num_derefs; ++i) {
+      if (nir_deref_instr_is_known_out_of_bounds(nir_src_as_deref(intr->src[i]))) {
+         switch (intr->intrinsic) {
+         case nir_intrinsic_load_deref:
+         case nir_intrinsic_deref_atomic:
+         case nir_intrinsic_deref_atomic_swap:
+            b->cursor = nir_before_instr(&intr->instr);
+            nir_def *undef = nir_undef(b, intr->def.num_components, intr->def.bit_size);
+            nir_def_rewrite_uses(&intr->def, undef);
+            break;
+         default:
+            break;
+         }
+         nir_instr_remove(&intr->instr);
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool
+dxil_nir_remove_oob_array_accesses(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(shader, remove_oob_array_access,
+                                     nir_metadata_block_index |
+                                        nir_metadata_dominance |
+                                        nir_metadata_loop_analysis,
+                                     NULL);
+}
+
+static bool
+lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
 {
    b->cursor = nir_before_instr(&intr->instr);
 
-   assert(intr->src[0].is_ssa);
-   nir_ssa_def *offset =
-      nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr)));
-   nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
+   nir_def *offset =
+      nir_iadd_imm(b, intr->src[0].ssa, nir_intrinsic_base(intr));
+   nir_def *index = nir_ushr_imm(b, offset, 2);
 
-   nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op);
-   atomic->src[0] = nir_src_for_ssa(index);
-   assert(intr->src[1].is_ssa);
-   atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa);
-   if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) {
-      assert(intr->src[2].is_ssa);
-      atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa);
-   }
-   atomic->num_components = 0;
-   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);
+   nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
+   nir_def *result;
+   if (intr->intrinsic == nir_intrinsic_shared_atomic_swap)
+      result = nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa,
+                                     .atomic_op = nir_intrinsic_atomic_op(intr));
+   else
+      result = nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa,
+                                .atomic_op = nir_intrinsic_atomic_op(intr));
 
-   nir_builder_instr_insert(b, &atomic->instr);
-   nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa);
+   nir_def_rewrite_uses(&intr->def, result);
    nir_instr_remove(&intr->instr);
    return true;
 }
 
 bool
-dxil_nir_lower_atomics_to_dxil(nir_shader *nir)
+dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
+                                    const struct dxil_nir_lower_loads_stores_options *options)
 {
-   bool progress = false;
+   bool progress = nir_remove_dead_variables(nir, nir_var_function_temp | nir_var_mem_shared, NULL);
+   nir_variable *shared_var = NULL;
+   if (nir->info.shared_size) {
+      shared_var = nir_variable_create(nir, nir_var_mem_shared,
+                                       glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->info.shared_size, 4), 4),
+                                       "lowered_shared_mem");
+   }
 
-   foreach_list_typed(nir_function, func, node, &nir->functions) {
-      if (!func->is_entrypoint)
-         continue;
-      assert(func->impl);
+   unsigned ptr_size = nir->info.cs.ptr_size;
+   if (nir->info.stage == MESA_SHADER_KERNEL) {
+      /* All the derefs created here will be used as GEP indices so force 32-bit */
+      nir->info.cs.ptr_size = 32;
+   }
+   nir_foreach_function_impl(impl, nir) {
+      nir_builder b = nir_builder_create(impl);
 
-      nir_builder b;
-      nir_builder_init(&b, func->impl);
+      nir_variable *scratch_var = NULL;
+      if (nir->scratch_size) {
+         const struct glsl_type *scratch_type = glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->scratch_size, 4), 4);
+         scratch_var = nir_local_variable_create(impl, scratch_type, "lowered_scratch_mem");
+      }
 
-      nir_foreach_block(block, func->impl) {
+      nir_foreach_block(block, impl) {
          nir_foreach_instr_safe(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;
             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
             switch (intr->intrinsic) {
-
-#define ATOMIC(op)                                                            \
-  case nir_intrinsic_shared_atomic_##op:                                     \
-     progress |= lower_shared_atomic(&b, intr,                                \
-                                     nir_intrinsic_shared_atomic_##op##_dxil); \
-     break
-
-            ATOMIC(add);
-            ATOMIC(imin);
-            ATOMIC(umin);
-            ATOMIC(imax);
-            ATOMIC(umax);
-            ATOMIC(and);
-            ATOMIC(or);
-            ATOMIC(xor);
-            ATOMIC(exchange);
-            ATOMIC(comp_swap);
-
-#undef ATOMIC
+            case nir_intrinsic_load_shared:
+               progress |= lower_32b_offset_load(&b, intr, shared_var);
+               break;
+            case nir_intrinsic_load_scratch:
+               progress |= lower_32b_offset_load(&b, intr, scratch_var);
+               break;
+            case nir_intrinsic_store_shared:
+               progress |= lower_32b_offset_store(&b, intr, shared_var);
+               break;
+            case nir_intrinsic_store_scratch:
+               progress |= lower_32b_offset_store(&b, intr, scratch_var);
+               break;
+            case nir_intrinsic_shared_atomic:
+            case nir_intrinsic_shared_atomic_swap:
+               progress |= lower_shared_atomic(&b, intr, shared_var);
+               break;
             default:
                break;
             }
          }
       }
    }
+   if (nir->info.stage == MESA_SHADER_KERNEL) {
+      nir->info.cs.ptr_size = ptr_size;
+   }
 
    return progress;
 }
@@ -845,12 +756,12 @@ lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref)
       /* We turn all deref_var into deref_cast and build a pointer value based on
        * the var binding which encodes the UAV id.
        */
-      nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32);
+      nir_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32);
       nir_deref_instr *deref_cast =
          nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type,
                               glsl_get_explicit_stride(var->type));
-      nir_ssa_def_rewrite_uses(&deref->dest.ssa,
-                               &deref_cast->dest.ssa);
+      nir_def_rewrite_uses(&deref->def,
+                               &deref_cast->def);
       nir_instr_remove(&deref->instr);
 
       deref = deref_cast;
@@ -869,8 +780,7 @@ dxil_nir_lower_deref_ssbo(nir_shader *nir)
          continue;
       assert(func->impl);
 
-      nir_builder b;
-      nir_builder_init(&b, func->impl);
+      nir_builder b = nir_builder_create(func->impl);
 
       nir_foreach_block(block, func->impl) {
          nir_foreach_instr_safe(instr, block) {
@@ -914,10 +824,10 @@ lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu)
       if (root_deref->deref_type != nir_deref_type_cast)
          continue;
 
-      nir_ssa_def *ptr =
+      nir_def *ptr =
          nir_iadd(b, root_deref->parent.ssa,
                      nir_build_deref_offset(b, deref, cl_type_size_align));
-      nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr));
+      nir_src_rewrite(&alu->src[i].src, ptr);
       progress = true;
    }
 
@@ -934,9 +844,7 @@ dxil_nir_opt_alu_deref_srcs(nir_shader *nir)
          continue;
       assert(func->impl);
 
-      bool progress = false;
-      nir_builder b;
-      nir_builder_init(&b, func->impl);
+      nir_builder b = nir_builder_create(func->impl);
 
       nir_foreach_block(block, func->impl) {
          nir_foreach_instr_safe(instr, block) {
@@ -952,115 +860,12 @@ dxil_nir_opt_alu_deref_srcs(nir_shader *nir)
    return progress;
 }
 
-static nir_ssa_def *
-memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent,
-                       nir_ssa_def *index)
-{
-   nir_deref_instr *deref;
-
-   index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
-   assert(parent->deref_type == nir_deref_type_cast);
-   deref = nir_build_deref_ptr_as_array(b, parent, index);
-
-   return nir_load_deref(b, deref);
-}
-
-static void
-memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent,
-                        nir_ssa_def *index, nir_ssa_def *value)
-{
-   nir_deref_instr *deref;
-
-   index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
-   assert(parent->deref_type == nir_deref_type_cast);
-   deref = nir_build_deref_ptr_as_array(b, parent, index);
-   nir_store_deref(b, deref, value, 1);
-}
-
-static bool
-lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr)
-{
-   nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]);
-   nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]);
-   assert(intr->src[2].is_ssa);
-   nir_ssa_def *num_bytes = intr->src[2].ssa;
-
-   assert(dst_deref && src_deref);
-
-   b->cursor = nir_after_instr(&intr->instr);
-
-   dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes,
-                                       glsl_uint8_t_type(), 1);
-   src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes,
-                                       glsl_uint8_t_type(), 1);
-
-   /*
-    * We want to avoid 64b instructions, so let's assume we'll always be
-    * passed a value that fits in a 32b type and truncate the 64b value.
-    */
-   num_bytes = nir_u2u32(b, num_bytes);
-
-   nir_variable *loop_index_var =
-     nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index");
-   nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var);
-   nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1);
-
-   nir_loop *loop = nir_push_loop(b);
-   nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref);
-   nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes);
-   nir_if *loop_check = nir_push_if(b, cmp);
-   nir_jump(b, nir_jump_break);
-   nir_pop_if(b, loop_check);
-   nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index);
-   memcpy_store_deref_elem(b, dst_deref, loop_index, val);
-   nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1);
-   nir_pop_loop(b, loop);
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-bool
-dxil_nir_lower_memcpy_deref(nir_shader *nir)
-{
-   bool progress = false;
-
-   foreach_list_typed(nir_function, func, node, &nir->functions) {
-      if (!func->is_entrypoint)
-         continue;
-      assert(func->impl);
-
-      nir_builder b;
-      nir_builder_init(&b, func->impl);
-
-      nir_foreach_block(block, func->impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-            if (intr->intrinsic == nir_intrinsic_memcpy_deref)
-               progress |= lower_memcpy_deref(&b, intr);
-         }
-      }
-   }
-
-   return progress;
-}
-
 static void
 cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size)
 {
    nir_phi_instr *lowered = nir_phi_instr_create(b->shader);
    int num_components = 0;
-   int old_bit_size = phi->dest.ssa.bit_size;
-
-   nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size,
-                                             nir_type_uint | new_bit_size,
-                                             nir_rounding_mode_undef);
-   nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size,
-                                               nir_type_uint | old_bit_size,
-                                               nir_rounding_mode_undef);
+   int old_bit_size = phi->def.bit_size;
 
    nir_foreach_phi_src(src, phi) {
       assert(num_components == 0 || num_components == src->src.ssa->num_components);
@@ -1068,40 +873,34 @@ cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size)
 
       b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr);
 
-      nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL);
-      nir_phi_instr_add_src(lowered, src->pred, nir_src_for_ssa(cast));
+      nir_def *cast = nir_u2uN(b, src->src.ssa, new_bit_size);
+
+      nir_phi_instr_add_src(lowered, src->pred, cast);
    }
 
-   nir_ssa_dest_init(&lowered->instr, &lowered->dest,
-                     num_components, new_bit_size, NULL);
+   nir_def_init(&lowered->instr, &lowered->def, num_components,
+                new_bit_size);
 
    b->cursor = nir_before_instr(&phi->instr);
    nir_builder_instr_insert(b, &lowered->instr);
 
    b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor));
-   nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL);
+   nir_def *result = nir_u2uN(b, &lowered->def, old_bit_size);
 
-   nir_ssa_def_rewrite_uses(&phi->dest.ssa, result);
+   nir_def_rewrite_uses(&phi->def, result);
    nir_instr_remove(&phi->instr);
 }
 
 static bool
 upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size)
 {
-   nir_builder b;
-   nir_builder_init(&b, impl);
+   nir_builder b = nir_builder_create(impl);
    bool progress = false;
 
    nir_foreach_block_reverse(block, impl) {
-      nir_foreach_instr_safe(instr, block) {
-         if (instr->type != nir_instr_type_phi)
-            continue;
-
-         nir_phi_instr *phi = nir_instr_as_phi(instr);
-         assert(phi->dest.is_ssa);
-
-         if (phi->dest.ssa.bit_size == 1 ||
-             phi->dest.ssa.bit_size >= min_bit_size)
+      nir_foreach_phi_safe(phi, block) {
+         if (phi->def.bit_size == 1 ||
+             phi->def.bit_size >= min_bit_size)
             continue;
 
          cast_phi(&b, phi, min_bit_size);
@@ -1124,16 +923,15 @@ dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size)
 {
    bool progress = false;
 
-   nir_foreach_function(function, shader) {
-      if (function->impl)
-         progress |= upcast_phi_impl(function->impl, min_bit_size);
+   nir_foreach_function_impl(impl, shader) {
+      progress |= upcast_phi_impl(impl, min_bit_size);
    }
 
    return progress;
 }
 
 struct dxil_nir_split_clip_cull_distance_params {
-   nir_variable *new_var;
+   nir_variable *new_var[2];
    nir_shader *shader;
 };
 
@@ -1151,7 +949,6 @@ dxil_nir_split_clip_cull_distance_instr(nir_builder *b,
                                         void *cb_data)
 {
    struct dxil_nir_split_clip_cull_distance_params *params = cb_data;
-   nir_variable *new_var = params->new_var;
 
    if (instr->type != nir_instr_type_deref)
       return false;
@@ -1164,6 +961,9 @@ dxil_nir_split_clip_cull_distance_instr(nir_builder *b,
        !var->data.compact)
       return false;
 
+   unsigned new_var_idx = var->data.mode == nir_var_shader_in ? 0 : 1;
+   nir_variable *new_var = params->new_var[new_var_idx];
+
    /* The location should only be inside clip distance, because clip
     * and cull should've been merged by nir_lower_clip_cull_distance_arrays()
     */
@@ -1173,15 +973,23 @@ dxil_nir_split_clip_cull_distance_instr(nir_builder *b,
    /* The deref chain to the clip/cull variables should be simple, just the
     * var and an array with a constant index, otherwise more lowering/optimization
     * might be needed before this pass, e.g. copy prop, lower_io_to_temporaries,
-    * split_var_copies, and/or lower_var_copies
+    * split_var_copies, and/or lower_var_copies. In the case of arrayed I/O like
+    * inputs to the tessellation or geometry stages, there might be a second level
+    * of array index.
     */
    assert(deref->deref_type == nir_deref_type_var ||
           deref->deref_type == nir_deref_type_array);
 
    b->cursor = nir_before_instr(instr);
+   unsigned arrayed_io_length = 0;
+   const struct glsl_type *old_type = var->type;
+   if (nir_is_arrayed_io(var, b->shader->info.stage)) {
+      arrayed_io_length = glsl_array_size(old_type);
+      old_type = glsl_get_array_element(old_type);
+   }
    if (!new_var) {
       /* Update lengths for new and old vars */
-      int old_length = glsl_array_size(var->type);
+      int old_length = glsl_array_size(old_type);
       int new_length = (old_length + var->data.location_frac) - 4;
       old_length -= new_length;
 
@@ -1191,12 +999,16 @@ dxil_nir_split_clip_cull_distance_instr(nir_builder *b,
 
       new_var = nir_variable_clone(var, params->shader);
       nir_shader_add_variable(params->shader, new_var);
-      assert(glsl_get_base_type(glsl_get_array_element(var->type)) == GLSL_TYPE_FLOAT);
+      assert(glsl_get_base_type(glsl_get_array_element(old_type)) == GLSL_TYPE_FLOAT);
       var->type = glsl_array_type(glsl_float_type(), old_length, 0);
       new_var->type = glsl_array_type(glsl_float_type(), new_length, 0);
+      if (arrayed_io_length) {
+         var->type = glsl_array_type(var->type, arrayed_io_length, 0);
+         new_var->type = glsl_array_type(new_var->type, arrayed_io_length, 0);
+      }
       new_var->data.location++;
       new_var->data.location_frac = 0;
-      params->new_var = new_var;
+      params->new_var[new_var_idx] = new_var;
    }
 
    /* Update the type for derefs of the old var */
@@ -1205,6 +1017,14 @@ dxil_nir_split_clip_cull_distance_instr(nir_builder *b,
       return false;
    }
 
+   if (glsl_type_is_array(deref->type)) {
+      assert(arrayed_io_length > 0);
+      deref->type = glsl_get_array_element(var->type);
+      return false;
+   }
+
+   assert(glsl_get_base_type(deref->type) == GLSL_TYPE_FLOAT);
+
    nir_const_value *index = nir_src_as_const_value(deref->arr.index);
    assert(index);
 
@@ -1218,8 +1038,14 @@ dxil_nir_split_clip_cull_distance_instr(nir_builder *b,
       return false;
 
    nir_deref_instr *new_var_deref = nir_build_deref_var(b, new_var);
-   nir_deref_instr *new_array_deref = nir_build_deref_array(b, new_var_deref, nir_imm_int(b, total_index % 4));
-   nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_array_deref->dest.ssa);
+   nir_deref_instr *new_intermediate_deref = new_var_deref;
+   if (arrayed_io_length) {
+      nir_deref_instr *parent = nir_src_as_deref(deref->parent);
+      assert(parent->deref_type == nir_deref_type_array);
+      new_intermediate_deref = nir_build_deref_array(b, new_intermediate_deref, parent->arr.index.ssa);
+   }
+   nir_deref_instr *new_array_deref = nir_build_deref_array(b, new_intermediate_deref, nir_imm_int(b, total_index % 4));
+   nir_def_rewrite_uses(&deref->def, &new_array_deref->def);
    return true;
 }
 
@@ -1227,7 +1053,7 @@ bool
 dxil_nir_split_clip_cull_distance(nir_shader *shader)
 {
    struct dxil_nir_split_clip_cull_distance_params params = {
-      .new_var = NULL,
+      .new_var = { NULL, NULL },
       .shader = shader,
    };
    nir_shader_instructions_pass(shader,
@@ -1236,7 +1062,7 @@ dxil_nir_split_clip_cull_distance(nir_shader *shader)
                                 nir_metadata_dominance |
                                 nir_metadata_loop_analysis,
                                 &params);
-   return params.new_var != NULL;
+   return params.new_var[0] != NULL || params.new_var[1] != NULL;
 }
 
 static bool
@@ -1244,6 +1070,36 @@ dxil_nir_lower_double_math_instr(nir_builder *b,
                                  nir_instr *instr,
                                  UNUSED void *cb_data)
 {
+   if (instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      switch (intr->intrinsic) {
+         case nir_intrinsic_reduce:
+         case nir_intrinsic_exclusive_scan:
+         case nir_intrinsic_inclusive_scan:
+            break;
+         default:
+            return false;
+      }
+      if (intr->def.bit_size != 64)
+         return false;
+      nir_op reduction = nir_intrinsic_reduction_op(intr);
+      switch (reduction) {
+         case nir_op_fmul:
+         case nir_op_fadd:
+         case nir_op_fmin:
+         case nir_op_fmax:
+            break;
+         default:
+            return false;
+      }
+      b->cursor = nir_before_instr(instr);
+      nir_src_rewrite(&intr->src[0], nir_pack_double_2x32_dxil(b, nir_unpack_64_2x32(b, intr->src[0].ssa)));
+      b->cursor = nir_after_instr(instr);
+      nir_def *result = nir_pack_64_2x32(b, nir_unpack_double_2x32_dxil(b, &intr->def));
+      nir_def_rewrite_uses_after(&intr->def, result, result->parent_instr);
+      return true;
+   }
+
    if (instr->type != nir_instr_type_alu)
       return false;
 
@@ -1262,22 +1118,33 @@ dxil_nir_lower_double_math_instr(nir_builder *b,
    for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i) {
       if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float &&
           alu->src[i].src.ssa->bit_size == 64) {
-         nir_ssa_def *packed_double = nir_channel(b, alu->src[i].src.ssa, alu->src[i].swizzle[0]);
-         nir_ssa_def *unpacked_double = nir_unpack_64_2x32(b, packed_double);
-         nir_ssa_def *repacked_double = nir_pack_double_2x32_dxil(b, unpacked_double);
-         nir_instr_rewrite_src_ssa(instr, &alu->src[i].src, repacked_double);
-         memset(alu->src[i].swizzle, 0, ARRAY_SIZE(alu->src[i].swizzle));
+         unsigned num_components = nir_op_infos[alu->op].input_sizes[i];
+         if (!num_components)
+            num_components = alu->def.num_components;
+         nir_def *components[NIR_MAX_VEC_COMPONENTS];
+         for (unsigned c = 0; c < num_components; ++c) {
+            nir_def *packed_double = nir_channel(b, alu->src[i].src.ssa, alu->src[i].swizzle[c]);
+            nir_def *unpacked_double = nir_unpack_64_2x32(b, packed_double);
+            components[c] = nir_pack_double_2x32_dxil(b, unpacked_double);
+            alu->src[i].swizzle[c] = c;
+         }
+         nir_src_rewrite(&alu->src[i].src,
+                         nir_vec(b, components, num_components));
          progress = true;
       }
    }
 
    if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float &&
-       alu->dest.dest.ssa.bit_size == 64) {
+       alu->def.bit_size == 64) {
       b->cursor = nir_after_instr(&alu->instr);
-      nir_ssa_def *packed_double = &alu->dest.dest.ssa;
-      nir_ssa_def *unpacked_double = nir_unpack_double_2x32_dxil(b, packed_double);
-      nir_ssa_def *repacked_double = nir_pack_64_2x32(b, unpacked_double);
-      nir_ssa_def_rewrite_uses_after(packed_double, repacked_double, unpacked_double->parent_instr);
+      nir_def *components[NIR_MAX_VEC_COMPONENTS];
+      for (unsigned c = 0; c < alu->def.num_components; ++c) {
+         nir_def *packed_double = nir_channel(b, &alu->def, c);
+         nir_def *unpacked_double = nir_unpack_double_2x32_dxil(b, packed_double);
+         components[c] = nir_pack_64_2x32(b, unpacked_double);
+      }
+      nir_def *repacked_dvec = nir_vec(b, components, alu->def.num_components);
+      nir_def_rewrite_uses_after(&alu->def, repacked_dvec, repacked_dvec->parent_instr);
       progress = true;
    }
 
@@ -1313,8 +1180,6 @@ lower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state)
    if (!nir_intrinsic_infos[intrin->intrinsic].has_dest)
       return false;
 
-   assert(intrin->dest.is_ssa);
-
    zero_system_values_state* state = (zero_system_values_state*)cb_state;
    for (uint32_t i = 0; i < state->count; ++i) {
       gl_system_value value = state->values[i];
@@ -1337,7 +1202,7 @@ lower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state)
    return false;
 }
 
-static nir_ssa_def*
+static nir_def*
 lower_system_value_to_zero_instr(nir_builder* b, nir_instr* instr, void* _state)
 {
    return nir_imm_int(b, 0);
@@ -1355,22 +1220,57 @@ dxil_nir_lower_system_values_to_zero(nir_shader* shader,
       &state);
 }
 
-static const struct glsl_type *
-get_bare_samplers_for_type(const struct glsl_type *type)
+static void
+lower_load_local_group_size(nir_builder *b, nir_intrinsic_instr *intr)
 {
-   if (glsl_type_is_sampler(type)) {
-      if (glsl_sampler_type_is_shadow(type))
-         return glsl_bare_shadow_sampler_type();
-      else
-         return glsl_bare_sampler_type();
-   } else if (glsl_type_is_array(type)) {
-      return glsl_array_type(
-         get_bare_samplers_for_type(glsl_get_array_element(type)),
-         glsl_get_length(type),
-         0 /*explicit size*/);
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_const_value v[3] = {
+      nir_const_value_for_int(b->shader->info.workgroup_size[0], 32),
+      nir_const_value_for_int(b->shader->info.workgroup_size[1], 32),
+      nir_const_value_for_int(b->shader->info.workgroup_size[2], 32)
+   };
+   nir_def *size = nir_build_imm(b, 3, 32, v);
+   nir_def_rewrite_uses(&intr->def, size);
+   nir_instr_remove(&intr->instr);
+}
+
+static bool
+lower_system_values_impl(nir_builder *b, nir_intrinsic_instr *intr,
+                         void *_state)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_workgroup_size:
+      lower_load_local_group_size(b, intr);
+      return true;
+   default:
+      return false;
    }
-   assert(!"Unexpected type");
-   return NULL;
+}
+
+bool
+dxil_nir_lower_system_values(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(shader, lower_system_values_impl,
+                                     nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis,
+                                     NULL);
+}
+
+static const struct glsl_type *
+get_bare_samplers_for_type(const struct glsl_type *type, bool is_shadow)
+{
+   const struct glsl_type *base_sampler_type =
+      is_shadow ?
+      glsl_bare_shadow_sampler_type() : glsl_bare_sampler_type();
+   return glsl_type_wrap_in_arrays(base_sampler_type, type);
+}
+
+static const struct glsl_type *
+get_textures_for_sampler_type(const struct glsl_type *type)
+{
+   return glsl_type_wrap_in_arrays(
+      glsl_sampler_type_to_texture(
+         glsl_without_array(type)), type);
 }
 
 static bool
@@ -1380,37 +1280,49 @@ redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data)
       return false;
 
    nir_tex_instr *tex = nir_instr_as_tex(instr);
-   if (!nir_tex_instr_need_sampler(tex))
-      return false;
 
    int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
    if (sampler_idx == -1) {
-      /* No derefs, must be using indices */
-      struct hash_entry *hash_entry = _mesa_hash_table_u64_search(data, tex->sampler_index);
+      /* No sampler deref - does this instruction even need a sampler? If not,
+       * sampler_index doesn't necessarily point to a sampler, so early-out.
+       */
+      if (!nir_tex_instr_need_sampler(tex))
+         return false;
+
+      /* No derefs but needs a sampler, must be using indices */
+      nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->sampler_index);
 
       /* Already have a bare sampler here */
-      if (hash_entry)
+      if (bare_sampler)
          return false;
 
-      nir_variable *typed_sampler = NULL;
+      nir_variable *old_sampler = NULL;
       nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) {
          if (var->data.binding <= tex->sampler_index &&
-             var->data.binding + glsl_type_get_sampler_count(var->type) > tex->sampler_index) {
-            /* Already have a bare sampler for this binding, add it to the table */
-            if (glsl_get_sampler_result_type(glsl_without_array(var->type)) == GLSL_TYPE_VOID) {
+             var->data.binding + glsl_type_get_sampler_count(var->type) >
+                tex->sampler_index) {
+
+            /* Already have a bare sampler for this binding and it is of the
+             * correct type, add it to the table */
+            if (glsl_type_is_bare_sampler(glsl_without_array(var->type)) &&
+                glsl_sampler_type_is_shadow(glsl_without_array(var->type)) ==
+                   tex->is_shadow) {
                _mesa_hash_table_u64_insert(data, tex->sampler_index, var);
                return false;
             }
 
-            typed_sampler = var;
+            old_sampler = var;
          }
       }
 
-      /* Clone the typed sampler to a bare sampler and we're done */
-      assert(typed_sampler);
-      nir_variable *bare_sampler = nir_variable_clone(typed_sampler, b->shader);
-      bare_sampler->type = get_bare_samplers_for_type(typed_sampler->type);
+      assert(old_sampler);
+
+      /* Clone the original sampler to a bare sampler of the correct type */
+      bare_sampler = nir_variable_clone(old_sampler, b->shader);
       nir_shader_add_variable(b->shader, bare_sampler);
+
+      bare_sampler->type =
+         get_bare_samplers_for_type(old_sampler->type, tex->is_shadow);
       _mesa_hash_table_u64_insert(data, tex->sampler_index, bare_sampler);
       return true;
    }
@@ -1423,20 +1335,101 @@ redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data)
    nir_deref_instr *old_tail = path.path[0];
    assert(old_tail->deref_type == nir_deref_type_var);
    nir_variable *old_var = old_tail->var;
-   if (glsl_get_sampler_result_type(glsl_without_array(old_var->type)) == GLSL_TYPE_VOID) {
+   if (glsl_type_is_bare_sampler(glsl_without_array(old_var->type)) &&
+       glsl_sampler_type_is_shadow(glsl_without_array(old_var->type)) ==
+          tex->is_shadow) {
       nir_deref_path_finish(&path);
       return false;
    }
 
-   struct hash_entry *hash_entry = _mesa_hash_table_u64_search(data, old_var->data.binding);
-   nir_variable *new_var;
-   if (hash_entry) {
-      new_var = hash_entry->data;
-   } else {
+   uint64_t var_key = ((uint64_t)old_var->data.descriptor_set << 32) |
+                      old_var->data.binding;
+   nir_variable *new_var = _mesa_hash_table_u64_search(data, var_key);
+   if (!new_var) {
+      new_var = nir_variable_clone(old_var, b->shader);
+      nir_shader_add_variable(b->shader, new_var);
+      new_var->type = 
+         get_bare_samplers_for_type(old_var->type, tex->is_shadow);
+      _mesa_hash_table_u64_insert(data, var_key, new_var);
+   }
+
+   b->cursor = nir_after_instr(&old_tail->instr);
+   nir_deref_instr *new_tail = nir_build_deref_var(b, new_var);
+
+   for (unsigned i = 1; path.path[i]; ++i) {
+      b->cursor = nir_after_instr(&path.path[i]->instr);
+      new_tail = nir_build_deref_follower(b, new_tail, path.path[i]);
+   }
+
+   nir_deref_path_finish(&path);
+   nir_src_rewrite(&tex->src[sampler_idx].src, &new_tail->def);
+   return true;
+}
+
+static bool
+redirect_texture_derefs(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+   int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+   if (texture_idx == -1) {
+      /* No derefs, must be using indices */
+      nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->texture_index);
+
+      /* Already have a texture here */
+      if (bare_sampler)
+         return false;
+
+      nir_variable *typed_sampler = NULL;
+      nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) {
+         if (var->data.binding <= tex->texture_index &&
+             var->data.binding + glsl_type_get_texture_count(var->type) > tex->texture_index) {
+            /* Already have a texture for this binding, add it to the table */
+            _mesa_hash_table_u64_insert(data, tex->texture_index, var);
+            return false;
+         }
+
+         if (var->data.binding <= tex->texture_index &&
+             var->data.binding + glsl_type_get_sampler_count(var->type) > tex->texture_index &&
+             !glsl_type_is_bare_sampler(glsl_without_array(var->type))) {
+            typed_sampler = var;
+         }
+      }
+
+      /* Clone the typed sampler to a texture and we're done */
+      assert(typed_sampler);
+      bare_sampler = nir_variable_clone(typed_sampler, b->shader);
+      bare_sampler->type = get_textures_for_sampler_type(typed_sampler->type);
+      nir_shader_add_variable(b->shader, bare_sampler);
+      _mesa_hash_table_u64_insert(data, tex->texture_index, bare_sampler);
+      return true;
+   }
+
+   /* Using derefs, means we have to rewrite the deref chain in addition to cloning */
+   nir_deref_instr *final_deref = nir_src_as_deref(tex->src[texture_idx].src);
+   nir_deref_path path;
+   nir_deref_path_init(&path, final_deref, NULL);
+
+   nir_deref_instr *old_tail = path.path[0];
+   assert(old_tail->deref_type == nir_deref_type_var);
+   nir_variable *old_var = old_tail->var;
+   if (glsl_type_is_texture(glsl_without_array(old_var->type)) ||
+       glsl_type_is_image(glsl_without_array(old_var->type))) {
+      nir_deref_path_finish(&path);
+      return false;
+   }
+
+   uint64_t var_key = ((uint64_t)old_var->data.descriptor_set << 32) |
+                      old_var->data.binding;
+   nir_variable *new_var = _mesa_hash_table_u64_search(data, var_key);
+   if (!new_var) {
       new_var = nir_variable_clone(old_var, b->shader);
-      new_var->type = get_bare_samplers_for_type(old_var->type);
+      new_var->type = get_textures_for_sampler_type(old_var->type);
       nir_shader_add_variable(b->shader, new_var);
-      _mesa_hash_table_u64_insert(data, old_var->data.binding, new_var);
+      _mesa_hash_table_u64_insert(data, var_key, new_var);
    }
 
    b->cursor = nir_after_instr(&old_tail->instr);
@@ -1448,49 +1441,102 @@ redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data)
    }
 
    nir_deref_path_finish(&path);
-   nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[sampler_idx].src, &new_tail->dest.ssa);
+   nir_src_rewrite(&tex->src[texture_idx].src, &new_tail->def);
 
    return true;
 }
 
 bool
-dxil_nir_create_bare_samplers(nir_shader *nir)
+dxil_nir_split_typed_samplers(nir_shader *nir)
 {
-   struct hash_table_u64 *sampler_to_bare = _mesa_hash_table_u64_create(NULL);
+   struct hash_table_u64 *hash_table = _mesa_hash_table_u64_create(NULL);
 
    bool progress = nir_shader_instructions_pass(nir, redirect_sampler_derefs,
-      nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, sampler_to_bare);
+      nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, hash_table);
+
+   _mesa_hash_table_u64_clear(hash_table);
 
-   _mesa_hash_table_u64_destroy(sampler_to_bare);
+   progress |= nir_shader_instructions_pass(nir, redirect_texture_derefs,
+      nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, hash_table);
+
+   _mesa_hash_table_u64_destroy(hash_table);
    return progress;
 }
 
 
+static bool
+lower_sysval_to_load_input_impl(nir_builder *b, nir_intrinsic_instr *intr,
+                                void *data)
+{
+   gl_system_value sysval = SYSTEM_VALUE_MAX;
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_instance_id:
+      sysval = SYSTEM_VALUE_INSTANCE_ID;
+      break;
+   case nir_intrinsic_load_vertex_id_zero_base:
+      sysval = SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
+      break;
+   default:
+      return false;
+   }
+
+   nir_variable **sysval_vars = (nir_variable **)data;
+   nir_variable *var = sysval_vars[sysval];
+   assert(var);
+
+   const nir_alu_type dest_type = nir_get_nir_type_for_glsl_type(var->type);
+   const unsigned bit_size = intr->def.bit_size;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *result = nir_load_input(b, intr->def.num_components, bit_size, nir_imm_int(b, 0),
+      .base = var->data.driver_location, .dest_type = dest_type);
+
+   nir_def_rewrite_uses(&intr->def, result);
+   return true;
+}
+
+bool
+dxil_nir_lower_sysval_to_load_input(nir_shader *s, nir_variable **sysval_vars)
+{
+   return nir_shader_intrinsics_pass(s, lower_sysval_to_load_input_impl,
+                                     nir_metadata_block_index | nir_metadata_dominance,
+                                     sysval_vars);
+}
+
 /* Comparison function to sort io values so that first come normal varyings,
  * then system values, and then system generated values.
  */
 static int
 variable_location_cmp(const nir_variable* a, const nir_variable* b)
 {
-   // Sort by driver_location, location, then index
-   return a->data.driver_location != b->data.driver_location ?
-            a->data.driver_location - b->data.driver_location : 
-            a->data.location !=  b->data.location ?
-               a->data.location - b->data.location :
-               a->data.index - b->data.index;
+   // Sort by stream, driver_location, location, location_frac, then index
+   // If all else is equal, sort full vectors before partial ones
+   unsigned a_location = a->data.location;
+   if (a_location >= VARYING_SLOT_PATCH0)
+      a_location -= VARYING_SLOT_PATCH0;
+   unsigned b_location = b->data.location;
+   if (b_location >= VARYING_SLOT_PATCH0)
+      b_location -= VARYING_SLOT_PATCH0;
+   unsigned a_stream = a->data.stream & ~NIR_STREAM_PACKED;
+   unsigned b_stream = b->data.stream & ~NIR_STREAM_PACKED;
+   return a_stream != b_stream ?
+            a_stream - b_stream :
+            a->data.driver_location != b->data.driver_location ?
+               a->data.driver_location - b->data.driver_location :
+               a_location !=  b_location ?
+                  a_location - b_location :
+                  a->data.location_frac != b->data.location_frac ?
+                     a->data.location_frac - b->data.location_frac :
+                     a->data.index != b->data.index ?
+                        a->data.index - b->data.index :
+                        glsl_get_component_slots(b->type) - glsl_get_component_slots(a->type);
 }
 
 /* Order varyings according to driver location */
-uint64_t
+void
 dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes)
 {
    nir_sort_variables_with_modes(s, variable_location_cmp, modes);
-
-   uint64_t result = 0;
-   nir_foreach_variable_with_modes(var, s, modes) {
-      result |= 1ull << var->data.location;
-   }
-   return result;
 }
 
 /* Sort PS outputs so that color outputs come first */
@@ -1521,31 +1567,1427 @@ dxil_sort_ps_outputs(nir_shader* s)
 
    unsigned driver_loc = 0;
    nir_foreach_variable_with_modes(var, s, nir_var_shader_out) {
-      var->data.driver_location = driver_loc++;
+      /* Fractional vars should use the same driver_location as the base. These will
+       * get fully merged during signature processing.
+       */
+      var->data.driver_location = var->data.location_frac ? driver_loc - 1 : driver_loc++;
+   }
+}
+
+enum dxil_sysvalue_type {
+   DXIL_NO_SYSVALUE = 0,
+   DXIL_USED_SYSVALUE,
+   DXIL_UNUSED_NO_SYSVALUE,
+   DXIL_SYSVALUE,
+   DXIL_GENERATED_SYSVALUE,
+};
+
+static enum dxil_sysvalue_type
+nir_var_to_dxil_sysvalue_type(nir_variable *var, uint64_t other_stage_mask,
+                              const BITSET_WORD *other_stage_frac_mask)
+{
+   switch (var->data.location) {
+   case VARYING_SLOT_FACE:
+      return DXIL_GENERATED_SYSVALUE;
+   case VARYING_SLOT_POS:
+   case VARYING_SLOT_PRIMITIVE_ID:
+   case VARYING_SLOT_CLIP_DIST0:
+   case VARYING_SLOT_CLIP_DIST1:
+   case VARYING_SLOT_PSIZ:
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+   case VARYING_SLOT_VIEWPORT:
+   case VARYING_SLOT_LAYER:
+   case VARYING_SLOT_VIEW_INDEX:
+      if (!((1ull << var->data.location) & other_stage_mask))
+         return DXIL_SYSVALUE;
+      return DXIL_USED_SYSVALUE;
+   default:
+      if (var->data.location < VARYING_SLOT_PATCH0 &&
+          !((1ull << var->data.location) & other_stage_mask))
+         return DXIL_UNUSED_NO_SYSVALUE;
+      if (var->data.location_frac && other_stage_frac_mask &&
+          var->data.location >= VARYING_SLOT_VAR0 &&
+          !BITSET_TEST(other_stage_frac_mask, ((var->data.location - VARYING_SLOT_VAR0) * 4 + var->data.location_frac)))
+         return DXIL_UNUSED_NO_SYSVALUE;
+      return DXIL_NO_SYSVALUE;
    }
 }
 
 /* Order between stage values so that normal varyings come first,
  * then sysvalues and then system generated values.
  */
-uint64_t
+void
 dxil_reassign_driver_locations(nir_shader* s, nir_variable_mode modes,
-   uint64_t other_stage_mask)
+   uint64_t other_stage_mask, const BITSET_WORD *other_stage_frac_mask)
 {
    nir_foreach_variable_with_modes_safe(var, s, modes) {
       /* We use the driver_location here to avoid introducing a new
        * struct or member variable here. The true, updated driver location
        * will be written below, after sorting */
-      var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask);
+      var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask, other_stage_frac_mask);
    }
 
    nir_sort_variables_with_modes(s, variable_location_cmp, modes);
 
-   uint64_t result = 0;
-   unsigned driver_loc = 0;
+   unsigned driver_loc = 0, driver_patch_loc = 0;
+   nir_foreach_variable_with_modes(var, s, modes) {
+      /* Overlap patches with non-patch */
+      var->data.driver_location = var->data.patch ?
+         driver_patch_loc++ : driver_loc++;
+   }
+}
+
+static bool
+lower_ubo_array_one_to_static(struct nir_builder *b,
+                              nir_intrinsic_instr *intrin,
+                              void *cb_data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor)
+      return false;
+
+   nir_variable *var =
+      nir_get_binding_variable(b->shader, nir_chase_binding(intrin->src[0]));
+
+   if (!var)
+      return false;
+
+   if (!glsl_type_is_array(var->type) || glsl_array_size(var->type) != 1)
+      return false;
+
+   nir_intrinsic_instr *index = nir_src_as_intrinsic(intrin->src[0]);
+   /* We currently do not support reindex */
+   assert(index && index->intrinsic == nir_intrinsic_vulkan_resource_index);
+
+   if (nir_src_is_const(index->src[0]) && nir_src_as_uint(index->src[0]) == 0)
+      return false;
+
+   if (nir_intrinsic_desc_type(index) != VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
+      return false;
+
+   b->cursor = nir_instr_remove(&index->instr);
+
+   // Indexing out of bounds on array of UBOs is considered undefined
+   // behavior. Therefore, we just hardcode all the index to 0.
+   uint8_t bit_size = index->def.bit_size;
+   nir_def *zero = nir_imm_intN_t(b, 0, bit_size);
+   nir_def *dest =
+      nir_vulkan_resource_index(b, index->num_components, bit_size, zero,
+                                .desc_set = nir_intrinsic_desc_set(index),
+                                .binding = nir_intrinsic_binding(index),
+                                .desc_type = nir_intrinsic_desc_type(index));
+
+   nir_def_rewrite_uses(&index->def, dest);
+
+   return true;
+}
+
+bool
+dxil_nir_lower_ubo_array_one_to_static(nir_shader *s)
+{
+   bool progress = nir_shader_intrinsics_pass(s,
+                                              lower_ubo_array_one_to_static,
+                                              nir_metadata_none, NULL);
+
+   return progress;
+}
+
+static bool
+is_fquantize2f16(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   return alu->op == nir_op_fquantize2f16;
+}
+
+static nir_def *
+lower_fquantize2f16(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   /*
+    * SpvOpQuantizeToF16 documentation says:
+    *
+    * "
+    * If Value is an infinity, the result is the same infinity.
+    * If Value is a NaN, the result is a NaN, but not necessarily the same NaN.
+    * If Value is positive with a magnitude too large to represent as a 16-bit
+    * floating-point value, the result is positive infinity. If Value is negative
+    * with a magnitude too large to represent as a 16-bit floating-point value,
+    * the result is negative infinity. If the magnitude of Value is too small to
+    * represent as a normalized 16-bit floating-point value, the result may be
+    * either +0 or -0.
+    * "
+    *
+    * which we turn into:
+    *
+    *   if (val < MIN_FLOAT16)
+    *      return -INFINITY;
+    *   else if (val > MAX_FLOAT16)
+    *      return -INFINITY;
+    *   else if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16 && sign(val) != 0)
+    *      return -0.0f;
+    *   else if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16 && sign(val) == 0)
+    *      return +0.0f;
+    *   else
+    *      return round(val);
+    */
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   nir_def *src =
+      alu->src[0].src.ssa;
+
+   nir_def *neg_inf_cond =
+      nir_flt_imm(b, src, -65504.0f);
+   nir_def *pos_inf_cond =
+      nir_fgt_imm(b, src, 65504.0f);
+   nir_def *zero_cond =
+      nir_flt_imm(b, nir_fabs(b, src), ldexpf(1.0, -14));
+   nir_def *zero = nir_iand_imm(b, src, 1 << 31);
+   nir_def *round = nir_iand_imm(b, src, ~BITFIELD_MASK(13));
+
+   nir_def *res =
+      nir_bcsel(b, neg_inf_cond, nir_imm_float(b, -INFINITY), round);
+   res = nir_bcsel(b, pos_inf_cond, nir_imm_float(b, INFINITY), res);
+   res = nir_bcsel(b, zero_cond, zero, res);
+   return res;
+}
+
+bool
+dxil_nir_lower_fquantize2f16(nir_shader *s)
+{
+   return nir_shader_lower_instructions(s, is_fquantize2f16, lower_fquantize2f16, NULL);
+}
+
+static bool
+fix_io_uint_deref_types(struct nir_builder *builder, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_instr_as_deref(instr);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   if (var == data) {
+      deref->type = glsl_type_wrap_in_arrays(glsl_uint_type(), deref->type);
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+fix_io_uint_type(nir_shader *s, nir_variable_mode modes, int slot)
+{
+   nir_variable *fixed_var = NULL;
+   nir_foreach_variable_with_modes(var, s, modes) {
+      if (var->data.location == slot) {
+         const struct glsl_type *plain_type = glsl_without_array(var->type);
+         if (plain_type == glsl_uint_type())
+            return false;
+
+         assert(plain_type == glsl_int_type());
+         var->type = glsl_type_wrap_in_arrays(glsl_uint_type(), var->type);
+         fixed_var = var;
+         break;
+      }
+   }
+
+   assert(fixed_var);
+
+   return nir_shader_instructions_pass(s, fix_io_uint_deref_types,
+                                       nir_metadata_all, fixed_var);
+}
+
+bool
+dxil_nir_fix_io_uint_type(nir_shader *s, uint64_t in_mask, uint64_t out_mask)
+{
+   if (!(s->info.outputs_written & out_mask) &&
+       !(s->info.inputs_read & in_mask))
+      return false;
+
+   bool progress = false;
+
+   while (in_mask) {
+      int slot = u_bit_scan64(&in_mask);
+      progress |= (s->info.inputs_read & (1ull << slot)) &&
+                  fix_io_uint_type(s, nir_var_shader_in, slot);
+   }
+
+   while (out_mask) {
+      int slot = u_bit_scan64(&out_mask);
+      progress |= (s->info.outputs_written & (1ull << slot)) &&
+                  fix_io_uint_type(s, nir_var_shader_out, slot);
+   }
+
+   return progress;
+}
+
+static bool
+lower_kill(struct nir_builder *builder, nir_intrinsic_instr *intr,
+           void *_cb_data)
+{
+   if (intr->intrinsic != nir_intrinsic_discard &&
+       intr->intrinsic != nir_intrinsic_terminate &&
+       intr->intrinsic != nir_intrinsic_discard_if &&
+       intr->intrinsic != nir_intrinsic_terminate_if)
+      return false;
+
+   builder->cursor = nir_instr_remove(&intr->instr);
+   nir_def *condition;
+
+   if (intr->intrinsic == nir_intrinsic_discard ||
+       intr->intrinsic == nir_intrinsic_terminate) {
+      nir_demote(builder);
+      condition = nir_imm_true(builder);
+   } else {
+      nir_demote_if(builder, intr->src[0].ssa);
+      condition = intr->src[0].ssa;
+   }
+
+   /* Create a new block by branching on the discard condition so that this return
+    * is definitely the last instruction in its own block */
+   nir_if *nif = nir_push_if(builder, condition);
+   nir_jump(builder, nir_jump_return);
+   nir_pop_if(builder, nif);
+
+   return true;
+}
+
+bool
+dxil_nir_lower_discard_and_terminate(nir_shader *s)
+{
+   if (s->info.stage != MESA_SHADER_FRAGMENT)
+      return false;
+
+   // This pass only works if all functions have been inlined
+   assert(exec_list_length(&s->functions) == 1);
+   return nir_shader_intrinsics_pass(s, lower_kill, nir_metadata_none, NULL);
+}
+
+static bool
+update_writes(struct nir_builder *b, nir_intrinsic_instr *intr, void *_state)
+{
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   nir_io_semantics io = nir_intrinsic_io_semantics(intr);
+   if (io.location != VARYING_SLOT_POS)
+      return false;
+
+   nir_def *src = intr->src[0].ssa;
+   unsigned write_mask = nir_intrinsic_write_mask(intr);
+   if (src->num_components == 4 && write_mask == 0xf)
+      return false;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   unsigned first_comp = nir_intrinsic_component(intr);
+   nir_def *channels[4] = { NULL, NULL, NULL, NULL };
+   assert(first_comp + src->num_components <= ARRAY_SIZE(channels));
+   for (unsigned i = 0; i < src->num_components; ++i)
+      if (write_mask & (1 << i))
+         channels[i + first_comp] = nir_channel(b, src, i);
+   for (unsigned i = 0; i < 4; ++i)
+      if (!channels[i])
+         channels[i] = nir_imm_intN_t(b, 0, src->bit_size);
+
+   intr->num_components = 4;
+   nir_src_rewrite(&intr->src[0], nir_vec(b, channels, 4));
+   nir_intrinsic_set_component(intr, 0);
+   nir_intrinsic_set_write_mask(intr, 0xf);
+   return true;
+}
+
+bool
+dxil_nir_ensure_position_writes(nir_shader *s)
+{
+   if (s->info.stage != MESA_SHADER_VERTEX &&
+       s->info.stage != MESA_SHADER_GEOMETRY &&
+       s->info.stage != MESA_SHADER_TESS_EVAL)
+      return false;
+   if ((s->info.outputs_written & VARYING_BIT_POS) == 0)
+      return false;
+
+   return nir_shader_intrinsics_pass(s, update_writes,
+                                       nir_metadata_block_index | nir_metadata_dominance,
+                                       NULL);
+}
+
+static bool
+is_sample_pos(const nir_instr *instr, const void *_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   return intr->intrinsic == nir_intrinsic_load_sample_pos;
+}
+
+static nir_def *
+lower_sample_pos(nir_builder *b, nir_instr *instr, void *_data)
+{
+   return nir_load_sample_pos_from_id(b, 32, nir_load_sample_id(b));
+}
+
+bool
+dxil_nir_lower_sample_pos(nir_shader *s)
+{
+   return nir_shader_lower_instructions(s, is_sample_pos, lower_sample_pos, NULL);
+}
+
+static bool
+lower_subgroup_id(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_subgroup_id)
+      return false;
+
+   b->cursor = nir_before_impl(b->impl);
+   if (b->shader->info.stage == MESA_SHADER_COMPUTE &&
+       b->shader->info.workgroup_size[1] == 1 &&
+       b->shader->info.workgroup_size[2] == 1) {
+      /* When using Nx1x1 groups, use a simple stable algorithm
+       * which is almost guaranteed to be correct. */
+      nir_def *subgroup_id = nir_udiv(b, nir_load_local_invocation_index(b), nir_load_subgroup_size(b));
+      nir_def_rewrite_uses(&intr->def, subgroup_id);
+      return true;
+   }
+
+   nir_def **subgroup_id = (nir_def **)data;
+   if (*subgroup_id == NULL) {
+      nir_variable *subgroup_id_counter = nir_variable_create(b->shader, nir_var_mem_shared, glsl_uint_type(), "dxil_SubgroupID_counter");
+      nir_variable *subgroup_id_local = nir_local_variable_create(b->impl, glsl_uint_type(), "dxil_SubgroupID_local");
+      nir_store_var(b, subgroup_id_local, nir_imm_int(b, 0), 1);
+
+      nir_deref_instr *counter_deref = nir_build_deref_var(b, subgroup_id_counter);
+      nir_def *tid = nir_load_local_invocation_index(b);
+      nir_if *nif = nir_push_if(b, nir_ieq_imm(b, tid, 0));
+      nir_store_deref(b, counter_deref, nir_imm_int(b, 0), 1);
+      nir_pop_if(b, nif);
+
+      nir_barrier(b,
+                         .execution_scope = SCOPE_WORKGROUP,
+                         .memory_scope = SCOPE_WORKGROUP,
+                         .memory_semantics = NIR_MEMORY_ACQ_REL,
+                         .memory_modes = nir_var_mem_shared);
+
+      nif = nir_push_if(b, nir_elect(b, 1));
+      nir_def *subgroup_id_first_thread = nir_deref_atomic(b, 32, &counter_deref->def, nir_imm_int(b, 1),
+                                                               .atomic_op = nir_atomic_op_iadd);
+      nir_store_var(b, subgroup_id_local, subgroup_id_first_thread, 1);
+      nir_pop_if(b, nif);
+
+      nir_def *subgroup_id_loaded = nir_load_var(b, subgroup_id_local);
+      *subgroup_id = nir_read_first_invocation(b, subgroup_id_loaded);
+   }
+   nir_def_rewrite_uses(&intr->def, *subgroup_id);
+   return true;
+}
+
+bool
+dxil_nir_lower_subgroup_id(nir_shader *s)
+{
+   nir_def *subgroup_id = NULL;
+   return nir_shader_intrinsics_pass(s, lower_subgroup_id, nir_metadata_none,
+                                     &subgroup_id);
+}
+
+static bool
+lower_num_subgroups(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_num_subgroups)
+      return false;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *subgroup_size = nir_load_subgroup_size(b);
+   nir_def *size_minus_one = nir_iadd_imm(b, subgroup_size, -1);
+   nir_def *workgroup_size_vec = nir_load_workgroup_size(b);
+   nir_def *workgroup_size = nir_imul(b, nir_channel(b, workgroup_size_vec, 0),
+                                             nir_imul(b, nir_channel(b, workgroup_size_vec, 1),
+                                                         nir_channel(b, workgroup_size_vec, 2)));
+   nir_def *ret = nir_idiv(b, nir_iadd(b, workgroup_size, size_minus_one), subgroup_size);
+   nir_def_rewrite_uses(&intr->def, ret);
+   return true;
+}
+
+bool
+dxil_nir_lower_num_subgroups(nir_shader *s)
+{
+   return nir_shader_intrinsics_pass(s, lower_num_subgroups,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance |
+                                       nir_metadata_loop_analysis, NULL);
+}
+
+
+static const struct glsl_type *
+get_cast_type(unsigned bit_size)
+{
+   switch (bit_size) {
+   case 64:
+      return glsl_int64_t_type();
+   case 32:
+      return glsl_int_type();
+   case 16:
+      return glsl_int16_t_type();
+   case 8:
+      return glsl_int8_t_type();
+   }
+   unreachable("Invalid bit_size");
+}
+
+static void
+split_unaligned_load(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment)
+{
+   enum gl_access_qualifier access = nir_intrinsic_access(intrin);
+   nir_def *srcs[NIR_MAX_VEC_COMPONENTS * NIR_MAX_VEC_COMPONENTS * sizeof(int64_t) / 8];
+   unsigned comp_size = intrin->def.bit_size / 8;
+   unsigned num_comps = intrin->def.num_components;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]);
+
+   const struct glsl_type *cast_type = get_cast_type(alignment * 8);
+   nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->def, ptr->modes, cast_type, alignment);
+
+   unsigned num_loads = DIV_ROUND_UP(comp_size * num_comps, alignment);
+   for (unsigned i = 0; i < num_loads; ++i) {
+      nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->def.bit_size));
+      srcs[i] = nir_load_deref_with_access(b, elem, access);
+   }
+
+   nir_def *new_dest = nir_extract_bits(b, srcs, num_loads, 0, num_comps, intrin->def.bit_size);
+   nir_def_rewrite_uses(&intrin->def, new_dest);
+   nir_instr_remove(&intrin->instr);
+}
+
+static void
+split_unaligned_store(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment)
+{
+   enum gl_access_qualifier access = nir_intrinsic_access(intrin);
+
+   nir_def *value = intrin->src[1].ssa;
+   unsigned comp_size = value->bit_size / 8;
+   unsigned num_comps = value->num_components;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]);
+
+   const struct glsl_type *cast_type = get_cast_type(alignment * 8);
+   nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->def, ptr->modes, cast_type, alignment);
+
+   unsigned num_stores = DIV_ROUND_UP(comp_size * num_comps, alignment);
+   for (unsigned i = 0; i < num_stores; ++i) {
+      nir_def *substore_val = nir_extract_bits(b, &value, 1, i * alignment * 8, 1, alignment * 8);
+      nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->def.bit_size));
+      nir_store_deref_with_access(b, elem, substore_val, ~0, access);
+   }
+
+   nir_instr_remove(&intrin->instr);
+}
+
+bool
+dxil_nir_split_unaligned_loads_stores(nir_shader *shader, nir_variable_mode modes)
+{
+   bool progress = false;
+
+   nir_foreach_function_impl(impl, shader) {
+      nir_builder b = nir_builder_create(impl);
+
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_deref &&
+                intrin->intrinsic != nir_intrinsic_store_deref)
+               continue;
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            if (!nir_deref_mode_may_be(deref, modes))
+               continue;
+
+            unsigned align_mul = 0, align_offset = 0;
+            nir_get_explicit_deref_align(deref, true, &align_mul, &align_offset);
+
+            unsigned alignment = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
+
+            /* We can load anything at 4-byte alignment, except for
+             * UBOs (AKA CBs where the granularity is 16 bytes).
+             */
+            unsigned req_align = (nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_push_const) ? 16 : 4);
+            if (alignment >= req_align)
+               continue;
+
+            nir_def *val;
+            if (intrin->intrinsic == nir_intrinsic_load_deref) {
+               val = &intrin->def;
+            } else {
+               val = intrin->src[1].ssa;
+            }
+
+            unsigned scalar_byte_size = glsl_type_is_boolean(deref->type) ? 4 : glsl_get_bit_size(deref->type) / 8;
+            unsigned num_components =
+               /* If the vector stride is larger than the scalar size, lower_explicit_io will
+                * turn this into multiple scalar loads anyway, so we don't have to split it here. */
+               glsl_get_explicit_stride(deref->type) > scalar_byte_size ? 1 :
+               (val->num_components == 3 ? 4 : val->num_components);
+            unsigned natural_alignment = scalar_byte_size * num_components;
+
+            if (alignment >= natural_alignment)
+               continue;
+
+            if (intrin->intrinsic == nir_intrinsic_load_deref)
+               split_unaligned_load(&b, intrin, alignment);
+            else
+               split_unaligned_store(&b, intrin, alignment);
+            progress = true;
+         }
+      }
+   }
+
+   return progress;
+}
+
+static void
+lower_inclusive_to_exclusive(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_op op = nir_intrinsic_reduction_op(intr);
+   intr->intrinsic = nir_intrinsic_exclusive_scan;
+   nir_intrinsic_set_reduction_op(intr, op);
+
+   nir_def *final_val = nir_build_alu2(b, nir_intrinsic_reduction_op(intr),
+                                           &intr->def, intr->src[0].ssa);
+   nir_def_rewrite_uses_after(&intr->def, final_val, final_val->parent_instr);
+}
+
+static bool
+lower_subgroup_scan(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_exclusive_scan:
+   case nir_intrinsic_inclusive_scan:
+      switch ((nir_op)nir_intrinsic_reduction_op(intr)) {
+      case nir_op_iadd:
+      case nir_op_fadd:
+      case nir_op_imul:
+      case nir_op_fmul:
+         if (intr->intrinsic == nir_intrinsic_exclusive_scan)
+            return false;
+         lower_inclusive_to_exclusive(b, intr);
+         return true;
+      default:
+         break;
+      }
+      break;
+   default:
+      return false;
+   }
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_op op = nir_intrinsic_reduction_op(intr);
+   nir_def *subgroup_id = nir_load_subgroup_invocation(b);
+   nir_def *subgroup_size = nir_load_subgroup_size(b);
+   nir_def *active_threads = nir_ballot(b, 4, 32, nir_imm_true(b));
+   nir_def *base_value;
+   uint32_t bit_size = intr->def.bit_size;
+   if (op == nir_op_iand || op == nir_op_umin)
+      base_value = nir_imm_intN_t(b, ~0ull, bit_size);
+   else if (op == nir_op_imin)
+      base_value = nir_imm_intN_t(b, (1ull << (bit_size - 1)) - 1, bit_size);
+   else if (op == nir_op_imax)
+      base_value = nir_imm_intN_t(b, 1ull << (bit_size - 1), bit_size);
+   else if (op == nir_op_fmax)
+      base_value = nir_imm_floatN_t(b, -INFINITY, bit_size);
+   else if (op == nir_op_fmin)
+      base_value = nir_imm_floatN_t(b, INFINITY, bit_size);
+   else
+      base_value = nir_imm_intN_t(b, 0, bit_size);
+
+   nir_variable *loop_counter_var = nir_local_variable_create(b->impl, glsl_uint_type(), "subgroup_loop_counter");
+   nir_variable *result_var = nir_local_variable_create(b->impl,
+                                                        glsl_vector_type(nir_get_glsl_base_type_for_nir_type(
+                                                           nir_op_infos[op].input_types[0] | bit_size), 1),
+                                                        "subgroup_loop_result");
+   nir_store_var(b, loop_counter_var, nir_imm_int(b, 0), 1);
+   nir_store_var(b, result_var, base_value, 1);
+   nir_loop *loop = nir_push_loop(b);
+   nir_def *loop_counter = nir_load_var(b, loop_counter_var);
+
+   nir_if *nif = nir_push_if(b, nir_ilt(b, loop_counter, subgroup_size));
+   nir_def *other_thread_val = nir_read_invocation(b, intr->src[0].ssa, loop_counter);
+   nir_def *thread_in_range = intr->intrinsic == nir_intrinsic_inclusive_scan ?
+      nir_ige(b, subgroup_id, loop_counter) :
+      nir_ilt(b, loop_counter, subgroup_id);
+   nir_def *thread_active = nir_ballot_bitfield_extract(b, 1, active_threads, loop_counter);
+
+   nir_if *if_active_thread = nir_push_if(b, nir_iand(b, thread_in_range, thread_active));
+   nir_def *result = nir_build_alu2(b, op, nir_load_var(b, result_var), other_thread_val);
+   nir_store_var(b, result_var, result, 1);
+   nir_pop_if(b, if_active_thread);
+
+   nir_store_var(b, loop_counter_var, nir_iadd_imm(b, loop_counter, 1), 1);
+   nir_jump(b, nir_jump_continue);
+   nir_pop_if(b, nif);
+
+   nir_jump(b, nir_jump_break);
+   nir_pop_loop(b, loop);
+
+   result = nir_load_var(b, result_var);
+   nir_def_rewrite_uses(&intr->def, result);
+   return true;
+}
+
+bool
+dxil_nir_lower_unsupported_subgroup_scan(nir_shader *s)
+{
+   bool ret = nir_shader_intrinsics_pass(s, lower_subgroup_scan,
+                                         nir_metadata_none, NULL);
+   if (ret) {
+      /* Lower the ballot bitfield tests */
+      nir_lower_subgroups_options options = { .ballot_bit_size = 32, .ballot_components = 4 };
+      nir_lower_subgroups(s, &options);
+   }
+   return ret;
+}
+
+bool
+dxil_nir_forward_front_face(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_FRAGMENT);
+
+   nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_in, VARYING_SLOT_FACE);
+   if (var) {
+      var->data.location = VARYING_SLOT_VAR12;
+      return true;
+   }
+   return false;
+}
+
+static bool
+move_consts(nir_builder *b, nir_instr *instr, void *data)
+{
+   bool progress = false;
+   switch (instr->type) {
+   case nir_instr_type_load_const: {
+      /* Sink load_const to their uses if there's multiple */
+      nir_load_const_instr *load_const = nir_instr_as_load_const(instr);
+      if (!list_is_singular(&load_const->def.uses)) {
+         nir_foreach_use_safe(src, &load_const->def) {
+            b->cursor = nir_before_src(src);
+            nir_load_const_instr *new_load = nir_load_const_instr_create(b->shader,
+                                                                         load_const->def.num_components,
+                                                                         load_const->def.bit_size);
+            memcpy(new_load->value, load_const->value, sizeof(load_const->value[0]) * load_const->def.num_components);
+            nir_builder_instr_insert(b, &new_load->instr);
+            nir_src_rewrite(src, &new_load->def);
+            progress = true;
+         }
+      }
+      return progress;
+   }
+   default:
+      return false;
+   }
+}
+
+/* Sink all consts so that they have only have a single use.
+ * The DXIL backend will already de-dupe the constants to the
+ * same dxil_value if they have the same type, but this allows a single constant
+ * to have different types without bitcasts. */
+bool
+dxil_nir_move_consts(nir_shader *s)
+{
+   return nir_shader_instructions_pass(s, move_consts,
+                                       nir_metadata_block_index | nir_metadata_dominance,
+                                       NULL);
+}
+
+static void
+clear_pass_flags(nir_function_impl *impl)
+{
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         instr->pass_flags = 0;
+      }
+   }
+}
+
+static bool
+add_def_to_worklist(nir_def *def, void *state)
+{
+   nir_foreach_use_including_if(src, def) {
+      if (nir_src_is_if(src)) {
+         nir_if *nif = nir_src_parent_if(src);
+         nir_foreach_block_in_cf_node(block, &nif->cf_node) {
+            nir_foreach_instr(instr, block)
+               nir_instr_worklist_push_tail(state, instr);
+         }
+      } else
+         nir_instr_worklist_push_tail(state, nir_src_parent_instr(src));
+   }
+   return true;
+}
+
+static bool
+set_input_bits(struct dxil_module *mod, nir_intrinsic_instr *intr, BITSET_WORD *input_bits, uint32_t ***tables, const uint32_t **table_sizes)
+{
+   if (intr->intrinsic == nir_intrinsic_load_view_index) {
+      BITSET_SET(input_bits, 0);
+      return true;
+   }
+   
+   bool any_bits_set = false;
+   nir_src *row_src = intr->intrinsic == nir_intrinsic_load_per_vertex_input ? &intr->src[1] : &intr->src[0];
+   bool is_patch_constant = mod->shader_kind == DXIL_DOMAIN_SHADER && intr->intrinsic == nir_intrinsic_load_input;
+   const struct dxil_signature_record *sig_rec = is_patch_constant ?
+      &mod->patch_consts[nir_intrinsic_base(intr)] :
+      &mod->inputs[mod->input_mappings[nir_intrinsic_base(intr)]];
+   if (is_patch_constant) {
+      /* Redirect to the second I/O table */
+      *tables = *tables + 1;
+      *table_sizes = *table_sizes + 1;
+   }
+   for (uint32_t component = 0; component < intr->num_components; ++component) {
+      uint32_t base_element = 0;
+      uint32_t num_elements = sig_rec->num_elements;
+      if (nir_src_is_const(*row_src)) {
+         base_element = (uint32_t)nir_src_as_uint(*row_src);
+         num_elements = 1;
+      }
+      for (uint32_t element = 0; element < num_elements; ++element) {
+         uint32_t row = sig_rec->elements[element + base_element].reg;
+         if (row == 0xffffffff)
+            continue;
+         BITSET_SET(input_bits, row * 4 + component + nir_intrinsic_component(intr));
+         any_bits_set = true;
+      }
+   }
+   return any_bits_set;
+}
+
+static bool
+set_output_bits(struct dxil_module *mod, nir_intrinsic_instr *intr, BITSET_WORD *input_bits, uint32_t **tables, const uint32_t *table_sizes)
+{
+   bool any_bits_set = false;
+   nir_src *row_src = intr->intrinsic == nir_intrinsic_store_per_vertex_output ? &intr->src[2] : &intr->src[1];
+   bool is_patch_constant = mod->shader_kind == DXIL_HULL_SHADER && intr->intrinsic == nir_intrinsic_store_output;
+   const struct dxil_signature_record *sig_rec = is_patch_constant ?
+      &mod->patch_consts[nir_intrinsic_base(intr)] :
+      &mod->outputs[nir_intrinsic_base(intr)];
+   for (uint32_t component = 0; component < intr->num_components; ++component) {
+      uint32_t base_element = 0;
+      uint32_t num_elements = sig_rec->num_elements;
+      if (nir_src_is_const(*row_src)) {
+         base_element = (uint32_t)nir_src_as_uint(*row_src);
+         num_elements = 1;
+      }
+      for (uint32_t element = 0; element < num_elements; ++element) {
+         uint32_t row = sig_rec->elements[element + base_element].reg;
+         if (row == 0xffffffff)
+            continue;
+         uint32_t stream = sig_rec->elements[element + base_element].stream;
+         uint32_t table_idx = is_patch_constant ? 1 : stream;
+         uint32_t *table = tables[table_idx];
+         uint32_t output_component = component + nir_intrinsic_component(intr);
+         uint32_t input_component;
+         BITSET_FOREACH_SET(input_component, input_bits, 32 * 4) {
+            uint32_t *table_for_input_component = table + table_sizes[table_idx] * input_component;
+            BITSET_SET(table_for_input_component, row * 4 + output_component);
+            any_bits_set = true;
+         }
+      }
+   }
+   return any_bits_set;
+}
+
+static bool
+propagate_input_to_output_dependencies(struct dxil_module *mod, nir_intrinsic_instr *load_intr, uint32_t **tables, const uint32_t *table_sizes)
+{
+   /* Which input components are being loaded by this instruction */
+   BITSET_DECLARE(input_bits, 32 * 4) = { 0 };
+   if (!set_input_bits(mod, load_intr, input_bits, &tables, &table_sizes))
+      return false;
+
+   nir_instr_worklist *worklist = nir_instr_worklist_create();
+   nir_instr_worklist_push_tail(worklist, &load_intr->instr);
+   bool any_bits_set = false;
+   nir_foreach_instr_in_worklist(instr, worklist) {
+      if (instr->pass_flags)
+         continue;
+
+      instr->pass_flags = 1;
+      nir_foreach_def(instr, add_def_to_worklist, worklist);
+      switch (instr->type) {
+      case nir_instr_type_jump: {
+         nir_jump_instr *jump = nir_instr_as_jump(instr);
+         switch (jump->type) {
+         case nir_jump_break:
+         case nir_jump_continue: {
+            nir_cf_node *parent = &instr->block->cf_node;
+            while (parent->type != nir_cf_node_loop)
+               parent = parent->parent;
+            nir_foreach_block_in_cf_node(block, parent)
+               nir_foreach_instr(i, block)
+               nir_instr_worklist_push_tail(worklist, i);
+            }
+            break;
+         default:
+            unreachable("Don't expect any other jumps");
+         }
+         break;
+      }
+      case nir_instr_type_intrinsic: {
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         switch (intr->intrinsic) {
+         case nir_intrinsic_store_output:
+         case nir_intrinsic_store_per_vertex_output:
+            any_bits_set |= set_output_bits(mod, intr, input_bits, tables, table_sizes);
+            break;
+            /* TODO: Memory writes */
+         default:
+            break;
+         }
+         break;
+      }
+      default:
+         break;
+      }
+   }
+
+   nir_instr_worklist_destroy(worklist);
+   return any_bits_set;
+}
+
+/* For every input load, compute the set of output stores that it can contribute to.
+ * If it contributes to a store to memory, If it's used for control flow, then any
+ * instruction in the CFG that it impacts is considered to contribute.
+ * Ideally, we should also handle stores to outputs/memory and then loads from that
+ * output/memory, but this is non-trivial and unclear how much impact that would have. */
+bool
+dxil_nir_analyze_io_dependencies(struct dxil_module *mod, nir_shader *s)
+{
+   bool any_outputs = false;
+   for (uint32_t i = 0; i < 4; ++i)
+      any_outputs |= mod->num_psv_outputs[i] > 0;
+   if (mod->shader_kind == DXIL_HULL_SHADER)
+      any_outputs |= mod->num_psv_patch_consts > 0;
+   if (!any_outputs)
+      return false;
+
+   bool any_bits_set = false;
+   nir_foreach_function(func, s) {
+      assert(func->impl);
+      /* Hull shaders have a patch constant function */
+      assert(func->is_entrypoint || s->info.stage == MESA_SHADER_TESS_CTRL);
+
+      /* Pass 1: input/view ID -> output dependencies */
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            uint32_t **tables = mod->io_dependency_table;
+            const uint32_t *table_sizes = mod->dependency_table_dwords_per_input;
+            switch (intr->intrinsic) {
+            case nir_intrinsic_load_view_index:
+               tables = mod->viewid_dependency_table;
+               FALLTHROUGH;
+            case nir_intrinsic_load_input:
+            case nir_intrinsic_load_per_vertex_input:
+            case nir_intrinsic_load_interpolated_input:
+               break;
+            default:
+               continue;
+            }
+
+            clear_pass_flags(func->impl);
+            any_bits_set |= propagate_input_to_output_dependencies(mod, intr, tables, table_sizes);
+         }
+      }
+
+      /* Pass 2: output -> output dependencies */
+      /* TODO */
+   }
+   return any_bits_set;
+}
+
+static enum pipe_format
+get_format_for_var(unsigned num_comps, enum glsl_base_type sampled_type)
+{
+   switch (sampled_type) {
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_INT64:
+   case GLSL_TYPE_INT16:
+      switch (num_comps) {
+      case 1: return PIPE_FORMAT_R32_SINT;
+      case 2: return PIPE_FORMAT_R32G32_SINT;
+      case 3: return PIPE_FORMAT_R32G32B32_SINT;
+      case 4: return PIPE_FORMAT_R32G32B32A32_SINT;
+      default: unreachable("Invalid num_comps");
+      }
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_UINT16:
+      switch (num_comps) {
+      case 1: return PIPE_FORMAT_R32_UINT;
+      case 2: return PIPE_FORMAT_R32G32_UINT;
+      case 3: return PIPE_FORMAT_R32G32B32_UINT;
+      case 4: return PIPE_FORMAT_R32G32B32A32_UINT;
+      default: unreachable("Invalid num_comps");
+      }
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_FLOAT16:
+   case GLSL_TYPE_DOUBLE:
+      switch (num_comps) {
+      case 1: return PIPE_FORMAT_R32_FLOAT;
+      case 2: return PIPE_FORMAT_R32G32_FLOAT;
+      case 3: return PIPE_FORMAT_R32G32B32_FLOAT;
+      case 4: return PIPE_FORMAT_R32G32B32A32_FLOAT;
+      default: unreachable("Invalid num_comps");
+      }
+   default: unreachable("Invalid sampler return type");
+   }
+}
+
+static unsigned
+aoa_size(const struct glsl_type *type)
+{
+   return glsl_type_is_array(type) ? glsl_get_aoa_size(type) : 1;
+}
+
+static bool
+guess_image_format_for_var(nir_shader *s, nir_variable *var)
+{
+   const struct glsl_type *base_type = glsl_without_array(var->type);
+   if (!glsl_type_is_image(base_type))
+      return false;
+   if (var->data.image.format != PIPE_FORMAT_NONE)
+      return false;
+
+   nir_foreach_function_impl(impl, s) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            switch (intr->intrinsic) {
+            case nir_intrinsic_image_deref_load:
+            case nir_intrinsic_image_deref_store:
+            case nir_intrinsic_image_deref_atomic:
+            case nir_intrinsic_image_deref_atomic_swap:
+               if (nir_intrinsic_get_var(intr, 0) != var)
+                  continue;
+               break;
+            case nir_intrinsic_image_load:
+            case nir_intrinsic_image_store:
+            case nir_intrinsic_image_atomic:
+            case nir_intrinsic_image_atomic_swap: {
+               unsigned binding = nir_src_as_uint(intr->src[0]);
+               if (binding < var->data.binding ||
+                   binding >= var->data.binding + aoa_size(var->type))
+                  continue;
+               break;
+               }
+            default:
+               continue;
+            }
+            break;
+
+            switch (intr->intrinsic) {
+            case nir_intrinsic_image_deref_load:
+            case nir_intrinsic_image_load:
+            case nir_intrinsic_image_deref_store:
+            case nir_intrinsic_image_store:
+               /* Increase unknown formats up to 4 components if a 4-component accessor is used */
+               if (intr->num_components > util_format_get_nr_components(var->data.image.format))
+                  var->data.image.format = get_format_for_var(intr->num_components, glsl_get_sampler_result_type(base_type));
+               break;
+            default:
+               /* If an atomic is used, the image format must be 1-component; return immediately */
+               var->data.image.format = get_format_for_var(1, glsl_get_sampler_result_type(base_type));
+               return true;
+            }
+         }
+      }
+   }
+   /* Dunno what it is, assume 4-component */
+   if (var->data.image.format == PIPE_FORMAT_NONE)
+      var->data.image.format = get_format_for_var(4, glsl_get_sampler_result_type(base_type));
+   return true;
+}
+
+static void
+update_intrinsic_format_and_type(nir_intrinsic_instr *intr, nir_variable *var)
+{
+   nir_intrinsic_set_format(intr, var->data.image.format);
+   nir_alu_type alu_type =
+      nir_get_nir_type_for_glsl_base_type(glsl_get_sampler_result_type(glsl_without_array(var->type)));
+   if (nir_intrinsic_has_src_type(intr))
+      nir_intrinsic_set_src_type(intr, alu_type);
+   else if (nir_intrinsic_has_dest_type(intr))
+      nir_intrinsic_set_dest_type(intr, alu_type);
+}
+
+static bool
+update_intrinsic_formats(nir_builder *b, nir_intrinsic_instr *intr,
+                         void *data)
+{
+   if (!nir_intrinsic_has_format(intr))
+      return false;
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   if (deref) {
+      nir_variable *var = nir_deref_instr_get_variable(deref);
+      if (var)
+         update_intrinsic_format_and_type(intr, var);
+      return var != NULL;
+   }
+
+   if (!nir_intrinsic_has_range_base(intr))
+      return false;
+
+   unsigned binding = nir_src_as_uint(intr->src[0]);
+   nir_foreach_variable_with_modes(var, b->shader, nir_var_image) {
+      if (var->data.binding <= binding &&
+          var->data.binding + aoa_size(var->type) > binding) {
+         update_intrinsic_format_and_type(intr, var);
+         return true;
+      }
+   }
+   return false;
+}
+
+bool
+dxil_nir_guess_image_formats(nir_shader *s)
+{
+   bool progress = false;
+   nir_foreach_variable_with_modes(var, s, nir_var_image) {
+      progress |= guess_image_format_for_var(s, var);
+   }
+   nir_shader_intrinsics_pass(s, update_intrinsic_formats, nir_metadata_all,
+                              NULL);
+   return progress;
+}
+
+static void
+set_binding_variables_coherent(nir_shader *s, nir_binding binding, nir_variable_mode modes)
+{
    nir_foreach_variable_with_modes(var, s, modes) {
-      result |= 1ull << var->data.location;
-      var->data.driver_location = driver_loc++;
+      if (var->data.binding == binding.binding &&
+          var->data.descriptor_set == binding.desc_set) {
+         var->data.access |= ACCESS_COHERENT;
+      }
+   }
+}
+
+static void
+set_deref_variables_coherent(nir_shader *s, nir_deref_instr *deref)
+{
+   while (deref->deref_type != nir_deref_type_var &&
+          deref->deref_type != nir_deref_type_cast) {
+      deref = nir_deref_instr_parent(deref);
+   }
+   if (deref->deref_type == nir_deref_type_var) {
+      deref->var->data.access |= ACCESS_COHERENT;
+      return;
+   }
+
+   /* For derefs with casts, we only support pre-lowered Vulkan accesses */
+   assert(deref->deref_type == nir_deref_type_cast);
+   nir_intrinsic_instr *cast_src = nir_instr_as_intrinsic(deref->parent.ssa->parent_instr);
+   assert(cast_src->intrinsic == nir_intrinsic_load_vulkan_descriptor);
+   nir_binding binding = nir_chase_binding(cast_src->src[0]);
+   set_binding_variables_coherent(s, binding, nir_var_mem_ssbo);
+}
+
+static nir_def *
+get_atomic_for_load_store(nir_builder *b, nir_intrinsic_instr *intr, unsigned bit_size)
+{
+   nir_def *zero = nir_imm_intN_t(b, 0, bit_size);
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_deref:
+      return nir_deref_atomic(b, bit_size, intr->src[0].ssa, zero, .atomic_op = nir_atomic_op_iadd);
+   case nir_intrinsic_load_ssbo:
+      return nir_ssbo_atomic(b, bit_size, intr->src[0].ssa, intr->src[1].ssa, zero, .atomic_op = nir_atomic_op_iadd);
+   case nir_intrinsic_image_deref_load:
+      return nir_image_deref_atomic(b, bit_size, intr->src[0].ssa, intr->src[1].ssa, intr->src[2].ssa, zero, .atomic_op = nir_atomic_op_iadd);
+   case nir_intrinsic_image_load:
+      return nir_image_atomic(b, bit_size, intr->src[0].ssa, intr->src[1].ssa, intr->src[2].ssa, zero, .atomic_op = nir_atomic_op_iadd);
+   case nir_intrinsic_store_deref:
+      return nir_deref_atomic(b, bit_size, intr->src[0].ssa, intr->src[1].ssa, .atomic_op = nir_atomic_op_xchg);
+   case nir_intrinsic_store_ssbo:
+      return nir_ssbo_atomic(b, bit_size, intr->src[1].ssa, intr->src[2].ssa, intr->src[0].ssa, .atomic_op = nir_atomic_op_xchg);
+   case nir_intrinsic_image_deref_store:
+      return nir_image_deref_atomic(b, bit_size, intr->src[0].ssa, intr->src[1].ssa, intr->src[2].ssa, intr->src[3].ssa, .atomic_op = nir_atomic_op_xchg);
+   case nir_intrinsic_image_store:
+      return nir_image_atomic(b, bit_size, intr->src[0].ssa, intr->src[1].ssa, intr->src[2].ssa, intr->src[3].ssa, .atomic_op = nir_atomic_op_xchg);
+   default:
+      return NULL;
+   }
+}
+
+static bool
+lower_coherent_load_store(nir_builder *b, nir_intrinsic_instr *intr, void *context)
+{
+   if (!nir_intrinsic_has_access(intr) || (nir_intrinsic_access(intr) & ACCESS_COHERENT) == 0)
+      return false;
+
+   nir_def *atomic_def = NULL;
+   b->cursor = nir_before_instr(&intr->instr);
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_deref:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_image_deref_load:
+   case nir_intrinsic_image_load: {
+      if (intr->def.bit_size < 32 || intr->def.num_components > 1) {
+         if (intr->intrinsic == nir_intrinsic_load_deref)
+            set_deref_variables_coherent(b->shader, nir_src_as_deref(intr->src[0]));
+         else {
+            nir_binding binding = {0};
+            if (nir_src_is_const(intr->src[0]))
+               binding.binding = nir_src_as_uint(intr->src[0]);
+            set_binding_variables_coherent(b->shader, binding,
+                                           intr->intrinsic == nir_intrinsic_load_ssbo ? nir_var_mem_ssbo : nir_var_image);
+         }
+         return false;
+      }
+
+      atomic_def = get_atomic_for_load_store(b, intr, intr->def.bit_size);
+      nir_def_rewrite_uses(&intr->def, atomic_def);
+      break;
+   }
+   case nir_intrinsic_store_deref:
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_image_deref_store:
+   case nir_intrinsic_image_store: {
+      int resource_idx = intr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
+      int value_idx = intr->intrinsic == nir_intrinsic_store_ssbo ? 0 :
+         intr->intrinsic == nir_intrinsic_store_deref ? 1 : 3;
+      unsigned num_components = nir_intrinsic_has_write_mask(intr) ?
+         util_bitcount(nir_intrinsic_write_mask(intr)) : intr->src[value_idx].ssa->num_components;
+      if (intr->src[value_idx].ssa->bit_size < 32 || num_components > 1) {
+         if (intr->intrinsic == nir_intrinsic_store_deref)
+            set_deref_variables_coherent(b->shader, nir_src_as_deref(intr->src[resource_idx]));
+         else {
+            nir_binding binding = {0};
+            if (nir_src_is_const(intr->src[resource_idx]))
+               binding.binding = nir_src_as_uint(intr->src[resource_idx]);
+            set_binding_variables_coherent(b->shader, binding,
+                                           intr->intrinsic == nir_intrinsic_store_ssbo ? nir_var_mem_ssbo : nir_var_image);
+         }
+         return false;
+      }
+
+      atomic_def = get_atomic_for_load_store(b, intr, intr->src[value_idx].ssa->bit_size);
+      break;
+   }
+   default:
+      return false;
+   }
+
+   nir_intrinsic_instr *atomic = nir_instr_as_intrinsic(atomic_def->parent_instr);
+   nir_intrinsic_set_access(atomic, nir_intrinsic_access(intr));
+   if (nir_intrinsic_has_image_dim(intr))
+      nir_intrinsic_set_image_dim(atomic, nir_intrinsic_image_dim(intr));
+   if (nir_intrinsic_has_image_array(intr))
+      nir_intrinsic_set_image_array(atomic, nir_intrinsic_image_array(intr));
+   if (nir_intrinsic_has_format(intr))
+      nir_intrinsic_set_format(atomic, nir_intrinsic_format(intr));
+   if (nir_intrinsic_has_range_base(intr))
+      nir_intrinsic_set_range_base(atomic, nir_intrinsic_range_base(intr));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+bool
+dxil_nir_lower_coherent_loads_and_stores(nir_shader *s)
+{
+   return nir_shader_intrinsics_pass(s, lower_coherent_load_store,
+                                     nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis,
+                                     NULL);
+}
+
+struct undefined_varying_masks {
+   uint64_t io_mask;
+   uint32_t patch_io_mask;
+   const BITSET_WORD *frac_io_mask;
+};
+
+static bool
+is_dead_in_variable(nir_variable *var, void *data)
+{
+   switch (var->data.location) {
+   /* Only these values can be system generated values in addition to varyings */
+   case VARYING_SLOT_PRIMITIVE_ID:
+   case VARYING_SLOT_FACE:
+   case VARYING_SLOT_VIEW_INDEX:
+      return false;
+   /* Tessellation input vars must remain untouched */
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+      return false;
+   default:
+      return true;
+   }
+}
+
+static bool
+kill_undefined_varyings(struct nir_builder *b,
+                        nir_instr *instr,
+                        void *data)
+{
+   const struct undefined_varying_masks *masks = data;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   if (intr->intrinsic != nir_intrinsic_load_deref)
+      return false;
+
+   nir_variable *var = nir_intrinsic_get_var(intr, 0);
+   if (!var || var->data.mode != nir_var_shader_in)
+      return false;
+
+   if (!is_dead_in_variable(var, NULL))
+      return false;
+
+   uint32_t loc = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
+      var->data.location - VARYING_SLOT_PATCH0 :
+      var->data.location;
+   uint64_t written = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
+      masks->patch_io_mask : masks->io_mask;
+   if (BITFIELD64_RANGE(loc, glsl_varying_count(var->type)) & written) {
+      if (!masks->frac_io_mask || !var->data.location_frac ||
+          var->data.location < VARYING_SLOT_VAR0 ||
+          BITSET_TEST(masks->frac_io_mask, (var->data.location - VARYING_SLOT_VAR0) * 4 + var->data.location_frac))
+      return false;
+   }
+
+   b->cursor = nir_after_instr(instr);
+   /* Note: zero is used instead of undef, because optimization is not run here, but is
+    * run later on. If we load an undef here, and that undef ends up being used to store
+    * to position later on, that can cause some or all of the components in that position
+    * write to be removed, which is problematic especially in the case of all components,
+    * since that would remove the store instruction, and would make it tricky to satisfy
+    * the DXIL requirements of writing all position components.
+    */
+   nir_def *zero = nir_imm_zero(b, intr->def.num_components,
+                                       intr->def.bit_size);
+   nir_def_rewrite_uses(&intr->def, zero);
+   nir_instr_remove(instr);
+   return true;
+}
+
+bool
+dxil_nir_kill_undefined_varyings(nir_shader *shader, uint64_t prev_stage_written_mask, uint32_t prev_stage_patch_written_mask,
+                                 const BITSET_WORD *prev_stage_frac_output_mask)
+{
+   struct undefined_varying_masks masks = {
+      .io_mask = prev_stage_written_mask,
+      .patch_io_mask = prev_stage_patch_written_mask,
+      .frac_io_mask = prev_stage_frac_output_mask
+   };
+   bool progress = nir_shader_instructions_pass(shader,
+                                                kill_undefined_varyings,
+                                                nir_metadata_dominance |
+                                                nir_metadata_block_index |
+                                                nir_metadata_loop_analysis,
+                                                (void *)&masks);
+   if (progress) {
+      nir_opt_dce(shader);
+      nir_remove_dead_derefs(shader);
+   }
+
+   const struct nir_remove_dead_variables_options options = {
+      .can_remove_var = is_dead_in_variable,
+      .can_remove_var_data = &masks,
+   };
+   progress |= nir_remove_dead_variables(shader, nir_var_shader_in, &options);
+   return progress;
+}
+
+static bool
+is_dead_out_variable(nir_variable *var, void *data)
+{
+   return !nir_slot_is_sysval_output(var->data.location, MESA_SHADER_NONE);
+}
+
+static bool
+kill_unused_outputs(struct nir_builder *b,
+                    nir_instr *instr,
+                    void *data)
+{
+   const struct undefined_varying_masks *masks = data;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   if (intr->intrinsic != nir_intrinsic_store_deref &&
+       intr->intrinsic != nir_intrinsic_load_deref)
+      return false;
+
+   nir_variable *var = nir_intrinsic_get_var(intr, 0);
+   if (!var || var->data.mode != nir_var_shader_out ||
+       /* always_active_io can mean two things: xfb or GL separable shaders. We can't delete
+        * varyings that are used for xfb (we'll just sort them last), but we must delete varyings
+        * that are mismatching between TCS and TES. Fortunately TCS can't do xfb, so we can ignore
+        the always_active_io bit for TCS outputs. */
+       (b->shader->info.stage != MESA_SHADER_TESS_CTRL && var->data.always_active_io))
+      return false;
+
+   if (!is_dead_out_variable(var, NULL))
+      return false;
+
+   unsigned loc = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
+      var->data.location - VARYING_SLOT_PATCH0 :
+      var->data.location;
+   uint64_t read = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
+      masks->patch_io_mask : masks->io_mask;
+   if (BITFIELD64_RANGE(loc, glsl_varying_count(var->type)) & read) {
+      if (!masks->frac_io_mask || !var->data.location_frac ||
+          var->data.location < VARYING_SLOT_VAR0 ||
+          BITSET_TEST(masks->frac_io_mask, (var->data.location - VARYING_SLOT_VAR0) * 4 + var->data.location_frac))
+      return false;
+   }
+
+   if (intr->intrinsic == nir_intrinsic_load_deref) {
+      b->cursor = nir_after_instr(&intr->instr);
+      nir_def *zero = nir_imm_zero(b, intr->def.num_components, intr->def.bit_size);
+      nir_def_rewrite_uses(&intr->def, zero);
+   }
+   nir_instr_remove(instr);
+   return true;
+}
+
+bool
+dxil_nir_kill_unused_outputs(nir_shader *shader, uint64_t next_stage_read_mask, uint32_t next_stage_patch_read_mask,
+                             const BITSET_WORD *next_stage_frac_input_mask)
+{
+   struct undefined_varying_masks masks = {
+      .io_mask = next_stage_read_mask,
+      .patch_io_mask = next_stage_patch_read_mask,
+      .frac_io_mask = next_stage_frac_input_mask
+   };
+
+   bool progress = nir_shader_instructions_pass(shader,
+                                                kill_unused_outputs,
+                                                nir_metadata_dominance |
+                                                nir_metadata_block_index |
+                                                nir_metadata_loop_analysis,
+                                                (void *)&masks);
+
+   if (progress) {
+      nir_opt_dce(shader);
+      nir_remove_dead_derefs(shader);
    }
-   return result;
+   const struct nir_remove_dead_variables_options options = {
+      .can_remove_var = is_dead_out_variable,
+      .can_remove_var_data = &masks,
+   };
+   progress |= nir_remove_dead_variables(shader, nir_var_shader_out, &options);
+   return progress;
 }