1 files changed, 607 insertions, 168 deletions
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index f03ceef4b24..2debdeab167 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -193,6 +193,8 @@ void
 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
                         struct ir3_reg_interval *interval)
 {
+   assert(interval->inserted);
+
    if (interval->parent) {
       rb_tree_remove(&interval->parent->children, &interval->node);
    } else {
@@ -538,8 +540,6 @@ ra_file_init(struct ra_file *file)
       BITSET_SET(file->available_to_evict, i);
    }
 
-   file->start = 0;
-
    rb_tree_init(&file->reg_ctx.intervals);
    rb_tree_init(&file->physreg_intervals);
 
@@ -686,6 +686,8 @@ ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
                 struct ra_interval *interval)
 {
    assert(!interval->interval.parent);
+   /* shared live splitting is not allowed! */
+   assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
 
    /* Check if we've already moved this reg before */
    unsigned pcopy_index;
@@ -720,6 +722,10 @@ ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
    interval->physreg_start = dst;
    interval->physreg_end = dst + removed->size;
 
+   assert(interval->physreg_end <= file->size);
+   if (interval->interval.reg->flags & IR3_REG_HALF)
+      assert(interval->physreg_end <= RA_HALF_SIZE);
+
    ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
 }
 
@@ -732,16 +738,76 @@ ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
    ra_push_interval(ctx, file, &temp, dst);
 }
 
+static struct ra_file *
+ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
+{
+   if (reg->flags & IR3_REG_SHARED)
+      return &ctx->shared;
+   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
+      return &ctx->full;
+   else
+      return &ctx->half;
+}
+
+
+/* Returns true if the proposed spot for "dst" or a killed source overlaps a
+ * destination that's been allocated.
+ */
 static bool
-get_reg_specified(struct ra_file *file, struct ir3_register *reg,
-                  physreg_t physreg, bool is_source)
+check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
+                  struct ir3_register *dst, physreg_t start,
+                  physreg_t end)
+{
+   struct ir3_instruction *instr = dst->instr;
+
+   ra_foreach_dst (other_dst, instr) {
+      /* We assume only destinations before the current one have been allocated.
+       */
+      if (other_dst == dst)
+         break;
+
+      if (ra_get_file(ctx, other_dst) != file)
+         continue;
+
+      struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
+      assert(!other_interval->interval.parent);
+      physreg_t other_start = other_interval->physreg_start;
+      physreg_t other_end = other_interval->physreg_end;
+
+      if (other_end > start && end > other_start)
+         return true;
+   }
+
+   return false;
+}
+
+/* True if the destination is "early-clobber," meaning that it cannot be
+ * allocated over killed sources. Some destinations always require it, but it
+ * also is implicitly true for tied destinations whose source is live-through.
+ * If the source is killed, then we skip allocating a register for the
+ * destination altogether so we don't need to worry about that case here.
+ */
+static bool
+is_early_clobber(struct ir3_register *reg)
+{
+   return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
+}
+
+static bool
+get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
+                  struct ir3_register *reg, physreg_t physreg, bool is_source)
 {
    for (unsigned i = 0; i < reg_size(reg); i++) {
-      if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,
+      if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
+                           file->available_to_evict : file->available,
                        physreg + i))
          return false;
    }
 
+   if (!is_source &&
+       check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
+      return false;
+
    return true;
 }
 
@@ -759,8 +825,13 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
    memcpy(available_to_evict, file->available_to_evict,
           sizeof(available_to_evict));
 
-   for (unsigned i = 0; i < reg_size(reg); i++)
+   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
+   memcpy(available, file->available, sizeof(available));
+
+   for (unsigned i = 0; i < reg_size(reg); i++) {
       BITSET_CLEAR(available_to_evict, physreg + i);
+      BITSET_CLEAR(available, physreg + i);
+   }
 
    unsigned eviction_count = 0;
    /* Iterate over each range conflicting with physreg */
@@ -769,7 +840,7 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
         conflicting != NULL &&
         conflicting->physreg_start < physreg + reg_size(reg);
         conflicting = next, next = ra_interval_next_or_null(next)) {
-      if (!is_source && conflicting->is_killed)
+      if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
          continue;
 
       if (conflicting->frozen) {
@@ -777,10 +848,12 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
          return false;
       }
 
+      unsigned conflicting_file_size =
+         reg_file_size(file, conflicting->interval.reg);
       unsigned avail_start, avail_end;
       bool evicted = false;
       BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
-                            reg_file_size(file, conflicting->interval.reg)) {
+                            conflicting_file_size) {
          unsigned size = avail_end - avail_start;
 
          /* non-half registers must be aligned */
@@ -790,7 +863,11 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
             size--;
          }
 
-         if (size >= conflicting->physreg_end - conflicting->physreg_start) {
+         unsigned conflicting_size =
+            conflicting->physreg_end - conflicting->physreg_start;
+         if (size >= conflicting_size &&
+             !check_dst_overlap(ctx, file, reg, avail_start, avail_start +
+                                conflicting_size)) {
             for (unsigned i = 0;
                  i < conflicting->physreg_end - conflicting->physreg_start; i++)
                BITSET_CLEAR(available_to_evict, avail_start + i);
@@ -803,6 +880,76 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
          }
       }
 
+      if (evicted)
+         continue;
+
+      /* If we couldn't evict this range, but the register we're allocating is
+       * allowed to overlap with a killed range, then we may be able to swap it
+       * with a killed range to acheive the same effect.
+       */
+      if (is_early_clobber(reg) || is_source)
+         return false;
+
+      foreach_interval (killed, file) {
+         if (!killed->is_killed)
+            continue;
+
+         if (killed->physreg_end - killed->physreg_start !=
+             conflicting->physreg_end - conflicting->physreg_start)
+            continue;
+
+         if (killed->physreg_end > conflicting_file_size ||
+             conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
+            continue;
+
+         /* We can't swap the killed range if it partially/fully overlaps the
+          * space we're trying to allocate or (in speculative mode) if it's
+          * already been swapped and will overlap when we actually evict.
+          */
+         bool killed_available = true;
+         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
+            if (!BITSET_TEST(available, i)) {
+               killed_available = false;
+               break;
+            }
+         }
+         
+         if (!killed_available)
+            continue;
+
+         if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
+                               killed->physreg_end))
+            continue;
+
+         /* Check for alignment if one is a full reg */
+         if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
+              !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
+             (killed->physreg_start % 2 != 0 ||
+              conflicting->physreg_start % 2 != 0))
+            continue;
+
+         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
+            BITSET_CLEAR(available, i);
+         }
+         /* Because this will generate swaps instead of moves, multiply the
+          * cost by 2.
+          */
+         eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
+         if (!speculative) {
+            physreg_t killed_start = killed->physreg_start,
+                      conflicting_start = conflicting->physreg_start;
+            struct ra_removed_interval killed_removed =
+               ra_pop_interval(ctx, file, killed);
+            struct ra_removed_interval conflicting_removed =
+               ra_pop_interval(ctx, file, conflicting);
+            ra_push_interval(ctx, file, &killed_removed, conflicting_start);
+            ra_push_interval(ctx, file, &conflicting_removed, killed_start);
+         }
+
+         evicted = true;
+         break;
+      }
+
       if (!evicted)
          return false;
    }
@@ -819,15 +966,16 @@ removed_interval_cmp(const void *_i1, const void *_i2)
 
    /* We sort the registers as follows:
     *
-    * |--------------------------------------------------------------------|
-    * |                    |             |             |                   |
-    * |  Half live-through | Half killed | Full killed | Full live-through |
-    * |                    |             |             |                   |
-    * |--------------------------------------------------------------------|
-    *                        |                 |
-    *                        |   Destination   |
-    *                        |                 |
-    *                        |-----------------|
+    * |------------------------------------------------------------------------------------------|
+    * |               |                    |        |        |                    |              |
+    * |  Half         | Half early-clobber | Half   | Full   | Full early-clobber | Full         |
+    * |  live-through | destination        | killed | killed | destination        | live-through |
+    * |               |                    |        |        |                    |              |
+    * |------------------------------------------------------------------------------------------|
+    *                                      |                 |
+    *                                      |   Destination   |
+    *                                      |                 |
+    *                                      |-----------------|
     *
     * Half-registers have to be first so that they stay in the low half of
     * the register file. Then half and full killed must stay together so that
@@ -859,6 +1007,37 @@ removed_interval_cmp(const void *_i1, const void *_i2)
    return 0;
 }
 
+static int
+dsts_cmp(const void *_i1, const void *_i2)
+{
+   struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
+   struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
+
+   /* Treat tied destinations as-if they are live-through sources, and normal
+    * destinations as killed sources.
+    */
+   unsigned i1_align = reg_elem_size(i1);
+   unsigned i2_align = reg_elem_size(i2);
+   if (i1_align > i2_align)
+      return 1;
+   if (i1_align < i2_align)
+      return -1;
+
+   if (i1_align == 1) {
+      if (!is_early_clobber(i2))
+         return -1;
+      if (!is_early_clobber(i1))
+         return 1;
+   } else {
+      if (!is_early_clobber(i2))
+         return 1;
+      if (!is_early_clobber(i1))
+         return -1;
+   }
+
+   return 0;
+}
+
 /* "Compress" all the live intervals so that there is enough space for the
  * destination register. As there can be gaps when a more-aligned interval
  * follows a less-aligned interval, this also sorts them to remove such
@@ -869,29 +1048,100 @@ removed_interval_cmp(const void *_i1, const void *_i2)
  * Return the physreg to use.
  */
 static physreg_t
-compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
-                   unsigned align, bool is_source)
+compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
+                   struct ir3_register *reg)
 {
+   unsigned reg_align = reg_elem_size(reg);
    DECLARE_ARRAY(struct ra_removed_interval, intervals);
    intervals_count = intervals_sz = 0;
    intervals = NULL;
 
+   DECLARE_ARRAY(struct ir3_register *, dsts);
+   dsts_count = dsts_sz = 0;
+   dsts = NULL;
+   array_insert(ctx, dsts, reg);
+   bool dst_inserted[reg->instr->dsts_count];
+
+   unsigned dst_size = reg->tied ? 0 : reg_size(reg);
+   unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
+   unsigned half_dst_size = 0, ec_half_dst_size = 0;
+   if (reg_align == 1) {
+      half_dst_size = dst_size;
+      ec_half_dst_size = ec_dst_size;
+   }
+
    unsigned removed_size = 0, removed_half_size = 0;
+   unsigned removed_killed_size = 0, removed_killed_half_size = 0;
    unsigned file_size =
-      align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
+      reg_align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
    physreg_t start_reg = 0;
 
    foreach_interval_rev_safe (interval, file) {
+      /* We'll check if we can compact the intervals starting here. */
+      physreg_t candidate_start = interval->physreg_end;
+
+      /* Check if there are any other destinations we need to compact. */
+      ra_foreach_dst_n (other_dst, n, reg->instr) {
+         if (other_dst == reg)
+            break;
+         if (ra_get_file(ctx, other_dst) != file)
+            continue;
+         if (dst_inserted[n])
+            continue;
+
+         struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
+         /* if the destination partially overlaps this interval, we need to
+          * extend candidate_start to the end.
+          */
+         if (other_interval->physreg_start < candidate_start) {
+            candidate_start = MAX2(candidate_start,
+                                   other_interval->physreg_end);
+            continue;
+         }
+
+         dst_inserted[n] = true;
+
+         /* dst intervals with a tied killed source are considered attached to
+          * that source. Don't actually insert them. This means we have to
+          * update them below if their tied source moves.
+          */
+         if (other_dst->tied) {
+            struct ra_interval *tied_interval =
+               &ctx->intervals[other_dst->tied->def->name];
+            if (tied_interval->is_killed)
+               continue;
+         }
+
+         d("popping destination %u physreg %u\n",
+           other_interval->interval.reg->name,
+           other_interval->physreg_start);
+
+         array_insert(ctx, dsts, other_dst);
+         unsigned interval_size = reg_size(other_dst);
+         if (is_early_clobber(other_dst)) {
+            ec_dst_size += interval_size;
+            if (other_interval->interval.reg->flags & IR3_REG_HALF)
+               ec_half_dst_size += interval_size;
+         } else {
+            dst_size += interval_size;
+            if (other_interval->interval.reg->flags & IR3_REG_HALF)
+               half_dst_size += interval_size;
+         }
+      }
+
       /* Check if we can sort the intervals *after* this one and have enough
-       * space leftover to accomodate "size" units. Also check that we have
-       * enough space leftover for half-registers, if we're inserting a
-       * half-register (otherwise we only shift any half-registers down so they
-       * should be safe).
+       * space leftover to accomodate all intervals, keeping in mind that killed
+       * sources overlap non-tied destinations. Also check that we have enough
+       * space leftover for half-registers, if we're inserting a half-register
+       * (otherwise we only shift any half-registers down so they should be
+       * safe).
        */
-      if (interval->physreg_end + size + removed_size <= file->size &&
-          (align != 1 ||
-           interval->physreg_end + size + removed_half_size <= file_size)) {
-         start_reg = interval->physreg_end;
+      if (candidate_start + removed_size + ec_dst_size +
+          MAX2(removed_killed_size, dst_size) <= file->size &&
+          (reg_align != 1 ||
+           candidate_start + removed_half_size + ec_half_dst_size +
+           MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
+         start_reg = candidate_start;
          break;
       }
 
@@ -900,20 +1150,23 @@ compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
        */
       assert(!interval->frozen);
 
-      /* Killed sources don't count because they go at the end and can
+      /* Killed sources are different because they go at the end and can
        * overlap the register we're trying to add.
        */
-      if (!interval->is_killed && !is_source) {
-         removed_size += interval->physreg_end - interval->physreg_start;
-         if (interval->interval.reg->flags & IR3_REG_HALF) {
-            removed_half_size += interval->physreg_end -
-               interval->physreg_start;
-         }
+      unsigned interval_size = interval->physreg_end - interval->physreg_start;
+      if (interval->is_killed) {
+         removed_killed_size += interval_size;
+         if (interval->interval.reg->flags & IR3_REG_HALF)
+            removed_killed_half_size += interval_size;
+      } else {
+         removed_size += interval_size;
+         if (interval->interval.reg->flags & IR3_REG_HALF)
+            removed_half_size += interval_size;
       }
 
       /* Now that we've done the accounting, pop this off */
-      d("popping interval %u physreg %u\n", interval->interval.reg->name,
-        interval->physreg_start);
+      d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
+        interval->physreg_start, interval->is_killed ? ", killed" : "");
       array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
    }
 
@@ -922,55 +1175,138 @@ compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
     */
 
    qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
+   qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
 
-   physreg_t physreg = start_reg;
+   physreg_t live_reg = start_reg;
+   physreg_t dst_reg = (physreg_t)~0;
    physreg_t ret_reg = (physreg_t)~0;
-   for (unsigned i = 0; i < intervals_count; i++) {
-      if (ret_reg == (physreg_t)~0 &&
-          ((intervals[i].interval->is_killed && !is_source) ||
-           !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
-         ret_reg = ALIGN(physreg, align);
+   unsigned dst_index = 0;
+   unsigned live_index = 0;
+
+   /* We have two lists of intervals to process, live intervals and destination
+    * intervals. Process them in the order of the disgram in insert_cmp().
+    */
+   while (live_index < intervals_count || dst_index < dsts_count) {
+      bool process_dst;
+      if (live_index == intervals_count) {
+         process_dst = true;
+      } else if (dst_index == dsts_count) {
+         process_dst = false;
+      } else {
+         struct ir3_register *dst = dsts[dst_index];
+         struct ra_interval *live_interval = intervals[live_index].interval;
+
+         bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
+         bool live_killed = live_interval->is_killed;
+         bool dst_half = dst->flags & IR3_REG_HALF;
+         bool dst_early_clobber = is_early_clobber(dst);
+
+         if (live_half && !live_killed) {
+            /* far-left of diagram. */
+            process_dst = false;
+         } else if (dst_half && dst_early_clobber) {
+            /* mid-left of diagram. */
+            process_dst = true;
+         } else if (!dst_early_clobber) {
+            /* bottom of disagram. */
+            process_dst = true;
+         } else if (live_killed) {
+            /* middle of diagram. */
+            process_dst = false;
+         } else if (!dst_half && dst_early_clobber) {
+            /* mid-right of diagram. */
+            process_dst = true;
+         } else {
+            /* far right of diagram. */
+            assert(!live_killed && !live_half);
+            process_dst = false;
+         }
       }
 
-      if (ret_reg != (physreg_t)~0 &&
-          (is_source || !intervals[i].interval->is_killed)) {
-         physreg = MAX2(physreg, ret_reg + size);
+      struct ir3_register *cur_reg =
+         process_dst ? dsts[dst_index] :
+         intervals[live_index].interval->interval.reg;
+
+      physreg_t physreg;
+      if (process_dst && !is_early_clobber(cur_reg)) {
+         if (dst_reg == (physreg_t)~0)
+            dst_reg = live_reg;
+         physreg = dst_reg;
+      } else {
+         physreg = live_reg;
+         struct ra_interval *live_interval = intervals[live_index].interval;
+         bool live_killed = live_interval->is_killed;
+         /* If this is live-through and we've processed the destinations, we
+          * need to make sure we take into account any overlapping destinations.
+          */
+         if (!live_killed && dst_reg != (physreg_t)~0)
+            physreg = MAX2(physreg, dst_reg);
       }
 
-      if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
+      if (!(cur_reg->flags & IR3_REG_HALF))
          physreg = ALIGN(physreg, 2);
-      }
 
-      if (physreg + intervals[i].size >
-          reg_file_size(file, intervals[i].interval->interval.reg)) {
+      d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
+
+      unsigned interval_size = reg_size(cur_reg);
+      if (physreg + interval_size >
+          reg_file_size(file, cur_reg)) {
          d("ran out of room for interval %u!\n",
-           intervals[i].interval->interval.reg->name);
+           cur_reg->name);
          unreachable("reg pressure calculation was wrong!");
          return 0;
       }
 
-      d("pushing interval %u physreg %u\n",
-        intervals[i].interval->interval.reg->name, physreg);
-      ra_push_interval(ctx, file, &intervals[i], physreg);
+      if (process_dst) {
+         if (cur_reg == reg) {
+            ret_reg = physreg;
+         } else {
+            struct ra_interval *interval = &ctx->intervals[cur_reg->name];
+            interval->physreg_start = physreg;
+            interval->physreg_end = physreg + interval_size;
+         }
+         dst_index++;
+      } else {
+         ra_push_interval(ctx, file, &intervals[live_index], physreg);
+         live_index++;
+      }
+
+      physreg += interval_size;
 
-      physreg += intervals[i].size;
+      if (process_dst && !is_early_clobber(cur_reg)) {
+         dst_reg = physreg;
+      } else {
+         live_reg = physreg;
+      }
    }
 
-   if (ret_reg == (physreg_t)~0)
-      ret_reg = physreg;
+   /* If we shuffled around a tied source that is killed, we may have to update
+    * its corresponding destination since we didn't insert it above.
+    */
+   ra_foreach_dst (dst, reg->instr) {
+      if (dst == reg)
+         break;
 
-   ret_reg = ALIGN(ret_reg, align);
-   if (ret_reg + size > file_size) {
-      d("ran out of room for the new interval!\n");
-      unreachable("reg pressure calculation was wrong!");
-      return 0;
+      struct ir3_register *tied = dst->tied;
+      if (!tied)
+         continue;
+
+      struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
+      if (!tied_interval->is_killed)
+         continue;
+
+      struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+      unsigned dst_size = reg_size(dst);
+      dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
+      dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
    }
 
    return ret_reg;
 }
 
 static void
-update_affinity(struct ir3_register *reg, physreg_t physreg)
+update_affinity(struct ra_file *file, struct ir3_register *reg,
+                physreg_t physreg)
 {
    if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
       return;
@@ -978,6 +1314,9 @@ update_affinity(struct ir3_register *reg, physreg_t physreg)
    if (physreg < reg->merge_set_offset)
       return;
 
+   if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
+      return;
+
    reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
 }
 
@@ -985,8 +1324,9 @@ update_affinity(struct ir3_register *reg, physreg_t physreg)
  * a round-robin algorithm to reduce false dependencies.
  */
 static physreg_t
-find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
-              unsigned align, bool is_source)
+find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
+              struct ir3_register *dst, unsigned file_size, unsigned size,
+              unsigned alignment)
 {
    /* This can happen if we create a very large merge set. Just bail out in that
     * case.
@@ -995,9 +1335,9 @@ find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
       return (physreg_t) ~0;
 
    BITSET_WORD *available =
-      is_source ? file->available_to_evict : file->available;
+      is_early_clobber(dst) ? file->available_to_evict : file->available;
 
-   unsigned start = ALIGN(file->start, align) % (file_size - size + align);
+   unsigned start = ALIGN(file->start, alignment) % (file_size - size + alignment);
    unsigned candidate = start;
    do {
       bool is_available = true;
@@ -1009,11 +1349,16 @@ find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
       }
 
       if (is_available) {
+         is_available =
+            !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
+      }
+
+      if (is_available) {
          file->start = (candidate + size) % file_size;
          return candidate;
       }
 
-      candidate += align;
+      candidate += alignment;
       if (candidate + size > file_size)
          candidate = 0;
    } while (candidate != start);
@@ -1021,17 +1366,6 @@ find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
    return (physreg_t)~0;
 }
 
-static struct ra_file *
-ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
-{
-   if (reg->flags & IR3_REG_SHARED)
-      return &ctx->shared;
-   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
-      return &ctx->full;
-   else
-      return &ctx->half;
-}
-
 /* This is the main entrypoint for picking a register. Pick a free register
  * for "reg", shuffling around sources if necessary. In the normal case where
  * "is_source" is false, this register can overlap with killed sources
@@ -1042,16 +1376,15 @@ ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
  */
 
 static physreg_t
-get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
-        bool is_source)
+get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
 {
    unsigned file_size = reg_file_size(file, reg);
    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
       physreg_t preferred_reg =
          reg->merge_set->preferred_reg + reg->merge_set_offset;
-      if (preferred_reg < file_size &&
+      if (preferred_reg + reg_size(reg) <= file_size &&
           preferred_reg % reg_elem_size(reg) == 0 &&
-          get_reg_specified(file, reg, preferred_reg, is_source))
+          get_reg_specified(ctx, file, reg, preferred_reg, false))
          return preferred_reg;
    }
 
@@ -1062,8 +1395,9 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
    unsigned size = reg_size(reg);
    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
        size < reg->merge_set->size) {
-      physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,
-                                         reg->merge_set->alignment, is_source);
+      physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
+                                         reg->merge_set->size,
+                                         reg->merge_set->alignment);
       if (best_reg != (physreg_t)~0u) {
          best_reg += reg->merge_set_offset;
          return best_reg;
@@ -1085,14 +1419,14 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
             physreg_t src_physreg = ra_interval_get_physreg(src_interval);
             if (src_physreg % reg_elem_size(reg) == 0 &&
                 src_physreg + size <= file_size &&
-                get_reg_specified(file, reg, src_physreg, is_source))
+                get_reg_specified(ctx, file, reg, src_physreg, false))
                return src_physreg;
          }
       }
    }
 
    physreg_t best_reg =
-      find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
+      find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
    if (best_reg != (physreg_t)~0u) {
       return best_reg;
    }
@@ -1104,7 +1438,7 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
    unsigned best_eviction_count = ~0;
    for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
       unsigned eviction_count;
-      if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
+      if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
          if (eviction_count < best_eviction_count) {
             best_eviction_count = eviction_count;
             best_reg = i;
@@ -1114,14 +1448,13 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
 
    if (best_eviction_count != ~0) {
       ASSERTED bool result = try_evict_regs(
-         ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
+         ctx, file, reg, best_reg, &best_eviction_count, false, false);
       assert(result);
       return best_reg;
    }
 
    /* Use the dumb fallback only if try_evict_regs() fails. */
-   return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),
-                             is_source);
+   return compress_regs_left(ctx, file, reg);
 }
 
 static void
@@ -1170,14 +1503,47 @@ static void
 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
                    physreg_t physreg)
 {
+   struct ra_file *file = ra_get_file(ctx, dst);
    struct ra_interval *interval = &ctx->intervals[dst->name];
-   update_affinity(dst, physreg);
+   update_affinity(file, dst, physreg);
 
    ra_interval_init(interval, dst);
    interval->physreg_start = physreg;
    interval->physreg_end = physreg + reg_size(dst);
 }
 
+/* If a tied destination interferes with its source register, we have to insert
+ * a copy beforehand to copy the source to the destination. Because we are using
+ * the parallel_copies array and not creating a separate copy, this copy will
+ * happen in parallel with any shuffling around of the tied source, so we have
+ * to copy the source *as it exists before it is shuffled around*. We do this by
+ * inserting the copy early, before any other copies are inserted. We don't
+ * actually know the destination of the copy, but that's ok because the
+ * dst_interval will be filled out later.
+ */
+static void
+insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
+{
+   struct ir3_register *tied = dst->tied;
+
+   if (!tied)
+      return;
+
+   struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
+   struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+
+   if (tied_interval->is_killed)
+      return;
+
+   physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
+
+   array_insert(ctx, ctx->parallel_copies,
+                (struct ra_parallel_copy){
+                   .interval = dst_interval,
+                   .src = tied_physreg,
+                });
+}
+
 static void
 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
 {
@@ -1186,33 +1552,17 @@ allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
    struct ir3_register *tied = dst->tied;
    if (tied) {
       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
-      struct ra_interval *dst_interval = &ctx->intervals[dst->name];
-      physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
       if (tied_interval->is_killed) {
          /* The easy case: the source is killed, so we can just reuse it
           * for the destination.
           */
          allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
-      } else {
-         /* The source is live-through, so we need to get a free register
-          * (which is free for both the source and destination!), copy the
-          * original source to it, then use that for the source and
-          * destination.
-          */
-         physreg_t physreg = get_reg(ctx, file, dst, true);
-         allocate_dst_fixed(ctx, dst, physreg);
-         array_insert(ctx, ctx->parallel_copies,
-                      (struct ra_parallel_copy){
-                         .interval = dst_interval,
-                         .src = tied_physreg,
-                      });
+         return;
       }
-
-      return;
    }
 
    /* All the hard work is done by get_reg here. */
-   physreg_t physreg = get_reg(ctx, file, dst, false);
+   physreg_t physreg = get_reg(ctx, file, dst);
 
    allocate_dst_fixed(ctx, dst, physreg);
 }
@@ -1256,7 +1606,8 @@ insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
       struct ir3_register *reg =
          ir3_dst_create(pcopy, INVALID_REG,
-                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
+                        entry->interval->interval.reg->flags &
+                        (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
       reg->size = entry->interval->interval.reg->size;
       reg->wrmask = entry->interval->interval.reg->wrmask;
       assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
@@ -1266,7 +1617,8 @@ insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
       struct ir3_register *reg =
          ir3_src_create(pcopy, INVALID_REG,
-                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
+                        entry->interval->interval.reg->flags &
+                        (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
       reg->size = entry->interval->interval.reg->size;
       reg->wrmask = entry->interval->interval.reg->wrmask;
       assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
@@ -1285,6 +1637,11 @@ handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
       mark_src_killed(ctx, src);
    }
 
+   /* Pre-insert tied dst copies. */
+   ra_foreach_dst (dst, instr) {
+      insert_tied_dst_copy(ctx, dst);
+   }
+
    /* Allocate the destination. */
    ra_foreach_dst (dst, instr) {
       allocate_dst(ctx, dst);
@@ -1312,6 +1669,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
    struct ir3_register *dst = instr->dsts[0];
    struct ir3_register *src = instr->srcs[0];
 
+   if (!(dst->flags & IR3_REG_SSA))
+      return;
+
    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
       handle_normal_instr(ctx, instr);
       return;
@@ -1330,6 +1690,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
    unsigned dst_offset = instr->dsts[0]->merge_set_offset;
 
@@ -1362,8 +1725,14 @@ handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
 
       struct ra_interval *interval = &ctx->intervals[src->def->name];
 
-      if (src->def->merge_set != dst_set || interval->is_killed)
+      /* We only need special handling if the source's interval overlaps with
+       * the destination's interval.
+       */
+      if (src->def->interval_start >= instr->dsts[0]->interval_end ||
+          instr->dsts[0]->interval_start >= src->def->interval_end ||
+          interval->is_killed)
          continue;
+
       while (interval->interval.parent != NULL) {
          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
       }
@@ -1445,13 +1814,19 @@ handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-   if (instr->dsts[0]->num == INVALID_REG)
+   if (instr->dsts[0]->num == INVALID_REG ||
+       !(instr->dsts[0]->flags & IR3_REG_SSA))
       return;
 
+   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
    physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
    allocate_dst_fixed(ctx, instr->dsts[0], physreg);
-   insert_dst(ctx, instr->dsts[0]);
+
+   d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
+     ra_interval_get_physreg(interval));
+
+   ra_file_insert(file, interval);
    interval->frozen = true;
 }
 
@@ -1471,6 +1846,9 @@ handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
 
@@ -1517,7 +1895,7 @@ handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
     * anything unless it overlaps with our precolored physreg, so we don't
     * have to worry about evicting other precolored sources.
     */
-   if (!get_reg_specified(file, src, physreg, true)) {
+   if (!get_reg_specified(ctx, file, src, physreg, true)) {
       unsigned eviction_count;
       if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
                           false)) {
@@ -1615,6 +1993,9 @@ handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 {
+   if (!(def->flags & IR3_REG_SSA))
+      return;
+
    struct ra_file *file = ra_get_file(ctx, def);
    struct ra_interval *interval = &ctx->intervals[def->name];
 
@@ -1630,7 +2011,7 @@ handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
       physreg = ra_interval_get_physreg(parent) +
                 (def->interval_start - parent_ir3->reg->interval_start);
    } else {
-      physreg = get_reg(ctx, file, def, false);
+      physreg = get_reg(ctx, file, def);
    }
 
    allocate_dst_fixed(ctx, def, physreg);
@@ -1641,6 +2022,9 @@ handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
 {
+   if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
    struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
    assert(!interval->interval.parent);
@@ -1686,12 +2070,10 @@ insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
                     struct ir3_register *reg)
 {
    struct ir3_instruction *old_pcopy = NULL;
-   if (!list_is_empty(&block->instr_list)) {
-      struct ir3_instruction *last =
-         LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
-      if (last->opc == OPC_META_PARALLEL_COPY)
-         old_pcopy = last;
-   }
+   struct ir3_instruction *last = ir3_block_get_last_non_terminator(block);
+
+   if (last && last->opc == OPC_META_PARALLEL_COPY)
+      old_pcopy = last;
 
    unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
    struct ir3_instruction *pcopy = ir3_instr_create(
@@ -1702,8 +2084,9 @@ insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
       pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
    }
 
-   struct ir3_register *dst_reg =
-      ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
+   unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+
+   struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
    dst_reg->wrmask = reg->wrmask;
    dst_reg->size = reg->size;
    assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
@@ -1712,8 +2095,7 @@ insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
       pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
    }
 
-   struct ir3_register *src_reg =
-      ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
+   struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
    src_reg->wrmask = reg->wrmask;
    src_reg->size = reg->size;
    assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
@@ -1727,15 +2109,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
 {
    physreg_t physreg = ra_interval_get_physreg(interval);
 
-   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
-   struct ir3_block **predecessors =
-      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
-   unsigned predecessors_count = shared
-                                    ? ctx->block->physical_predecessors_count
-                                    : ctx->block->predecessors_count;
-
-   for (unsigned i = 0; i < predecessors_count; i++) {
-      struct ir3_block *pred = predecessors[i];
+   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+      struct ir3_block *pred = ctx->block->predecessors[i];
       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
 
       if (!pred_state->visited)
@@ -1743,28 +2118,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
 
       physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
       if (pred_reg != physreg) {
+         assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
          insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
-
-         /* This is a bit tricky, but when visiting the destination of a
-          * physical-only edge, we have two predecessors (the if and the
-          * header block) and both have multiple successors. We pick the
-          * register for all live-ins from the normal edge, which should
-          * guarantee that there's no need for shuffling things around in
-          * the normal predecessor as long as there are no phi nodes, but
-          * we still may need to insert fixup code in the physical
-          * predecessor (i.e. the last block of the if) and that has
-          * another successor (the block after the if) so we need to update
-          * the renames state for when we process the other successor. This
-          * crucially depends on the other successor getting processed
-          * after this.
-          *
-          * For normal (non-physical) edges we disallow critical edges so
-          * that hacks like this aren't necessary.
-          */
-         if (!pred_state->renames)
-            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
-         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
-                                 (void *)(uintptr_t)physreg);
       }
    }
 }
@@ -1977,13 +2332,13 @@ calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
    unsigned reg_independent_max_waves =
       ir3_get_reg_independent_max_waves(v, double_threadsize);
    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
-      v->shader->compiler, reg_count, double_threadsize);
+      v->compiler, reg_count, double_threadsize);
    unsigned target_waves =
       MIN2(reg_independent_max_waves, reg_dependent_max_waves);
 
    while (target <= RA_FULL_SIZE / (2 * 4) &&
           ir3_should_double_threadsize(v, target) == double_threadsize &&
-          ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
+          ir3_get_reg_dependent_max_waves(v->compiler, target,
                                           double_threadsize) >= target_waves)
       target++;
 
@@ -2136,17 +2491,68 @@ calc_min_limit_pressure(struct ir3_shader_variant *v,
    ralloc_free(ctx);
 }
 
+/*
+ * If barriers are used, it must be possible for all waves in the workgroup
+ * to execute concurrently. Thus we may have to reduce the registers limit.
+ */
+static void
+calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
+                                        struct ir3_pressure *limit_pressure)
+{
+   const struct ir3_compiler *compiler = v->compiler;
+
+   unsigned threads_per_wg;
+   if (v->local_size_variable) {
+      /* We have to expect the worst case. */
+      threads_per_wg = compiler->max_variable_workgroup_size;
+   } else {
+      threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+   }
+
+   /* The register file is grouped into reg_size_vec4 number of parts.
+    * Each part has enough registers to add a single vec4 register to
+    * each thread of a single-sized wave-pair. With double threadsize
+    * each wave-pair would consume two parts of the register file to get
+    * a single vec4 for a thread. The more active wave-pairs the less
+    * parts each could get.
+    */
+
+   bool double_threadsize = ir3_should_double_threadsize(v, 0);
+   unsigned waves_per_wg = DIV_ROUND_UP(
+      threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
+                         compiler->wave_granularity);
+
+   uint32_t vec4_regs_per_thread =
+      compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
+   assert(vec4_regs_per_thread > 0);
+
+   uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
+
+   if (limit_pressure->full > half_regs_per_thread) {
+      if (v->mergedregs) {
+         limit_pressure->full = half_regs_per_thread;
+      } else {
+         /* TODO: Handle !mergedregs case, probably we would have to do this
+          * after the first register pressure pass.
+          */
+      }
+   }
+}
+
 int
 ir3_ra(struct ir3_shader_variant *v)
 {
    ir3_calc_dominance(v->ir);
 
+   /* Predicate RA needs dominance. */
+   ir3_ra_predicates(v);
+
    ir3_create_parallel_copies(v->ir);
 
    struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
 
    ctx->merged_regs = v->mergedregs;
-   ctx->compiler = v->shader->compiler;
+   ctx->compiler = v->compiler;
    ctx->stage = v->type;
 
    struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
@@ -2155,6 +2561,18 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ir3_merge_regs(live, v->ir);
 
+   bool has_shared_vectors = false;
+   foreach_block (block, &v->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
+               has_shared_vectors = true;
+               break;
+            }
+         }
+      }
+   }
+
    struct ir3_pressure max_pressure;
    ir3_calc_pressure(v, live, &max_pressure);
    d("max pressure:");
@@ -2162,26 +2580,45 @@ ir3_ra(struct ir3_shader_variant *v)
    d("\thalf: %u", max_pressure.half);
    d("\tshared: %u", max_pressure.shared);
 
-   /* TODO: calculate half/full limit correctly for CS with barrier */
    struct ir3_pressure limit_pressure;
    limit_pressure.full = RA_FULL_SIZE;
    limit_pressure.half = RA_HALF_SIZE;
    limit_pressure.shared = RA_SHARED_SIZE;
 
+   if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
+      calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
+   }
+
+   /* If the user forces a doubled threadsize, we may have to lower the limit
+    * because on some gens the register file is not big enough to hold a
+    * double-size wave with all 48 registers in use.
+    */
+   if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY) {
+      limit_pressure.full =
+         MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
+   }
+
    /* If requested, lower the limit so that spilling happens more often. */
    if (ir3_shader_debug & IR3_DBG_SPILLALL)
       calc_min_limit_pressure(v, live, &limit_pressure);
 
-   if (max_pressure.shared > limit_pressure.shared) {
-      /* TODO shared reg -> normal reg spilling */
-      d("shared max pressure exceeded!");
-      goto fail;
+   if (max_pressure.shared > limit_pressure.shared || has_shared_vectors) {
+      ir3_ra_shared(v, live);
+
+      /* Recalculate liveness and register pressure now that additional values
+       * have been added.
+       */
+      ralloc_free(live);
+      live = ir3_calc_liveness(ctx, v->ir);
+      ir3_calc_pressure(v, live, &max_pressure);
+
+      ir3_debug_print(v->ir, "AFTER: shared register allocation");
    }
 
    bool spilled = false;
    if (max_pressure.full > limit_pressure.full ||
        max_pressure.half > limit_pressure.half) {
-      if (!v->shader->compiler->has_pvtmem) {
+      if (!v->compiler->has_pvtmem) {
          d("max pressure exceeded!");
          goto fail;
       }
@@ -2206,10 +2643,12 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ctx->shared.size = RA_SHARED_SIZE;
 
+   ctx->full.start = ctx->half.start = ctx->shared.start = 0;
+
    foreach_block (block, &v->ir->block_list)
       handle_block(ctx, block);
 
-   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
+   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);
 
    /* Strip array-ness and SSA-ness at the end, because various helpers still
     * need to work even on definitions that have already been assigned. For