1 files changed, 593 insertions, 200 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 696aa8d12bc..d08f99a4031 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -1,41 +1,53 @@
 /*
  * Copyright 2018 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
+ * SPDX-License-Identifier: MIT
  */
 
 #include "si_pipe.h"
 #include "util/format/u_format.h"
 #include "util/format_srgb.h"
 #include "util/u_helpers.h"
+#include "util/hash_table.h"
+
+static bool si_can_use_compute_blit(struct si_context *sctx, enum pipe_format format,
+                                    unsigned num_samples, bool is_store, bool has_dcc)
+{
+   /* TODO: This format fails AMD_TEST=imagecopy. */
+   if (format == PIPE_FORMAT_A8R8_UNORM && is_store)
+      return false;
+
+   /* MSAA image stores are broken. AMD_DEBUG=nofmask fixes them, implying that the FMASK
+    * expand pass doesn't work, but let's use the gfx blit, which should be faster because
+    * it doesn't require expanding the FMASK.
+    *
+    * TODO: Broken MSAA stores can cause app issues, though this issue might only affect
+    *       internal blits, not sure.
+    *
+    * EQAA image stores are also unimplemented, which should be rejected here after MSAA
+    * image stores are fixed.
+    */
+   if (num_samples > 1 && is_store)
+      return false;
+
+   if (util_format_is_depth_or_stencil(format))
+      return false;
+
+   /* Image stores support DCC since GFX10. */
+   if (has_dcc && is_store && sctx->gfx_level < GFX10)
+      return false;
+
+   return true;
+}
 
 /* Determine the cache policy. */
 static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
                                              uint64_t size)
 {
-   if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
+   if ((sctx->gfx_level >= GFX9 && (coher == SI_COHERENCY_CB_META ||
                                      coher == SI_COHERENCY_DB_META ||
                                      coher == SI_COHERENCY_CP)) ||
-       (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
+       (sctx->gfx_level >= GFX7 && coher == SI_COHERENCY_SHADER))
       return L2_LRU; /* it's faster if L2 doesn't evict anything  */
 
    return L2_BYPASS;
@@ -59,20 +71,64 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
    }
 }
 
-void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
-                             void *shader, unsigned flags)
+static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf,
+                              unsigned usage)
+{
+   return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) &&
+          sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage);
+}
+
+static void si_improve_sync_flags(struct si_context *sctx, struct pipe_resource *dst,
+                                  struct pipe_resource *src, unsigned *flags)
 {
+   if (dst->target != PIPE_BUFFER || (src && src->target != PIPE_BUFFER))
+      return;
+
+   if (si_is_buffer_idle(sctx, si_resource(dst), RADEON_USAGE_READWRITE) &&
+       (!src || si_is_buffer_idle(sctx, si_resource(src), RADEON_USAGE_WRITE))) {
+      /* Idle buffers don't have to sync. */
+      *flags &= ~(SI_OP_SYNC_GE_BEFORE | SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_CS_BEFORE |
+                  SI_OP_SYNC_CPDMA_BEFORE);
+      return;
+   }
+
+   const unsigned cs_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_COMPUTE) |
+                            SI_BIND_SHADER_BUFFER(PIPE_SHADER_COMPUTE) |
+                            SI_BIND_IMAGE_BUFFER(PIPE_SHADER_COMPUTE) |
+                            SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_COMPUTE);
+
+   const unsigned ps_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_FRAGMENT) |
+                            SI_BIND_SHADER_BUFFER(PIPE_SHADER_FRAGMENT) |
+                            SI_BIND_IMAGE_BUFFER(PIPE_SHADER_FRAGMENT) |
+                            SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_FRAGMENT);
+
+   unsigned bind_history = si_resource(dst)->bind_history |
+                           (src ? si_resource(src)->bind_history : 0);
+
+   /* Clear SI_OP_SYNC_CS_BEFORE if the buffer has never been used with a CS. */
+   if (*flags & SI_OP_SYNC_CS_BEFORE && !(bind_history & cs_mask))
+      *flags &= ~SI_OP_SYNC_CS_BEFORE;
+
+   /* Clear SI_OP_SYNC_PS_BEFORE if the buffer has never been used with a PS. */
+   if (*flags & SI_OP_SYNC_PS_BEFORE && !(bind_history & ps_mask)) {
+      *flags &= ~SI_OP_SYNC_PS_BEFORE;
+      *flags |= SI_OP_SYNC_GE_BEFORE;
+   }
+}
 
+static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_grid_info *info,
+                                    void *shader, unsigned flags)
+{
    /* Wait for previous shaders to finish. */
+   if (flags & SI_OP_SYNC_GE_BEFORE)
+      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+
    if (flags & SI_OP_SYNC_PS_BEFORE)
       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 
    if (flags & SI_OP_SYNC_CS_BEFORE)
       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 
-   if (!(flags & SI_OP_CS_IMAGE))
-      sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
-
    /* Invalidate L0-L1 caches. */
    /* sL0 is never invalidated, because src resources don't use it. */
    if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
@@ -80,11 +136,18 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
 
    /* Set settings for driver-internal compute dispatches. */
    sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
-   sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+   if (sctx->num_hw_pipestat_streamout_queries)
+      sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+
+   if (sctx->flags)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 
    if (!(flags & SI_OP_CS_RENDER_COND_ENABLE))
       sctx->render_cond_enabled = false;
 
+   /* Force-disable fbfetch because there are unsolvable recursion problems. */
+   si_force_disable_ps_colorbuf0_slot(sctx);
+
    /* Skip decompression to prevent infinite recursion. */
    sctx->blitter_running = true;
 
@@ -96,23 +159,42 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
 
    /* Restore default settings. */
    sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
-   sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+   if (sctx->num_hw_pipestat_streamout_queries)
+      sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+
    sctx->render_cond_enabled = sctx->render_cond;
    sctx->blitter_running = false;
 
+   /* We force-disabled fbfetch, so recompute the state. */
+   si_update_ps_colorbuf0_slot(sctx);
+
    if (flags & SI_OP_SYNC_AFTER) {
       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 
       if (flags & SI_OP_CS_IMAGE) {
          /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
-         sctx->flags |= sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0;
+         sctx->flags |= sctx->gfx_level <= GFX8 ? SI_CONTEXT_WB_L2 : 0;
          /* Make sure image stores are visible to all CUs. */
          sctx->flags |= SI_CONTEXT_INV_VCACHE;
+         /* Make sure RBs see our DCC changes. */
+         if (sctx->gfx_level >= GFX10 && sctx->screen->info.tcc_rb_non_coherent) {
+            unsigned enabled_mask = sctx->images[PIPE_SHADER_COMPUTE].enabled_mask;
+            while (enabled_mask) {
+               int i = u_bit_scan(&enabled_mask);
+               if (sctx->images[PIPE_SHADER_COMPUTE].views[i].access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE) {
+                  sctx->flags |= SI_CONTEXT_INV_L2;
+                  break;
+               }
+            }
+         }
       } else {
          /* Make sure buffer stores are visible to all CUs. */
          sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
       }
    }
+
+   if (sctx->flags)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }
 
 void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
@@ -120,8 +202,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
                                    unsigned num_buffers, const struct pipe_shader_buffer *buffers,
                                    unsigned writeable_bitmask)
 {
-   if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
+   if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
       sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
 
    /* Save states. */
    struct pipe_shader_buffer saved_sb[3] = {};
@@ -136,14 +220,17 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
    }
 
    /* Bind buffers and launch compute. */
-   sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers,
-                              writeable_bitmask);
+   si_set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers,
+                         writeable_bitmask,
+                         true /* don't update bind_history to prevent unnecessary syncs later */);
    si_launch_grid_internal(sctx, info, shader, flags);
 
    /* Do cache flushing at the end. */
    if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) {
-      if (flags & SI_OP_SYNC_AFTER)
+      if (flags & SI_OP_SYNC_AFTER) {
          sctx->flags |= SI_CONTEXT_WB_L2;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+      }
    } else {
       while (writeable_bitmask)
          si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true;
@@ -172,14 +259,14 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
 
    /* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */
    unsigned dwords_per_instruction = 4;
-   unsigned wave_size = sctx->screen->compute_wave_size;
-   unsigned dwords_per_wave = dwords_per_instruction * wave_size;
+   unsigned block_size = 64; /* it's always 64x1x1 */
+   unsigned dwords_per_wave = dwords_per_instruction * block_size;
 
    unsigned num_dwords = size / 4;
    unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
 
    struct pipe_grid_info info = {};
-   info.block[0] = MIN2(wave_size, num_instructions);
+   info.block[0] = MIN2(block_size, num_instructions);
    info.block[1] = 1;
    info.block[2] = 1;
    info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
@@ -195,7 +282,7 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
    sctx->cs_user_data[1] = ~writebitmask;
 
    if (!sctx->cs_clear_buffer_rmw)
-      sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b);
+      sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(sctx);
 
    si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer_rmw, flags, coher,
                                  1, &sb, 0x1);
@@ -206,8 +293,6 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe
                                             const uint32_t *clear_value, unsigned flags,
                                             enum si_coherency coher)
 {
-   struct pipe_context *ctx = &sctx->b;
-
    assert(dst_offset % 4 == 0);
    assert(size % 4 == 0);
    unsigned size_12 = DIV_ROUND_UP(size, 12);
@@ -222,7 +307,7 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe
    struct pipe_grid_info info = {0};
 
    if (!sctx->cs_clear_12bytes_buffer)
-      sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
+      sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(sctx);
 
    info.block[0] = 64;
    info.last_block[0] = size_12 % 64;
@@ -257,14 +342,15 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
       src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
    unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
    unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
-   unsigned wave_size = sctx->screen->compute_wave_size;
-   unsigned dwords_per_wave = dwords_per_thread * wave_size;
+   /* The shader declares the block size like this: */
+   unsigned block_size = si_determine_wave_size(sctx->screen, NULL);
+   unsigned dwords_per_wave = dwords_per_thread * block_size;
 
    unsigned num_dwords = size / 4;
    unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
 
    struct pipe_grid_info info = {};
-   info.block[0] = MIN2(wave_size, num_instructions);
+   info.block[0] = MIN2(block_size, num_instructions);
    info.block[1] = 1;
    info.block[2] = 1;
    info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
@@ -285,7 +371,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
 
       if (!sctx->cs_copy_buffer) {
          sctx->cs_copy_buffer = si_create_dma_compute_shader(
-            &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+            sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
       }
 
       si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
@@ -299,7 +385,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
 
       if (!sctx->cs_clear_buffer) {
          sctx->cs_clear_buffer = si_create_dma_compute_shader(
-            &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+            sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
       }
 
       si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
@@ -315,6 +401,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
    if (!size)
       return;
 
+   si_improve_sync_flags(sctx, dst, NULL, &flags);
+
    ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
 
    assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
@@ -335,7 +423,7 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
    if (aligned_size >= 4) {
       uint64_t compute_min_size;
 
-      if (sctx->chip_class <= GFX8) {
+      if (sctx->gfx_level <= GFX8) {
          /* CP DMA clears are terribly slow with GTT on GFX6-8, which can always
           * happen due to BO evictions.
           */
@@ -345,7 +433,9 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
          compute_min_size = 4 * 1024;
       }
 
-      if (method == SI_AUTO_SELECT_CLEAR_METHOD && (
+      /* TODO: use compute for unaligned big sizes */
+      if (method == SI_AUTO_SELECT_CLEAR_METHOD &&
+          (flags & SI_OP_CS_RENDER_COND_ENABLE ||
            clear_value_size > 4 ||
            (clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size))) {
          method = SI_COMPUTE_CLEAR_METHOD;
@@ -355,6 +445,7 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
                                      clear_value_size, flags, coher);
       } else {
          assert(clear_value_size == 4);
+         assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
          si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value,
                                 flags, coher, get_cache_policy(sctx, coher, size));
       }
@@ -365,26 +456,21 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 
    /* Handle non-dword alignment. */
    if (size) {
+      assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
       assert(dst);
       assert(dst->target == PIPE_BUFFER);
       assert(size < 4);
 
-      pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+      sctx->b.buffer_subdata(&sctx->b, dst,
+                             PIPE_MAP_WRITE |
+                             /* TC forbids drivers to invalidate buffers and infer unsynchronized mappings,
+                              * so suppress those optimizations. */
+                             (sctx->tc ? TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED |
+                                         TC_TRANSFER_MAP_NO_INVALIDATE : 0),
+                             offset, size, clear_value);
    }
 }
 
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
-                            uint64_t size, unsigned value, unsigned flags)
-{
-   struct si_context *ctx = (struct si_context *)sscreen->aux_context;
-
-   simple_mtx_lock(&sscreen->aux_context_lock);
-   si_clear_buffer(ctx, dst, offset, size, &value, 4, flags,
-                   SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD);
-   sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
-   simple_mtx_unlock(&sscreen->aux_context_lock);
-}
-
 static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
                                  unsigned offset, unsigned size, const void *clear_value,
                                  int clear_value_size)
@@ -404,7 +490,10 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
    enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
    uint64_t compute_min_size = 8 * 1024;
 
+   si_improve_sync_flags(sctx, dst, src, &flags);
+
    /* Only use compute for VRAM copies on dGPUs. */
+   /* TODO: use compute for unaligned big sizes */
    if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
        si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size &&
        dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
@@ -416,29 +505,176 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
    }
 }
 
-void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+                                     uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags)
+{
+   if (!size)
+      return;
+
+   if (!sctx->cs_ubyte_to_ushort)
+      sctx->cs_ubyte_to_ushort = si_create_ubyte_to_ushort_compute_shader(sctx);
+
+   /* Use COHERENCY_NONE to get SI_CONTEXT_WB_L2 automatically used in
+    * si_launch_grid_internal_ssbos.
+    */
+   enum si_coherency coher = SI_COHERENCY_NONE;
+
+   si_improve_sync_flags(sctx, dst, src, &flags);
+
+   struct pipe_grid_info info = {};
+   info.block[0] = si_determine_wave_size(sctx->screen, NULL);
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(size, info.block[0]);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+   info.last_block[0] = size % info.block[0];
+
+   struct pipe_shader_buffer sb[2] = {};
+   sb[0].buffer = dst;
+   sb[0].buffer_offset = dst_offset;
+   sb[0].buffer_size = dst->width0;
+
+   sb[1].buffer = src;
+   sb[1].buffer_offset = src_offset;
+   sb[1].buffer_size = src->width0;
+
+   si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_ubyte_to_ushort, flags, coher,
+                                 2, sb, 0x1);
+}
+
+static unsigned
+set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
+              unsigned work_x, unsigned work_y, unsigned work_z)
+{
+   info->block[0] = block_x;
+   info->block[1] = block_y;
+   info->block[2] = block_z;
+
+   unsigned work[3] = {work_x, work_y, work_z};
+   for (int i = 0; i < 3; ++i) {
+      info->last_block[i] = work[i] % info->block[i];
+      info->grid[i] = DIV_ROUND_UP(work[i], info->block[i]);
+   }
+
+   return work_z > 1 ? 3 : (work_y > 1 ? 2 : 1);
+}
+
+static void si_launch_grid_internal_images(struct si_context *sctx,
+                                           struct pipe_image_view *images,
+                                           unsigned num_images,
+                                           const struct pipe_grid_info *info,
+                                           void *shader, unsigned flags)
+{
+   struct pipe_image_view saved_image[2] = {};
+   assert(num_images <= ARRAY_SIZE(saved_image));
+
+   for (unsigned i = 0; i < num_images; i++) {
+      assert(sctx->b.screen->is_format_supported(sctx->b.screen, images[i].format,
+                                                 images[i].resource->target,
+                                                 images[i].resource->nr_samples,
+                                                 images[i].resource->nr_storage_samples,
+                                                 PIPE_BIND_SHADER_IMAGE));
+
+      /* Always allow DCC stores on gfx10+. */
+      if (sctx->gfx_level >= GFX10 &&
+          images[i].access & PIPE_IMAGE_ACCESS_WRITE &&
+          !(images[i].access & SI_IMAGE_ACCESS_DCC_OFF))
+         images[i].access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
+
+      /* Simplify the format according to what image stores support. */
+      if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) {
+         images[i].format = util_format_linear(images[i].format); /* SRGB not supported */
+         /* Keep L8A8 formats as-is because GFX7 is unable to store into R8A8 for some reason. */
+         images[i].format = util_format_intensity_to_red(images[i].format);
+         images[i].format = util_format_rgbx_to_rgba(images[i].format); /* prevent partial writes */
+      }
+
+      /* Save the image. */
+      util_copy_image_view(&saved_image[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
+   }
+
+   /* This might invoke DCC decompression, so do it first. */
+   sctx->b.set_shader_images(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_images, 0, images);
+
+   /* This should be done after set_shader_images. */
+   for (unsigned i = 0; i < num_images; i++) {
+      /* The driver doesn't decompress resources automatically here, so do it manually. */
+      si_decompress_subresource(&sctx->b, images[i].resource, PIPE_MASK_RGBAZS,
+                                images[i].u.tex.level, images[i].u.tex.first_layer,
+                                images[i].u.tex.last_layer,
+                                images[i].access & PIPE_IMAGE_ACCESS_WRITE);
+   }
+
+   /* This must be done before the compute shader. */
+   if (flags & SI_OP_SYNC_PS_BEFORE) {
+      for (unsigned i = 0; i < num_images; i++) {
+         si_make_CB_shader_coherent(sctx, images[i].resource->nr_samples, true,
+               ((struct si_texture*)images[i].resource)->surface.u.gfx9.color.dcc.pipe_aligned);
+      }
+   }
+
+   si_launch_grid_internal(sctx, info, shader, flags | SI_OP_CS_IMAGE);
+
+   /* Restore images. */
+   sctx->b.set_shader_images(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_images, 0, saved_image);
+   for (unsigned i = 0; i < num_images; i++)
+      pipe_resource_reference(&saved_image[i].resource, NULL);
+}
+
+bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
                            struct pipe_resource *src, unsigned src_level, unsigned dstx,
                            unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
-                           bool is_dcc_decompress, unsigned flags)
+                           unsigned flags)
 {
-   struct pipe_context *ctx = &sctx->b;
    struct si_texture *ssrc = (struct si_texture*)src;
    struct si_texture *sdst = (struct si_texture*)dst;
-   unsigned width = src_box->width;
-   unsigned height = src_box->height;
-   unsigned depth = src_box->depth;
+
+   /* The compute copy is mandatory for compressed and subsampled formats because the gfx copy
+    * doesn't support them. In all other cases, call si_can_use_compute_blit.
+    *
+    * The format is identical (we only need to check the src format) except compressed formats,
+    * which can be paired with an equivalent integer format.
+    */
+   if (!util_format_is_compressed(src->format) &&
+       !util_format_is_compressed(dst->format) &&
+       !util_format_is_subsampled_422(src->format)) {
+      bool src_can_use_compute_blit =
+         si_can_use_compute_blit(sctx, src->format, src->nr_samples, false,
+                                 vi_dcc_enabled(ssrc, src_level));
+
+      if (!src_can_use_compute_blit)
+         return false;
+
+      bool dst_can_use_compute_blit =
+         si_can_use_compute_blit(sctx, dst->format, dst->nr_samples, true,
+                                 vi_dcc_enabled(sdst, dst_level));
+
+      if (!dst_can_use_compute_blit && !sctx->has_graphics &&
+          si_can_use_compute_blit(sctx, dst->format, dst->nr_samples, false,
+                                  vi_dcc_enabled(sdst, dst_level))) {
+         /* Non-graphics context don't have a blitter, so try harder to do
+          * a compute blit by disabling dcc on the destination texture.
+          */
+         dst_can_use_compute_blit = si_texture_disable_dcc(sctx, sdst);
+      }
+
+      if (!dst_can_use_compute_blit)
+         return false;
+   }
+
    enum pipe_format src_format = util_format_linear(src->format);
    enum pipe_format dst_format = util_format_linear(dst->format);
    bool is_linear = ssrc->surface.is_linear || sdst->surface.is_linear;
 
    assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
 
+   /* Interpret as integer values to avoid NaN issues */
    if (!vi_dcc_enabled(ssrc, src_level) &&
        !vi_dcc_enabled(sdst, dst_level) &&
        src_format == dst_format &&
        util_format_is_float(src_format) &&
        !util_format_is_compressed(src_format)) {
-      /* Interpret as integer values to avoid NaN issues */
       switch(util_format_get_blocksizebits(src_format)) {
         case 16:
           src_format = dst_format = PIPE_FORMAT_R16_UINT;
@@ -457,8 +693,44 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
       }
    }
 
+   /* Interpret compressed formats as UINT. */
+   struct pipe_box new_box;
+   unsigned src_access = 0, dst_access = 0;
+
+   /* Note that staging copies do compressed<->UINT, so one of the formats is already UINT. */
+   if (util_format_is_compressed(src_format) || util_format_is_compressed(dst_format)) {
+      if (util_format_is_compressed(src_format))
+         src_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+      if (util_format_is_compressed(dst_format))
+         dst_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+
+      dstx = util_format_get_nblocksx(dst_format, dstx);
+      dsty = util_format_get_nblocksy(dst_format, dsty);
+
+      new_box.x = util_format_get_nblocksx(src_format, src_box->x);
+      new_box.y = util_format_get_nblocksy(src_format, src_box->y);
+      new_box.z = src_box->z;
+      new_box.width = util_format_get_nblocksx(src_format, src_box->width);
+      new_box.height = util_format_get_nblocksy(src_format, src_box->height);
+      new_box.depth = src_box->depth;
+      src_box = &new_box;
+
+      if (ssrc->surface.bpe == 8)
+         src_format = dst_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+      else
+         src_format = dst_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+   }
+
    if (util_format_is_subsampled_422(src_format)) {
+      assert(src_format == dst_format);
+
+      src_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+      dst_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+
+      dstx = util_format_get_nblocksx(src_format, dstx);
+
       src_format = dst_format = PIPE_FORMAT_R32_UINT;
+
       /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
        * should force us to divide src_box->x, dstx and width by 2.
        * But given that ac_surface allocates this format as 32 bpp
@@ -467,135 +739,69 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
        */
    }
 
-   if (width == 0 || height == 0)
-      return;
-
-   /* The driver doesn't decompress resources automatically here. */
-   si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
-                             dstz + src_box->depth - 1);
-   si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
-                             src_box->z + src_box->depth - 1);
-
-   /* src and dst have the same number of samples. */
-   si_make_CB_shader_coherent(sctx, src->nr_samples, true,
-                              ssrc->surface.u.gfx9.color.dcc.pipe_aligned);
-   if (sctx->chip_class >= GFX10) {
-      /* GFX10+ uses DCC stores so si_make_CB_shader_coherent is required for dst too */
-      si_make_CB_shader_coherent(sctx, dst->nr_samples, true,
-                                 sdst->surface.u.gfx9.color.dcc.pipe_aligned);
-   }
+   /* SNORM blitting has precision issues. Use the SINT equivalent instead, which doesn't
+    * force DCC decompression.
+    */
+   if (util_format_is_snorm(dst_format))
+      src_format = dst_format = util_format_snorm_to_sint(dst_format);
 
-   struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
-   struct pipe_image_view saved_image[2] = {0};
-   util_copy_image_view(&saved_image[0], &images->views[0]);
-   util_copy_image_view(&saved_image[1], &images->views[1]);
+   if (src_box->width == 0 || src_box->height == 0 || src_box->depth == 0)
+      return true; /* success - nothing to do */
 
    struct pipe_image_view image[2] = {0};
    image[0].resource = src;
-   image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+   image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ | src_access;
    image[0].format = src_format;
    image[0].u.tex.level = src_level;
    image[0].u.tex.first_layer = 0;
-   image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
-                                                              : (unsigned)(src->array_size - 1);
+   image[0].u.tex.last_layer = util_max_layer(src, src_level);
    image[1].resource = dst;
-   image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+   image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE | dst_access;
    image[1].format = dst_format;
    image[1].u.tex.level = dst_level;
    image[1].u.tex.first_layer = 0;
-   image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
-                                                              : (unsigned)(dst->array_size - 1);
-
-   /* SNORM8 blitting has precision issues on some chips. Use the SINT
-    * equivalent instead, which doesn't force DCC decompression.
-    */
-   if (util_format_is_snorm8(dst->format)) {
-      image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
-   }
-
-   if (is_dcc_decompress)
-      image[1].access |= SI_IMAGE_ACCESS_DCC_OFF;
-   else if (sctx->chip_class >= GFX10)
-      image[1].access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
-
-   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, image);
-
-   if (!is_dcc_decompress) {
-      sctx->cs_user_data[0] = src_box->x | (dstx << 16);
-      sctx->cs_user_data[1] = src_box->y | (dsty << 16);
-      sctx->cs_user_data[2] = src_box->z | (dstz << 16);
-   }
+   image[1].u.tex.last_layer = util_max_layer(dst, dst_level);
 
    struct pipe_grid_info info = {0};
 
-   if (is_dcc_decompress) {
-      /* The DCC decompression is a normal blit where the load is compressed
-       * and the store is uncompressed. The workgroup size is either equal to
-       * the DCC block size or a multiple thereof. The shader uses a barrier
-       * between loads and stores to safely overwrite each DCC block of pixels.
-       */
-      unsigned dim[3] = {src_box->width, src_box->height, src_box->depth};
-
-      assert(src == dst);
-      assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY);
-
-      if (!sctx->cs_dcc_decompress)
-         sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx);
-
-      info.block[0] = ssrc->surface.u.gfx9.color.dcc_block_width;
-      info.block[1] = ssrc->surface.u.gfx9.color.dcc_block_height;
-      info.block[2] = ssrc->surface.u.gfx9.color.dcc_block_depth;
-
-      /* Make sure the block size is at least the same as wave size. */
-      while (info.block[0] * info.block[1] * info.block[2] <
-             sctx->screen->compute_wave_size) {
-         info.block[0] *= 2;
-      }
+   bool dst_is_1d = dst->target == PIPE_TEXTURE_1D ||
+                    dst->target == PIPE_TEXTURE_1D_ARRAY;
+   bool src_is_1d = src->target == PIPE_TEXTURE_1D ||
+                    src->target == PIPE_TEXTURE_1D_ARRAY;
+   int block_x, block_y;
+   int block_z = 1;
+
+   /* Choose the block dimensions based on the copy area size. */
+   if (src_box->height <= 4) {
+      block_y = util_next_power_of_two(src_box->height);
+      block_x = 64 / block_y;
+   } else if (src_box->width <= 4) {
+      block_x = util_next_power_of_two(src_box->width);
+      block_y = 64 / block_x;
+   } else if (is_linear) {
+      block_x = 64;
+      block_y = 1;
+   } else {
+      block_x = 8;
+      block_y = 8;
+   }
 
-      for (unsigned i = 0; i < 3; i++) {
-         info.last_block[i] = dim[i] % info.block[i];
-         info.grid[i] = DIV_ROUND_UP(dim[i], info.block[i]);
-      }
+   sctx->cs_user_data[0] = src_box->x | (dstx << 16);
+   sctx->cs_user_data[1] = src_box->y | (dsty << 16);
+   sctx->cs_user_data[2] = src_box->z | (dstz << 16);
 
-      si_launch_grid_internal(sctx, &info, sctx->cs_dcc_decompress, flags | SI_OP_CS_IMAGE);
-   } else if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
-      if (!sctx->cs_copy_image_1d_array)
-         sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
+   unsigned wg_dim =
+      set_work_size(&info, block_x, block_y, block_z,
+                    src_box->width, src_box->height, src_box->depth);
 
-      info.block[0] = 64;
-      info.last_block[0] = width % 64;
-      info.block[1] = 1;
-      info.block[2] = 1;
-      info.grid[0] = DIV_ROUND_UP(width, 64);
-      info.grid[1] = depth;
-      info.grid[2] = 1;
+   void **copy_image_cs_ptr = &sctx->cs_copy_image[wg_dim - 1][src_is_1d][dst_is_1d];
+   if (!*copy_image_cs_ptr)
+      *copy_image_cs_ptr = si_create_copy_image_cs(sctx, wg_dim, src_is_1d, dst_is_1d);
 
-      si_launch_grid_internal(sctx, &info, sctx->cs_copy_image_1d_array, flags | SI_OP_CS_IMAGE);
-   } else {
-      if (!sctx->cs_copy_image)
-         sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
+   assert(*copy_image_cs_ptr);
 
-      /* This is better for access over PCIe. */
-      if (is_linear) {
-         info.block[0] = 64;
-         info.block[1] = 1;
-      } else {
-         info.block[0] = 8;
-         info.block[1] = 8;
-      }
-      info.last_block[0] = width % info.block[0];
-      info.last_block[1] = height % info.block[1];
-      info.block[2] = 1;
-      info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
-      info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
-      info.grid[2] = depth;
-
-      si_launch_grid_internal(sctx, &info, sctx->cs_copy_image, flags | SI_OP_CS_IMAGE);
-   }
-
-   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, saved_image);
-   for (int i = 0; i < 2; i++)
-      pipe_resource_reference(&saved_image[i].resource, NULL);
+   si_launch_grid_internal_images(sctx, image, 2, &info, *copy_image_cs_ptr, flags);
+   return true;
 }
 
 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
@@ -649,6 +855,8 @@ void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uin
 {
    struct si_texture *tex = (struct si_texture*)res;
 
+   assert(sctx->gfx_level < GFX11);
+
    /* Set the DCC buffer. */
    assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX);
    assert(tex->buffer.bo_size <= UINT_MAX);
@@ -701,6 +909,8 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
    unsigned log_samples = util_logbase2(tex->nr_samples);
    assert(tex->nr_samples >= 2);
 
+   assert(sctx->gfx_level < GFX11);
+
    /* EQAA FMASK expansion is unimplemented. */
    if (tex->nr_samples != tex->nr_storage_samples)
       return;
@@ -727,7 +937,7 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
    /* Bind the shader. */
    void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
    if (!*shader)
-      *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
+      *shader = si_create_fmask_expand_cs(sctx, tex->nr_samples, is_array);
 
    /* Dispatch compute. */
    struct pipe_grid_info info = {0};
@@ -765,6 +975,47 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
                    SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD);
 }
 
+void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
+                                       unsigned level, enum pipe_format format,
+                                       const union pipe_color_union *color, unsigned flags)
+{
+   assert(sctx->gfx_level >= GFX11); /* not believed to be useful on gfx10 */
+   unsigned dcc_block_width = tex->surface.u.gfx9.color.dcc_block_width;
+   unsigned dcc_block_height = tex->surface.u.gfx9.color.dcc_block_height;
+   unsigned width = DIV_ROUND_UP(u_minify(tex->buffer.b.b.width0, level), dcc_block_width);
+   unsigned height = DIV_ROUND_UP(u_minify(tex->buffer.b.b.height0, level), dcc_block_height);
+   unsigned depth = util_num_layers(&tex->buffer.b.b, level);
+   bool is_msaa = tex->buffer.b.b.nr_samples >= 2;
+
+   struct pipe_image_view image = {0};
+   image.resource = &tex->buffer.b.b;
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_DCC_OFF;
+   image.format = format;
+   image.u.tex.level = level;
+   image.u.tex.last_layer = depth - 1;
+
+   if (util_format_is_srgb(format)) {
+      union pipe_color_union color_srgb;
+      for (int i = 0; i < 3; i++)
+         color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
+      color_srgb.f[3] = color->f[3];
+      memcpy(sctx->cs_user_data, color_srgb.ui, sizeof(color->ui));
+   } else {
+      memcpy(sctx->cs_user_data, color->ui, sizeof(color->ui));
+   }
+
+   sctx->cs_user_data[4] = dcc_block_width | (dcc_block_height << 16);
+
+   struct pipe_grid_info info = {0};
+   unsigned wg_dim = set_work_size(&info, 8, 8, 1, width, height, depth);
+
+   void **shader = &sctx->cs_clear_image_dcc_single[is_msaa][wg_dim];
+   if (!*shader)
+      *shader = si_clear_image_dcc_single_shader(sctx, is_msaa, wg_dim);
+
+   si_launch_grid_internal_images(sctx, &image, 1, &info, *shader, flags);
+}
+
 void si_init_compute_blit_functions(struct si_context *sctx)
 {
    sctx->b.clear_buffer = si_pipe_clear_buffer;
@@ -777,17 +1028,12 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
                                     bool render_condition_enabled)
 {
    struct si_context *sctx = (struct si_context *)ctx;
-   struct si_texture *tex = (struct si_texture*)dstsurf->texture;
    unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
    unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
 
    if (width == 0 || height == 0)
       return;
 
-   /* The driver doesn't decompress resources automatically here. */
-   si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
-                             dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
-
    if (util_format_is_srgb(dstsurf->format)) {
       union pipe_color_union color_srgb;
       for (int i = 0; i < 3; i++)
@@ -798,16 +1044,9 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
       memcpy(data + 4, color->ui, sizeof(color->ui));
    }
 
-   si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
-                              tex->surface.u.gfx9.color.dcc.pipe_aligned);
-
    struct pipe_constant_buffer saved_cb = {};
    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 
-   struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
-   struct pipe_image_view saved_image = {0};
-   util_copy_image_view(&saved_image, &images->views[0]);
-
    struct pipe_constant_buffer cb = {};
    cb.buffer_size = sizeof(data);
    cb.user_buffer = data;
@@ -815,20 +1054,18 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
 
    struct pipe_image_view image = {0};
    image.resource = dstsurf->texture;
-   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
    image.format = util_format_linear(dstsurf->format);
    image.u.tex.level = dstsurf->u.tex.level;
    image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
    image.u.tex.last_layer = dstsurf->u.tex.last_layer;
 
-   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &image);
-
    struct pipe_grid_info info = {0};
    void *shader;
 
    if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
       if (!sctx->cs_clear_render_target)
-         sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
+         sctx->cs_clear_render_target = si_clear_render_target_shader(sctx, PIPE_TEXTURE_2D_ARRAY);
       shader = sctx->cs_clear_render_target;
 
       info.block[0] = 8;
@@ -841,7 +1078,7 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
       info.grid[2] = num_layers;
    } else {
       if (!sctx->cs_clear_render_target_1d_array)
-         sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
+         sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader(sctx, PIPE_TEXTURE_1D_ARRAY);
       shader = sctx->cs_clear_render_target_1d_array;
 
       info.block[0] = 64;
@@ -853,10 +1090,166 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
       info.grid[2] = 1;
    }
 
-   si_launch_grid_internal(sctx, &info, shader, SI_OP_SYNC_BEFORE_AFTER | SI_OP_CS_IMAGE |
-                           (render_condition_enabled ? SI_OP_CS_RENDER_COND_ENABLE : 0));
+   si_launch_grid_internal_images(sctx, &image, 1, &info, shader,
+                                  SI_OP_SYNC_BEFORE_AFTER |
+                                  (render_condition_enabled ? SI_OP_CS_RENDER_COND_ENABLE : 0));
 
-   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &saved_image);
    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, true, &saved_cb);
-   pipe_resource_reference(&saved_image.resource, NULL);
+}
+
+/* Return the last component that a compute blit should load and store. */
+static unsigned si_format_get_last_blit_component(enum pipe_format format, bool is_dst)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   unsigned num = 0;
+
+   for (unsigned i = 1; i < 4; i++) {
+      if (desc->swizzle[i] <= PIPE_SWIZZLE_W ||
+          /* If the swizzle is 1 for dst, we need to store 1 explicitly.
+           * The hardware stores 0 by default. */
+          (is_dst && desc->swizzle[i] == PIPE_SWIZZLE_1))
+         num = i;
+   }
+   return num;
+}
+
+static bool si_should_blit_clamp_xy(const struct pipe_blit_info *info)
+{
+   int src_width = u_minify(info->src.resource->width0, info->src.level);
+   int src_height = u_minify(info->src.resource->height0, info->src.level);
+   struct pipe_box box = info->src.box;
+
+   /* Eliminate negative width/height/depth. */
+   if (box.width < 0) {
+      box.x += box.width;
+      box.width *= -1;
+   }
+   if (box.height < 0) {
+      box.y += box.height;
+      box.height *= -1;
+   }
+
+   bool in_bounds = box.x >= 0 && box.x < src_width &&
+                    box.y >= 0 && box.y < src_height &&
+                    box.x + box.width > 0 && box.x + box.width <= src_width &&
+                    box.y + box.height > 0 && box.y + box.height <= src_height;
+
+   /* Return if the box is not in bounds. */
+   return !in_bounds;
+}
+
+bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, bool testing)
+{
+   /* Compute blits require D16 right now (see the ISA).
+    *
+    * Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
+    * The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test
+    * for that because it's mostly just blits.
+    *
+    * TODO: benchmark the performance on gfx11
+    */
+   if (sctx->gfx_level < GFX11 && !testing)
+      return false;
+
+   if (!si_can_use_compute_blit(sctx, info->dst.format, info->dst.resource->nr_samples, true,
+                                vi_dcc_enabled((struct si_texture*)info->dst.resource,
+                                               info->dst.level)) ||
+       !si_can_use_compute_blit(sctx, info->src.format, info->src.resource->nr_samples, false,
+                                vi_dcc_enabled((struct si_texture*)info->src.resource,
+                                               info->src.level)))
+      return false;
+
+   if (info->alpha_blend ||
+       info->num_window_rectangles ||
+       info->scissor_enable ||
+       /* No scaling. */
+       info->dst.box.width != abs(info->src.box.width) ||
+       info->dst.box.height != abs(info->src.box.height) ||
+       info->dst.box.depth != abs(info->src.box.depth))
+      return false;
+
+   assert(info->src.box.depth >= 0);
+
+   /* Shader images. */
+   struct pipe_image_view image[2];
+   image[0].resource = info->src.resource;
+   image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+   image[0].format = info->src.format;
+   image[0].u.tex.level = info->src.level;
+   image[0].u.tex.first_layer = 0;
+   image[0].u.tex.last_layer = util_max_layer(info->src.resource, info->src.level);
+
+   image[1].resource = info->dst.resource;
+   image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+   image[1].format = info->dst.format;
+   image[1].u.tex.level = info->dst.level;
+   image[1].u.tex.first_layer = 0;
+   image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);
+
+   struct pipe_grid_info grid = {0};
+   unsigned wg_dim =
+      set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height,
+                    info->dst.box.depth);
+
+   /* Get the shader key. */
+   const struct util_format_description *dst_desc = util_format_description(info->dst.format);
+   unsigned i = util_format_get_first_non_void_channel(info->dst.format);
+   union si_compute_blit_shader_key options;
+   options.key = 0;
+
+   options.always_true = true;
+   options.wg_dim = wg_dim;
+   options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D ||
+                       info->src.resource->target == PIPE_TEXTURE_1D_ARRAY;
+   options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
+                       info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY;
+   options.src_is_msaa = info->src.resource->nr_samples > 1;
+   options.dst_is_msaa = info->dst.resource->nr_samples > 1;
+   /* Resolving integer formats only copies sample 0. log2_samples is then unused. */
+   options.sample0_only = options.src_is_msaa && !options.dst_is_msaa &&
+                          util_format_is_pure_integer(info->src.format);
+   unsigned num_samples = MAX2(info->src.resource->nr_samples, info->dst.resource->nr_samples);
+   options.log2_samples = options.sample0_only ? 0 : util_logbase2(num_samples);
+   options.xy_clamp_to_edge = si_should_blit_clamp_xy(info);
+   options.flip_x = info->src.box.width < 0;
+   options.flip_y = info->src.box.height < 0;
+   options.sint_to_uint = util_format_is_pure_sint(info->src.format) &&
+                          util_format_is_pure_uint(info->dst.format);
+   options.uint_to_sint = util_format_is_pure_uint(info->src.format) &&
+                          util_format_is_pure_sint(info->dst.format);
+   options.dst_is_srgb = util_format_is_srgb(info->dst.format);
+   options.last_dst_channel = si_format_get_last_blit_component(info->dst.format, true);
+   options.last_src_channel = MIN2(si_format_get_last_blit_component(info->src.format, false),
+                                   options.last_dst_channel);
+   options.use_integer_one = util_format_is_pure_integer(info->dst.format) &&
+                             options.last_src_channel < options.last_dst_channel &&
+                             options.last_dst_channel == 3;
+
+   /* WARNING: We need this option for AMD_TEST to get results identical with the gfx blit,
+    * otherwise we wouldn't be able to fully validate whether everything else works.
+    * The test expects that the behavior is identical to u_blitter.
+    *
+    * Additionally, we need to keep this enabled even when not testing because not doing fp16_rtz
+    * breaks "piglit/bin/texsubimage -auto pbo".
+    */
+   options.fp16_rtz = !util_format_is_pure_integer(info->dst.format) &&
+                      dst_desc->channel[i].size <= 10;
+
+   struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders,
+                                                      (void*)(uintptr_t)options.key);
+   void *shader = entry ? entry->data : NULL;
+   if (!shader) {
+      shader = si_create_blit_cs(sctx, &options);
+      _mesa_hash_table_insert(sctx->cs_blit_shaders,
+                              (void*)(uintptr_t)options.key, shader);
+   }
+
+   sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16);
+   sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16);
+   sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16);
+
+   si_launch_grid_internal_images(sctx, image, 2, &grid, shader,
+                                  SI_OP_SYNC_BEFORE_AFTER |
+                                  (info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0));
+   return true;
 }