summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/radeonsi/si_compute_blit.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_compute_blit.c')
-rw-r--r--src/gallium/drivers/radeonsi/si_compute_blit.c793
1 files changed, 593 insertions, 200 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 696aa8d12bc..d08f99a4031 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -1,41 +1,53 @@
/*
* Copyright 2018 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
*
+ * SPDX-License-Identifier: MIT
*/
#include "si_pipe.h"
#include "util/format/u_format.h"
#include "util/format_srgb.h"
#include "util/u_helpers.h"
+#include "util/hash_table.h"
+
+static bool si_can_use_compute_blit(struct si_context *sctx, enum pipe_format format,
+ unsigned num_samples, bool is_store, bool has_dcc)
+{
+ /* TODO: This format fails AMD_TEST=imagecopy. */
+ if (format == PIPE_FORMAT_A8R8_UNORM && is_store)
+ return false;
+
+ /* MSAA image stores are broken. AMD_DEBUG=nofmask fixes them, implying that the FMASK
+ * expand pass doesn't work, but let's use the gfx blit, which should be faster because
+ * it doesn't require expanding the FMASK.
+ *
+ * TODO: Broken MSAA stores can cause app issues, though this issue might only affect
+ * internal blits, not sure.
+ *
+ * EQAA image stores are also unimplemented, which should be rejected here after MSAA
+ * image stores are fixed.
+ */
+ if (num_samples > 1 && is_store)
+ return false;
+
+ if (util_format_is_depth_or_stencil(format))
+ return false;
+
+ /* Image stores support DCC since GFX10. */
+ if (has_dcc && is_store && sctx->gfx_level < GFX10)
+ return false;
+
+ return true;
+}
/* Determine the cache policy. */
static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
uint64_t size)
{
- if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
+ if ((sctx->gfx_level >= GFX9 && (coher == SI_COHERENCY_CB_META ||
coher == SI_COHERENCY_DB_META ||
coher == SI_COHERENCY_CP)) ||
- (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
+ (sctx->gfx_level >= GFX7 && coher == SI_COHERENCY_SHADER))
return L2_LRU; /* it's faster if L2 doesn't evict anything */
return L2_BYPASS;
@@ -59,20 +71,64 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
}
}
-void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
- void *shader, unsigned flags)
+static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf,
+ unsigned usage)
+{
+ return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) &&
+ sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage);
+}
+
+static void si_improve_sync_flags(struct si_context *sctx, struct pipe_resource *dst,
+ struct pipe_resource *src, unsigned *flags)
{
+ if (dst->target != PIPE_BUFFER || (src && src->target != PIPE_BUFFER))
+ return;
+
+ if (si_is_buffer_idle(sctx, si_resource(dst), RADEON_USAGE_READWRITE) &&
+ (!src || si_is_buffer_idle(sctx, si_resource(src), RADEON_USAGE_WRITE))) {
+ /* Idle buffers don't have to sync. */
+ *flags &= ~(SI_OP_SYNC_GE_BEFORE | SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_CS_BEFORE |
+ SI_OP_SYNC_CPDMA_BEFORE);
+ return;
+ }
+
+ const unsigned cs_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_COMPUTE) |
+ SI_BIND_SHADER_BUFFER(PIPE_SHADER_COMPUTE) |
+ SI_BIND_IMAGE_BUFFER(PIPE_SHADER_COMPUTE) |
+ SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_COMPUTE);
+
+ const unsigned ps_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_FRAGMENT) |
+ SI_BIND_SHADER_BUFFER(PIPE_SHADER_FRAGMENT) |
+ SI_BIND_IMAGE_BUFFER(PIPE_SHADER_FRAGMENT) |
+ SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_FRAGMENT);
+
+ unsigned bind_history = si_resource(dst)->bind_history |
+ (src ? si_resource(src)->bind_history : 0);
+
+ /* Clear SI_OP_SYNC_CS_BEFORE if the buffer has never been used with a CS. */
+ if (*flags & SI_OP_SYNC_CS_BEFORE && !(bind_history & cs_mask))
+ *flags &= ~SI_OP_SYNC_CS_BEFORE;
+
+ /* Clear SI_OP_SYNC_PS_BEFORE if the buffer has never been used with a PS. */
+ if (*flags & SI_OP_SYNC_PS_BEFORE && !(bind_history & ps_mask)) {
+ *flags &= ~SI_OP_SYNC_PS_BEFORE;
+ *flags |= SI_OP_SYNC_GE_BEFORE;
+ }
+}
+static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_grid_info *info,
+ void *shader, unsigned flags)
+{
/* Wait for previous shaders to finish. */
+ if (flags & SI_OP_SYNC_GE_BEFORE)
+ sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+
if (flags & SI_OP_SYNC_PS_BEFORE)
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
if (flags & SI_OP_SYNC_CS_BEFORE)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
- if (!(flags & SI_OP_CS_IMAGE))
- sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
-
/* Invalidate L0-L1 caches. */
/* sL0 is never invalidated, because src resources don't use it. */
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
@@ -80,11 +136,18 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
/* Set settings for driver-internal compute dispatches. */
sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
- sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+ if (sctx->num_hw_pipestat_streamout_queries)
+ sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+
+ if (sctx->flags)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
if (!(flags & SI_OP_CS_RENDER_COND_ENABLE))
sctx->render_cond_enabled = false;
+ /* Force-disable fbfetch because there are unsolvable recursion problems. */
+ si_force_disable_ps_colorbuf0_slot(sctx);
+
/* Skip decompression to prevent infinite recursion. */
sctx->blitter_running = true;
@@ -96,23 +159,42 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
/* Restore default settings. */
sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
- sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+ if (sctx->num_hw_pipestat_streamout_queries)
+ sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+
sctx->render_cond_enabled = sctx->render_cond;
sctx->blitter_running = false;
+ /* We force-disabled fbfetch, so recompute the state. */
+ si_update_ps_colorbuf0_slot(sctx);
+
if (flags & SI_OP_SYNC_AFTER) {
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
if (flags & SI_OP_CS_IMAGE) {
/* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
- sctx->flags |= sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0;
+ sctx->flags |= sctx->gfx_level <= GFX8 ? SI_CONTEXT_WB_L2 : 0;
/* Make sure image stores are visible to all CUs. */
sctx->flags |= SI_CONTEXT_INV_VCACHE;
+ /* Make sure RBs see our DCC changes. */
+ if (sctx->gfx_level >= GFX10 && sctx->screen->info.tcc_rb_non_coherent) {
+ unsigned enabled_mask = sctx->images[PIPE_SHADER_COMPUTE].enabled_mask;
+ while (enabled_mask) {
+ int i = u_bit_scan(&enabled_mask);
+ if (sctx->images[PIPE_SHADER_COMPUTE].views[i].access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE) {
+ sctx->flags |= SI_CONTEXT_INV_L2;
+ break;
+ }
+ }
+ }
} else {
/* Make sure buffer stores are visible to all CUs. */
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
}
}
+
+ if (sctx->flags)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
@@ -120,8 +202,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
unsigned num_buffers, const struct pipe_shader_buffer *buffers,
unsigned writeable_bitmask)
{
- if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
+ if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+ }
/* Save states. */
struct pipe_shader_buffer saved_sb[3] = {};
@@ -136,14 +220,17 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
}
/* Bind buffers and launch compute. */
- sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers,
- writeable_bitmask);
+ si_set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers,
+ writeable_bitmask,
+ true /* don't update bind_history to prevent unnecessary syncs later */);
si_launch_grid_internal(sctx, info, shader, flags);
/* Do cache flushing at the end. */
if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) {
- if (flags & SI_OP_SYNC_AFTER)
+ if (flags & SI_OP_SYNC_AFTER) {
sctx->flags |= SI_CONTEXT_WB_L2;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+ }
} else {
while (writeable_bitmask)
si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true;
@@ -172,14 +259,14 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
/* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */
unsigned dwords_per_instruction = 4;
- unsigned wave_size = sctx->screen->compute_wave_size;
- unsigned dwords_per_wave = dwords_per_instruction * wave_size;
+ unsigned block_size = 64; /* it's always 64x1x1 */
+ unsigned dwords_per_wave = dwords_per_instruction * block_size;
unsigned num_dwords = size / 4;
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
struct pipe_grid_info info = {};
- info.block[0] = MIN2(wave_size, num_instructions);
+ info.block[0] = MIN2(block_size, num_instructions);
info.block[1] = 1;
info.block[2] = 1;
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
@@ -195,7 +282,7 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
sctx->cs_user_data[1] = ~writebitmask;
if (!sctx->cs_clear_buffer_rmw)
- sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b);
+ sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(sctx);
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer_rmw, flags, coher,
1, &sb, 0x1);
@@ -206,8 +293,6 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe
const uint32_t *clear_value, unsigned flags,
enum si_coherency coher)
{
- struct pipe_context *ctx = &sctx->b;
-
assert(dst_offset % 4 == 0);
assert(size % 4 == 0);
unsigned size_12 = DIV_ROUND_UP(size, 12);
@@ -222,7 +307,7 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe
struct pipe_grid_info info = {0};
if (!sctx->cs_clear_12bytes_buffer)
- sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
+ sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(sctx);
info.block[0] = 64;
info.last_block[0] = size_12 % 64;
@@ -257,14 +342,15 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
- unsigned wave_size = sctx->screen->compute_wave_size;
- unsigned dwords_per_wave = dwords_per_thread * wave_size;
+ /* The shader declares the block size like this: */
+ unsigned block_size = si_determine_wave_size(sctx->screen, NULL);
+ unsigned dwords_per_wave = dwords_per_thread * block_size;
unsigned num_dwords = size / 4;
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
struct pipe_grid_info info = {};
- info.block[0] = MIN2(wave_size, num_instructions);
+ info.block[0] = MIN2(block_size, num_instructions);
info.block[1] = 1;
info.block[2] = 1;
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
@@ -285,7 +371,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
if (!sctx->cs_copy_buffer) {
sctx->cs_copy_buffer = si_create_dma_compute_shader(
- &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+ sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
@@ -299,7 +385,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
if (!sctx->cs_clear_buffer) {
sctx->cs_clear_buffer = si_create_dma_compute_shader(
- &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+ sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
@@ -315,6 +401,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
if (!size)
return;
+ si_improve_sync_flags(sctx, dst, NULL, &flags);
+
ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
@@ -335,7 +423,7 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
if (aligned_size >= 4) {
uint64_t compute_min_size;
- if (sctx->chip_class <= GFX8) {
+ if (sctx->gfx_level <= GFX8) {
/* CP DMA clears are terribly slow with GTT on GFX6-8, which can always
* happen due to BO evictions.
*/
@@ -345,7 +433,9 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
compute_min_size = 4 * 1024;
}
- if (method == SI_AUTO_SELECT_CLEAR_METHOD && (
+ /* TODO: use compute for unaligned big sizes */
+ if (method == SI_AUTO_SELECT_CLEAR_METHOD &&
+ (flags & SI_OP_CS_RENDER_COND_ENABLE ||
clear_value_size > 4 ||
(clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size))) {
method = SI_COMPUTE_CLEAR_METHOD;
@@ -355,6 +445,7 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
clear_value_size, flags, coher);
} else {
assert(clear_value_size == 4);
+ assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value,
flags, coher, get_cache_policy(sctx, coher, size));
}
@@ -365,26 +456,21 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
/* Handle non-dword alignment. */
if (size) {
+ assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
assert(dst);
assert(dst->target == PIPE_BUFFER);
assert(size < 4);
- pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+ sctx->b.buffer_subdata(&sctx->b, dst,
+ PIPE_MAP_WRITE |
+ /* TC forbids drivers to invalidate buffers and infer unsynchronized mappings,
+ * so suppress those optimizations. */
+ (sctx->tc ? TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED |
+ TC_TRANSFER_MAP_NO_INVALIDATE : 0),
+ offset, size, clear_value);
}
}
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
- uint64_t size, unsigned value, unsigned flags)
-{
- struct si_context *ctx = (struct si_context *)sscreen->aux_context;
-
- simple_mtx_lock(&sscreen->aux_context_lock);
- si_clear_buffer(ctx, dst, offset, size, &value, 4, flags,
- SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD);
- sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
- simple_mtx_unlock(&sscreen->aux_context_lock);
-}
-
static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
unsigned offset, unsigned size, const void *clear_value,
int clear_value_size)
@@ -404,7 +490,10 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
uint64_t compute_min_size = 8 * 1024;
+ si_improve_sync_flags(sctx, dst, src, &flags);
+
/* Only use compute for VRAM copies on dGPUs. */
+ /* TODO: use compute for unaligned big sizes */
if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size &&
dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
@@ -416,29 +505,176 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
}
}
-void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+ uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags)
+{
+ if (!size)
+ return;
+
+ if (!sctx->cs_ubyte_to_ushort)
+ sctx->cs_ubyte_to_ushort = si_create_ubyte_to_ushort_compute_shader(sctx);
+
+ /* Use COHERENCY_NONE to get SI_CONTEXT_WB_L2 automatically used in
+ * si_launch_grid_internal_ssbos.
+ */
+ enum si_coherency coher = SI_COHERENCY_NONE;
+
+ si_improve_sync_flags(sctx, dst, src, &flags);
+
+ struct pipe_grid_info info = {};
+ info.block[0] = si_determine_wave_size(sctx->screen, NULL);
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(size, info.block[0]);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+ info.last_block[0] = size % info.block[0];
+
+ struct pipe_shader_buffer sb[2] = {};
+ sb[0].buffer = dst;
+ sb[0].buffer_offset = dst_offset;
+ sb[0].buffer_size = dst->width0;
+
+ sb[1].buffer = src;
+ sb[1].buffer_offset = src_offset;
+ sb[1].buffer_size = src->width0;
+
+ si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_ubyte_to_ushort, flags, coher,
+ 2, sb, 0x1);
+}
+
+static unsigned
+set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
+ unsigned work_x, unsigned work_y, unsigned work_z)
+{
+ info->block[0] = block_x;
+ info->block[1] = block_y;
+ info->block[2] = block_z;
+
+ unsigned work[3] = {work_x, work_y, work_z};
+ for (int i = 0; i < 3; ++i) {
+ info->last_block[i] = work[i] % info->block[i];
+ info->grid[i] = DIV_ROUND_UP(work[i], info->block[i]);
+ }
+
+ return work_z > 1 ? 3 : (work_y > 1 ? 2 : 1);
+}
+
+static void si_launch_grid_internal_images(struct si_context *sctx,
+ struct pipe_image_view *images,
+ unsigned num_images,
+ const struct pipe_grid_info *info,
+ void *shader, unsigned flags)
+{
+ struct pipe_image_view saved_image[2] = {};
+ assert(num_images <= ARRAY_SIZE(saved_image));
+
+ for (unsigned i = 0; i < num_images; i++) {
+ assert(sctx->b.screen->is_format_supported(sctx->b.screen, images[i].format,
+ images[i].resource->target,
+ images[i].resource->nr_samples,
+ images[i].resource->nr_storage_samples,
+ PIPE_BIND_SHADER_IMAGE));
+
+ /* Always allow DCC stores on gfx10+. */
+ if (sctx->gfx_level >= GFX10 &&
+ images[i].access & PIPE_IMAGE_ACCESS_WRITE &&
+ !(images[i].access & SI_IMAGE_ACCESS_DCC_OFF))
+ images[i].access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
+
+ /* Simplify the format according to what image stores support. */
+ if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) {
+ images[i].format = util_format_linear(images[i].format); /* SRGB not supported */
+ /* Keep L8A8 formats as-is because GFX7 is unable to store into R8A8 for some reason. */
+ images[i].format = util_format_intensity_to_red(images[i].format);
+ images[i].format = util_format_rgbx_to_rgba(images[i].format); /* prevent partial writes */
+ }
+
+ /* Save the image. */
+ util_copy_image_view(&saved_image[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
+ }
+
+ /* This might invoke DCC decompression, so do it first. */
+ sctx->b.set_shader_images(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_images, 0, images);
+
+ /* This should be done after set_shader_images. */
+ for (unsigned i = 0; i < num_images; i++) {
+ /* The driver doesn't decompress resources automatically here, so do it manually. */
+ si_decompress_subresource(&sctx->b, images[i].resource, PIPE_MASK_RGBAZS,
+ images[i].u.tex.level, images[i].u.tex.first_layer,
+ images[i].u.tex.last_layer,
+ images[i].access & PIPE_IMAGE_ACCESS_WRITE);
+ }
+
+ /* This must be done before the compute shader. */
+ if (flags & SI_OP_SYNC_PS_BEFORE) {
+ for (unsigned i = 0; i < num_images; i++) {
+ si_make_CB_shader_coherent(sctx, images[i].resource->nr_samples, true,
+ ((struct si_texture*)images[i].resource)->surface.u.gfx9.color.dcc.pipe_aligned);
+ }
+ }
+
+ si_launch_grid_internal(sctx, info, shader, flags | SI_OP_CS_IMAGE);
+
+ /* Restore images. */
+ sctx->b.set_shader_images(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_images, 0, saved_image);
+ for (unsigned i = 0; i < num_images; i++)
+ pipe_resource_reference(&saved_image[i].resource, NULL);
+}
+
+bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
struct pipe_resource *src, unsigned src_level, unsigned dstx,
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
- bool is_dcc_decompress, unsigned flags)
+ unsigned flags)
{
- struct pipe_context *ctx = &sctx->b;
struct si_texture *ssrc = (struct si_texture*)src;
struct si_texture *sdst = (struct si_texture*)dst;
- unsigned width = src_box->width;
- unsigned height = src_box->height;
- unsigned depth = src_box->depth;
+
+ /* The compute copy is mandatory for compressed and subsampled formats because the gfx copy
+ * doesn't support them. In all other cases, call si_can_use_compute_blit.
+ *
+ * The format is identical (we only need to check the src format) except compressed formats,
+ * which can be paired with an equivalent integer format.
+ */
+ if (!util_format_is_compressed(src->format) &&
+ !util_format_is_compressed(dst->format) &&
+ !util_format_is_subsampled_422(src->format)) {
+ bool src_can_use_compute_blit =
+ si_can_use_compute_blit(sctx, src->format, src->nr_samples, false,
+ vi_dcc_enabled(ssrc, src_level));
+
+ if (!src_can_use_compute_blit)
+ return false;
+
+ bool dst_can_use_compute_blit =
+ si_can_use_compute_blit(sctx, dst->format, dst->nr_samples, true,
+ vi_dcc_enabled(sdst, dst_level));
+
+ if (!dst_can_use_compute_blit && !sctx->has_graphics &&
+ si_can_use_compute_blit(sctx, dst->format, dst->nr_samples, false,
+ vi_dcc_enabled(sdst, dst_level))) {
+ /* Non-graphics context don't have a blitter, so try harder to do
+ * a compute blit by disabling dcc on the destination texture.
+ */
+ dst_can_use_compute_blit = si_texture_disable_dcc(sctx, sdst);
+ }
+
+ if (!dst_can_use_compute_blit)
+ return false;
+ }
+
enum pipe_format src_format = util_format_linear(src->format);
enum pipe_format dst_format = util_format_linear(dst->format);
bool is_linear = ssrc->surface.is_linear || sdst->surface.is_linear;
assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
+ /* Interpret as integer values to avoid NaN issues */
if (!vi_dcc_enabled(ssrc, src_level) &&
!vi_dcc_enabled(sdst, dst_level) &&
src_format == dst_format &&
util_format_is_float(src_format) &&
!util_format_is_compressed(src_format)) {
- /* Interpret as integer values to avoid NaN issues */
switch(util_format_get_blocksizebits(src_format)) {
case 16:
src_format = dst_format = PIPE_FORMAT_R16_UINT;
@@ -457,8 +693,44 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
}
}
+ /* Interpret compressed formats as UINT. */
+ struct pipe_box new_box;
+ unsigned src_access = 0, dst_access = 0;
+
+ /* Note that staging copies do compressed<->UINT, so one of the formats is already UINT. */
+ if (util_format_is_compressed(src_format) || util_format_is_compressed(dst_format)) {
+ if (util_format_is_compressed(src_format))
+ src_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+ if (util_format_is_compressed(dst_format))
+ dst_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+
+ dstx = util_format_get_nblocksx(dst_format, dstx);
+ dsty = util_format_get_nblocksy(dst_format, dsty);
+
+ new_box.x = util_format_get_nblocksx(src_format, src_box->x);
+ new_box.y = util_format_get_nblocksy(src_format, src_box->y);
+ new_box.z = src_box->z;
+ new_box.width = util_format_get_nblocksx(src_format, src_box->width);
+ new_box.height = util_format_get_nblocksy(src_format, src_box->height);
+ new_box.depth = src_box->depth;
+ src_box = &new_box;
+
+ if (ssrc->surface.bpe == 8)
+ src_format = dst_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+ else
+ src_format = dst_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+ }
+
if (util_format_is_subsampled_422(src_format)) {
+ assert(src_format == dst_format);
+
+ src_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+ dst_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
+
+ dstx = util_format_get_nblocksx(src_format, dstx);
+
src_format = dst_format = PIPE_FORMAT_R32_UINT;
+
/* Interpreting 422 subsampled format (16 bpp) as 32 bpp
* should force us to divide src_box->x, dstx and width by 2.
* But given that ac_surface allocates this format as 32 bpp
@@ -467,135 +739,69 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
*/
}
- if (width == 0 || height == 0)
- return;
-
- /* The driver doesn't decompress resources automatically here. */
- si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
- dstz + src_box->depth - 1);
- si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
- src_box->z + src_box->depth - 1);
-
- /* src and dst have the same number of samples. */
- si_make_CB_shader_coherent(sctx, src->nr_samples, true,
- ssrc->surface.u.gfx9.color.dcc.pipe_aligned);
- if (sctx->chip_class >= GFX10) {
- /* GFX10+ uses DCC stores so si_make_CB_shader_coherent is required for dst too */
- si_make_CB_shader_coherent(sctx, dst->nr_samples, true,
- sdst->surface.u.gfx9.color.dcc.pipe_aligned);
- }
+ /* SNORM blitting has precision issues. Use the SINT equivalent instead, which doesn't
+ * force DCC decompression.
+ */
+ if (util_format_is_snorm(dst_format))
+ src_format = dst_format = util_format_snorm_to_sint(dst_format);
- struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
- struct pipe_image_view saved_image[2] = {0};
- util_copy_image_view(&saved_image[0], &images->views[0]);
- util_copy_image_view(&saved_image[1], &images->views[1]);
+ if (src_box->width == 0 || src_box->height == 0 || src_box->depth == 0)
+ return true; /* success - nothing to do */
struct pipe_image_view image[2] = {0};
image[0].resource = src;
- image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+ image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ | src_access;
image[0].format = src_format;
image[0].u.tex.level = src_level;
image[0].u.tex.first_layer = 0;
- image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
- : (unsigned)(src->array_size - 1);
+ image[0].u.tex.last_layer = util_max_layer(src, src_level);
image[1].resource = dst;
- image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+ image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE | dst_access;
image[1].format = dst_format;
image[1].u.tex.level = dst_level;
image[1].u.tex.first_layer = 0;
- image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
- : (unsigned)(dst->array_size - 1);
-
- /* SNORM8 blitting has precision issues on some chips. Use the SINT
- * equivalent instead, which doesn't force DCC decompression.
- */
- if (util_format_is_snorm8(dst->format)) {
- image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
- }
-
- if (is_dcc_decompress)
- image[1].access |= SI_IMAGE_ACCESS_DCC_OFF;
- else if (sctx->chip_class >= GFX10)
- image[1].access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
-
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, image);
-
- if (!is_dcc_decompress) {
- sctx->cs_user_data[0] = src_box->x | (dstx << 16);
- sctx->cs_user_data[1] = src_box->y | (dsty << 16);
- sctx->cs_user_data[2] = src_box->z | (dstz << 16);
- }
+ image[1].u.tex.last_layer = util_max_layer(dst, dst_level);
struct pipe_grid_info info = {0};
- if (is_dcc_decompress) {
- /* The DCC decompression is a normal blit where the load is compressed
- * and the store is uncompressed. The workgroup size is either equal to
- * the DCC block size or a multiple thereof. The shader uses a barrier
- * between loads and stores to safely overwrite each DCC block of pixels.
- */
- unsigned dim[3] = {src_box->width, src_box->height, src_box->depth};
-
- assert(src == dst);
- assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY);
-
- if (!sctx->cs_dcc_decompress)
- sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx);
-
- info.block[0] = ssrc->surface.u.gfx9.color.dcc_block_width;
- info.block[1] = ssrc->surface.u.gfx9.color.dcc_block_height;
- info.block[2] = ssrc->surface.u.gfx9.color.dcc_block_depth;
-
- /* Make sure the block size is at least the same as wave size. */
- while (info.block[0] * info.block[1] * info.block[2] <
- sctx->screen->compute_wave_size) {
- info.block[0] *= 2;
- }
+ bool dst_is_1d = dst->target == PIPE_TEXTURE_1D ||
+ dst->target == PIPE_TEXTURE_1D_ARRAY;
+ bool src_is_1d = src->target == PIPE_TEXTURE_1D ||
+ src->target == PIPE_TEXTURE_1D_ARRAY;
+ int block_x, block_y;
+ int block_z = 1;
+
+ /* Choose the block dimensions based on the copy area size. */
+ if (src_box->height <= 4) {
+ block_y = util_next_power_of_two(src_box->height);
+ block_x = 64 / block_y;
+ } else if (src_box->width <= 4) {
+ block_x = util_next_power_of_two(src_box->width);
+ block_y = 64 / block_x;
+ } else if (is_linear) {
+ block_x = 64;
+ block_y = 1;
+ } else {
+ block_x = 8;
+ block_y = 8;
+ }
- for (unsigned i = 0; i < 3; i++) {
- info.last_block[i] = dim[i] % info.block[i];
- info.grid[i] = DIV_ROUND_UP(dim[i], info.block[i]);
- }
+ sctx->cs_user_data[0] = src_box->x | (dstx << 16);
+ sctx->cs_user_data[1] = src_box->y | (dsty << 16);
+ sctx->cs_user_data[2] = src_box->z | (dstz << 16);
- si_launch_grid_internal(sctx, &info, sctx->cs_dcc_decompress, flags | SI_OP_CS_IMAGE);
- } else if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
- if (!sctx->cs_copy_image_1d_array)
- sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
+ unsigned wg_dim =
+ set_work_size(&info, block_x, block_y, block_z,
+ src_box->width, src_box->height, src_box->depth);
- info.block[0] = 64;
- info.last_block[0] = width % 64;
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, 64);
- info.grid[1] = depth;
- info.grid[2] = 1;
+ void **copy_image_cs_ptr = &sctx->cs_copy_image[wg_dim - 1][src_is_1d][dst_is_1d];
+ if (!*copy_image_cs_ptr)
+ *copy_image_cs_ptr = si_create_copy_image_cs(sctx, wg_dim, src_is_1d, dst_is_1d);
- si_launch_grid_internal(sctx, &info, sctx->cs_copy_image_1d_array, flags | SI_OP_CS_IMAGE);
- } else {
- if (!sctx->cs_copy_image)
- sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
+ assert(*copy_image_cs_ptr);
- /* This is better for access over PCIe. */
- if (is_linear) {
- info.block[0] = 64;
- info.block[1] = 1;
- } else {
- info.block[0] = 8;
- info.block[1] = 8;
- }
- info.last_block[0] = width % info.block[0];
- info.last_block[1] = height % info.block[1];
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
- info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
- info.grid[2] = depth;
-
- si_launch_grid_internal(sctx, &info, sctx->cs_copy_image, flags | SI_OP_CS_IMAGE);
- }
-
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, saved_image);
- for (int i = 0; i < 2; i++)
- pipe_resource_reference(&saved_image[i].resource, NULL);
+ si_launch_grid_internal_images(sctx, image, 2, &info, *copy_image_cs_ptr, flags);
+ return true;
}
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
@@ -649,6 +855,8 @@ void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uin
{
struct si_texture *tex = (struct si_texture*)res;
+ assert(sctx->gfx_level < GFX11);
+
/* Set the DCC buffer. */
assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX);
assert(tex->buffer.bo_size <= UINT_MAX);
@@ -701,6 +909,8 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
unsigned log_samples = util_logbase2(tex->nr_samples);
assert(tex->nr_samples >= 2);
+ assert(sctx->gfx_level < GFX11);
+
/* EQAA FMASK expansion is unimplemented. */
if (tex->nr_samples != tex->nr_storage_samples)
return;
@@ -727,7 +937,7 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
/* Bind the shader. */
void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
if (!*shader)
- *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
+ *shader = si_create_fmask_expand_cs(sctx, tex->nr_samples, is_array);
/* Dispatch compute. */
struct pipe_grid_info info = {0};
@@ -765,6 +975,47 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD);
}
+void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
+ unsigned level, enum pipe_format format,
+ const union pipe_color_union *color, unsigned flags)
+{
+ assert(sctx->gfx_level >= GFX11); /* not believed to be useful on gfx10 */
+ unsigned dcc_block_width = tex->surface.u.gfx9.color.dcc_block_width;
+ unsigned dcc_block_height = tex->surface.u.gfx9.color.dcc_block_height;
+ unsigned width = DIV_ROUND_UP(u_minify(tex->buffer.b.b.width0, level), dcc_block_width);
+ unsigned height = DIV_ROUND_UP(u_minify(tex->buffer.b.b.height0, level), dcc_block_height);
+ unsigned depth = util_num_layers(&tex->buffer.b.b, level);
+ bool is_msaa = tex->buffer.b.b.nr_samples >= 2;
+
+ struct pipe_image_view image = {0};
+ image.resource = &tex->buffer.b.b;
+ image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_DCC_OFF;
+ image.format = format;
+ image.u.tex.level = level;
+ image.u.tex.last_layer = depth - 1;
+
+ if (util_format_is_srgb(format)) {
+ union pipe_color_union color_srgb;
+ for (int i = 0; i < 3; i++)
+ color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
+ color_srgb.f[3] = color->f[3];
+ memcpy(sctx->cs_user_data, color_srgb.ui, sizeof(color->ui));
+ } else {
+ memcpy(sctx->cs_user_data, color->ui, sizeof(color->ui));
+ }
+
+ sctx->cs_user_data[4] = dcc_block_width | (dcc_block_height << 16);
+
+ struct pipe_grid_info info = {0};
+ unsigned wg_dim = set_work_size(&info, 8, 8, 1, width, height, depth);
+
+ void **shader = &sctx->cs_clear_image_dcc_single[is_msaa][wg_dim];
+ if (!*shader)
+ *shader = si_clear_image_dcc_single_shader(sctx, is_msaa, wg_dim);
+
+ si_launch_grid_internal_images(sctx, &image, 1, &info, *shader, flags);
+}
+
void si_init_compute_blit_functions(struct si_context *sctx)
{
sctx->b.clear_buffer = si_pipe_clear_buffer;
@@ -777,17 +1028,12 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
bool render_condition_enabled)
{
struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture *tex = (struct si_texture*)dstsurf->texture;
unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
if (width == 0 || height == 0)
return;
- /* The driver doesn't decompress resources automatically here. */
- si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
- dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
-
if (util_format_is_srgb(dstsurf->format)) {
union pipe_color_union color_srgb;
for (int i = 0; i < 3; i++)
@@ -798,16 +1044,9 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
memcpy(data + 4, color->ui, sizeof(color->ui));
}
- si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
- tex->surface.u.gfx9.color.dcc.pipe_aligned);
-
struct pipe_constant_buffer saved_cb = {};
si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
- struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
- struct pipe_image_view saved_image = {0};
- util_copy_image_view(&saved_image, &images->views[0]);
-
struct pipe_constant_buffer cb = {};
cb.buffer_size = sizeof(data);
cb.user_buffer = data;
@@ -815,20 +1054,18 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
struct pipe_image_view image = {0};
image.resource = dstsurf->texture;
- image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
+ image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
image.format = util_format_linear(dstsurf->format);
image.u.tex.level = dstsurf->u.tex.level;
image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
image.u.tex.last_layer = dstsurf->u.tex.last_layer;
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &image);
-
struct pipe_grid_info info = {0};
void *shader;
if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
if (!sctx->cs_clear_render_target)
- sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
+ sctx->cs_clear_render_target = si_clear_render_target_shader(sctx, PIPE_TEXTURE_2D_ARRAY);
shader = sctx->cs_clear_render_target;
info.block[0] = 8;
@@ -841,7 +1078,7 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
info.grid[2] = num_layers;
} else {
if (!sctx->cs_clear_render_target_1d_array)
- sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
+ sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader(sctx, PIPE_TEXTURE_1D_ARRAY);
shader = sctx->cs_clear_render_target_1d_array;
info.block[0] = 64;
@@ -853,10 +1090,166 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
info.grid[2] = 1;
}
- si_launch_grid_internal(sctx, &info, shader, SI_OP_SYNC_BEFORE_AFTER | SI_OP_CS_IMAGE |
- (render_condition_enabled ? SI_OP_CS_RENDER_COND_ENABLE : 0));
+ si_launch_grid_internal_images(sctx, &image, 1, &info, shader,
+ SI_OP_SYNC_BEFORE_AFTER |
+ (render_condition_enabled ? SI_OP_CS_RENDER_COND_ENABLE : 0));
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &saved_image);
ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, true, &saved_cb);
- pipe_resource_reference(&saved_image.resource, NULL);
+}
+
+/* Return the last component that a compute blit should load and store. */
+static unsigned si_format_get_last_blit_component(enum pipe_format format, bool is_dst)
+{
+ const struct util_format_description *desc = util_format_description(format);
+ unsigned num = 0;
+
+ for (unsigned i = 1; i < 4; i++) {
+ if (desc->swizzle[i] <= PIPE_SWIZZLE_W ||
+ /* If the swizzle is 1 for dst, we need to store 1 explicitly.
+ * The hardware stores 0 by default. */
+ (is_dst && desc->swizzle[i] == PIPE_SWIZZLE_1))
+ num = i;
+ }
+ return num;
+}
+
+static bool si_should_blit_clamp_xy(const struct pipe_blit_info *info)
+{
+ int src_width = u_minify(info->src.resource->width0, info->src.level);
+ int src_height = u_minify(info->src.resource->height0, info->src.level);
+ struct pipe_box box = info->src.box;
+
+ /* Eliminate negative width/height/depth. */
+ if (box.width < 0) {
+ box.x += box.width;
+ box.width *= -1;
+ }
+ if (box.height < 0) {
+ box.y += box.height;
+ box.height *= -1;
+ }
+
+ bool in_bounds = box.x >= 0 && box.x < src_width &&
+ box.y >= 0 && box.y < src_height &&
+ box.x + box.width > 0 && box.x + box.width <= src_width &&
+ box.y + box.height > 0 && box.y + box.height <= src_height;
+
+ /* Return if the box is not in bounds. */
+ return !in_bounds;
+}
+
+bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, bool testing)
+{
+ /* Compute blits require D16 right now (see the ISA).
+ *
+ * Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
+ * The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test
+ * for that because it's mostly just blits.
+ *
+ * TODO: benchmark the performance on gfx11
+ */
+ if (sctx->gfx_level < GFX11 && !testing)
+ return false;
+
+ if (!si_can_use_compute_blit(sctx, info->dst.format, info->dst.resource->nr_samples, true,
+ vi_dcc_enabled((struct si_texture*)info->dst.resource,
+ info->dst.level)) ||
+ !si_can_use_compute_blit(sctx, info->src.format, info->src.resource->nr_samples, false,
+ vi_dcc_enabled((struct si_texture*)info->src.resource,
+ info->src.level)))
+ return false;
+
+ if (info->alpha_blend ||
+ info->num_window_rectangles ||
+ info->scissor_enable ||
+ /* No scaling. */
+ info->dst.box.width != abs(info->src.box.width) ||
+ info->dst.box.height != abs(info->src.box.height) ||
+ info->dst.box.depth != abs(info->src.box.depth))
+ return false;
+
+ assert(info->src.box.depth >= 0);
+
+ /* Shader images. */
+ struct pipe_image_view image[2];
+ image[0].resource = info->src.resource;
+ image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+ image[0].format = info->src.format;
+ image[0].u.tex.level = info->src.level;
+ image[0].u.tex.first_layer = 0;
+ image[0].u.tex.last_layer = util_max_layer(info->src.resource, info->src.level);
+
+ image[1].resource = info->dst.resource;
+ image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+ image[1].format = info->dst.format;
+ image[1].u.tex.level = info->dst.level;
+ image[1].u.tex.first_layer = 0;
+ image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);
+
+ struct pipe_grid_info grid = {0};
+ unsigned wg_dim =
+ set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height,
+ info->dst.box.depth);
+
+ /* Get the shader key. */
+ const struct util_format_description *dst_desc = util_format_description(info->dst.format);
+ unsigned i = util_format_get_first_non_void_channel(info->dst.format);
+ union si_compute_blit_shader_key options;
+ options.key = 0;
+
+ options.always_true = true;
+ options.wg_dim = wg_dim;
+ options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D ||
+ info->src.resource->target == PIPE_TEXTURE_1D_ARRAY;
+ options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
+ info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY;
+ options.src_is_msaa = info->src.resource->nr_samples > 1;
+ options.dst_is_msaa = info->dst.resource->nr_samples > 1;
+ /* Resolving integer formats only copies sample 0. log2_samples is then unused. */
+ options.sample0_only = options.src_is_msaa && !options.dst_is_msaa &&
+ util_format_is_pure_integer(info->src.format);
+ unsigned num_samples = MAX2(info->src.resource->nr_samples, info->dst.resource->nr_samples);
+ options.log2_samples = options.sample0_only ? 0 : util_logbase2(num_samples);
+ options.xy_clamp_to_edge = si_should_blit_clamp_xy(info);
+ options.flip_x = info->src.box.width < 0;
+ options.flip_y = info->src.box.height < 0;
+ options.sint_to_uint = util_format_is_pure_sint(info->src.format) &&
+ util_format_is_pure_uint(info->dst.format);
+ options.uint_to_sint = util_format_is_pure_uint(info->src.format) &&
+ util_format_is_pure_sint(info->dst.format);
+ options.dst_is_srgb = util_format_is_srgb(info->dst.format);
+ options.last_dst_channel = si_format_get_last_blit_component(info->dst.format, true);
+ options.last_src_channel = MIN2(si_format_get_last_blit_component(info->src.format, false),
+ options.last_dst_channel);
+ options.use_integer_one = util_format_is_pure_integer(info->dst.format) &&
+ options.last_src_channel < options.last_dst_channel &&
+ options.last_dst_channel == 3;
+
+ /* WARNING: We need this option for AMD_TEST to get results identical with the gfx blit,
+ * otherwise we wouldn't be able to fully validate whether everything else works.
+ * The test expects that the behavior is identical to u_blitter.
+ *
+ * Additionally, we need to keep this enabled even when not testing because not doing fp16_rtz
+ * breaks "piglit/bin/texsubimage -auto pbo".
+ */
+ options.fp16_rtz = !util_format_is_pure_integer(info->dst.format) &&
+ dst_desc->channel[i].size <= 10;
+
+ struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders,
+ (void*)(uintptr_t)options.key);
+ void *shader = entry ? entry->data : NULL;
+ if (!shader) {
+ shader = si_create_blit_cs(sctx, &options);
+ _mesa_hash_table_insert(sctx->cs_blit_shaders,
+ (void*)(uintptr_t)options.key, shader);
+ }
+
+ sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16);
+ sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16);
+ sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16);
+
+ si_launch_grid_internal_images(sctx, image, 2, &grid, shader,
+ SI_OP_SYNC_BEFORE_AFTER |
+ (info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0));
+ return true;
}