diff options
author | Marek Olšák <marek.olsak@amd.com> | 2020-12-03 17:57:45 -0500 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2020-12-09 00:52:26 +0000 |
commit | 1f31a216640f294ce310898773d9b42bda5d1d47 (patch) | |
tree | e9827d03decb361c3e580e5a72e4a0512ef3c681 | |
parent | 5b81194fee22f6b22f3448d445acd38b647cd1e8 (diff) |
radeonsi: remove SDMA support
There are many issues with SDMA across many generations of hardware.
A recent example is that gfx10.3 suffers from random GPU hangs if
userspace uses SDMA.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7908>
-rw-r--r-- | docs/envvars.rst | 8 | ||||
-rw-r--r-- | src/gallium/drivers/radeon/radeon_video.c | 3 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/Makefile.sources | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/cik_sdma.c | 554 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/meson.build | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_blit.c | 19 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_buffer.c | 118 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute_blit.c | 1 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_debug.c | 22 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_dma_cs.c | 319 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_fence.c | 98 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_gfx_cs.c | 43 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 54 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 50 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_query.c | 51 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_query.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_test_dma.c | 10 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_test_dma_perf.c | 47 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_texture.c | 75 |
19 files changed, 67 insertions, 1413 deletions
diff --git a/docs/envvars.rst b/docs/envvars.rst index 7782197e59f..9acc1cef3e4 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -662,12 +662,6 @@ radeonsi driver environment variables ``AMD_DEBUG`` a comma-separated list of named flags, which do various things: -``nodma`` - Disable SDMA -``nodmaclear`` - Disable SDMA clears -``nodmacopyimage`` - Disable SDMA image copies ``nodcc`` Disable DCC. ``nodccclear`` @@ -736,8 +730,6 @@ radeonsi driver environment variables Use old-style monolithic shaders compiled on demand ``nooptvariant`` Disable compiling optimized shader variants. -``forcedma`` - Use SDMA for all operations when possible. ``nowc`` Disable GTT write combining ``check_vm`` diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c index 97767872bf1..25962d73c6d 100644 --- a/src/gallium/drivers/radeon/radeon_video.c +++ b/src/gallium/drivers/radeon/radeon_video.c @@ -129,7 +129,8 @@ error: void si_vid_clear_buffer(struct pipe_context *context, struct rvid_buffer *buffer) { struct si_context *sctx = (struct si_context *)context; + uint32_t zero = 0; - si_sdma_clear_buffer(sctx, &buffer->res->b.b, 0, buffer->res->b.b.width0, 0); + sctx->b.clear_buffer(&sctx->b, &buffer->res->b.b, 0, buffer->res->b.b.width0, &zero, 4); context->flush(context, NULL, 0); } diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index 32bcf20042a..8fd312a2b6a 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -1,5 +1,4 @@ C_SOURCES := \ - cik_sdma.c \ driinfo_radeonsi.h \ gfx10_query.c \ gfx10_shader_ngg.c \ @@ -15,7 +14,6 @@ C_SOURCES := \ si_cp_reg_shadowing.c \ si_debug.c \ si_descriptors.c \ - si_dma_cs.c \ si_fence.c \ si_get.c \ si_gfx_cs.c \ diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c deleted file mode 100644 index 1c154050ed6..00000000000 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ /dev/null @@ -1,554 +0,0 @@ -/* - * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> - * Copyright 2015 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "si_pipe.h" -#include "sid.h" - -static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w) -{ - width = u_minify(width, level); - return DIV_ROUND_UP(width, blk_w); -} - -static unsigned encode_tile_info(struct si_context *sctx, struct si_texture *tex, unsigned level, - bool set_bpp) -{ - struct radeon_info *info = &sctx->screen->info; - unsigned tile_index = tex->surface.u.legacy.tiling_index[level]; - unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index; - unsigned tile_mode = info->si_tile_mode_array[tile_index]; - unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index]; - - return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) | (G_009910_ARRAY_MODE(tile_mode) << 3) | - (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) | - /* Non-depth modes don't have TILE_SPLIT set. */ - ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) | - (G_009990_BANK_WIDTH(macro_tile_mode) << 15) | - (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) | - (G_009990_NUM_BANKS(macro_tile_mode) << 21) | - (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) | - (G_009910_PIPE_CONFIG(tile_mode) << 26); -} - -static bool si_sdma_v4_copy_texture(struct si_context *sctx, struct pipe_resource *dst, - unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, unsigned src_level, - const struct pipe_box *src_box) -{ - struct si_texture *ssrc = (struct si_texture *)src; - struct si_texture *sdst = (struct si_texture *)dst; - - unsigned bpp = sdst->surface.bpe; - uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset; - uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset; - unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch; - unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch; - uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp; - uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp; - unsigned srcx = src_box->x / ssrc->surface.blk_w; - unsigned srcy = src_box->y / ssrc->surface.blk_h; - unsigned srcz = src_box->z; - unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w); - unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h); - unsigned copy_depth = src_box->depth; - unsigned xalign = MAX2(1, 4 / bpp); - - assert(src_level <= src->last_level); - assert(dst_level <= dst->last_level); - assert(sdst->surface.u.gfx9.surf_offset + dst_slice_pitch * bpp * (dstz + src_box->depth) <= - sdst->buffer.buf->size); - assert(ssrc->surface.u.gfx9.surf_offset + src_slice_pitch * bpp * (srcz + src_box->depth) <= - ssrc->buffer.buf->size); - - if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box)) - return false; - - dstx /= sdst->surface.blk_w; - dsty /= sdst->surface.blk_h; - - if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) || - dsty >= (1 << 14) || dstz >= (1 << 11)) - return false; - - /* Linear -> linear sub-window copy. */ - if (ssrc->surface.is_linear && sdst->surface.is_linear) { - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - - /* Check if everything fits into the bitfields */ - if (!(src_pitch <= (1 << 19) && dst_pitch <= (1 << 19) && src_slice_pitch <= (1 << 28) && - dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) && - copy_depth <= (1 << 11))) - return false; - - si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); - - src_address += ssrc->surface.u.gfx9.offset[src_level]; - dst_address += sdst->surface.u.gfx9.offset[dst_level]; - - /* Check alignments */ - if ((src_address % 4) != 0 || (dst_address % 4) != 0 || (src_pitch % xalign) != 0) - return false; - - radeon_emit( - cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, - sctx->ws->cs_is_secure(cs) ? (1u << 2) : 0) | - (util_logbase2(bpp) << 29)); - radeon_emit(cs, src_address); - radeon_emit(cs, src_address >> 32); - radeon_emit(cs, srcx | (srcy << 16)); - radeon_emit(cs, srcz | ((src_pitch - 1) << 13)); - radeon_emit(cs, src_slice_pitch - 1); - radeon_emit(cs, dst_address); - radeon_emit(cs, dst_address >> 32); - radeon_emit(cs, dstx | (dsty << 16)); - radeon_emit(cs, dstz | ((dst_pitch - 1) << 13)); - radeon_emit(cs, dst_slice_pitch - 1); - radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - return true; - } - - /* Linear <-> Tiled sub-window copy */ - if (ssrc->surface.is_linear != sdst->surface.is_linear) { - struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc; - struct si_texture *linear = tiled == ssrc ? sdst : ssrc; - unsigned tiled_level = tiled == ssrc ? src_level : dst_level; - unsigned linear_level = linear == ssrc ? src_level : dst_level; - unsigned tiled_x = tiled == ssrc ? srcx : dstx; - unsigned linear_x = linear == ssrc ? srcx : dstx; - unsigned tiled_y = tiled == ssrc ? srcy : dsty; - unsigned linear_y = linear == ssrc ? srcy : dsty; - unsigned tiled_z = tiled == ssrc ? srcz : dstz; - unsigned linear_z = linear == ssrc ? srcz : dstz; - unsigned tiled_width = tiled == ssrc - ? DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w) - : DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w); - unsigned tiled_height = tiled == ssrc - ? DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h) - : DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h); - unsigned tiled_depth = tiled == ssrc ? ssrc->buffer.b.b.depth0 : sdst->buffer.b.b.depth0; - unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch; - unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch; - uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; - uint64_t linear_address = linear == ssrc ? src_address : dst_address; - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - - linear_address += linear->surface.u.gfx9.offset[linear_level]; - - /* Check if everything fits into the bitfields */ - if (!(tiled_x <= (1 << 14) && tiled_y <= (1 << 14) && tiled_z <= (1 << 11) && - tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && tiled_depth <= (1 << 11) && - linear_x <= (1 << 14) && linear_y <= (1 << 14) && linear_z <= (1 << 11) && - linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) && - copy_width <= (1 << 14) && copy_height <= (1 << 14) && copy_depth <= (1 << 11))) - return false; - - /* Check alignments */ - if ((tiled_address % 256 != 0) || (linear_address % 4 != 0) || (linear_pitch % xalign != 0) || - (linear_slice_pitch % xalign != 0)) - return false; - - si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); - - radeon_emit( - cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, - sctx->ws->cs_is_secure(cs) ? (1u << 2) : 0) | - tiled->buffer.b.b.last_level << 20 | tiled_level << 24 | - (linear == sdst ? 1u : 0) << 31); - radeon_emit(cs, (uint32_t)tiled_address); - radeon_emit(cs, (uint32_t)(tiled_address >> 32)); - radeon_emit(cs, tiled_x | (tiled_y << 16)); - radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16)); - radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16); - radeon_emit(cs, util_logbase2(bpp) | tiled->surface.u.gfx9.surf.swizzle_mode << 3 | - tiled->surface.u.gfx9.resource_type << 9 | - tiled->surface.u.gfx9.surf.epitch << 16); - radeon_emit(cs, (uint32_t)linear_address); - radeon_emit(cs, (uint32_t)(linear_address >> 32)); - radeon_emit(cs, linear_x | (linear_y << 16)); - radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); - radeon_emit(cs, linear_slice_pitch - 1); - radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - return true; - } - - return false; -} - -static bool cik_sdma_copy_texture(struct si_context *sctx, struct pipe_resource *dst, - unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, unsigned src_level, - const struct pipe_box *src_box) -{ - struct radeon_info *info = &sctx->screen->info; - struct si_texture *ssrc = (struct si_texture *)src; - struct si_texture *sdst = (struct si_texture *)dst; - unsigned bpp = sdst->surface.bpe; - uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[dst_level].offset; - uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[src_level].offset; - unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode; - unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode; - unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level]; - unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level]; - unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index]; - unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index]; - unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode); - unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode); - unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0; - unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0; - unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x; - unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x; - uint64_t dst_slice_pitch = - ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp; - uint64_t src_slice_pitch = - ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp; - unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, dst_level, sdst->surface.blk_w); - unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, src_level, ssrc->surface.blk_w); - unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0, dst_level, sdst->surface.blk_h); - unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0, src_level, ssrc->surface.blk_h); - unsigned srcx = src_box->x / ssrc->surface.blk_w; - unsigned srcy = src_box->y / ssrc->surface.blk_h; - unsigned srcz = src_box->z; - unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w); - unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h); - unsigned copy_depth = src_box->depth; - - assert(src_level <= src->last_level); - assert(dst_level <= dst->last_level); - assert(sdst->surface.u.legacy.level[dst_level].offset + - dst_slice_pitch * bpp * (dstz + src_box->depth) <= - sdst->buffer.buf->size); - assert(ssrc->surface.u.legacy.level[src_level].offset + - src_slice_pitch * bpp * (srcz + src_box->depth) <= - ssrc->buffer.buf->size); - - if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box)) - return false; - - dstx /= sdst->surface.blk_w; - dsty /= sdst->surface.blk_h; - - if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) || - dsty >= (1 << 14) || dstz >= (1 << 11)) - return false; - - dst_address |= dst_tile_swizzle << 8; - src_address |= src_tile_swizzle << 8; - - /* Linear -> linear sub-window copy. */ - if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && - /* check if everything fits into the bitfields */ - src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) && - dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) && - copy_depth <= (1 << 11) && - /* HW limitation - GFX7: */ - (sctx->chip_class != GFX7 || - (copy_width < (1 << 14) && copy_height < (1 << 14) && copy_depth < (1 << 11))) && - /* HW limitation - some GFX7 parts: */ - ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) || - (srcx + copy_width != (1 << 14) && srcy + copy_height != (1 << 14)))) { - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - - si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); - - radeon_emit( - cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | - (util_logbase2(bpp) << 29)); - radeon_emit(cs, src_address); - radeon_emit(cs, src_address >> 32); - radeon_emit(cs, srcx | (srcy << 16)); - radeon_emit(cs, srcz | ((src_pitch - 1) << 16)); - radeon_emit(cs, src_slice_pitch - 1); - radeon_emit(cs, dst_address); - radeon_emit(cs, dst_address >> 32); - radeon_emit(cs, dstx | (dsty << 16)); - radeon_emit(cs, dstz | ((dst_pitch - 1) << 16)); - radeon_emit(cs, dst_slice_pitch - 1); - if (sctx->chip_class == GFX7) { - radeon_emit(cs, copy_width | (copy_height << 16)); - radeon_emit(cs, copy_depth); - } else { - radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - } - return true; - } - - /* Tiled <-> linear sub-window copy. */ - if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) { - struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst; - struct si_texture *linear = tiled == ssrc ? sdst : ssrc; - unsigned tiled_level = tiled == ssrc ? src_level : dst_level; - unsigned linear_level = linear == ssrc ? src_level : dst_level; - unsigned tiled_x = tiled == ssrc ? srcx : dstx; - unsigned linear_x = linear == ssrc ? srcx : dstx; - unsigned tiled_y = tiled == ssrc ? srcy : dsty; - unsigned linear_y = linear == ssrc ? srcy : dsty; - unsigned tiled_z = tiled == ssrc ? srcz : dstz; - unsigned linear_z = linear == ssrc ? srcz : dstz; - unsigned tiled_width = tiled == ssrc ? src_width : dst_width; - unsigned linear_width = linear == ssrc ? src_width : dst_width; - unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch; - unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch; - unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch; - unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch; - uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; - uint64_t linear_address = linear == ssrc ? src_address : dst_address; - unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode; - - assert(tiled_pitch % 8 == 0); - assert(tiled_slice_pitch % 64 == 0); - unsigned pitch_tile_max = tiled_pitch / 8 - 1; - unsigned slice_tile_max = tiled_slice_pitch / 64 - 1; - unsigned xalign = MAX2(1, 4 / bpp); - unsigned copy_width_aligned = copy_width; - - /* If the region ends at the last pixel and is unaligned, we - * can copy the remainder of the line that is not visible to - * make it aligned. - */ - if (copy_width % xalign != 0 && linear_x + copy_width == linear_width && - tiled_x + copy_width == tiled_width && - linear_x + align(copy_width, xalign) <= linear_pitch && - tiled_x + align(copy_width, xalign) <= tiled_pitch) - copy_width_aligned = align(copy_width, xalign); - - /* HW limitations. */ - if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) && - linear_pitch - 1 == 0x3fff && bpp == 16) - return false; - - if (sctx->chip_class == GFX7 && - (copy_width_aligned == (1 << 14) || copy_height == (1 << 14) || copy_depth == (1 << 11))) - return false; - - if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI || - sctx->family == CHIP_KABINI) && - (tiled_x + copy_width == (1 << 14) || tiled_y + copy_height == (1 << 14))) - return false; - - /* The hw can read outside of the given linear buffer bounds, - * or access those pages but not touch the memory in case - * of writes. (it still causes a VM fault) - * - * Out-of-bounds memory access or page directory access must - * be prevented. - */ - int64_t start_linear_address, end_linear_address; - unsigned granularity; - - /* Deduce the size of reads from the linear surface. */ - switch (tiled_micro_mode) { - case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING: - granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp); - break; - case V_009910_ADDR_SURF_THIN_MICRO_TILING: - case V_009910_ADDR_SURF_DEPTH_MICRO_TILING: - if (0 /* TODO: THICK microtiling */) - granularity = - bpp == 1 ? 32 / (8 * bpp) - : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp); - else - granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp); - break; - default: - return false; - } - - /* The linear reads start at tiled_x & ~(granularity - 1). - * If linear_x == 0 && tiled_x % granularity != 0, the hw - * starts reading from an address preceding linear_address!!! - */ - start_linear_address = - linear->surface.u.legacy.level[linear_level].offset + - bpp * (linear_z * linear_slice_pitch + linear_y * linear_pitch + linear_x); - start_linear_address -= (int)(bpp * (tiled_x % granularity)); - - end_linear_address = - linear->surface.u.legacy.level[linear_level].offset + - bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch + - (linear_y + copy_height - 1) * linear_pitch + (linear_x + copy_width)); - - if ((tiled_x + copy_width) % granularity) - end_linear_address += granularity - (tiled_x + copy_width) % granularity; - - if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size) - return false; - - /* Check requirements. */ - if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 && - linear_x % xalign == 0 && tiled_x % xalign == 0 && copy_width_aligned % xalign == 0 && - tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING && - /* check if everything fits into the bitfields */ - tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) && - slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) && - linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) && - copy_height <= (1 << 14) && copy_depth <= (1 << 11)) { - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - uint32_t direction = linear == sdst ? 1u << 31 : 0; - - si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); - - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | - direction); - radeon_emit(cs, tiled_address); - radeon_emit(cs, tiled_address >> 32); - radeon_emit(cs, tiled_x | (tiled_y << 16)); - radeon_emit(cs, tiled_z | (pitch_tile_max << 16)); - radeon_emit(cs, slice_tile_max); - radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true)); - radeon_emit(cs, linear_address); - radeon_emit(cs, linear_address >> 32); - radeon_emit(cs, linear_x | (linear_y << 16)); - radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); - radeon_emit(cs, linear_slice_pitch - 1); - if (sctx->chip_class == GFX7) { - radeon_emit(cs, copy_width_aligned | (copy_height << 16)); - radeon_emit(cs, copy_depth); - } else { - radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - } - return true; - } - } - - /* Tiled -> Tiled sub-window copy. */ - if (dst_mode >= RADEON_SURF_MODE_1D && src_mode >= RADEON_SURF_MODE_1D && - /* check if these fit into the bitfields */ - src_address % 256 == 0 && dst_address % 256 == 0 && - ssrc->surface.u.legacy.tile_split <= 4096 && sdst->surface.u.legacy.tile_split <= 4096 && - dstx % 8 == 0 && dsty % 8 == 0 && srcx % 8 == 0 && srcy % 8 == 0 && - /* this can either be equal, or display->rotated (GFX8+ only) */ - (src_micro_mode == dst_micro_mode || - (sctx->chip_class >= GFX8 && src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING && - dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) { - assert(src_pitch % 8 == 0); - assert(dst_pitch % 8 == 0); - assert(src_slice_pitch % 64 == 0); - assert(dst_slice_pitch % 64 == 0); - unsigned src_pitch_tile_max = src_pitch / 8 - 1; - unsigned dst_pitch_tile_max = dst_pitch / 8 - 1; - unsigned src_slice_tile_max = src_slice_pitch / 64 - 1; - unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1; - unsigned copy_width_aligned = copy_width; - unsigned copy_height_aligned = copy_height; - - /* If the region ends at the last pixel and is unaligned, we - * can copy the remainder of the tile that is not visible to - * make it aligned. - */ - if (copy_width % 8 != 0 && srcx + copy_width == src_width && dstx + copy_width == dst_width) - copy_width_aligned = align(copy_width, 8); - - if (copy_height % 8 != 0 && srcy + copy_height == src_height && - dsty + copy_height == dst_height) - copy_height_aligned = align(copy_height, 8); - - /* check if these fit into the bitfields */ - if (src_pitch_tile_max < (1 << 11) && dst_pitch_tile_max < (1 << 11) && - src_slice_tile_max < (1 << 22) && dst_slice_tile_max < (1 << 22) && - copy_width_aligned <= (1 << 14) && copy_height_aligned <= (1 << 14) && - copy_depth <= (1 << 11) && copy_width_aligned % 8 == 0 && copy_height_aligned % 8 == 0 && - /* HW limitation - GFX7: */ - (sctx->chip_class != GFX7 || - (copy_width_aligned < (1 << 14) && copy_height_aligned < (1 << 14) && - copy_depth < (1 << 11))) && - /* HW limitation - some GFX7 parts: */ - ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI && - sctx->family != CHIP_KABINI) || - (srcx + copy_width_aligned != (1 << 14) && srcy + copy_height_aligned != (1 << 14) && - dstx + copy_width != (1 << 14)))) { - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - - si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer); - - radeon_emit( - cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0)); - radeon_emit(cs, src_address); - radeon_emit(cs, src_address >> 32); - radeon_emit(cs, srcx | (srcy << 16)); - radeon_emit(cs, srcz | (src_pitch_tile_max << 16)); - radeon_emit(cs, src_slice_tile_max); - radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true)); - radeon_emit(cs, dst_address); - radeon_emit(cs, dst_address >> 32); - radeon_emit(cs, dstx | (dsty << 16)); - radeon_emit(cs, dstz | (dst_pitch_tile_max << 16)); - radeon_emit(cs, dst_slice_tile_max); - radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false)); - if (sctx->chip_class == GFX7) { - radeon_emit(cs, copy_width_aligned | (copy_height_aligned << 16)); - radeon_emit(cs, copy_depth); - } else { - radeon_emit(cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16)); - radeon_emit(cs, (copy_depth - 1)); - } - return true; - } - } - - return false; -} - -static void cik_sdma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src, - unsigned src_level, const struct pipe_box *src_box) -{ - struct si_context *sctx = (struct si_context *)ctx; - - assert(src->target != PIPE_BUFFER); - - if (!sctx->sdma_cs.priv || src->flags & PIPE_RESOURCE_FLAG_SPARSE || - dst->flags & PIPE_RESOURCE_FLAG_SPARSE) - goto fallback; - - /* SDMA causes corruption. See: - * https://bugs.freedesktop.org/show_bug.cgi?id=110575 - * https://bugs.freedesktop.org/show_bug.cgi?id=110635 - * - * Keep SDMA enabled on APUs. - */ - if (sctx->screen->debug_flags & DBG(FORCE_SDMA) || - (!sctx->screen->info.has_dedicated_vram && - !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) { - if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) && - cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box)) - return; - else if (sctx->chip_class == GFX9 && si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty, - dstz, src, src_level, src_box)) - return; - } - -fallback: - si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); -} - -void cik_init_sdma_functions(struct si_context *sctx) -{ - sctx->dma_copy = cik_sdma_copy; -} diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 22d5b1e58f3..e373cbc63a6 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -19,7 +19,6 @@ # SOFTWARE. files_libradeonsi = files( - 'cik_sdma.c', 'driinfo_radeonsi.h', 'gfx10_query.c', 'gfx10_shader_ngg.c', @@ -35,7 +34,6 @@ files_libradeonsi = files( 'si_cp_reg_shadowing.c', 'si_debug.c', 'si_descriptors.c', - 'si_dma_cs.c', 'si_fence.c', 'si_get.c', 'si_gfx_cs.c', diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 96c4d8b9733..4af96afd4f7 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -972,7 +972,6 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst /* SNORM8 blitting has precision issues on some chips. Use the SINT * equivalent instead, which doesn't force DCC decompression. - * Note that some chips avoid this issue by using SDMA. */ if (util_format_is_snorm8(dst_templ.format)) { dst_templ.format = src_templ.format = util_format_snorm8_to_sint8(dst_templ.format); @@ -1137,21 +1136,18 @@ resolve_to_temp: static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) { struct si_context *sctx = (struct si_context *)ctx; - struct si_texture *dst = (struct si_texture *)info->dst.resource; if (do_hardware_msaa_resolve(ctx, info)) { return; } - /* Using SDMA for copying to a linear texture in GTT is much faster. - * This improves DRI PRIME performance. - * - * resource_copy_region can't do this yet, because dma_copy calls it - * on failure (recursion). + /* Using compute for copying to a linear texture in GTT is much faster than + * going through RBs (render backends). This improves DRI PRIME performance. */ - if (dst->surface.is_linear && util_can_blit_via_copy_region(info, false)) { - sctx->dma_copy(ctx, info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y, - info->dst.box.z, info->src.resource, info->src.level, &info->src.box); + if (util_can_blit_via_copy_region(info, false)) { + si_resource_copy_region(ctx, info->dst.resource, info->dst.level, + info->dst.box.x, info->dst.box.y, info->dst.box.z, + info->src.resource, info->src.level, &info->src.box); return; } @@ -1166,9 +1162,6 @@ static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS, info->src.level, info->src.box.z, info->src.box.z + info->src.box.depth - 1); - if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && util_try_blit_via_copy_region(ctx, info)) - return; - si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); util_blitter_blit(sctx->blitter, info); si_blitter_end(sctx); diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index 7c800e19098..cfdabf241ee 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -33,68 +33,13 @@ bool si_cs_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf, enum radeon_bo_usage usage) { - if (sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, buf, usage)) { - return true; - } - if (radeon_emitted(&sctx->sdma_cs, 0) && - sctx->ws->cs_is_buffer_referenced(&sctx->sdma_cs, buf, usage)) { - return true; - } - return false; + return sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, buf, usage); } void *si_buffer_map(struct si_context *sctx, struct si_resource *resource, unsigned usage) { - enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE; - bool busy = false; - - assert(!(resource->flags & RADEON_FLAG_SPARSE)); - - if (usage & PIPE_MAP_UNSYNCHRONIZED) { - return sctx->ws->buffer_map(resource->buf, NULL, usage); - } - - if (!(usage & PIPE_MAP_WRITE)) { - /* have to wait for the last write */ - rusage = RADEON_USAGE_WRITE; - } - - if (radeon_emitted(&sctx->gfx_cs, sctx->initial_gfx_cs_size) && - sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, resource->buf, rusage)) { - if (usage & PIPE_MAP_DONTBLOCK) { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - return NULL; - } else { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - busy = true; - } - } - if (radeon_emitted(&sctx->sdma_cs, 0) && - sctx->ws->cs_is_buffer_referenced(&sctx->sdma_cs, resource->buf, rusage)) { - if (usage & PIPE_MAP_DONTBLOCK) { - si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL); - return NULL; - } else { - si_flush_dma_cs(sctx, 0, NULL); - busy = true; - } - } - - if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) { - if (usage & PIPE_MAP_DONTBLOCK) { - return NULL; - } else { - /* We will be wait for the GPU. Wait for any offloaded - * CS flush to complete to avoid busy-waiting in the winsys. */ - sctx->ws->cs_sync_flush(&sctx->gfx_cs); - if (sctx->sdma_cs.priv) - sctx->ws->cs_sync_flush(&sctx->sdma_cs); - } - } - - /* Setting the CS to NULL will prevent doing checks we have done already. */ - return sctx->ws->buffer_map(resource->buf, NULL, usage); + return sctx->ws->buffer_map(resource->buf, &sctx->gfx_cs, usage); } void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size, @@ -188,7 +133,7 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, res->flags |= RADEON_FLAG_DRIVER_INTERNAL; /* For higher throughput and lower latency over PCIe assuming sequential access. - * Only CP DMA, SDMA, and optimized compute benefit from this. + * Only CP DMA and optimized compute benefit from this. * GFX8 and older don't support RADEON_FLAG_UNCACHED. */ if (sscreen->info.chip_class >= GFX9 && @@ -442,13 +387,6 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resour } } - if (usage & PIPE_MAP_FLUSH_EXPLICIT && - buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { - usage &= ~(PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_PERSISTENT); - usage |= PIPE_MAP_DISCARD_RANGE; - force_discard_range = true; - } - if (usage & PIPE_MAP_DISCARD_RANGE && ((!(usage & (PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_PERSISTENT))) || (buf->flags & RADEON_FLAG_SPARSE))) { @@ -502,8 +440,8 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resour box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT), 256); if (staging) { /* Copy the VRAM buffer to the staging buffer. */ - si_sdma_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT, - box->x, box->width); + si_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT, + box->x, box->width); data = si_buffer_map(sctx, staging, usage & ~PIPE_MAP_UNSYNCHRONIZED); if (!data) { @@ -538,45 +476,6 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx, struct pipe_tran unsigned src_offset = stransfer->offset + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + (box->x - transfer->box.x); - if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { - /* This should be true for all uploaders. */ - assert(transfer->box.x == 0); - - /* Find a previous upload and extend its range. The last - * upload is likely to be at the end of the list. - */ - for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) { - struct si_sdma_upload *up = &sctx->sdma_uploads[i]; - - if (up->dst != buf) - continue; - - assert(up->src == stransfer->staging); - assert(box->x > up->dst_offset); - up->size = box->x + box->width - up->dst_offset; - return; - } - - /* Enlarge the array if it's full. */ - if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) { - unsigned size; - - sctx->max_sdma_uploads += 4; - size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]); - sctx->sdma_uploads = realloc(sctx->sdma_uploads, size); - } - - /* Add a new upload. */ - struct si_sdma_upload *up = &sctx->sdma_uploads[sctx->num_sdma_uploads++]; - up->dst = up->src = NULL; - si_resource_reference(&up->dst, buf); - si_resource_reference(&up->src, stransfer->staging); - up->dst_offset = box->x; - up->src_offset = src_offset; - up->size = box->width; - return; - } - /* Copy the staging buffer into the original one. */ si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, box->x, src_offset, box->width); @@ -808,13 +707,6 @@ static bool si_resource_commit(struct pipe_context *pctx, struct pipe_resource * ctx->ws->cs_is_buffer_referenced(&ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) { si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } - if (radeon_emitted(&ctx->sdma_cs, 0) && - ctx->ws->cs_is_buffer_referenced(&ctx->sdma_cs, res->buf, RADEON_USAGE_READWRITE)) { - si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); - } - - if (ctx->sdma_cs.priv) - ctx->ws->cs_sync_flush(&ctx->sdma_cs); ctx->ws->cs_sync_flush(&ctx->gfx_cs); assert(resource->target == PIPE_BUFFER); diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index b350c9fe3f7..9e29283a642 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -529,7 +529,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u /* SNORM8 blitting has precision issues on some chips. Use the SINT * equivalent instead, which doesn't force DCC decompression. - * Note that some chips avoid this issue by using SDMA. */ if (util_format_is_snorm8(dst->format)) { image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format); diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index c242c8e5d25..2f3800f260a 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -1110,25 +1110,6 @@ void si_log_compute_state(struct si_context *sctx, struct u_log_context *log) si_dump_compute_descriptors(sctx, log); } -static void si_dump_dma(struct si_context *sctx, struct radeon_saved_cs *saved, FILE *f) -{ - static const char ib_name[] = "sDMA IB"; - unsigned i; - - si_dump_bo_list(sctx, saved, f); - - fprintf(f, "------------------ %s begin ------------------\n", ib_name); - - for (i = 0; i < saved->num_dw; ++i) { - fprintf(f, " %08x\n", saved->ib[i]); - } - - fprintf(f, "------------------- %s end -------------------\n", ib_name); - fprintf(f, "\n"); - - fprintf(f, "SDMA Dump Done.\n"); -} - void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring) { struct pipe_screen *screen = sctx->b.screen; @@ -1167,9 +1148,6 @@ void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, u_log_context_destroy(&log); break; } - case RING_DMA: - si_dump_dma(sctx, saved, f); - break; default: break; diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c deleted file mode 100644 index 77dc9d81dc0..00000000000 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright 2018 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "si_pipe.h" -#include "sid.h" - -static void si_dma_emit_wait_idle(struct si_context *sctx) -{ - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - - /* NOP waits for idle. */ - if (sctx->chip_class >= GFX7) - radeon_emit(cs, 0x00000000); /* NOP */ - else - radeon_emit(cs, 0xf0000000); /* NOP */ -} - -void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset) -{ - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - uint64_t va = dst->gpu_address + offset; - - if (sctx->chip_class == GFX6) { - unreachable("SI DMA doesn't support the timestamp packet."); - return; - } - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8); - - assert(va % 8 == 0); - - si_need_dma_space(sctx, 4, dst, NULL); - si_dma_emit_wait_idle(sctx); - - radeon_emit( - cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); -} - -void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, - uint64_t size, unsigned clear_value) -{ - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - unsigned i, ncopy, csize; - struct si_resource *sdst = si_resource(dst); - - assert(offset % 4 == 0); - assert(size); - assert(size % 4 == 0); - - if (!cs->priv || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || - sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS) || - unlikely(radeon_uses_secure_bos(sctx->ws))) { - sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); - return; - } - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); - - offset += sdst->gpu_address; - - if (sctx->chip_class == GFX6) { - /* the same maximum size as for copying */ - ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); - si_need_dma_space(sctx, ncopy * 4, sdst, NULL); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); - radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4)); - radeon_emit(cs, offset); - radeon_emit(cs, clear_value); - radeon_emit(cs, (offset >> 32) << 16); - offset += csize; - size -= csize; - } - return; - } - - /* The following code is for CI and later. */ - /* the same maximum size as for copying */ - unsigned max_size_per_packet = sctx->chip_class >= GFX10_3 ? - GFX103_SDMA_COPY_MAX_SIZE : - CIK_SDMA_COPY_MAX_SIZE; - ncopy = DIV_ROUND_UP(size, max_size_per_packet); - si_need_dma_space(sctx, ncopy * 5, sdst, NULL); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, max_size_per_packet); - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */)); - radeon_emit(cs, offset); - radeon_emit(cs, offset >> 32); - radeon_emit(cs, clear_value); - /* dw count */ - radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc); - offset += csize; - size -= csize; - } -} - -void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, - struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, - uint64_t size) -{ - struct radeon_cmdbuf *cs = &sctx->sdma_cs; - unsigned i, ncopy, csize; - struct si_resource *sdst = si_resource(dst); - struct si_resource *ssrc = si_resource(src); - - if (!cs->priv || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE || - (ssrc->flags & RADEON_FLAG_ENCRYPTED) != (sdst->flags & RADEON_FLAG_ENCRYPTED)) { - si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size); - return; - } - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size); - - dst_offset += sdst->gpu_address; - src_offset += ssrc->gpu_address; - - if (sctx->chip_class == GFX6) { - unsigned max_size, sub_cmd, shift; - - /* see whether we should use the dword-aligned or byte-aligned copy */ - if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { - sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; - shift = 2; - max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; - } else { - sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; - shift = 0; - max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; - } - - ncopy = DIV_ROUND_UP(size, max_size); - si_need_dma_space(sctx, ncopy * 5, sdst, ssrc); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, max_size); - radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift)); - radeon_emit(cs, dst_offset); - radeon_emit(cs, src_offset); - radeon_emit(cs, (dst_offset >> 32UL) & 0xff); - radeon_emit(cs, (src_offset >> 32UL) & 0xff); - dst_offset += csize; - src_offset += csize; - size -= csize; - } - return; - } - - /* The following code is for CI and later. */ - unsigned max_size_per_packet = sctx->chip_class >= GFX10_3 ? - GFX103_SDMA_COPY_MAX_SIZE : - CIK_SDMA_COPY_MAX_SIZE; - unsigned align = ~0u; - ncopy = DIV_ROUND_UP(size, max_size_per_packet); - - /* Align copy size to dw if src/dst address are dw aligned */ - if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) { - align = ~0x3u; - ncopy++; - } - - si_need_dma_space(sctx, ncopy * 7, sdst, ssrc); - - for (i = 0; i < ncopy; i++) { - csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size; - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, - (sctx->ws->cs_is_secure(cs) ? 1u : 0) << 2)); - radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); - radeon_emit(cs, 0); /* src/dst endian swap */ - radeon_emit(cs, src_offset); - radeon_emit(cs, src_offset >> 32); - radeon_emit(cs, dst_offset); - radeon_emit(cs, dst_offset >> 32); - dst_offset += csize; - src_offset += csize; - size -= csize; - } -} - -void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst, - struct si_resource *src) -{ - struct radeon_winsys *ws = ctx->ws; - uint64_t vram = ctx->sdma_cs.used_vram; - uint64_t gtt = ctx->sdma_cs.used_gart; - - if (dst) { - vram += dst->vram_usage; - gtt += dst->gart_usage; - } - if (src) { - vram += src->vram_usage; - gtt += src->gart_usage; - } - - /* Flush the GFX IB if DMA depends on it. */ - if (!ctx->sdma_uploads_in_progress && radeon_emitted(&ctx->gfx_cs, ctx->initial_gfx_cs_size) && - ((dst && ws->cs_is_buffer_referenced(&ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) || - (src && ws->cs_is_buffer_referenced(&ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE)))) - si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - - bool use_secure_cmd = false; - if (unlikely(radeon_uses_secure_bos(ctx->ws))) { - if (src && src->flags & RADEON_FLAG_ENCRYPTED) { - assert(!dst || (dst->flags & RADEON_FLAG_ENCRYPTED)); - use_secure_cmd = true; - } else if (dst && (dst->flags & RADEON_FLAG_ENCRYPTED)) { - use_secure_cmd = true; - } - } - - /* Flush if there's not enough space, or if the memory usage per IB - * is too large. - * - * IBs using too little memory are limited by the IB submission overhead. - * IBs using too much memory are limited by the kernel/TTM overhead. - * Too long IBs create CPU-GPU pipeline bubbles and add latency. - * - * This heuristic makes sure that DMA requests are executed - * very soon after the call is made and lowers memory usage. - * It improves texture upload performance by keeping the DMA - * engine busy while uploads are being submitted. - */ - num_dw++; /* for emit_wait_idle below */ - if (!ctx->sdma_uploads_in_progress && - (use_secure_cmd != ctx->ws->cs_is_secure(&ctx->sdma_cs) || - !ws->cs_check_space(&ctx->sdma_cs, num_dw, false) || - ctx->sdma_cs.used_vram + ctx->sdma_cs.used_gart > 64 * 1024 * 1024 || - !radeon_cs_memory_below_limit(ctx->screen, &ctx->sdma_cs, vram, gtt))) { - si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC | RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL); - assert(ctx->ws->cs_is_secure(&ctx->sdma_cs) == use_secure_cmd); - assert((num_dw + ctx->sdma_cs.current.cdw) <= ctx->sdma_cs.current.max_dw); - } - - /* Wait for idle if either buffer has been used in the IB before to - * prevent read-after-write hazards. - */ - if ((dst && ws->cs_is_buffer_referenced(&ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) || - (src && ws->cs_is_buffer_referenced(&ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE))) - si_dma_emit_wait_idle(ctx); - - unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED; - if (dst) { - ws->cs_add_buffer(&ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0); - } - if (src) { - ws->cs_add_buffer(&ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0); - } - - /* this function is called before all DMA calls, so increment this. */ - ctx->num_dma_calls++; -} - -void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) -{ - struct radeon_cmdbuf *cs = &ctx->sdma_cs; - struct radeon_saved_cs saved; - bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0; - - if (!radeon_emitted(cs, 0) && - !(flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)) { - if (fence) - ctx->ws->fence_reference(fence, ctx->last_sdma_fence); - return; - } - - if (check_vm) - si_save_cs(ctx->ws, cs, &saved, true); - - if (ctx->is_noop) - flags |= RADEON_FLUSH_NOOP; - - ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence); - if (fence) - ctx->ws->fence_reference(fence, ctx->last_sdma_fence); - - if (check_vm) { - /* Use conservative timeout 800ms, after which we won't wait any - * longer and assume the GPU is hung. - */ - ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000); - - si_check_vm_faults(ctx, &saved, RING_DMA); - si_clear_saved_cs(&saved); - } -} diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index d1521485444..a2d0d1862a5 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -36,10 +36,9 @@ struct si_fine_fence { unsigned offset; }; -struct si_multi_fence { +struct si_fence { struct pipe_reference reference; struct pipe_fence_handle *gfx; - struct pipe_fence_handle *sdma; struct tc_unflushed_batch_token *tc_token; struct util_queue_fence ready; @@ -168,8 +167,6 @@ static void si_add_fence_dependency(struct si_context *sctx, struct pipe_fence_h { struct radeon_winsys *ws = sctx->ws; - if (sctx->sdma_cs.priv) - ws->cs_add_fence_dependency(&sctx->sdma_cs, fence, 0); ws->cs_add_fence_dependency(&sctx->gfx_cs, fence, 0); } @@ -182,12 +179,11 @@ static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_han struct pipe_fence_handle *src) { struct radeon_winsys *ws = ((struct si_screen *)screen)->ws; - struct si_multi_fence **sdst = (struct si_multi_fence **)dst; - struct si_multi_fence *ssrc = (struct si_multi_fence *)src; + struct si_fence **sdst = (struct si_fence **)dst; + struct si_fence *ssrc = (struct si_fence *)src; if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) { ws->fence_reference(&(*sdst)->gfx, NULL); - ws->fence_reference(&(*sdst)->sdma, NULL); tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL); si_resource_reference(&(*sdst)->fine.buf, NULL); FREE(*sdst); @@ -195,9 +191,9 @@ static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_han *sdst = ssrc; } -static struct si_multi_fence *si_create_multi_fence() +static struct si_fence *si_create_multi_fence() { - struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence); + struct si_fence *fence = CALLOC_STRUCT(si_fence); if (!fence) return NULL; @@ -210,7 +206,7 @@ static struct si_multi_fence *si_create_multi_fence() struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, struct tc_unflushed_batch_token *tc_token) { - struct si_multi_fence *fence = si_create_multi_fence(); + struct si_fence *fence = si_create_multi_fence(); if (!fence) return NULL; @@ -265,7 +261,7 @@ static bool si_fence_finish(struct pipe_screen *screen, struct pipe_context *ctx struct pipe_fence_handle *fence, uint64_t timeout) { struct radeon_winsys *rws = ((struct si_screen *)screen)->ws; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; + struct si_fence *sfence = (struct si_fence *)fence; struct si_context *sctx; int64_t abs_timeout = os_time_get_absolute_timeout(timeout); @@ -301,17 +297,6 @@ static bool si_fence_finish(struct pipe_screen *screen, struct pipe_context *ctx } } - if (sfence->sdma) { - if (!rws->fence_wait(rws, sfence->sdma, timeout)) - return false; - - /* Recompute the timeout after waiting. */ - if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { - int64_t time = os_time_get_nano(); - timeout = abs_timeout > time ? abs_timeout - time : 0; - } - } - if (!sfence->gfx) return true; @@ -376,7 +361,7 @@ static void si_create_fence_fd(struct pipe_context *ctx, struct pipe_fence_handl { struct si_screen *sscreen = (struct si_screen *)ctx->screen; struct radeon_winsys *ws = sscreen->ws; - struct si_multi_fence *sfence; + struct si_fence *sfence; *pfence = NULL; @@ -416,8 +401,8 @@ static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle { struct si_screen *sscreen = (struct si_screen *)screen; struct radeon_winsys *ws = sscreen->ws; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; - int gfx_fd = -1, sdma_fd = -1; + struct si_fence *sfence = (struct si_fence *)fence; + int gfx_fd = -1; if (!sscreen->info.has_fence_to_handle) return -1; @@ -429,32 +414,18 @@ static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle if (sfence->gfx_unflushed.ctx) return -1; - if (sfence->sdma) { - sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma); - if (sdma_fd == -1) - return -1; - } if (sfence->gfx) { gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx); if (gfx_fd == -1) { - if (sdma_fd != -1) - close(sdma_fd); return -1; } } /* If we don't have FDs at this point, it means we don't have fences * either. */ - if (sdma_fd == -1 && gfx_fd == -1) - return ws->export_signalled_sync_file(ws); - if (sdma_fd == -1) - return gfx_fd; if (gfx_fd == -1) - return sdma_fd; + return ws->export_signalled_sync_file(ws); - /* Get a fence that will be a combination of both fences. */ - sync_accumulate("radeonsi", &gfx_fd, sdma_fd); - close(sdma_fd); return gfx_fd; } @@ -466,7 +437,6 @@ static void si_flush_all_queues(struct pipe_context *ctx, struct si_context *sctx = (struct si_context *)ctx; struct radeon_winsys *ws = sctx->ws; struct pipe_fence_handle *gfx_fence = NULL; - struct pipe_fence_handle *sdma_fence = NULL; bool deferred_fence = false; struct si_fine_fence fine = {}; unsigned rflags = PIPE_FLUSH_ASYNC; @@ -485,10 +455,6 @@ static void si_flush_all_queues(struct pipe_context *ctx, si_fine_fence_set(sctx, &fine, flags); } - /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ - if (sctx->sdma_cs.priv) - si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL); - if (force_flush) { sctx->initial_gfx_cs_size = 0; } @@ -515,45 +481,41 @@ static void si_flush_all_queues(struct pipe_context *ctx, /* Both engines can signal out of order, so we need to keep both fences. */ if (fence) { - struct si_multi_fence *multi_fence; + struct si_fence *new_fence; if (flags & TC_FLUSH_ASYNC) { - multi_fence = (struct si_multi_fence *)*fence; - assert(multi_fence); + new_fence = (struct si_fence *)*fence; + assert(new_fence); } else { - multi_fence = si_create_multi_fence(); - if (!multi_fence) { - ws->fence_reference(&sdma_fence, NULL); + new_fence = si_create_multi_fence(); + if (!new_fence) { ws->fence_reference(&gfx_fence, NULL); goto finish; } screen->fence_reference(screen, fence, NULL); - *fence = (struct pipe_fence_handle *)multi_fence; + *fence = (struct pipe_fence_handle *)new_fence; } /* If both fences are NULL, fence_finish will always return true. */ - multi_fence->gfx = gfx_fence; - multi_fence->sdma = sdma_fence; + new_fence->gfx = gfx_fence; if (deferred_fence) { - multi_fence->gfx_unflushed.ctx = sctx; - multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes; + new_fence->gfx_unflushed.ctx = sctx; + new_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes; } - multi_fence->fine = fine; + new_fence->fine = fine; fine.buf = NULL; if (flags & TC_FLUSH_ASYNC) { - util_queue_fence_signal(&multi_fence->ready); - tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL); + util_queue_fence_signal(&new_fence->ready); + tc_unflushed_batch_token_reference(&new_fence->tc_token, NULL); } } assert(!fine.buf); finish: if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) { - if (sctx->sdma_cs.priv) - ws->cs_sync_flush(&sctx->sdma_cs); ws->cs_sync_flush(&sctx->gfx_cs); } } @@ -567,13 +529,10 @@ static void si_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle static void si_fence_server_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence) { struct si_context *sctx = (struct si_context *)ctx; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; + struct si_fence *sfence = (struct si_fence *)fence; - /* We should have at least one syncobj to signal */ - assert(sfence->sdma || sfence->gfx); + assert(sfence->gfx); - if (sfence->sdma) - si_add_syncobj_signal(sctx, sfence->sdma); if (sfence->gfx) si_add_syncobj_signal(sctx, sfence->gfx); @@ -595,7 +554,7 @@ static void si_fence_server_signal(struct pipe_context *ctx, struct pipe_fence_h static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence) { struct si_context *sctx = (struct si_context *)ctx; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; + struct si_fence *sfence = (struct si_fence *)fence; util_queue_fence_wait(&sfence->ready); @@ -613,10 +572,7 @@ static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_han * the time it takes to create and submit that IB, flushing decreases * performance. Therefore, DO NOT FLUSH. */ - if (sfence->sdma) - si_add_fence_dependency(sctx, sfence->sdma); - if (sfence->gfx) - si_add_fence_dependency(sctx, sfence->gfx); + si_add_fence_dependency(sctx, sfence->gfx); } void si_init_fence_functions(struct si_context *ctx) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index e21986fed1b..3de2dae490f 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -34,13 +34,6 @@ void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws) { struct radeon_cmdbuf *cs = &ctx->gfx_cs; - /* There is no need to flush the DMA IB here, because - * si_need_dma_space always flushes the GFX IB if there is - * a conflict, which means any unflushed DMA commands automatically - * precede the GFX IB (= they had no dependency on the GFX IB when - * they were submitted). - */ - /* There are two memory usage counters in the winsys for all buffers * that have been added (cs_add_buffer) and two counters in the pipe * driver for those that haven't been added yet. @@ -59,15 +52,6 @@ void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws) si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } -void si_unref_sdma_uploads(struct si_context *sctx) -{ - for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) { - si_resource_reference(&sctx->sdma_uploads[i].dst, NULL); - si_resource_reference(&sctx->sdma_uploads[i].src, NULL); - } - sctx->num_sdma_uploads = 0; -} - void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct radeon_cmdbuf *cs = &ctx->gfx_cs; @@ -120,33 +104,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h ctx->gfx_flush_in_progress = true; - /* If the gallium frontend is flushing the GFX IB, si_flush_from_st is - * responsible for flushing the DMA IB and merging the fences from both. - * If the driver flushes the GFX IB internally, and it should never ask - * for a fence handle. - */ - assert(!radeon_emitted(&ctx->sdma_cs, 0) || fence == NULL); - - /* Update the sdma_uploads list by flushing the uploader. */ - u_upload_unmap(ctx->b.const_uploader); - - /* Execute SDMA uploads. */ - ctx->sdma_uploads_in_progress = true; - for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) { - struct si_sdma_upload *up = &ctx->sdma_uploads[i]; - - assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0); - - si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset, - up->size); - } - ctx->sdma_uploads_in_progress = false; - si_unref_sdma_uploads(ctx); - - /* Flush SDMA (preamble IB). */ - if (radeon_emitted(&ctx->sdma_cs, 0)) - si_flush_dma_cs(ctx, flags, NULL); - if (radeon_emitted(&ctx->prim_discard_compute_cs, 0)) { struct radeon_cmdbuf *compute_cs = &ctx->prim_discard_compute_cs; si_compute_signal_gfx(ctx); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 4982f08f9ee..774abef234c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -82,10 +82,6 @@ static const struct debug_named_value debug_options[] = { {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."}, /* Driver options: */ - {"forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible."}, - {"nodma", DBG(NO_SDMA), "Disable SDMA"}, - {"nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears"}, - {"nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies"}, {"nowc", DBG(NO_WC), "Disable GTT write combining"}, {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."}, {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."}, @@ -123,9 +119,8 @@ static const struct debug_named_value debug_options[] = { static const struct debug_named_value test_options[] = { /* Tests: */ - {"testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit."}, + {"testdma", DBG(TEST_DMA), "Invoke blit tests and exit."}, {"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."}, - {"testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit."}, {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."}, {"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"}, {"testgds", DBG(TEST_GDS), "Test GDS."}, @@ -285,7 +280,6 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader); sctx->ws->cs_destroy(&sctx->gfx_cs); - sctx->ws->cs_destroy(&sctx->sdma_cs); if (sctx->ctx) sctx->ws->ctx_destroy(sctx->ctx); @@ -306,7 +300,6 @@ static void si_destroy_context(struct pipe_context *context) u_suballocator_destroy(&sctx->allocator_zeroed_memory); sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL); - sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL); sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL); si_resource_reference(&sctx->eop_bug_scratch, NULL); si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL); @@ -329,8 +322,6 @@ static void si_destroy_context(struct pipe_context *context) util_dynarray_fini(&sctx->resident_tex_needs_color_decompress); util_dynarray_fini(&sctx->resident_img_needs_color_decompress); util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress); - si_unref_sdma_uploads(sctx); - free(sctx->sdma_uploads); FREE(sctx); } @@ -502,37 +493,12 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign if (!sctx->ctx) goto fail; - /* SDMA causes corruption on: : - * - RX 580: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1399, 1889 - * - gfx9 APUs: https://gitlab.freedesktop.org/mesa/mesa/-/issues/2814 - * - gfx10: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1907, - https://gitlab.freedesktop.org/drm/amd/issues/892 - * - * While we could keep buffer copies and clears enabled, let's disable - * everything because SDMA decreases CPU performance because of its - * command submission overhead. - * - * And SDMA is disabled on all chips (instead of just the ones listed above), - * because it doesn't make sense to keep it enabled on old chips only - * that are not tested as often as newer chips. - */ - if (sscreen->info.num_rings[RING_DMA] && !(sscreen->debug_flags & DBG(NO_SDMA)) && - sscreen->debug_flags & DBG(FORCE_SDMA)) { - sctx->ws->cs_create(&sctx->sdma_cs, sctx->ctx, RING_DMA, (void *)si_flush_dma_cs, - sctx, stop_exec_on_failure); - } - - bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs.priv; sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT, - SI_RESOURCE_FLAG_32BIT | - (use_sdma_upload ? SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0)); + SI_RESOURCE_FLAG_32BIT); if (!sctx->b.const_uploader) goto fail; - if (use_sdma_upload) - u_upload_enable_flush_explicit(sctx->b.const_uploader); - ws->cs_create(&sctx->gfx_cs, sctx->ctx, sctx->has_graphics ? RING_GFX : RING_COMPUTE, (void *)si_flush_gfx_cs, sctx, stop_exec_on_failure); @@ -615,15 +581,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign sctx->prim_discard_vertex_count_threshold = UINT_MAX; } - /* Initialize SDMA functions. */ - if (sctx->chip_class >= GFX7) - cik_init_sdma_functions(sctx); - else - sctx->dma_copy = si_resource_copy_region; - - if (sscreen->debug_flags & DBG(FORCE_SDMA)) - sctx->b.resource_copy_region = sctx->dma_copy; - sctx->sample_mask = 0xffff; /* Initialize multimedia functions. */ @@ -888,11 +845,6 @@ static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags) ctx->flush(ctx, NULL, 0); puts("VM fault test: CP - done."); } - if (test_flags & DBG(TEST_VMFAULT_SDMA)) { - si_sdma_clear_buffer(sctx, buf, 0, 4, 0); - ctx->flush(ctx, NULL, 0); - puts("VM fault test: SDMA - done."); - } if (test_flags & DBG(TEST_VMFAULT_SHADER)) { util_test_constant_buffer(ctx, buf); puts("VM fault test: Shader - done."); @@ -1343,7 +1295,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, si_test_dma_perf(sscreen); } - if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER))) + if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER))) si_test_vmfault(sscreen, test_flags); if (test_flags & DBG(TEST_GDS)) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0cebbb596b2..0e8a2a6813b 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -116,8 +116,7 @@ #define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5) #define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6) #define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7) -/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */ -#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8) +/* gap */ /* Set a micro tile mode: */ #define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9) #define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10) @@ -179,10 +178,6 @@ enum DBG_CACHE_STATS, /* Driver options: */ - DBG_FORCE_SDMA, - DBG_NO_SDMA, - DBG_NO_SDMA_CLEARS, - DBG_NO_SDMA_COPY_IMAGE, DBG_NO_WC, DBG_CHECK_VM, DBG_RESERVE_VMID, @@ -223,7 +218,6 @@ enum /* Tests: */ DBG_TEST_DMA, DBG_TEST_VMFAULT_CP, - DBG_TEST_VMFAULT_SDMA, DBG_TEST_VMFAULT_SHADER, DBG_TEST_DMA_PERF, DBG_TEST_GDS, @@ -882,14 +876,6 @@ struct si_saved_cs { int64_t time_flush; }; -struct si_sdma_upload { - struct si_resource *dst; - struct si_resource *src; - unsigned src_offset; - unsigned dst_offset; - unsigned size; -}; - struct si_small_prim_cull_info { float scale[2], translate[2]; }; @@ -903,9 +889,7 @@ struct si_context { struct radeon_winsys *ws; struct radeon_winsys_ctx *ctx; struct radeon_cmdbuf gfx_cs; /* compute IB if graphics is disabled */ - struct radeon_cmdbuf sdma_cs; struct pipe_fence_handle *last_gfx_fence; - struct pipe_fence_handle *last_sdma_fence; struct si_resource *eop_bug_scratch; struct si_resource *eop_bug_scratch_tmz; struct u_upload_mgr *cached_gtt_allocator; @@ -1220,7 +1204,6 @@ struct si_context { unsigned num_spill_draw_calls; unsigned num_compute_calls; unsigned num_spill_compute_calls; - unsigned num_dma_calls; unsigned num_cp_dma_calls; unsigned num_vs_flushes; unsigned num_ps_flushes; @@ -1251,12 +1234,6 @@ struct si_context { bool render_cond_invert; bool render_cond_force_off; /* for u_blitter */ - /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */ - bool sdma_uploads_in_progress; - struct si_sdma_upload *sdma_uploads; - unsigned num_sdma_uploads; - unsigned max_sdma_uploads; - /* Shader-based queries. */ struct list_head shader_query_buffers; unsigned num_active_shader_queries; @@ -1280,11 +1257,6 @@ struct si_context { bool query_active; } dcc_stats[5]; - /* Copy one resource to another using async DMA. */ - void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, - unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src, - unsigned src_level, const struct pipe_box *src_box); - struct si_tracked_regs tracked_regs; /* Resources that need to be flushed, but will not get an explicit @@ -1294,9 +1266,6 @@ struct si_context { struct hash_table *dirty_implicit_resources; }; -/* cik_sdma.c */ -void cik_init_sdma_functions(struct si_context *sctx); - /* si_blit.c */ enum si_blitter_op /* bitmask */ { @@ -1419,17 +1388,6 @@ void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring); bool si_replace_shader(unsigned num, struct si_shader_binary *binary); -/* si_dma_cs.c */ -void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset); -void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, - uint64_t size, unsigned clear_value); -void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, - struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, - uint64_t size); -void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst, - struct si_resource *src); -void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); - /* si_fence.c */ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event, unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel, @@ -1452,7 +1410,6 @@ void si_allocate_gds(struct si_context *ctx); void si_set_tracked_regs_to_clear_state(struct si_context *ctx); void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs); void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws); -void si_unref_sdma_uploads(struct si_context *sctx); /* si_gpu_load.c */ void si_gpu_load_kill_thread(struct si_screen *sscreen); @@ -1542,9 +1499,6 @@ void si_update_vs_viewport_state(struct si_context *ctx); void si_init_viewport_functions(struct si_context *ctx); /* si_texture.c */ -bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src, - unsigned src_level, const struct pipe_box *src_box); void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex, bool *ctx_flushed); void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex); @@ -1901,8 +1855,6 @@ static inline void radeon_add_to_buffer_list(struct si_context *sctx, struct rad * - if si_context_add_resource_size has been called for the buffer * followed by *_need_cs_space for checking the memory usage * - * - if si_need_dma_space has been called for the buffer - * * - when emitting state packets and draw packets (because preceding packets * can't be re-emitted at that point) * diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 6f4d651fe7c..36e02b6e86a 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -82,8 +82,6 @@ static enum radeon_value_id winsys_id_from_type(unsigned type) return RADEON_NUM_MAPPED_BUFFERS; case SI_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS; - case SI_QUERY_NUM_SDMA_IBS: - return RADEON_NUM_SDMA_IBS; case SI_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER; case SI_QUERY_GFX_IB_SIZE: @@ -113,19 +111,6 @@ static enum radeon_value_id winsys_id_from_type(unsigned type) } } -static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx) -{ - struct pipe_fence_handle *fence = NULL; - - si_flush_dma_cs(sctx, 0, &fence); - if (fence) { - sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE); - sctx->ws->fence_reference(&fence, NULL); - } - - return os_time_get_nano(); -} - static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery) { struct si_query_sw *query = (struct si_query_sw *)squery; @@ -135,9 +120,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery) case PIPE_QUERY_TIMESTAMP_DISJOINT: case PIPE_QUERY_GPU_FINISHED: break; - case SI_QUERY_TIME_ELAPSED_SDMA_SI: - query->begin_result = si_finish_dma_get_cpu_time(sctx); - break; case SI_QUERY_DRAW_CALLS: query->begin_result = sctx->num_draw_calls; break; @@ -159,9 +141,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery) case SI_QUERY_SPILL_COMPUTE_CALLS: query->begin_result = sctx->num_spill_compute_calls; break; - case SI_QUERY_DMA_CALLS: - query->begin_result = sctx->num_dma_calls; - break; case SI_QUERY_CP_DMA_CALLS: query->begin_result = sctx->num_cp_dma_calls; break; @@ -215,7 +194,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery) case SI_QUERY_BUFFER_WAIT_TIME: case SI_QUERY_GFX_IB_SIZE: case SI_QUERY_NUM_GFX_IBS: - case SI_QUERY_NUM_SDMA_IBS: case SI_QUERY_NUM_BYTES_MOVED: case SI_QUERY_NUM_EVICTIONS: case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { @@ -317,9 +295,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery) case PIPE_QUERY_GPU_FINISHED: sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED); break; - case SI_QUERY_TIME_ELAPSED_SDMA_SI: - query->end_result = si_finish_dma_get_cpu_time(sctx); - break; case SI_QUERY_DRAW_CALLS: query->end_result = sctx->num_draw_calls; break; @@ -341,9 +316,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery) case SI_QUERY_SPILL_COMPUTE_CALLS: query->end_result = sctx->num_spill_compute_calls; break; - case SI_QUERY_DMA_CALLS: - query->end_result = sctx->num_dma_calls; - break; case SI_QUERY_CP_DMA_CALLS: query->end_result = sctx->num_cp_dma_calls; break; @@ -394,7 +366,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery) case SI_QUERY_GFX_IB_SIZE: case SI_QUERY_NUM_MAPPED_BUFFERS: case SI_QUERY_NUM_GFX_IBS: - case SI_QUERY_NUM_SDMA_IBS: case SI_QUERY_NUM_BYTES_MOVED: case SI_QUERY_NUM_EVICTIONS: case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { @@ -739,10 +710,6 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query->result_size += 16; /* for the fence + alignment */ query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen); break; - case SI_QUERY_TIME_ELAPSED_SDMA: - /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */ - query->result_size = 64; - break; case PIPE_QUERY_TIME_ELAPSED: query->result_size = 24; query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen); @@ -835,9 +802,6 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h struct radeon_cmdbuf *cs = &sctx->gfx_cs; switch (query->b.type) { - case SI_QUERY_TIME_ELAPSED_SDMA: - si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address); - return; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: @@ -886,8 +850,7 @@ static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw * if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS) sctx->num_pipeline_stat_queries++; - if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA) - si_need_gfx_cs_space(sctx, 0); + si_need_gfx_cs_space(sctx, 0); va = query->buffer.buf->gpu_address + query->buffer.results_end; query->ops->emit_start(sctx, query, query->buffer.buf, va); @@ -900,9 +863,6 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw uint64_t fence_va = 0; switch (query->b.type) { - case SI_QUERY_TIME_ELAPSED_SDMA: - si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address); - return; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: @@ -1144,7 +1104,7 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que struct si_screen *sscreen = (struct si_screen *)ctx->screen; if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED || - (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA)) + (query_type >= PIPE_QUERY_DRIVER_SPECIFIC)) return si_query_sw_create(query_type); if (sscreen->use_ngg_streamout && @@ -1332,9 +1292,6 @@ static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw case PIPE_QUERY_TIME_ELAPSED: result->u64 += si_query_read_result(buffer, 0, 2, false); break; - case SI_QUERY_TIME_ELAPSED_SDMA: - result->u64 += si_query_read_result(buffer, 0, 32 / 4, false); - break; case PIPE_QUERY_TIMESTAMP: result->u64 = *(uint64_t *)buffer; break; @@ -1474,7 +1431,7 @@ bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bo } /* Convert the time to expected units. */ - if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA || + if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == PIPE_QUERY_TIMESTAMP) { result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq; } @@ -1719,7 +1676,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = { X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE), X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), - X("dma-calls", DMA_CALLS, UINT64, AVERAGE), X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE), X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), @@ -1741,7 +1697,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = { X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE), X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE), - X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE), X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE), X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE), X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index e0be318bcac..9bdac5f83fc 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -49,7 +49,6 @@ enum SI_QUERY_SPILL_DRAW_CALLS, SI_QUERY_COMPUTE_CALLS, SI_QUERY_SPILL_COMPUTE_CALLS, - SI_QUERY_DMA_CALLS, SI_QUERY_CP_DMA_CALLS, SI_QUERY_NUM_VS_FLUSHES, SI_QUERY_NUM_PS_FLUSHES, @@ -71,7 +70,6 @@ enum SI_QUERY_BUFFER_WAIT_TIME, SI_QUERY_NUM_MAPPED_BUFFERS, SI_QUERY_NUM_GFX_IBS, - SI_QUERY_NUM_SDMA_IBS, SI_QUERY_GFX_BO_LIST_SIZE, SI_QUERY_GFX_IB_SIZE, SI_QUERY_NUM_BYTES_MOVED, @@ -112,8 +110,6 @@ enum SI_QUERY_GPIN_NUM_RB, SI_QUERY_GPIN_NUM_SPI, SI_QUERY_GPIN_NUM_SE, - SI_QUERY_TIME_ELAPSED_SDMA, - SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */ SI_QUERY_PD_NUM_PRIMS_ACCEPTED, SI_QUERY_PD_NUM_PRIMS_REJECTED, SI_QUERY_PD_NUM_PRIMS_INELIGIBLE, diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c index 70e811db853..d56b0e40671 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma.c +++ b/src/gallium/drivers/radeonsi/si_test_dma.c @@ -23,7 +23,7 @@ * */ -/* This file implements randomized SDMA texture blit tests. */ +/* This file implements randomized texture blit tests. */ #include "si_pipe.h" #include "util/rand_xor.h" @@ -203,7 +203,7 @@ void si_test_dma(struct si_screen *sscreen) struct si_texture *ssrc; struct cpu_texture src_cpu, dst_cpu; unsigned max_width, max_height, max_depth, j, num; - unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen; + unsigned gfx_blits = 0, cs_blits = 0, max_tex_side_gen; unsigned max_tex_layers; bool pass; bool do_partial_copies = rand() & 1; @@ -305,7 +305,6 @@ void si_test_dma(struct si_screen *sscreen) int srcx, srcy, srcz, dstx, dsty, dstz; struct pipe_box box; unsigned old_num_draw_calls = sctx->num_draw_calls; - unsigned old_num_dma_calls = sctx->num_dma_calls; unsigned old_num_cs_calls = sctx->num_compute_calls; if (!do_partial_copies) { @@ -357,11 +356,10 @@ void si_test_dma(struct si_screen *sscreen) /* GPU copy */ u_box_3d(srcx, srcy, srcz, width, height, depth, &box); - sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box); + si_resource_copy_region(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box); /* See which engine was used. */ gfx_blits += sctx->num_draw_calls > old_num_draw_calls; - dma_blits += sctx->num_dma_calls > old_num_dma_calls; cs_blits += sctx->num_compute_calls > old_num_cs_calls; /* CPU copy */ @@ -376,7 +374,7 @@ void si_test_dma(struct si_screen *sscreen) else num_fail++; - printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits, + printf("BLITs: GFX = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, cs_blits, pass ? "pass" : "fail", num_pass, num_pass + num_fail); /* cleanup */ diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 7135a958496..ad03f5c532c 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -48,13 +48,12 @@ void si_test_dma_perf(struct si_screen *sscreen) static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16}; #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list) -#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) +#define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) static const char *method_str[] = { "CP MC ", "CP L2 ", "CP L2 ", - "SDMA ", }; static const char *placement_str[] = { /* Clear */ @@ -80,7 +79,6 @@ void si_test_dma_perf(struct si_screen *sscreen) struct si_result { bool is_valid; bool is_cp; - bool is_sdma; bool is_cs; unsigned cache_policy; unsigned dwords_per_thread; @@ -100,9 +98,8 @@ void si_test_dma_perf(struct si_screen *sscreen) for (unsigned method = 0; method < NUM_METHODS; method++) { bool test_cp = method <= 2; - bool test_sdma = method == 3; - bool test_cs = method >= 4; - unsigned cs_method = method - 4; + bool test_cs = method >= 3; + unsigned cs_method = method - 3; unsigned cs_waves_per_sh = test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0; cs_method %= 3 * NUM_SHADERS; @@ -111,9 +108,6 @@ void si_test_dma_perf(struct si_screen *sscreen) unsigned cs_dwords_per_thread = test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0; - if (test_sdma && !sctx->sdma_cs.priv) - continue; - if (sctx->chip_class == GFX6) { /* GFX6 doesn't support CP DMA operations through L2. */ if (test_cp && cache_policy != L2_BYPASS) @@ -161,13 +155,6 @@ void si_test_dma_perf(struct si_screen *sscreen) unsigned query_type = PIPE_QUERY_TIME_ELAPSED; unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0; - if (test_sdma) { - if (sctx->chip_class == GFX6) - query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI; - else - query_type = SI_QUERY_TIME_ELAPSED_SDMA; - } - if (placement == 0 || placement == 2 || placement == 4) dst_usage = PIPE_USAGE_DEFAULT; else @@ -201,13 +188,6 @@ void si_test_dma_perf(struct si_screen *sscreen) si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value, 0, SI_COHERENCY_NONE, cache_policy); } - } else if (test_sdma) { - /* SDMA */ - if (is_copy) { - si_sdma_copy_buffer(sctx, dst, src, 0, 0, size); - } else { - si_sdma_clear_buffer(sctx, dst, 0, size, clear_value); - } } else { /* Compute */ /* The memory accesses are coalesced, meaning that the 1st instruction writes @@ -252,12 +232,10 @@ void si_test_dma_perf(struct si_screen *sscreen) } /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */ - if (!test_sdma) { - sctx->flags |= SI_CONTEXT_INV_VCACHE | - (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | - SI_CONTEXT_CS_PARTIAL_FLUSH; - sctx->emit_cache_flush(sctx); - } + sctx->flags |= SI_CONTEXT_INV_VCACHE | + (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | + SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->emit_cache_flush(sctx); } ctx->end_query(ctx, q); @@ -280,7 +258,6 @@ void si_test_dma_perf(struct si_screen *sscreen) struct si_result *r = &results[util_logbase2(size)][placement][method]; r->is_valid = true; r->is_cp = test_cp; - r->is_sdma = test_sdma; r->is_cs = test_cs; r->cache_policy = cache_policy; r->dwords_per_thread = cs_dwords_per_thread; @@ -329,7 +306,7 @@ void si_test_dma_perf(struct si_screen *sscreen) bool cached = mode == 1; if (async) - puts(" if (async) { /* SDMA or async compute */"); + puts(" if (async) { /* async compute */"); else if (cached) puts(" if (cached) { /* gfx ring */"); else @@ -380,10 +357,6 @@ void si_test_dma_perf(struct si_screen *sscreen) if (r->is_cs && r->waves_per_sh == 0) continue; } else { - /* SDMA is always asynchronous */ - if (r->is_sdma) - continue; - if (cached && r->cache_policy == L2_BYPASS) continue; if (!cached && r->cache_policy == L2_LRU) @@ -420,7 +393,7 @@ void si_test_dma_perf(struct si_screen *sscreen) */ if (!best || /* If it's the same method as for the previous size: */ - (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma && + (prev->is_cp == best->is_cp && prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy && prev->dwords_per_thread == best->dwords_per_thread && prev->waves_per_sh == best->waves_per_sh) || @@ -461,8 +434,6 @@ void si_test_dma_perf(struct si_screen *sscreen) if (best->is_cp) { printf("CP_DMA(%s);\n", cache_policy_str); } - if (best->is_sdma) - printf("SDMA;\n"); if (best->is_cs) { printf("COMPUTE(%s, %u, %u);\n", cache_policy_str, best->dwords_per_thread, best->waves_per_sh); diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index c77f4e28d45..a16f007e4ae 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -48,66 +48,6 @@ static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen, static bool si_texture_is_aux_plane(const struct pipe_resource *resource); -bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src, - unsigned src_level, const struct pipe_box *src_box) -{ - if (!sctx->sdma_cs.priv) - return false; - - if (dst->surface.bpe != src->surface.bpe) - return false; - - /* MSAA: Blits don't exist in the real world. */ - if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1) - return false; - - /* Depth-stencil surfaces: - * When dst is linear, the DB->CB copy preserves HTILE. - * When dst is tiled, the 3D path must be used to update HTILE. - */ - if (src->is_depth || dst->is_depth) - return false; - - /* DCC as: - * src: Use the 3D path. DCC decompression is expensive. - * dst: Use the 3D path to compress the pixels with DCC. - */ - if (vi_dcc_enabled(src, src_level) || vi_dcc_enabled(dst, dst_level)) - return false; - - /* TMZ: mixing encrypted and non-encrypted buffer in a single command - * doesn't seem supported. - */ - if ((src->buffer.flags & RADEON_FLAG_ENCRYPTED) != - (dst->buffer.flags & RADEON_FLAG_ENCRYPTED)) - return false; - - /* CMASK as: - * src: Both texture and SDMA paths need decompression. Use SDMA. - * dst: If overwriting the whole texture, discard CMASK and use - * SDMA. Otherwise, use the 3D path. - */ - if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) { - /* The CMASK clear is only enabled for the first level. */ - assert(dst_level == 0); - if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, dstx, dsty, dstz, - src_box->width, src_box->height, src_box->depth)) - return false; - - si_texture_discard_cmask(sctx->screen, dst); - } - - /* All requirements are met. Prepare textures for SDMA. */ - if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level)) - sctx->b.flush_resource(&sctx->b, &src->buffer.b.b); - - assert(!(src->dirty_level_mask & (1 << src_level))); - assert(!(dst->dirty_level_mask & (1 << dst_level))); - - return true; -} - /* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */ static void si_copy_region_with_blit(struct pipe_context *pipe, struct pipe_resource *dst, unsigned dst_level, unsigned dstx, unsigned dsty, @@ -141,7 +81,6 @@ static void si_copy_region_with_blit(struct pipe_context *pipe, struct pipe_reso /* Copy from a full GPU texture to a transfer's staging one. */ static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer) { - struct si_context *sctx = (struct si_context *)ctx; struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer; struct pipe_resource *dst = &stransfer->staging->b.b; struct pipe_resource *src = transfer->resource; @@ -151,13 +90,12 @@ static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_trans return; } - sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box); + si_resource_copy_region(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box); } /* Copy from a transfer's staging texture to a full GPU one. */ static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer) { - struct si_context *sctx = (struct si_context *)ctx; struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer; struct pipe_resource *dst = transfer->resource; struct pipe_resource *src = &stransfer->staging->b.b; @@ -176,8 +114,8 @@ static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_tra sbox.height = util_format_get_nblocksx(dst->format, sbox.height); } - sctx->dma_copy(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, transfer->box.z, src, - 0, &sbox); + si_resource_copy_region(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, + transfer->box.z, src, 0, &sbox); } static unsigned si_texture_get_offset(struct si_screen *sscreen, struct si_texture *tex, @@ -479,7 +417,8 @@ static void si_reallocate_texture_inplace(struct si_context *sctx, struct si_tex u_box_3d(0, 0, 0, u_minify(templ.width0, i), u_minify(templ.height0, i), util_num_layers(&templ, i), &box); - sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, &tex->buffer.b.b, i, &box); + si_resource_copy_region(&sctx->b, &new_tex->buffer.b.b, + i, 0, 0, 0, &tex->buffer.b.b, i, &box); } } @@ -1156,8 +1095,8 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen, struct si_context *sctx = (struct si_context *)sscreen->aux_context; simple_mtx_lock(&sscreen->aux_context_lock); - si_sdma_copy_buffer(sctx, &tex->dcc_retile_buffer->b.b, &buf->b.b, 0, - 0, buf->b.b.width0); + si_copy_buffer(sctx, &tex->dcc_retile_buffer->b.b, &buf->b.b, 0, + 0, buf->b.b.width0); sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); simple_mtx_unlock(&sscreen->aux_context_lock); |