diff options
author | Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> | 2020-10-11 17:48:19 +0200 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-01-13 16:27:19 +0000 |
commit | 4a783a3c7846857671a9f2e91b62850e24e01029 (patch) | |
tree | 0957f41619d3eceb4116e56b99253aeb42f484e4 /src | |
parent | 0af86341a20259932955f0386ca2d865928ea409 (diff) |
radv: Use L2 coherency on GFX9+.
Especially on GFX10 we can avoid pretty much all L2 flushes.
However, instead of that we have to do L2_METADATA invalidations. We
do that every time we could possibly be reading new DCC/HTILE info
from the L2 cache in shaders.
Benchmark results, basemark on high preset with a navi10 on profile_standard
(which is slower than a navi10 on default settings, please don't compare
to random navi10 results you find)
before:
5932
5928
5937
after:
6011
6013
6009
So this looks like a >1% increase.
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7202>
Diffstat (limited to 'src')
-rw-r--r-- | src/amd/vulkan/radv_cmd_buffer.c | 102 | ||||
-rw-r--r-- | src/amd/vulkan/radv_private.h | 25 | ||||
-rw-r--r-- | src/amd/vulkan/si_cmd_buffer.c | 31 |
3 files changed, 94 insertions, 64 deletions
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index ef196e44a85..bddbba79dea 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -3267,20 +3267,42 @@ static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, } } +static bool +radv_image_is_l2_coherent(const struct radv_device *device, const struct radv_image *image) +{ + if (device->physical_device->rad_info.chip_class >= GFX10) { + return !device->physical_device->rad_info.tcc_harvested; + } else if (device->physical_device->rad_info.chip_class == GFX9 && image) { + if (image->info.samples == 1 && + (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) && + !vk_format_is_stencil(image->vk_format)) { + /* Single-sample color and single-sample depth + * (not stencil) are coherent with shaders on + * GFX9. + */ + return true; + } + } + + return false; +} + enum radv_cmd_flush_bits radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags, const struct radv_image *image) { - bool flush_CB_meta = true, flush_DB_meta = true; + bool has_CB_meta = true, has_DB_meta = true; + bool image_is_coherent = radv_image_is_l2_coherent(cmd_buffer->device, image); enum radv_cmd_flush_bits flush_bits = 0; uint32_t b; if (image) { if (!radv_image_has_CB_metadata(image)) - flush_CB_meta = false; + has_CB_meta = false; if (!radv_image_has_htile(image)) - flush_DB_meta = false; + has_DB_meta = false; } for_each_bit(b, src_flags) { @@ -3296,40 +3318,44 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; } } - flush_bits |= RADV_CMD_FLAG_WB_L2; + if (!image_is_coherent) + flush_bits |= RADV_CMD_FLAG_WB_L2; break; case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: - flush_bits |= RADV_CMD_FLAG_WB_L2; + if (!image_is_coherent) + flush_bits |= RADV_CMD_FLAG_WB_L2; break; case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; - if (flush_CB_meta) + if (has_CB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; break; case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; - if (flush_DB_meta) + if (has_DB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; break; case VK_ACCESS_TRANSFER_WRITE_BIT: flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | - RADV_CMD_FLAG_FLUSH_AND_INV_DB | - RADV_CMD_FLAG_INV_L2; + RADV_CMD_FLAG_FLUSH_AND_INV_DB; - if (flush_CB_meta) + if (!image_is_coherent) + flush_bits |= RADV_CMD_FLAG_INV_L2; + if (has_CB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; - if (flush_DB_meta) + if (has_DB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; break; case VK_ACCESS_MEMORY_WRITE_BIT: - flush_bits |= RADV_CMD_FLAG_INV_L2 | - RADV_CMD_FLAG_FLUSH_AND_INV_CB | + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; - if (flush_CB_meta) + if (!image_is_coherent) + flush_bits |= RADV_CMD_FLAG_INV_L2; + if (has_CB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; - if (flush_DB_meta) + if (has_DB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; break; default: @@ -3344,10 +3370,10 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flags, const struct radv_image *image) { - bool flush_CB_meta = true, flush_DB_meta = true; + bool has_CB_meta = true, has_DB_meta = true; enum radv_cmd_flush_bits flush_bits = 0; bool flush_CB = true, flush_DB = true; - bool image_is_coherent = false; + bool image_is_coherent = radv_image_is_l2_coherent(cmd_buffer->device, image); uint32_t b; if (image) { @@ -3357,24 +3383,9 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, } if (!radv_image_has_CB_metadata(image)) - flush_CB_meta = false; + has_CB_meta = false; if (!radv_image_has_htile(image)) - flush_DB_meta = false; - - /* TODO: implement shader coherent for GFX10 */ - - if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { - if (image->info.samples == 1 && - (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | - VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) && - !vk_format_is_stencil(image->vk_format)) { - /* Single-sample color and single-sample depth - * (not stencil) are coherent with shaders on - * GFX9. - */ - image_is_coherent = true; - } - } + has_DB_meta = false; } for_each_bit(b, dst_flags) { @@ -3390,8 +3401,12 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: case VK_ACCESS_TRANSFER_READ_BIT: case VK_ACCESS_TRANSFER_WRITE_BIT: - flush_bits |= RADV_CMD_FLAG_INV_VCACHE | - RADV_CMD_FLAG_INV_L2; + flush_bits |= RADV_CMD_FLAG_INV_VCACHE; + + if (has_CB_meta || has_DB_meta) + flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; + if (!image_is_coherent) + flush_bits |= RADV_CMD_FLAG_INV_L2; break; case VK_ACCESS_SHADER_READ_BIT: flush_bits |= RADV_CMD_FLAG_INV_VCACHE; @@ -3400,6 +3415,8 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, if (!cmd_buffer->device->physical_device->use_llvm && !image) flush_bits |= RADV_CMD_FLAG_INV_SCACHE; + if (has_CB_meta || has_DB_meta) + flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; if (!image_is_coherent) flush_bits |= RADV_CMD_FLAG_INV_L2; break; @@ -3409,28 +3426,29 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: if (flush_CB) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; - if (flush_CB_meta) + if (has_CB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; break; case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT: case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: if (flush_DB) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; - if (flush_DB_meta) + if (has_DB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; break; case VK_ACCESS_MEMORY_READ_BIT: case VK_ACCESS_MEMORY_WRITE_BIT: flush_bits |= RADV_CMD_FLAG_INV_VCACHE | - RADV_CMD_FLAG_INV_SCACHE | - RADV_CMD_FLAG_INV_L2; + RADV_CMD_FLAG_INV_SCACHE; + if (!image_is_coherent) + flush_bits |= RADV_CMD_FLAG_INV_L2; if (flush_CB) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; - if (flush_CB_meta) + if (has_CB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; if (flush_DB) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; - if (flush_DB_meta) + if (has_DB_meta) flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; break; default: diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index dcf4d27dbcc..be4920d73ff 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1066,20 +1066,23 @@ enum radv_cmd_flush_bits { * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8. * GFX6-7 will do complete invalidation, because the writeback is unsupported. */ RADV_CMD_FLAG_WB_L2 = 1 << 4, + /* Invalidate the metadata cache. To be used when the DCC/HTILE metadata + * changed and we want to read an image from shaders. */ + RADV_CMD_FLAG_INV_L2_METADATA = 1 << 5, /* Framebuffer caches */ - RADV_CMD_FLAG_FLUSH_AND_INV_CB_META = 1 << 5, - RADV_CMD_FLAG_FLUSH_AND_INV_DB_META = 1 << 6, - RADV_CMD_FLAG_FLUSH_AND_INV_DB = 1 << 7, - RADV_CMD_FLAG_FLUSH_AND_INV_CB = 1 << 8, + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META = 1 << 6, + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META = 1 << 7, + RADV_CMD_FLAG_FLUSH_AND_INV_DB = 1 << 8, + RADV_CMD_FLAG_FLUSH_AND_INV_CB = 1 << 9, /* Engine synchronization. */ - RADV_CMD_FLAG_VS_PARTIAL_FLUSH = 1 << 9, - RADV_CMD_FLAG_PS_PARTIAL_FLUSH = 1 << 10, - RADV_CMD_FLAG_CS_PARTIAL_FLUSH = 1 << 11, - RADV_CMD_FLAG_VGT_FLUSH = 1 << 12, + RADV_CMD_FLAG_VS_PARTIAL_FLUSH = 1 << 10, + RADV_CMD_FLAG_PS_PARTIAL_FLUSH = 1 << 11, + RADV_CMD_FLAG_CS_PARTIAL_FLUSH = 1 << 12, + RADV_CMD_FLAG_VGT_FLUSH = 1 << 13, /* Pipeline query controls. */ - RADV_CMD_FLAG_START_PIPELINE_STATS = 1 << 13, - RADV_CMD_FLAG_STOP_PIPELINE_STATS = 1 << 14, - RADV_CMD_FLAG_VGT_STREAMOUT_SYNC = 1 << 15, + RADV_CMD_FLAG_START_PIPELINE_STATS = 1 << 14, + RADV_CMD_FLAG_STOP_PIPELINE_STATS = 1 << 15, + RADV_CMD_FLAG_VGT_STREAMOUT_SYNC = 1 << 16, RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER = (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 28d4a01a996..4ba23c5b8d2 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -1103,12 +1103,9 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, S_586_GLM_WB(1) | S_586_GLM_INV(1); *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2; - } - - /* TODO: Implement this new flag for GFX9+. - else if (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA) + } else if (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA) { gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); - */ + } if (flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) { /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_CB_META */ @@ -1356,11 +1353,18 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH; } - if (chip_class == GFX9 && flush_cb_db) { + if (chip_class == GFX9 && + (flush_cb_db || (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA))) { unsigned cb_db_event, tc_flags; /* Set the CB/DB flush event. */ - cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + if (flush_cb_db) { + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } else { + /* Besides the CB the only other thing writing HTILE + * or DCC metadata are our meta compute shaders. */ + cb_db_event = V_028A90_CS_DONE; + } /* These are the only allowed combinations. If you need to * do multiple operations at once, do them separately. @@ -1374,11 +1378,12 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) * TCL1 = invalidate L1 */ - tc_flags = EVENT_TC_ACTION_ENA | - EVENT_TC_MD_ACTION_ENA; + tc_flags = 0; - *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB | - RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; + if (flush_cb_db) { + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB | + RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; + } /* Ideally flush TC together with CB/DB. */ if (flush_bits & RADV_CMD_FLAG_INV_L2) { @@ -1393,7 +1398,11 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, RADV_CMD_FLAG_INV_VCACHE); *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2; + } else if (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA) { + tc_flags = EVENT_TC_ACTION_ENA | + EVENT_TC_MD_ACTION_ENA; } + assert(flush_cnt); (*flush_cnt)++; |