diff options
author | Samuel Pitoiset <samuel.pitoiset@gmail.com> | 2020-11-24 14:56:55 +0100 |
---|---|---|
committer | Dylan Baker <dylan.c.baker@intel.com> | 2021-01-12 15:27:32 -0800 |
commit | e6aa51a84d5b979b64b46291413aa212d11baccb (patch) | |
tree | fc1377b071b65d737f62b48cf9525ad86f71add2 | |
parent | a29f08b1e204db4342375184829984c594bced86 (diff) |
radv/llvm,aco: always split typed vertex buffer loads on GFX6 and GFX10+
To avoid any alignment issues that triggers memory violations and
eventually a GPU. This can happen if the stride (static or dynamic)
is unaligned and also if the VBO offset is aligned to scalar
(eg. stride is 8 and VBO offset is 2 for R16G16B16A16_SNORM).
The AMD Windows driver also always splits typed vertex fetches.
fossils-db (Sienna Cichlid):
Totals from 56508 (40.54% of 139391) affected shaders:
SGPRs: 2643545 -> 2664516 (+0.79%); split: -0.19%, +0.98%
VGPRs: 2007472 -> 1995408 (-0.60%); split: -0.74%, +0.13%
CodeSize: 70596372 -> 73913312 (+4.70%); split: -0.00%, +4.70%
MaxWaves: 772653 -> 774916 (+0.29%); split: +0.37%, -0.08%
Instrs: 14074162 -> 14567072 (+3.50%); split: -0.00%, +3.51%
Cycles: 69281276 -> 71253252 (+2.85%); split: -0.00%, +2.85%
VMEM: 22047039 -> 25554196 (+15.91%); split: +17.20%, -1.29%
SMEM: 4120370 -> 4360820 (+5.84%); split: +7.41%, -1.58%
VClause: 416913 -> 438361 (+5.14%); split: -1.86%, +7.01%
SClause: 536739 -> 542637 (+1.10%); split: -0.33%, +1.43%
Copies: 977194 -> 970015 (-0.73%); split: -2.43%, +1.69%
Branches: 241205 -> 241193 (-0.00%); split: -0.06%, +0.06%
PreVGPRs: 1505645 -> 1505379 (-0.02%)
This fixes GPU hangs with bin/draw-vertices from Piglit on GFX10+
with Zink.
Cc: mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8363>
(cherry picked from commit d2f4934121f65f2f086e4babaa0704e58503234b)
-rw-r--r-- | .pick_status.json | 2 | ||||
-rw-r--r-- | src/amd/compiler/aco_instruction_selection.cpp | 10 | ||||
-rw-r--r-- | src/amd/vulkan/radv_nir_to_llvm.c | 18 |
3 files changed, 17 insertions, 13 deletions
diff --git a/.pick_status.json b/.pick_status.json index a6ee33d800a..b903f2391d1 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -1885,7 +1885,7 @@ "description": "radv/llvm,aco: always split typed vertex buffer loads on GFX6 and GFX10+", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "master_sha": null, "because_sha": null }, diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 7c3b492c004..c85d173d43d 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4553,11 +4553,17 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info, unsigned offset, unsigned stride, unsigned channels) { - unsigned vertex_byte_size = vtx_info->chan_byte_size * channels; if (vtx_info->chan_byte_size != 4 && channels == 3) return false; + + /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any + * alignment issues that triggers memory violations and eventually a GPU + * hang. This can happen if the stride (static or dynamic) is unaligned and + * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO + * offset is 2 for R16G16B16A16_SNORM). + */ return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) || - (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0); + (channels == 1); } uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info, diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index f94411d9bd1..b591c969d2d 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -1182,17 +1182,15 @@ handle_vs_input_decl(struct radv_shader_context *ctx, t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); - /* Perform per-channel vertex fetch operations if unaligned - * access are detected. Only GFX6 and GFX10 are affected. + /* Always split typed vertex buffer loads on GFX6 and GFX10+ + * to avoid any alignment issues that triggers memory + * violations and eventually a GPU hang. This can happen if + * the stride (static or dynamic) is unaligned and also if the + * VBO offset is aligned to a scalar (eg. stride is 8 and VBO + * offset is 2 for R16G16B16A16_SNORM). */ - bool unaligned_vertex_fetches = false; - if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) && - vtx_info->chan_format != data_format && - ((attrib_offset % vtx_info->element_size) || - (attrib_stride % vtx_info->element_size))) - unaligned_vertex_fetches = true; - - if (unaligned_vertex_fetches) { + if (ctx->ac.chip_class == GFX6 || + ctx->ac.chip_class >= GFX10) { unsigned chan_format = vtx_info->chan_format; LLVMValueRef values[4]; |