summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schürmann <daniel@schuermann.dev>2021-02-05 18:25:18 +0100
committerDylan Baker <dylan.c.baker@intel.com>2021-02-23 10:13:31 -0800
commit17aac7926a8bf06873fc8b9b5a03a3508131714f (patch)
treeea8d6ee3bf7df6c66a005cfa516ae2050daf22f5
parenta0b7f2318517124778a489e4ae0b970f89a677d8 (diff)
aco: fix shared VGPR allocation on RDNA2
VGPRs are now allocated in blocks of 8 normal or 16 shared VGPRs, respectively. Fixes: 14a5021aff661a26d76f330fec55d400d35443a8 ('aco/gfx10: Refactor of GFX10 wave64 bpermute.') Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8921> (cherry picked from commit bacc3b36f55ed1f0fbb8a7efdfb072a0f3ee4ee1)
-rw-r--r--.pick_status.json2
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp5
2 files changed, 4 insertions, 3 deletions
diff --git a/.pick_status.json b/.pick_status.json
index c9e47ea2620..52c59a1ed07 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -5836,7 +5836,7 @@
"description": "aco: fix shared VGPR allocation on RDNA2",
"nominated": true,
"nomination_type": 1,
- "resolution": 0,
+ "resolution": 1,
"master_sha": null,
"because_sha": "14a5021aff661a26d76f330fec55d400d35443a8"
},
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 9824324aa0a..142c75c2b48 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -196,8 +196,9 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
/* GFX10 wave64 mode: emulate full-wave bpermute */
if (!ctx->has_gfx10_wave64_bpermute) {
ctx->has_gfx10_wave64_bpermute = true;
- ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
- ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
+ /* Shared VGPRs are allocated in groups of 8/16 */
+ ctx->program->config->num_shared_vgprs = ctx->program->chip_class >= GFX10_3 ? 16 : 8;
+ ctx->program->vgpr_limit -= ctx->program->chip_class >= GFX10_3 ? 8 : 4;
}
Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);