summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDaniel Schürmann <daniel@schuermann.dev>2020-07-20 12:07:55 +0200
committerEric Engestrom <eric@engestrom.ch>2020-08-05 22:23:19 +0200
commit2646170bcdc95eff636d48dc07d7e5863fb54796 (patch)
tree1382a5645e9622b3df1e1520500097f689ace753 /src
parent3c7d8b7f7ee97a4aa031c99f9391fad9fbe86d02 (diff)
aco: fix scratch loads which cross element_size boundaries
Previously, we've set element_size == 16 which causes loads from packed vec3 arrays to cross the boundary and return wrong data. This patch sets element_size = 4 and splits loads into single channel. Fixes all of dEQP-VK.subgroups.ballot_broadcast.* Cc: 20.1 <mesa-stable@lists.freedesktop.org> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5977> (cherry picked from commit 7015d2c249e1f7814bf5681ccd049e49e4d6495c)
Diffstat (limited to 'src')
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp25
1 files changed, 14 insertions, 11 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index af838f0217e..0d50ca1149c 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3194,7 +3194,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
/* align offset down if needed */
Operand aligned_offset = offset;
+ unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
if (need_to_align_offset) {
+ align = 4;
Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
if (offset.isConstant()) {
aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
@@ -3214,7 +3216,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
bld.copy(bld.def(s1), aligned_offset);
- unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
reduced_const_offset, byte_align ? Temp() : info->dst);
@@ -3279,7 +3280,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
if (num_tmps > 1) {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
- for (unsigned i = 0; i < num_vals; i++)
+ for (unsigned i = 0; i < num_tmps; i++)
vec->operands[i] = Operand(tmp[i]);
tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
vec->definitions[0] = Definition(tmp[0]);
@@ -3478,10 +3479,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
unsigned bytes_size = 0;
aco_opcode op;
- if (bytes_needed == 1) {
+ if (bytes_needed == 1 || align_ % 2) {
bytes_size = 1;
op = aco_opcode::buffer_load_ubyte;
- } else if (bytes_needed == 2) {
+ } else if (bytes_needed == 2 || align_ % 4) {
bytes_size = 2;
op = aco_opcode::buffer_load_ushort;
} else if (bytes_needed <= 4) {
@@ -3507,7 +3508,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
mubuf->barrier = info->barrier;
mubuf->can_reorder = info->can_reorder;
mubuf->offset = const_offset;
- RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mubuf->definitions[0] = Definition(val);
bld.insert(std::move(mubuf));
@@ -3519,6 +3520,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
}
static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
{
@@ -6741,7 +6743,7 @@ Temp get_scratch_resource(isel_context *ctx)
scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
- S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+ S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
if (ctx->program->chip_class >= GFX10) {
rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
@@ -6752,9 +6754,9 @@ Temp get_scratch_resource(isel_context *ctx)
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
}
- /* older generations need element size = 16 bytes. element size removed in GFX9 */
+ /* older generations need element size = 4 bytes. element size removed in GFX9 */
if (ctx->program->chip_class <= GFX8)
- rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+ rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
}
@@ -6769,10 +6771,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
instr->dest.ssa.bit_size / 8u, rsrc};
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
- info.swizzle_component_size = 16;
+ info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
info.can_reorder = false;
info.soffset = ctx->program->scratch_offset;
- emit_mubuf_load(ctx, bld, &info);
+ emit_scratch_load(ctx, bld, &info);
}
void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
@@ -6787,8 +6789,9 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
+ unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
- 16, &write_count, write_datas, offsets);
+ swizzle_component_size, &write_count, write_datas, offsets);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());