summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRhys Perry <pendingchaos02@gmail.com>2020-10-13 13:32:38 +0100
committerMarge Bot <eric+marge@anholt.net>2020-10-15 11:33:42 +0000
commit1a652244e4bdc0cefa907a91c81ab1efe1eafbd3 (patch)
treea0e212628d1a748a52d0cc67a233771b8c65943c
parent91d9c55f3a435717224dace90b6181833ca9ea8e (diff)
aco: implement 16-bit literals
We can copy any value into a 16-bit subregister with a 3 dword v_pack_b32_f16 on GFX10 or a v_and_b32+v_or_b32 on GFX9. Because the generated code can depend on the register assignment and to improve constant propagation, Builder::copy creates a p_create_vector in the case of sub-dword literals. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7111>
-rw-r--r--src/amd/compiler/aco_builder_h.py2
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp41
-rw-r--r--src/amd/compiler/aco_validate.cpp1
-rw-r--r--src/amd/compiler/tests/test_to_hw_instr.cpp113
4 files changed, 155 insertions, 2 deletions
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index a6c2113f38b..9f708c4adfc 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -415,7 +415,7 @@ public:
return sop1(aco_opcode::s_mov_b64, dst, op);
} else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) {
return vop1(aco_opcode::v_mov_b32, dst, op);
- } else if (op.bytes() > 2) {
+ } else if (op.bytes() > 2 || (op.isLiteral() && dst.regClass().is_subdword())) {
return pseudo(aco_opcode::p_create_vector, dst, op);
} else if (op.bytes() == 1 && op.isConstant()) {
uint8_t val = op.constantValue();
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index d20a239972c..a68fa02868a 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -979,6 +979,26 @@ uint32_t get_intersection_mask(int a_start, int a_size,
return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
}
+void copy_16bit_literal(lower_context *ctx, Builder& bld, Definition def, Operand op)
+{
+ if (ctx->program->chip_class < GFX10) {
+ unsigned offset = def.physReg().byte() * 8u;
+ def = Definition(PhysReg(def.physReg().reg()), v1);
+ Operand def_op(def.physReg(), v1);
+ bld.vop2(aco_opcode::v_and_b32, def, Operand(~(0xffffu << offset)), def_op);
+ bld.vop2(aco_opcode::v_or_b32, def, Operand(op.constantValue() << offset), def_op);
+ } else if (def.physReg().byte() == 2) {
+ Operand def_lo(def.physReg().advance(-2), v2b);
+ Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, def_lo, op);
+ static_cast<VOP3A_instruction*>(instr)->opsel = 0;
+ } else {
+ assert(def.physReg().byte() == 0);
+ Operand def_hi(def.physReg().advance(2), v2b);
+ Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, op, def_hi);
+ static_cast<VOP3A_instruction*>(instr)->opsel = 2;
+ }
+}
+
bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
{
bool did_copy = false;
@@ -1029,6 +1049,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
} else {
bld.vop1(aco_opcode::v_mov_b32, def, op);
}
+ } else if (def.regClass() == v2b && op.isLiteral()) {
+ copy_16bit_literal(ctx, bld, def, op);
} else {
bld.copy(def, op);
}
@@ -1141,6 +1163,25 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi)
{
+ if (lo.isConstant() && hi.isConstant()) {
+ bld.copy(def, Operand(lo.constantValue() | (hi.constantValue() << 16)));
+ return;
+ } else if (lo.isLiteral() && ctx->program->chip_class < GFX10) {
+ if (def.physReg().reg() != hi.physReg().reg())
+ bld.copy(def, Operand(lo.constantValue()));
+ bld.copy(Definition(def.physReg().advance(2), v2b), hi);
+ if (def.physReg().reg() == hi.physReg().reg()) //TODO: create better code in this case with a v_lshlrev_b32+v_or_b32
+ copy_16bit_literal(ctx, bld, Definition(def.physReg(), v2b), lo);
+ return;
+ } else if (hi.isLiteral() && ctx->program->chip_class < GFX10) {
+ if (def.physReg().reg() != lo.physReg().reg())
+ bld.copy(def, Operand(hi.constantValue() << 16));
+ bld.copy(Definition(def.physReg(), v2b), lo);
+ if (def.physReg().reg() == lo.physReg().reg())
+ copy_16bit_literal(ctx, bld, Definition(def.physReg().advance(2), v2b), hi);
+ return;
+ }
+
if (ctx->program->chip_class >= GFX9) {
Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
/* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index ac4a766133b..d8886cd31b7 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -349,7 +349,6 @@ bool validate_ir(Program* program)
check(!is_subdword || !has_const_sgpr || program->chip_class >= GFX9,
"Sub-dword pseudo instructions can only take constants or SGPRs on GFX9+", instr.get());
- check(!is_subdword || !has_literal, "Sub-dword pseudo instructions cannot take literals", instr.get());
}
if (instr->opcode == aco_opcode::p_create_vector) {
diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp
index 3047a3c54d2..0fe8e168aee 100644
--- a/src/amd/compiler/tests/test_to_hw_instr.cpp
+++ b/src/amd/compiler/tests/test_to_hw_instr.cpp
@@ -384,3 +384,116 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
finish_to_hw_instr_test();
}
END_TEST
+
+BEGIN_TEST(to_hw_instr.subdword_constant)
+ PhysReg v0_lo{256};
+ PhysReg v0_hi{256};
+ PhysReg v0_b1{256};
+ PhysReg v1_hi{257};
+ v0_hi.reg_b += 2;
+ v0_b1.reg_b += 1;
+ v1_hi.reg_b += 2;
+
+ for (unsigned i = GFX9; i <= GFX10; i++) {
+ if (!setup_cs(NULL, (chip_class)i))
+ continue;
+
+ /* 16-bit pack */
+ //>> p_unit_test 0
+ //! v1: %_:v[0] = v_pack_b32_f16 0.5, hi(%_:v[1][16:32])
+ bld.pseudo(aco_opcode::p_unit_test, Operand(0u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+ Operand((uint16_t)0x3800), Operand(v1_hi, v2b));
+
+ //! p_unit_test 1
+ //~gfx9! v1: %_:v[0] = v_mov_b32 0x4205
+ //~gfx9! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[1][16:32] dst_preserve
+ //~gfx10! v1: %_:v[0] = v_pack_b32_f16 0x4205, hi(%_:v[1][16:32])
+ bld.pseudo(aco_opcode::p_unit_test, Operand(1u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+ Operand((uint16_t)0x4205), Operand(v1_hi, v2b));
+
+ //TODO: optimize this with GFX10. do_pack_2x16() isn't used in this case
+ //! p_unit_test 2
+ //~gfx9! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[0][0:16] dst_preserve
+ //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
+ //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
+ //~gfx10! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[0][0:16] dst_preserve
+ //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
+ bld.pseudo(aco_opcode::p_unit_test, Operand(2u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+ Operand((uint16_t)0x4205), Operand(v0_lo, v2b));
+
+ //! p_unit_test 3
+ //! v1: %_:v[0] = v_mov_b32 0x3c003800
+ bld.pseudo(aco_opcode::p_unit_test, Operand(3u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+ Operand((uint16_t)0x3800), Operand((uint16_t)0x3c00));
+
+ //! p_unit_test 4
+ //! v1: %_:v[0] = v_mov_b32 0x43064205
+ bld.pseudo(aco_opcode::p_unit_test, Operand(4u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+ Operand((uint16_t)0x4205), Operand((uint16_t)0x4306));
+
+ //! p_unit_test 5
+ //! v1: %_:v[0] = v_mov_b32 0x38004205
+ bld.pseudo(aco_opcode::p_unit_test, Operand(5u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+ Operand((uint16_t)0x4205), Operand((uint16_t)0x3800));
+
+ /* 16-bit copy */
+ //! p_unit_test 6
+ //! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_preserve
+ bld.pseudo(aco_opcode::p_unit_test, Operand(6u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Operand((uint16_t)0x3800));
+
+ //! p_unit_test 7
+ //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
+ //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
+ //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
+ bld.pseudo(aco_opcode::p_unit_test, Operand(7u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v2b), Operand((uint16_t)0x4205));
+
+ //! p_unit_test 8
+ //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
+ //~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0]
+ //~gfx10! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205
+ bld.pseudo(aco_opcode::p_unit_test, Operand(8u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_hi, v2b), Operand((uint16_t)0x4205));
+
+ //! p_unit_test 9
+ //! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_preserve
+ //! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_preserve
+ bld.pseudo(aco_opcode::p_unit_test, Operand(9u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_b1, v2b), Operand((uint16_t)0x3800));
+
+ //! p_unit_test 10
+ //! v1b: %_:v[0][8:16] = v_mov_b32 5 dst_preserve
+ //! v1b: %_:v[0][16:24] = v_mul_u32_u24 2, 33 dst_preserve
+ bld.pseudo(aco_opcode::p_unit_test, Operand(10u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_b1, v2b), Operand((uint16_t)0x4205));
+
+ /* 8-bit copy */
+ //! p_unit_test 11
+ //! v1b: %_:v[0][0:8] = v_mul_u32_u24 2, 33 dst_preserve
+ bld.pseudo(aco_opcode::p_unit_test, Operand(11u));
+ bld.pseudo(aco_opcode::p_parallelcopy,
+ Definition(v0_lo, v1b), Operand((uint8_t)0x42));
+
+ //! s_endpgm
+
+ finish_to_hw_instr_test();
+ }
+END_TEST