summaryrefslogtreecommitdiff
path: root/src/amd/compiler/aco_lower_to_hw_instr.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/amd/compiler/aco_lower_to_hw_instr.cpp')
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp41
1 files changed, 41 insertions, 0 deletions
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index d20a239972c..a68fa02868a 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -979,6 +979,26 @@ uint32_t get_intersection_mask(int a_start, int a_size,
return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
}
+void copy_16bit_literal(lower_context *ctx, Builder& bld, Definition def, Operand op)
+{
+ if (ctx->program->chip_class < GFX10) {
+ unsigned offset = def.physReg().byte() * 8u;
+ def = Definition(PhysReg(def.physReg().reg()), v1);
+ Operand def_op(def.physReg(), v1);
+ bld.vop2(aco_opcode::v_and_b32, def, Operand(~(0xffffu << offset)), def_op);
+ bld.vop2(aco_opcode::v_or_b32, def, Operand(op.constantValue() << offset), def_op);
+ } else if (def.physReg().byte() == 2) {
+ Operand def_lo(def.physReg().advance(-2), v2b);
+ Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, def_lo, op);
+ static_cast<VOP3A_instruction*>(instr)->opsel = 0;
+ } else {
+ assert(def.physReg().byte() == 0);
+ Operand def_hi(def.physReg().advance(2), v2b);
+ Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, op, def_hi);
+ static_cast<VOP3A_instruction*>(instr)->opsel = 2;
+ }
+}
+
bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
{
bool did_copy = false;
@@ -1029,6 +1049,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
} else {
bld.vop1(aco_opcode::v_mov_b32, def, op);
}
+ } else if (def.regClass() == v2b && op.isLiteral()) {
+ copy_16bit_literal(ctx, bld, def, op);
} else {
bld.copy(def, op);
}
@@ -1141,6 +1163,25 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi)
{
+ if (lo.isConstant() && hi.isConstant()) {
+ bld.copy(def, Operand(lo.constantValue() | (hi.constantValue() << 16)));
+ return;
+ } else if (lo.isLiteral() && ctx->program->chip_class < GFX10) {
+ if (def.physReg().reg() != hi.physReg().reg())
+ bld.copy(def, Operand(lo.constantValue()));
+ bld.copy(Definition(def.physReg().advance(2), v2b), hi);
+ if (def.physReg().reg() == hi.physReg().reg()) //TODO: create better code in this case with a v_lshlrev_b32+v_or_b32
+ copy_16bit_literal(ctx, bld, Definition(def.physReg(), v2b), lo);
+ return;
+ } else if (hi.isLiteral() && ctx->program->chip_class < GFX10) {
+ if (def.physReg().reg() != lo.physReg().reg())
+ bld.copy(def, Operand(hi.constantValue() << 16));
+ bld.copy(Definition(def.physReg(), v2b), lo);
+ if (def.physReg().reg() == lo.physReg().reg())
+ copy_16bit_literal(ctx, bld, Definition(def.physReg().advance(2), v2b), hi);
+ return;
+ }
+
if (ctx->program->chip_class >= GFX9) {
Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
/* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */