/* * Copyright © 2018 Valve Corporation * Copyright © 2018 Google * * SPDX-License-Identifier: MIT */ #include "aco_builder.h" #include "aco_ir.h" #include /* * Insert p_linear_start instructions right before RA to correctly allocate * temporaries for reductions that have to disrespect EXEC by executing in * WWM. */ namespace aco { void setup_reduce_temp(Program* program) { unsigned last_top_level_block_idx = 0; unsigned maxSize = 0; std::vector hasReductions(program->blocks.size()); for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) { if (instr->opcode == aco_opcode::p_interp_gfx11 || instr->opcode == aco_opcode::p_bpermute_permlane) { maxSize = MAX2(maxSize, 1); hasReductions[block.index] = true; } else if (instr->format == Format::PSEUDO_REDUCTION) { maxSize = MAX2(maxSize, instr->operands[0].size()); hasReductions[block.index] = true; } } } if (maxSize == 0) return; assert(maxSize == 1 || maxSize == 2); Temp reduceTmp(0, RegClass(RegType::vgpr, maxSize).as_linear()); Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear()); int inserted_at = -1; int vtmp_inserted_at = -1; for (Block& block : program->blocks) { if (block.kind & block_kind_top_level) { last_top_level_block_idx = block.index; /* TODO: this could be improved in this case: * start_linear_vgpr * if (...) { * use_linear_vgpr * } * end_linear_vgpr * Here, the linear vgpr is used before any phi copies, so this isn't necessary. */ if (inserted_at >= 0) { aco_ptr end{create_instruction( aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)}; end->operands[0] = Operand(reduceTmp); end->operands[0].setLateKill(true); if (vtmp_inserted_at >= 0) { end->operands[1] = Operand(vtmp); end->operands[1].setLateKill(true); } /* insert after the phis of the block */ std::vector>::iterator it = block.instructions.begin(); while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) ++it; block.instructions.insert(it, std::move(end)); inserted_at = vtmp_inserted_at = -1; } } if (!hasReductions[block.index]) continue; std::vector>::iterator it; for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { Instruction* instr = (*it).get(); if (instr->format != Format::PSEUDO_REDUCTION && instr->opcode != aco_opcode::p_interp_gfx11 && instr->opcode != aco_opcode::p_bpermute_permlane) continue; if ((int)last_top_level_block_idx != inserted_at) { reduceTmp = program->allocateTmp(reduceTmp.regClass()); aco_ptr create{ create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(reduceTmp); /* find the right place to insert this definition */ if (last_top_level_block_idx == block.index) { /* insert right before the current instruction */ it = block.instructions.insert(it, std::move(create)); it++; /* inserted_at is intentionally not updated here, so later blocks * would insert at the end instead of using this one. */ } else { assert(last_top_level_block_idx < block.index); /* insert after p_logical_end of the last top-level block */ std::vector>& instructions = program->blocks[last_top_level_block_idx].instructions; auto insert_point = std::find_if(instructions.rbegin(), instructions.rend(), [](const auto& iter) { return iter->opcode == aco_opcode::p_logical_end; }) .base(); instructions.insert(insert_point, std::move(create)); inserted_at = last_top_level_block_idx; } } /* same as before, except for the vector temporary instead of the reduce temporary */ bool need_vtmp = false; if (instr->isReduction()) { ReduceOp op = instr->reduction().reduce_op; unsigned cluster_size = instr->reduction().cluster_size; need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 || op == fmax64 || op == umin64 || op == umax64 || op == imin64 || op == imax64 || op == imul64; bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 || op == imul16 || op == imax16 || op == imin16 || op == umin16 || op == iadd64; if (program->gfx_level >= GFX10 && cluster_size == 64) need_vtmp = true; if (program->gfx_level >= GFX10 && gfx10_need_vtmp) need_vtmp = true; if (program->gfx_level <= GFX7) need_vtmp = true; need_vtmp |= cluster_size == 32; } if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { vtmp = program->allocateTmp(vtmp.regClass()); aco_ptr create{ create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(vtmp); if (last_top_level_block_idx == block.index) { it = block.instructions.insert(it, std::move(create)); it++; } else { assert(last_top_level_block_idx < block.index); std::vector>& instructions = program->blocks[last_top_level_block_idx].instructions; auto insert_point = std::find_if(instructions.rbegin(), instructions.rend(), [](const auto& iter) { return iter->opcode == aco_opcode::p_logical_end; }) .base(); instructions.insert(insert_point, std::move(create)); vtmp_inserted_at = last_top_level_block_idx; } } if (instr->isReduction()) { instr->operands[1] = Operand(reduceTmp); instr->operands[1].setLateKill(true); if (need_vtmp) { instr->operands[2] = Operand(vtmp); instr->operands[2].setLateKill(true); } } else { assert(instr->opcode == aco_opcode::p_interp_gfx11 || instr->opcode == aco_opcode::p_bpermute_permlane); instr->operands[0] = Operand(reduceTmp); instr->operands[0].setLateKill(true); } } } } }; // namespace aco