From 45826e42c537b22a2af82786fc076e78f4e5238a Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Sun, 9 Apr 2023 16:14:40 +0800 Subject: ac,aco: move gfx10 ngg prim count zero workaround to nir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To simplify both llvm and aco backend and remove unnecessary workaround code where prim count is known to be not zero. Reviewed-by: Timur Kristóf Signed-off-by: Qiang Yu Part-of: --- src/amd/common/ac_nir_lower_ngg.c | 59 +++++++++++++- src/amd/compiler/aco_instruction_selection.cpp | 78 +++--------------- src/amd/llvm/ac_llvm_build.c | 106 ------------------------- src/amd/llvm/ac_llvm_build.h | 5 -- src/amd/llvm/ac_nir_to_llvm.c | 20 +++-- 5 files changed, 83 insertions(+), 185 deletions(-) diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 1dcdda9ae35..27d9ca4e5c8 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -447,6 +447,50 @@ emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives, return arg; } +static void +alloc_vertices_and_primitives_gfx10_workaround(nir_builder *b, + nir_ssa_def *num_vtx, + nir_ssa_def *num_prim) +{ + /* HW workaround for a GPU hang with 100% culling on GFX10. + * We always have to export at least 1 primitive. + * Export a degenerate triangle using vertex 0 for all 3 vertices. + * + * NOTE: We rely on the caller to set the vertex count also to 0 when the primitive count is 0. + */ + nir_ssa_def *is_prim_cnt_0 = nir_ieq_imm(b, num_prim, 0); + nir_if *if_prim_cnt_0 = nir_push_if(b, is_prim_cnt_0); + { + nir_ssa_def *one = nir_imm_int(b, 1); + nir_alloc_vertices_and_primitives_amd(b, one, one); + + nir_ssa_def *tid = nir_load_subgroup_invocation(b); + nir_ssa_def *is_thread_0 = nir_ieq_imm(b, tid, 0); + nir_if *if_thread_0 = nir_push_if(b, is_thread_0); + { + /* The vertex indices are 0, 0, 0. */ + nir_export_amd(b, nir_imm_zero(b, 4, 32), + .base = V_008DFC_SQ_EXP_PRIM, + .flags = AC_EXP_FLAG_DONE, + .write_mask = 1); + + /* The HW culls primitives with NaN. -1 is also NaN and can save + * a dword in binary code by inlining constant. + */ + nir_export_amd(b, nir_imm_ivec4(b, -1, -1, -1, -1), + .base = V_008DFC_SQ_EXP_POS, + .flags = AC_EXP_FLAG_DONE, + .write_mask = 0xf); + } + nir_pop_if(b, if_thread_0); + } + nir_push_else(b, if_prim_cnt_0); + { + nir_alloc_vertices_and_primitives_amd(b, num_vtx, num_prim); + } + nir_pop_if(b, if_prim_cnt_0); +} + static void ngg_nogs_init_vertex_indices_vars(nir_builder *b, nir_function_impl *impl, lower_ngg_nogs_state *s) { @@ -1565,7 +1609,13 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_int(b, 0))); { /* Tell the final vertex and primitive count to the HW. */ - nir_alloc_vertices_and_primitives_amd(b, num_live_vertices_in_workgroup, num_exported_prims); + if (s->options->gfx_level == GFX10) { + alloc_vertices_and_primitives_gfx10_workaround( + b, num_live_vertices_in_workgroup, num_exported_prims); + } else { + nir_alloc_vertices_and_primitives_amd( + b, num_live_vertices_in_workgroup, num_exported_prims); + } } nir_pop_if(b, if_wave_0); @@ -3309,7 +3359,12 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) /* Allocate export space. We currently don't compact primitives, just use the maximum number. */ nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_zero(b, 1, 32))); - nir_alloc_vertices_and_primitives_amd(b, workgroup_num_vertices, max_prmcnt); + { + if (s->options->gfx_level == GFX10) + alloc_vertices_and_primitives_gfx10_workaround(b, workgroup_num_vertices, max_prmcnt); + else + nir_alloc_vertices_and_primitives_amd(b, workgroup_num_vertices, max_prmcnt); + } nir_pop_if(b, if_wave_0); /* Vertex compaction. This makes sure there are no gaps between threads that export vertices. */ diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 38dedd5935d..d58bdbb6eaa 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8053,7 +8053,6 @@ emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2) Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i); Temp lanecount_to_mask(isel_context* ctx, Temp count); -void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt); Temp get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, @@ -8963,7 +8962,18 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) assert(ctx->stage.hw == HWStage::NGG); Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa); Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa); - ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives); + + /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */ + Temp tmp = + bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), + num_primitives, Operand::c32(12u)); + tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), + tmp, num_vertices); + + /* Request the SPI to allocate space for the primitives and vertices + * that will be exported by the threadgroup. + */ + bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req); break; } case nir_intrinsic_gds_atomic_add_amd: { @@ -11430,70 +11440,6 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i) return lanecount_to_mask(ctx, count); } -void -ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt) -{ - assert(vtx_cnt.id() && prm_cnt.id()); - - Builder bld(ctx->program, ctx->block); - Temp prm_cnt_0; - - if (ctx->program->gfx_level == GFX10 && - (ctx->stage.has(SWStage::GS) || ctx->program->info.has_ngg_culling)) { - /* Navi 1x workaround: check whether the workgroup has no output. - * If so, change the number of exported vertices and primitives to 1. - */ - prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero()); - prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt, - bld.scc(prm_cnt_0)); - vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt, - bld.scc(prm_cnt_0)); - } - - /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */ - Temp tmp = - bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u)); - tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt); - - /* Request the SPI to allocate space for the primitives and vertices - * that will be exported by the threadgroup. - */ - bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req); - - if (prm_cnt_0.id()) { - /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output. - * It can't have all-zero positions because that would render an undesired pixel with - * conservative rasterization. - */ - Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); - Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), - Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane); - cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond, - Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0)); - - if_context ic_prim_0; - begin_divergent_if_then(ctx, &ic_prim_0, cond); - bld.reset(ctx->block); - ctx->block->kind |= block_kind_export_end; - - /* Use zero: means that it's a triangle whose every vertex index is 0. */ - Temp zero = bld.copy(bld.def(v1), Operand::zero()); - /* Use NaN for the coordinates, so that the rasterizer allways culls it. */ - Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u)); - - bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */, - V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */, - false /* valid mask */); - bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */, - V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */, - true /* valid mask */); - - begin_divergent_if_else(ctx, &ic_prim_0); - end_divergent_if(ctx, &ic_prim_0); - bld.reset(ctx->block); - } -} - } /* end namespace */ void diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index c1b86bc4728..dde69a0aa4a 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -3970,65 +3970,6 @@ void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueR args->enabled_channels = mask; } -/* Send GS Alloc Req message from the first wave of the group to SPI. - * Message payload is: - * - bits 0..10: vertices in group - * - bits 12..22: primitives in group - */ -void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, - LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt) -{ - LLVMBuilderRef builder = ctx->builder; - - if (wave_id) - ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); - - /* HW workaround for a GPU hang with 100% culling on GFX10. - * We always have to export at least 1 primitive. - * Export a degenerate triangle using vertex 0 for all 3 vertices. - * - * NOTE: We rely on the caller to set the vertex count also to 0 when the primitive count is 0. - */ - if (ctx->gfx_level == GFX10) { - ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""), 5021); - LLVMValueRef prim_cnt_is_0 = LLVMBuildICmp(builder, LLVMIntEQ, prim_cnt, ctx->i32_0, ""); - ac_build_ifcc(ctx, prim_cnt_is_0, 5022); - { - LLVMValueRef x = LLVMBuildShl(builder, ctx->i32_1, LLVMConstInt(ctx->i32, 12, false), ""); - x = LLVMBuildOr(builder, x, ctx->i32_1, ""); - ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, x); - - /* The vertex indices are 0, 0, 0. */ - struct ac_ngg_prim prim = {0}; - prim.passthrough = ctx->i32_0; - - /* The HW culls primitives with NaN. */ - struct ac_export_args pos = {0}; - pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN); - pos.target = V_008DFC_SQ_EXP_POS; - pos.enabled_channels = 0xf; - pos.done = true; - - ac_build_export_prim(ctx, &prim); - ac_build_export(ctx, &pos); - } - ac_build_else(ctx, 5022); - } - - LLVMValueRef x = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), ""); - x = LLVMBuildOr(builder, x, vtx_cnt, ""); - ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, x); - - if (ctx->gfx_level == GFX10) { - ac_build_endif(ctx, 5022); - ac_build_endif(ctx, 5021); - } - - if (wave_id) - ac_build_endif(ctx, 5020); -} - - LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx, const struct ac_shader_args *args) { @@ -4044,53 +3985,6 @@ LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx, return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), ""); } -LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) -{ - /* The prim export format is: - * - bits 0..8: index 0 - * - bit 9: edge flag 0 - * - bits 10..18: index 1 - * - bit 19: edge flag 1 - * - bits 20..28: index 2 - * - bit 29: edge flag 2 - * - bit 31: null primitive (skip) - */ - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); - LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); - result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, ""); - - for (unsigned i = 0; i < prim->num_vertices; ++i) { - tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), ""); - result = LLVMBuildOr(builder, result, tmp, ""); - } - return result; -} - -void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) -{ - struct ac_export_args args; - - if (prim->passthrough) { - args.out[0] = prim->passthrough; - } else { - args.out[0] = ac_pack_prim_export(ctx, prim); - } - - args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, ""); - args.out[1] = LLVMGetUndef(ctx->f32); - args.out[2] = LLVMGetUndef(ctx->f32); - args.out[3] = LLVMGetUndef(ctx->f32); - - args.target = V_008DFC_SQ_EXP_PRIM; - args.enabled_channels = 1; - args.done = true; - args.valid_mask = false; - args.compr = false; - - ac_build_export(ctx, &args); -} - static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx) { LLVMTypeRef base; diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 299fe8ff7f4..373ded524b0 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -552,9 +552,6 @@ void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueR LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last, struct ac_export_args *args); -void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, - LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt); - struct ac_ngg_prim { unsigned num_vertices; LLVMValueRef isnull; @@ -565,8 +562,6 @@ struct ac_ngg_prim { LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx, const struct ac_shader_args *args); -LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim); -void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim); LLVMTypeRef ac_arg_type_to_pointee_type(struct ac_llvm_context *ctx, enum ac_arg_type type); diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index e7db71522ed..e464a55457e 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -4099,14 +4099,22 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins case nir_intrinsic_load_workgroup_num_input_primitives_amd: result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), 22, 9); break; - case nir_intrinsic_alloc_vertices_and_primitives_amd: - /* The caller should only call this conditionally for wave 0, so pass NULL to disable - * the wave 0 check inside this function. + case nir_intrinsic_alloc_vertices_and_primitives_amd: { + /* The caller should only call this conditionally for wave 0. + * + * Send GS Alloc Req message from the first wave of the group to SPI. + * Message payload is: + * - bits 0..10: vertices in group + * - bits 12..22: primitives in group */ - ac_build_sendmsg_gs_alloc_req(&ctx->ac, NULL, - get_src(ctx, instr->src[0]), - get_src(ctx, instr->src[1])); + LLVMValueRef vtx_cnt = get_src(ctx, instr->src[0]); + LLVMValueRef prim_cnt = get_src(ctx, instr->src[1]); + LLVMValueRef msg = LLVMBuildShl(ctx->ac.builder, prim_cnt, + LLVMConstInt(ctx->ac.i32, 12, false), ""); + msg = LLVMBuildOr(ctx->ac.builder, msg, vtx_cnt, ""); + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, msg); break; + } case nir_intrinsic_overwrite_vs_arguments_amd: ctx->abi->vertex_id_replaced = get_src(ctx, instr->src[0]); ctx->abi->instance_id_replaced = get_src(ctx, instr->src[1]); -- cgit v1.2.3