diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_shader.c')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 3625 |
1 files changed, 2564 insertions, 1061 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 4bc70ce9a22..45a3dba6a9d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1,80 +1,62 @@ /* * Copyright 2012 Advanced Micro Devices, Inc. - * All Rights Reserved. * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. + * SPDX-License-Identifier: MIT */ -#include "ac_exp_param.h" +#include "si_shader.h" +#include "ac_nir.h" #include "ac_rtld.h" -#include "compiler/nir/nir.h" -#include "compiler/nir/nir_serialize.h" +#include "nir.h" +#include "nir_builder.h" +#include "nir_serialize.h" +#include "nir_xfb_info.h" #include "si_pipe.h" #include "si_shader_internal.h" #include "sid.h" #include "tgsi/tgsi_from_mesa.h" -#include "tgsi/tgsi_strings.h" #include "util/u_memory.h" +#include "util/mesa-sha1.h" +#include "util/ralloc.h" +#include "util/u_upload_mgr.h" static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0"; static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1"; static void si_dump_shader_key(const struct si_shader *shader, FILE *f); +static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader); + +/* Get the number of all interpolated inputs */ +unsigned si_get_ps_num_interp(struct si_shader *ps) +{ + unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0); + unsigned num_interp = + ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0); + + assert(num_interp <= 32); + return MIN2(num_interp, 32); +} /** Whether the shader runs as a combination of multiple API shaders */ bool si_is_multi_part_shader(struct si_shader *shader) { - if (shader->selector->screen->info.chip_class <= GFX8) + if (shader->selector->screen->info.gfx_level <= GFX8 || + shader->selector->stage > MESA_SHADER_GEOMETRY) return false; - return shader->key.as_ls || shader->key.as_es || - shader->selector->info.stage == MESA_SHADER_TESS_CTRL || - shader->selector->info.stage == MESA_SHADER_GEOMETRY; + return shader->key.ge.as_ls || shader->key.ge.as_es || + shader->selector->stage == MESA_SHADER_TESS_CTRL || + shader->selector->stage == MESA_SHADER_GEOMETRY; } /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */ bool si_is_merged_shader(struct si_shader *shader) { - return shader->key.as_ngg || si_is_multi_part_shader(shader); -} - -/** - * Returns a unique index for a per-patch semantic name and index. The index - * must be less than 32, so that a 32-bit bitmask of used inputs or outputs - * can be calculated. - */ -unsigned si_shader_io_get_unique_index_patch(unsigned semantic) -{ - switch (semantic) { - case VARYING_SLOT_TESS_LEVEL_OUTER: - return 0; - case VARYING_SLOT_TESS_LEVEL_INNER: - return 1; - default: - if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30) - return 2 + (semantic - VARYING_SLOT_PATCH0); + if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader) + return false; - assert(!"invalid semantic"); - return 0; - } + return shader->key.ge.as_ngg || si_is_multi_part_shader(shader); } /** @@ -82,47 +64,32 @@ unsigned si_shader_io_get_unique_index_patch(unsigned semantic) * less than 64, so that a 64-bit bitmask of used inputs or outputs can be * calculated. */ -unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying) +unsigned si_shader_io_get_unique_index(unsigned semantic) { switch (semantic) { case VARYING_SLOT_POS: - return 0; + return SI_UNIQUE_SLOT_POS; default: - /* Since some shader stages use the highest used IO index - * to determine the size to allocate for inputs/outputs - * (in LDS, tess and GS rings). GENERIC should be placed right - * after POSITION to make that size as small as possible. - */ if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31) - return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */ + return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0); - /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as - * legacy desktop GL varyings because they are mutually exclusive. - */ if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT) - return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */ + return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT); assert(!"invalid generic index"); return 0; /* Legacy desktop GL varyings. */ case VARYING_SLOT_FOGC: - return 33; + return SI_UNIQUE_SLOT_FOGC; case VARYING_SLOT_COL0: - return 34; + return SI_UNIQUE_SLOT_COL0; case VARYING_SLOT_COL1: - return 35; + return SI_UNIQUE_SLOT_COL1; case VARYING_SLOT_BFC0: - /* If it's a varying, COLOR and BCOLOR alias. */ - if (is_varying) - return 34; - else - return 36; + return SI_UNIQUE_SLOT_BFC0; case VARYING_SLOT_BFC1: - if (is_varying) - return 35; - else - return 37; + return SI_UNIQUE_SLOT_BFC1; case VARYING_SLOT_TEX0: case VARYING_SLOT_TEX1: case VARYING_SLOT_TEX2: @@ -131,84 +98,78 @@ unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying) case VARYING_SLOT_TEX5: case VARYING_SLOT_TEX6: case VARYING_SLOT_TEX7: - return 38 + (semantic - VARYING_SLOT_TEX0); + return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0); case VARYING_SLOT_CLIP_VERTEX: - return 46; + return SI_UNIQUE_SLOT_CLIP_VERTEX; - /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */ + /* Varyings present in both GLES and desktop GL. */ case VARYING_SLOT_CLIP_DIST0: - return 49; + return SI_UNIQUE_SLOT_CLIP_DIST0; case VARYING_SLOT_CLIP_DIST1: - return 50; + return SI_UNIQUE_SLOT_CLIP_DIST1; case VARYING_SLOT_PSIZ: - return 51; - - /* These can't be written by LS, HS, and ES. */ + return SI_UNIQUE_SLOT_PSIZ; case VARYING_SLOT_LAYER: - return 52; + return SI_UNIQUE_SLOT_LAYER; case VARYING_SLOT_VIEWPORT: - return 53; + return SI_UNIQUE_SLOT_VIEWPORT; case VARYING_SLOT_PRIMITIVE_ID: - return 54; + return SI_UNIQUE_SLOT_PRIMITIVE_ID; } } -static void si_dump_streamout(struct pipe_stream_output_info *so) +static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader) { - unsigned i; - - if (so->num_outputs) - fprintf(stderr, "STREAMOUT\n"); - - for (i = 0; i < so->num_outputs; i++) { - unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; - fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer, - so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, - so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "", - mask & 4 ? "z" : "", mask & 8 ? "w" : ""); - } -} + struct si_shader_selector *sel = shader->selector; -static void declare_streamout_params(struct si_shader_context *ctx, - struct pipe_stream_output_info *so) -{ - if (ctx->screen->use_ngg_streamout) { - if (ctx->stage == MESA_SHADER_TESS_EVAL) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + if (shader->selector->screen->info.gfx_level >= GFX11) { + /* NGG streamout. */ + if (sel->stage == MESA_SHADER_TESS_EVAL) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); return; } /* Streamout SGPRs. */ - if (so->num_outputs) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index); - } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - } + if (si_shader_uses_streamout(shader)) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index); - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (int i = 0; i < 4; i++) { - if (!so->stride[i]) - continue; + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (int i = 0; i < 4; i++) { + if (!sel->info.base.xfb_stride[i]) + continue; - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]); + } + } else if (sel->stage == MESA_SHADER_TESS_EVAL) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); } } unsigned si_get_max_workgroup_size(const struct si_shader *shader) { - switch (shader->selector->info.stage) { + gl_shader_stage stage = shader->is_gs_copy_shader ? + MESA_SHADER_VERTEX : shader->selector->stage; + + switch (stage) { case MESA_SHADER_VERTEX: case MESA_SHADER_TESS_EVAL: - return shader->key.as_ngg ? 128 : 0; + /* Use the largest workgroup size for streamout */ + if (shader->key.ge.as_ngg) + return si_shader_uses_streamout(shader) ? 256 : 128; + + /* As part of merged shader. */ + return shader->selector->screen->info.gfx_level >= GFX9 && + (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : 0; case MESA_SHADER_TESS_CTRL: /* Return this so that LLVM doesn't remove s_barrier * instructions on chips where we use s_barrier. */ - return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0; + return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0; case MESA_SHADER_GEOMETRY: - return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0; + /* GS can always generate up to 256 vertices. */ + return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0; case MESA_SHADER_COMPUTE: break; /* see below */ @@ -229,135 +190,132 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader) return max_work_group_size; } -static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params) +static void declare_const_and_shader_buffers(struct si_shader_args *args, + struct si_shader *shader, + bool assign_params) { enum ac_arg_type const_shader_buf_type; - if (ctx->shader->selector->info.base.num_ubos == 1 && - ctx->shader->selector->info.base.num_ssbos == 0) + if (shader->selector->info.base.num_ubos == 1 && + shader->selector->info.base.num_ssbos == 0) const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR; else const_shader_buf_type = AC_ARG_CONST_DESC_PTR; ac_add_arg( - &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type, - assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers); + &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type, + assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers); } -static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params) +static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, - assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, + assign_params ? &args->samplers_and_images : &args->other_samplers_and_images); } -static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params) +static void declare_per_stage_desc_pointers(struct si_shader_args *args, + struct si_shader *shader, + bool assign_params) { - declare_const_and_shader_buffers(ctx, assign_params); - declare_samplers_and_images(ctx, assign_params); + declare_const_and_shader_buffers(args, shader, assign_params); + declare_samplers_and_images(args, assign_params); } -static void declare_global_desc_pointers(struct si_shader_context *ctx) +static void declare_global_desc_pointers(struct si_shader_args *args) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, - &ctx->bindless_samplers_and_images); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, + &args->bindless_samplers_and_images); } -static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx) +static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args, + struct si_shader *shader) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - if (!ctx->shader->is_gs_copy_shader) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance); - } -} + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers); -static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) -{ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers); - - unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs; if (num_vbos_in_user_sgprs) { - unsigned user_sgprs = ctx->args.num_sgprs_used; + unsigned user_sgprs = args->ac.num_sgprs_used; - if (si_is_merged_shader(ctx->shader)) + if (si_is_merged_shader(shader)) user_sgprs -= 8; assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */ for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors)); + assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors)); for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]); + ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]); } } -static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs) +static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader) { - struct si_shader *shader = ctx->shader; - - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id); - if (shader->key.as_ls) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id); - if (ctx->screen->info.chip_class >= GFX10) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id); + if (shader->key.ge.as_ls) { + if (shader->selector->screen->info.gfx_level >= GFX11) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + } else if (shader->selector->screen->info.gfx_level >= GFX10) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); } else { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ } - } else if (ctx->screen->info.chip_class >= GFX10) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + } else if (shader->selector->screen->info.gfx_level >= GFX10) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + /* user vgpr or PrimID (legacy) */ + shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); } else { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ - } - - if (!shader->is_gs_copy_shader) { - /* Vertex load indices. */ - if (shader->selector->info.num_inputs) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0); - for (unsigned i = 1; i < shader->selector->info.num_inputs; i++) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); - } - *num_prolog_vgprs += shader->selector->info.num_inputs; + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ } } -static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property) +static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */ - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */ - } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */ + bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11; + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */ + + if (shader->selector->info.base.vs.blit_sgprs_amd == + SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */ + if (has_attribute_ring_address) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */ + } else if (shader->selector->info.base.vs.blit_sgprs_amd == + SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */ + if (has_attribute_ring_address) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */ } } -static void declare_tes_input_vgprs(struct si_shader_context *ctx) +static void declare_tes_input_vgprs(struct si_shader_args *args) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id); } enum @@ -374,393 +332,442 @@ void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, u ac_add_arg(args, file, registers, type, arg); } -void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) +void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) { - struct si_shader *shader = ctx->shader; unsigned i, num_returns, num_return_sgprs; unsigned num_prolog_vgprs = 0; - unsigned stage = ctx->stage; + struct si_shader_selector *sel = shader->selector; + unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage; + unsigned stage_case = stage; - memset(&ctx->args, 0, sizeof(ctx->args)); + memset(args, 0, sizeof(*args)); /* Set MERGED shaders. */ - if (ctx->screen->info.chip_class >= GFX9) { - if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL) - stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ - else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY) - stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; + if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) { + if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL) + stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ + else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY) + stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; } - switch (stage) { + switch (stage_case) { case MESA_SHADER_VERTEX: - declare_global_desc_pointers(ctx); + declare_global_desc_pointers(args); - if (shader->selector->info.base.vs.blit_sgprs_amd) { - declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd); + if (sel->info.base.vs.blit_sgprs_amd) { + declare_vs_blit_inputs(shader, args); + } else { + declare_per_stage_desc_pointers(args, shader, true); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); - /* VGPRs */ - declare_vs_input_vgprs(ctx, &num_prolog_vgprs); - break; + if (shader->is_gs_copy_shader) { + declare_streamout_params(args, shader); + } else { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance); + declare_vb_descriptor_input_sgprs(args, shader); + + if (shader->key.ge.as_es) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset); + } else if (shader->key.ge.as_ls) { + /* no extra parameters */ + } else { + declare_streamout_params(args, shader); + } + } } - declare_per_stage_desc_pointers(ctx, true); - declare_vs_specific_input_sgprs(ctx); - if (!shader->is_gs_copy_shader) - declare_vb_descriptor_input_sgprs(ctx); - - if (shader->key.as_es) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset); - } else if (shader->key.as_ls) { - /* no extra parameters */ - } else { - /* The locations of the other parameters are assigned dynamically. */ - declare_streamout_params(ctx, &shader->selector->so); - } + /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ + if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* VGPRs */ - declare_vs_input_vgprs(ctx, &num_prolog_vgprs); + declare_vs_input_vgprs(args, shader); + break; case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */ - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset); + declare_global_desc_pointers(args); + declare_per_stage_desc_pointers(args, shader, true); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset); + + /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ + if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* VGPRs */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids); - /* param_tcs_offchip_offset and param_tcs_factor_offset are - * placed after the user SGPRs. + /* For monolithic shaders, the TCS epilog code is generated by + * ac_nir_lower_hs_outputs_to_mem. */ - for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) - ac_add_return(&ctx->args, AC_ARG_SGPR); - for (i = 0; i < 11; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); + if (!shader->is_monolithic) { + /* param_tcs_offchip_offset and param_tcs_factor_offset are + * placed after the user SGPRs. + */ + for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) + ac_add_return(&args->ac, AC_ARG_SGPR); + for (i = 0; i < 11; i++) + ac_add_return(&args->ac, AC_ARG_VGPR); + } break; case SI_SHADER_MERGED_VERTEX_TESSCTRL: /* Merged stages have 8 system SGPRs at the beginning. */ - /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ - declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX); - declare_vs_specific_input_sgprs(ctx); - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); - if (ctx->stage == MESA_SHADER_VERTEX) - declare_vb_descriptor_input_sgprs(ctx); + /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ + /* Gfx11+: SPI_SHADER_PGM_LO/HI_HS */ + declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_TESS_CTRL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset); + if (sel->screen->info.gfx_level >= GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id); + else + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + + declare_global_desc_pointers(args); + declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_VERTEX); + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); /* VGPRs (first TCS, then VS) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids); + + if (stage == MESA_SHADER_VERTEX) { + declare_vs_input_vgprs(args, shader); - if (ctx->stage == MESA_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &num_prolog_vgprs); + /* Need to keep LS/HS arg index same for shared args when ACO, + * so this is not able to be before shared VGPRs. + */ + declare_vb_descriptor_input_sgprs(args, shader); /* LS return values are inputs to the TCS main shader part. */ - for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) - ac_add_return(&ctx->args, AC_ARG_SGPR); - for (i = 0; i < 2; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); - - /* VS outputs passed via VGPRs to TCS. */ - if (shader->key.opt.same_patch_vertices) { - unsigned num_outputs = util_last_bit64(shader->selector->outputs_written); - for (i = 0; i < num_outputs * 4; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); + if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) { + for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) + ac_add_return(&args->ac, AC_ARG_SGPR); + for (i = 0; i < 2; i++) + ac_add_return(&args->ac, AC_ARG_VGPR); + + /* VS outputs passed via VGPRs to TCS. */ + if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) { + unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs); + for (i = 0; i < num_outputs * 4; i++) + ac_add_return(&args->ac, AC_ARG_VGPR); + } } } else { /* TCS inputs are passed via VGPRs from VS. */ - if (shader->key.opt.same_patch_vertices) { - unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written); + if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) { + unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written_before_tes_gs); for (i = 0; i < num_inputs * 4; i++) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); } - /* TCS return values are inputs to the TCS epilog. - * - * param_tcs_offchip_offset, param_tcs_factor_offset, - * param_tcs_offchip_layout, and internal_bindings - * should be passed to the epilog. + /* For monolithic shaders, the TCS epilog code is generated by + * ac_nir_lower_hs_outputs_to_mem. */ - for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) - ac_add_return(&ctx->args, AC_ARG_SGPR); - for (i = 0; i < 11; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); + if (!shader->is_monolithic) { + /* TCS return values are inputs to the TCS epilog. + * + * param_tcs_offchip_offset, param_tcs_factor_offset, + * param_tcs_offchip_layout, and internal_bindings + * should be passed to the epilog. + */ + for (i = 0; i <= 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR; i++) + ac_add_return(&args->ac, AC_ARG_SGPR); + for (i = 0; i < 11; i++) + ac_add_return(&args->ac, AC_ARG_VGPR); + } } break; case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: /* Merged stages have 8 system SGPRs at the beginning. */ - /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ - declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY); + /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ + /* Gfx11+: SPI_SHADER_PGM_LO/HI_GS */ + declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_GEOMETRY); + + if (shader->key.ge.as_ngg) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info); + else + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset); - if (ctx->shader->key.as_ngg) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); + if (sel->screen->info.gfx_level >= GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset); else - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset); - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ - - declare_global_desc_pointers(ctx); - if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + + declare_global_desc_pointers(args); + if (stage != MESA_SHADER_VERTEX || !sel->info.base.vs.blit_sgprs_amd) { declare_per_stage_desc_pointers( - ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)); + args, shader, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)); } - if (ctx->stage == MESA_SHADER_VERTEX) { - if (shader->selector->info.base.vs.blit_sgprs_amd) - declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd); - else - declare_vs_specific_input_sgprs(ctx); + if (stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd) { + declare_vs_blit_inputs(shader, args); } else { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - - if (ctx->stage == MESA_SHADER_TESS_EVAL) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); + + if (stage == MESA_SHADER_VERTEX) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance); + } else if (stage == MESA_SHADER_TESS_EVAL) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + } else { + /* GS */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ } - } - - if (ctx->stage == MESA_SHADER_VERTEX) - declare_vb_descriptor_input_sgprs(ctx); - /* VGPRs (first GS, then VS/TES) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]); - - if (ctx->stage == MESA_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &num_prolog_vgprs); - } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - declare_tes_input_vgprs(ctx); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info); + if (sel->screen->info.gfx_level >= GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address); + else + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ } - if ((ctx->shader->key.as_es || ngg_cull_shader) && - (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) { - unsigned num_user_sgprs, num_vgprs; + /* VGPRs (first GS, then VS/TES) */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]); - if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) { - /* For the NGG cull shader, add 1 SGPR to hold - * the vertex buffer pointer. - */ - num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + 1; + if (stage == MESA_SHADER_VERTEX) { + declare_vs_input_vgprs(args, shader); - if (shader->selector->num_vbos_in_user_sgprs) { - assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); - num_user_sgprs = - SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; - } - } else if (ctx->stage == MESA_SHADER_TESS_EVAL && ngg_cull_shader) { - num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; - } else { - num_user_sgprs = SI_NUM_VS_STATE_RESOURCE_SGPRS; - } - - /* The NGG cull shader has to return all 9 VGPRs. - * - * The normal merged ESGS shader only has to return the 5 VGPRs - * for the GS stage. + /* Need to keep ES/GS arg index same for shared args when ACO, + * so this is not able to be before shared VGPRs. */ - num_vgprs = ngg_cull_shader ? 9 : 5; + if (!sel->info.base.vs.blit_sgprs_amd) + declare_vb_descriptor_input_sgprs(args, shader); + } else if (stage == MESA_SHADER_TESS_EVAL) { + declare_tes_input_vgprs(args); + } + if (shader->key.ge.as_es && !shader->is_monolithic && + (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) { /* ES return values are inputs to GS. */ - for (i = 0; i < 8 + num_user_sgprs; i++) - ac_add_return(&ctx->args, AC_ARG_SGPR); - for (i = 0; i < num_vgprs; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); + for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++) + ac_add_return(&args->ac, AC_ARG_SGPR); + for (i = 0; i < 5; i++) + ac_add_return(&args->ac, AC_ARG_VGPR); } break; case MESA_SHADER_TESS_EVAL: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); - - if (shader->key.as_es) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset); + declare_global_desc_pointers(args); + declare_per_stage_desc_pointers(args, shader, true); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); + + if (shader->key.ge.as_es) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset); } else { - declare_streamout_params(ctx, &shader->selector->so); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); + declare_streamout_params(args, shader); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); } + /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ + if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); + /* VGPRs */ - declare_tes_input_vgprs(ctx); + declare_tes_input_vgprs(args); break; case MESA_SHADER_GEOMETRY: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id); + declare_global_desc_pointers(args); + declare_per_stage_desc_pointers(args, shader, true); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id); + + /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ + if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* VGPRs */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id); break; case MESA_SHADER_FRAGMENT: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF); - si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask, + declare_global_desc_pointers(args); + declare_per_stage_desc_pointers(args, shader, true); + si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference, + SI_PARAM_ALPHA_REF); + si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask, SI_PARAM_PRIM_MASK); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample, SI_PARAM_PERSP_SAMPLE); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center, SI_PARAM_PERSP_CENTER); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid, SI_PARAM_PERSP_CENTROID); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL); + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample, SI_PARAM_LINEAR_SAMPLE); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center, SI_PARAM_LINEAR_CENTER); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid, SI_PARAM_LINEAR_CENTROID); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0], + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX); + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0], SI_PARAM_POS_X_FLOAT); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1], + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1], SI_PARAM_POS_Y_FLOAT); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2], + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2], SI_PARAM_POS_Z_FLOAT); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3], + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3], SI_PARAM_POS_W_FLOAT); - shader->info.face_vgpr_index = ctx->args.num_vgprs_used; - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face, SI_PARAM_FRONT_FACE); - shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used; - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary, SI_PARAM_ANCILLARY); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt, + si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt, SI_PARAM_POS_FIXED_PT); - /* Color inputs from the prolog. */ - if (shader->selector->info.colors_read) { - unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read); - - for (i = 0; i < num_color_elements; i++) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); + if (sel->screen->use_aco) { + ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr); - num_prolog_vgprs += num_color_elements; + /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ + if (sel->screen->info.gfx_level < GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); } - /* Outputs for the epilog. */ - num_return_sgprs = SI_SGPR_ALPHA_REF + 1; - num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 + - shader->selector->info.writes_z + shader->selector->info.writes_stencil + - shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */; + /* Monolithic PS emit prolog and epilog in NIR directly. */ + if (!shader->is_monolithic) { + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read); - num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + for (i = 0; i < num_color_elements; i++) + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start); - for (i = 0; i < num_return_sgprs; i++) - ac_add_return(&ctx->args, AC_ARG_SGPR); - for (; i < num_returns; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); + num_prolog_vgprs += num_color_elements; + } + + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + shader->selector->info.writes_stencil + + shader->ps.writes_samplemask + 1 /* SampleMaskIn */; + + for (i = 0; i < num_return_sgprs; i++) + ac_add_return(&args->ac, AC_ARG_SGPR); + for (; i < num_returns; i++) + ac_add_return(&args->ac, AC_ARG_VGPR); + } break; case MESA_SHADER_COMPUTE: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); + declare_global_desc_pointers(args); + declare_per_stage_desc_pointers(args, shader, true); if (shader->selector->info.uses_grid_size) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups); + ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups); if (shader->selector->info.uses_variable_block_size) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size); unsigned cs_user_data_dwords = shader->selector->info.base.cs.user_data_components_amd; if (cs_user_data_dwords) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data); + ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &args->cs_user_data); } /* Some descriptors can be in user SGPRs. */ /* Shader buffers in user SGPRs. */ for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) { - while (ctx->args.num_sgprs_used % 4 != 0) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + while (args->ac.num_sgprs_used % 4 != 0) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]); + ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]); } /* Images in user SGPRs. */ for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) { - unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8; + unsigned num_sgprs = BITSET_TEST(shader->selector->info.base.image_buffers, i) ? 4 : 8; - while (ctx->args.num_sgprs_used % num_sgprs != 0) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + while (args->ac.num_sgprs_used % num_sgprs != 0) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]); + ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]); } /* Hardware SGPRs. */ for (i = 0; i < 3; i++) { if (shader->selector->info.uses_block_id[i]) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]); } } - if (shader->selector->info.uses_subgroup_info) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size); + if (shader->selector->info.uses_tg_size) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size); + + /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ + if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* Hardware VGPRs. */ - if (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids); + /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */ + if (sel->screen->info.gfx_level >= GFX11 || + (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200)) + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids); else - ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids); + ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.local_invocation_ids); break; default: assert(0 && "unimplemented shader"); return; } - shader->info.num_input_sgprs = ctx->args.num_sgprs_used; - shader->info.num_input_vgprs = ctx->args.num_vgprs_used; + shader->info.num_input_sgprs = args->ac.num_sgprs_used; + shader->info.num_input_vgprs = args->ac.num_vgprs_used; assert(shader->info.num_input_vgprs >= num_prolog_vgprs); shader->info.num_input_vgprs -= num_prolog_vgprs; } -/* For the UMR disassembler. */ -#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ -#define DEBUGGER_NUM_MARKERS 5 +static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage) +{ + return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 : + screen->info.gfx_level >= GFX7 ? 512 : 256; +} -static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, - struct ac_rtld_binary *rtld) +bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, + struct ac_rtld_binary *rtld) { const struct si_shader_selector *sel = shader->selector; const char *part_elfs[5]; @@ -769,14 +776,13 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh #define add_part(shader_or_part) \ if (shader_or_part) { \ - part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ - part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ + part_elfs[num_parts] = (shader_or_part)->binary.code_buffer; \ + part_sizes[num_parts] = (shader_or_part)->binary.code_size; \ num_parts++; \ } add_part(shader->prolog); add_part(shader->previous_stage); - add_part(shader->prolog2); add_part(shader); add_part(shader->epilog); @@ -785,15 +791,16 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh struct ac_rtld_symbol lds_symbols[2]; unsigned num_lds_symbols = 0; - if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && - (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) { + if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader && + (sel->stage == MESA_SHADER_GEOMETRY || + (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) { struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; sym->name = "esgs_ring"; sym->size = shader->gs_info.esgs_ring_size * 4; sym->align = 64 * 1024; } - if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) { + if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) { struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; sym->name = "ngg_emit"; sym->size = shader->ngg.ngg_emit_size * 4; @@ -806,8 +813,8 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh { .halt_at_entry = screen->options.halt_shaders, }, - .shader_type = sel->info.stage, - .wave_size = si_get_shader_wave_size(shader), + .shader_type = sel->stage, + .wave_size = shader->wave_size, .num_parts = num_parts, .elf_ptrs = part_elfs, .elf_sizes = part_sizes, @@ -815,23 +822,54 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh .shared_lds_symbols = lds_symbols}); if (rtld->lds_size > 0) { - unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256; - shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity; + unsigned alloc_granularity = get_lds_granularity(screen, sel->stage); + shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity); } return ok; } +static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4]) +{ + unsigned num_bin = 0; + + if (shader->prolog) + bin[num_bin++] = &shader->prolog->binary; + + if (shader->previous_stage) + bin[num_bin++] = &shader->previous_stage->binary; + + bin[num_bin++] = &shader->binary; + + if (shader->epilog) + bin[num_bin++] = &shader->epilog->binary; + + return num_bin; +} + static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader) { - struct ac_rtld_binary rtld; - si_shader_binary_open(screen, shader, &rtld); - uint64_t size = rtld.exec_size; - ac_rtld_close(&rtld); - return size; + if (shader->binary.type == SI_SHADER_BINARY_ELF) { + struct ac_rtld_binary rtld; + si_shader_binary_open(screen, shader, &rtld); + uint64_t size = rtld.exec_size; + ac_rtld_close(&rtld); + return size; + } else { + struct si_shader_binary *bin[4]; + unsigned num_bin = get_shader_binaries(shader, bin); + + unsigned size = 0; + for (unsigned i = 0; i < num_bin; i++) { + assert(bin[i]->type == SI_SHADER_BINARY_RAW); + size += bin[i]->exec_size; + } + return size; + } } -static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) +bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name, + uint64_t *value) { uint64_t *scratch_va = data; @@ -841,27 +879,115 @@ static bool si_get_external_symbol(void *data, const char *name, uint64_t *value } if (!strcmp(scratch_rsrc_dword1_symbol, name)) { /* Enable scratch coalescing. */ - *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1); + *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32); + + if (gfx_level >= GFX11) + *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1); + else + *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1); return true; } return false; } -bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, - uint64_t scratch_va) +static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader, + unsigned binary_size, bool dma_upload, + struct si_context **upload_ctx, + struct pipe_resource **staging, + unsigned *staging_offset) { - struct ac_rtld_binary binary; - if (!si_shader_binary_open(sscreen, shader, &binary)) - return false; + unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size); si_resource_reference(&shader->bo, NULL); shader->bo = si_aligned_buffer_create( &sscreen->b, - (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) | - SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT, - PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256); + SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT | + (dma_upload || sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) | + (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0), + PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256); if (!shader->bo) + return NULL; + + shader->gpu_address = shader->bo->gpu_address; + + if (dma_upload) { + /* First upload into a staging buffer. */ + *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload); + + void *ret; + u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256, + staging_offset, staging, &ret); + if (!ret) + si_put_aux_context_flush(&sscreen->aux_context.shader_upload); + + return ret; + } else { + return sscreen->ws->buffer_map(sscreen->ws, + shader->bo->buf, NULL, + PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY); + } +} + +static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader, + void *code, unsigned code_size, + unsigned binary_size, bool dma_upload, + struct si_context *upload_ctx, + struct pipe_resource *staging, + unsigned staging_offset) +{ + if (sscreen->debug_flags & DBG(SQTT)) { + /* Remember the uploaded code */ + shader->binary.uploaded_code_size = code_size; + shader->binary.uploaded_code = malloc(code_size); + memcpy(shader->binary.uploaded_code, code, code_size); + } + + if (dma_upload) { + /* Then copy from the staging buffer to VRAM. + * + * We can't use the upload copy in si_buffer_transfer_unmap because that might use + * a compute shader, and we can't use shaders in the code that is responsible for making + * them available. + */ + si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset, + binary_size, SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER, + sscreen->info.gfx_level >= GFX7 ? L2_LRU : L2_BYPASS); + upload_ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_L2; + +#if 0 /* debug: validate whether the copy was successful */ + uint32_t *dst_binary = malloc(binary_size); + uint32_t *src_binary = (uint32_t*)code; + pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary); + puts("dst_binary == src_binary:"); + for (unsigned i = 0; i < binary_size / 4; i++) { + printf(" %08x == %08x\n", dst_binary[i], src_binary[i]); + } + free(dst_binary); + exit(0); +#endif + + si_put_aux_context_flush(&sscreen->aux_context.shader_upload); + pipe_resource_reference(&staging, NULL); + } else { + sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf); + } +} + +static bool upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader, + uint64_t scratch_va, bool dma_upload) +{ + struct ac_rtld_binary binary; + if (!si_shader_binary_open(sscreen, shader, &binary)) + return false; + + struct si_context *upload_ctx = NULL; + struct pipe_resource *staging = NULL; + unsigned staging_offset = 0; + + void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload, + &upload_ctx, &staging, &staging_offset); + if (!rx_ptr) return false; /* Upload. */ @@ -870,60 +996,125 @@ bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader u.get_external_symbol = si_get_external_symbol; u.cb_data = &scratch_va; u.rx_va = shader->bo->gpu_address; - u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws, - shader->bo->buf, NULL, - PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY); - if (!u.rx_ptr) - return false; + u.rx_ptr = rx_ptr; int size = ac_rtld_upload(&u); - if (sscreen->debug_flags & DBG(SQTT)) { - /* Remember the uploaded code */ - shader->binary.uploaded_code_size = size; - shader->binary.uploaded_code = malloc(size); - memcpy(shader->binary.uploaded_code, u.rx_ptr, size); - } + post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload, + upload_ctx, staging, staging_offset); - sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf); ac_rtld_close(&binary); return size >= 0; } -static void si_shader_dump_disassembly(struct si_screen *screen, - const struct si_shader_binary *binary, - gl_shader_stage stage, unsigned wave_size, - struct pipe_debug_callback *debug, const char *name, - FILE *file) +static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader) { - struct ac_rtld_binary rtld_binary; + gl_shader_stage stage = + shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage; + + if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY && + (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) { + unsigned size_in_dw = shader->gs_info.esgs_ring_size; + + if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) + size_in_dw += shader->ngg.ngg_emit_size; + + if (shader->key.ge.as_ngg) { + unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader); + if (scratch_dw_size) { + /* scratch base address needs to be 8 byte aligned */ + size_in_dw = ALIGN(size_in_dw, 2); + size_in_dw += scratch_dw_size; + } + } - if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ - .info = &screen->info, - .shader_type = stage, - .wave_size = wave_size, - .num_parts = 1, - .elf_ptrs = &binary->elf_buffer, - .elf_sizes = &binary->elf_size})) - return; + shader->config.lds_size = + DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage)); + } +} - const char *disasm; - size_t nbytes; +static bool upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader, + uint64_t scratch_va, bool dma_upload) +{ + struct si_shader_binary *bin[4]; + unsigned num_bin = get_shader_binaries(shader, bin); + + unsigned code_size = 0, exec_size = 0; + for (unsigned i = 0; i < num_bin; i++) { + assert(bin[i]->type == SI_SHADER_BINARY_RAW); + code_size += bin[i]->code_size; + exec_size += bin[i]->exec_size; + } - if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) - goto out; + struct si_context *upload_ctx = NULL; + struct pipe_resource *staging = NULL; + unsigned staging_offset = 0; - if (nbytes > INT_MAX) - goto out; + void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload, + &upload_ctx, &staging, &staging_offset); + if (!rx_ptr) + return false; + + unsigned exec_offset = 0, data_offset = exec_size; + for (unsigned i = 0; i < num_bin; i++) { + memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size); + + if (bin[i]->num_symbols) { + /* Offset needed to add to const data symbol because of inserting other + * shader part between exec code and const data. + */ + unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size; + + /* Prolog and epilog have no symbols. */ + struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage; + assert(sh && bin[i] == &sh->binary); + + si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer, + scratch_va, const_offset); + } + + exec_offset += bin[i]->exec_size; + + unsigned data_size = bin[i]->code_size - bin[i]->exec_size; + if (data_size) { + memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size); + data_offset += data_size; + } + } + post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload, + upload_ctx, staging, staging_offset); + + calculate_needed_lds_size(sscreen, shader); + return true; +} + +bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, + uint64_t scratch_va) +{ + bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && + sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible; + + if (shader->binary.type == SI_SHADER_BINARY_ELF) { + return upload_binary_elf(sscreen, shader, scratch_va, dma_upload); + } else { + assert(shader->binary.type == SI_SHADER_BINARY_RAW); + return upload_binary_raw(sscreen, shader, scratch_va, dma_upload); + } +} + +static void print_disassembly(const char *disasm, size_t nbytes, + const char *name, FILE *file, + struct util_debug_callback *debug) +{ if (debug && debug->debug_message) { /* Very long debug messages are cut off, so send the * disassembly one line at a time. This causes more * overhead, but on the plus side it simplifies * parsing of resulting logs. */ - pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin"); + util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin"); uint64_t line = 0; while (line < nbytes) { @@ -933,19 +1124,53 @@ static void si_shader_dump_disassembly(struct si_screen *screen, count = nl - (disasm + line); if (count) { - pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line); + util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line); } line += count + 1; } - pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End"); + util_debug_message(debug, SHADER_INFO, "Shader Disassembly End"); } if (file) { fprintf(file, "Shader %s disassembly:\n", name); fprintf(file, "%*s", (int)nbytes, disasm); } +} + +static void si_shader_dump_disassembly(struct si_screen *screen, + const struct si_shader_binary *binary, + gl_shader_stage stage, unsigned wave_size, + struct util_debug_callback *debug, const char *name, + FILE *file) +{ + if (binary->type == SI_SHADER_BINARY_RAW) { + print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug); + return; + } + + struct ac_rtld_binary rtld_binary; + + if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ + .info = &screen->info, + .shader_type = stage, + .wave_size = wave_size, + .num_parts = 1, + .elf_ptrs = &binary->code_buffer, + .elf_sizes = &binary->code_size})) + return; + + const char *disasm; + size_t nbytes; + + if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) + goto out; + + if (nbytes > INT_MAX) + goto out; + + print_disassembly(disasm, nbytes, name, file, debug); out: ac_rtld_close(&rtld_binary); @@ -955,15 +1180,14 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) { struct si_screen *sscreen = shader->selector->screen; struct ac_shader_config *conf = &shader->config; - unsigned num_inputs = shader->selector->info.num_inputs; - unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256; + unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage); unsigned lds_per_wave = 0; unsigned max_simd_waves; - max_simd_waves = sscreen->info.max_wave64_per_simd; + max_simd_waves = sscreen->info.max_waves_per_simd; /* Compute LDS usage for PS. */ - switch (shader->selector->info.stage) { + switch (shader->selector->stage) { case MESA_SHADER_FRAGMENT: /* The minimum usage per wave is (num_inputs * 48). The maximum * usage is (num_inputs * 48 * 16). @@ -975,12 +1199,13 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) * Other stages don't know the size at compile time or don't * allocate LDS per wave, but instead they do it per thread group. */ - lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment); + lds_per_wave = conf->lds_size * lds_increment + + align(shader->info.num_ps_inputs * 48, lds_increment); break; case MESA_SHADER_COMPUTE: { unsigned max_workgroup_size = si_get_max_workgroup_size(shader); lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size); + DIV_ROUND_UP(max_workgroup_size, shader->wave_size); } break; default:; @@ -993,10 +1218,24 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) } if (conf->num_vgprs) { + /* GFX 10.3 internally: + * - aligns VGPRS to 16 for Wave32 and 8 for Wave64 + * - aligns LDS to 1024 + * + * For shader-db stats, set num_vgprs that the hw actually uses. + */ + unsigned num_vgprs = conf->num_vgprs; + if (sscreen->info.gfx_level >= GFX10_3) { + unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64; + num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1)); + } else { + num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4); + } + /* Always print wave limits as Wave64, so that we can compare * Wave32 and Wave64 with shader-db fairly. */ unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd; - max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); + max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs); } unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4; @@ -1007,21 +1246,84 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) } void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader, - struct pipe_debug_callback *debug) + struct util_debug_callback *debug) { const struct ac_shader_config *conf = &shader->config; + static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"}; if (screen->options.debug_disassembly) - si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage, - si_get_shader_wave_size(shader), debug, "main", NULL); + si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage, + shader->wave_size, debug, "main", NULL); + + unsigned num_ls_outputs = 0; + unsigned num_hs_outputs = 0; + unsigned num_es_outputs = 0; + unsigned num_gs_outputs = 0; + unsigned num_vs_outputs = 0; + unsigned num_ps_outputs = 0; + + if (shader->selector->stage <= MESA_SHADER_GEOMETRY) { + /* This doesn't include pos exports because only param exports are interesting + * for performance and can be optimized. + */ + if (shader->key.ge.as_ls) + num_ls_outputs = shader->selector->info.lshs_vertex_stride / 16; + else if (shader->selector->stage == MESA_SHADER_TESS_CTRL) + num_hs_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs); + else if (shader->key.ge.as_es) + num_es_outputs = shader->selector->info.esgs_vertex_stride / 16; + else if (shader->gs_copy_shader) + num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports; + else if (shader->selector->stage == MESA_SHADER_GEOMETRY) + num_gs_outputs = shader->info.nr_param_exports; + else if (shader->selector->stage == MESA_SHADER_VERTEX || + shader->selector->stage == MESA_SHADER_TESS_EVAL) + num_vs_outputs = shader->info.nr_param_exports; + else + unreachable("invalid shader key"); + } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) { + num_ps_outputs = util_bitcount(shader->selector->info.colors_written) + + (shader->selector->info.writes_z || + shader->selector->info.writes_stencil || + shader->ps.writes_samplemask); + } - pipe_debug_message(debug, SHADER_INFO, + util_debug_message(debug, SHADER_INFO, "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " - "Spilled VGPRs: %d PrivMem VGPRs: %d", + "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u " + "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u " + "InlineUniforms: %u DivergentLoop: %u (%s, W%u)", conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader), conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves, - conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs); + conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs, + num_ls_outputs, num_hs_outputs, + util_last_bit64(shader->selector->info.patch_outputs_written), + num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs, + shader->selector->info.base.num_inlinable_uniforms, + shader->selector->info.has_divergent_loop, + stages[shader->selector->stage], shader->wave_size); +} + +bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage, + enum si_shader_dump_type dump_type) +{ + static uint64_t filter[] = { + [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM), + [SI_DUMP_INIT_NIR] = DBG(INIT_NIR), + [SI_DUMP_NIR] = DBG(NIR), + [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM), + [SI_DUMP_LLVM_IR] = DBG(LLVM), + [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO), + [SI_DUMP_ACO_IR] = DBG(ACO), + [SI_DUMP_ASM] = DBG(ASM), + [SI_DUMP_STATS] = DBG(STATS), + [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS), + }; + assert(dump_type < ARRAY_SIZE(filter)); + + return sscreen->debug_flags & (1 << stage) && + sscreen->debug_flags & filter[dump_type]; } static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file, @@ -1029,51 +1331,50 @@ static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *sh { const struct ac_shader_config *conf = &shader->config; - if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) { - if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) { - fprintf(file, - "*** SHADER CONFIG ***\n" - "SPI_PS_INPUT_ADDR = 0x%04x\n" - "SPI_PS_INPUT_ENA = 0x%04x\n", - conf->spi_ps_input_addr, conf->spi_ps_input_ena); - } - + if (shader->selector->stage == MESA_SHADER_FRAGMENT) { fprintf(file, - "*** SHADER STATS ***\n" - "SGPRS: %d\n" - "VGPRS: %d\n" - "Spilled SGPRs: %d\n" - "Spilled VGPRs: %d\n" - "Private memory VGPRs: %d\n" - "Code Size: %d bytes\n" - "LDS: %d blocks\n" - "Scratch: %d bytes per wave\n" - "Max Waves: %d\n" - "********************\n\n\n", - conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs, - shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader), - conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves); + "*** SHADER CONFIG ***\n" + "SPI_PS_INPUT_ADDR = 0x%04x\n" + "SPI_PS_INPUT_ENA = 0x%04x\n", + conf->spi_ps_input_addr, conf->spi_ps_input_ena); } + + fprintf(file, + "*** SHADER STATS ***\n" + "SGPRS: %d\n" + "VGPRS: %d\n" + "Spilled SGPRs: %d\n" + "Spilled VGPRs: %d\n" + "Private memory VGPRs: %d\n" + "Code Size: %d bytes\n" + "LDS: %d bytes\n" + "Scratch: %d bytes per wave\n" + "Max Waves: %d\n" + "********************\n\n\n", + conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs, + shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader), + conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage), + conf->scratch_bytes_per_wave, shader->info.max_simd_waves); } const char *si_get_shader_name(const struct si_shader *shader) { - switch (shader->selector->info.stage) { + switch (shader->selector->stage) { case MESA_SHADER_VERTEX: - if (shader->key.as_es) + if (shader->key.ge.as_es) return "Vertex Shader as ES"; - else if (shader->key.as_ls) + else if (shader->key.ge.as_ls) return "Vertex Shader as LS"; - else if (shader->key.as_ngg) + else if (shader->key.ge.as_ngg) return "Vertex Shader as ESGS"; else return "Vertex Shader as VS"; case MESA_SHADER_TESS_CTRL: return "Tessellation Control Shader"; case MESA_SHADER_TESS_EVAL: - if (shader->key.as_es) + if (shader->key.ge.as_es) return "Tessellation Evaluation Shader as ES"; - else if (shader->key.as_ngg) + else if (shader->key.ge.as_ngg) return "Tessellation Evaluation Shader as ESGS"; else return "Tessellation Evaluation Shader as VS"; @@ -1092,14 +1393,15 @@ const char *si_get_shader_name(const struct si_shader *shader) } void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, - struct pipe_debug_callback *debug, FILE *file, bool check_debug_option) + struct util_debug_callback *debug, FILE *file, bool check_debug_option) { - gl_shader_stage stage = shader->selector->info.stage; + gl_shader_stage stage = shader->selector->stage; - if (!check_debug_option || si_can_dump_shader(sscreen, stage)) + if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY)) si_dump_shader_key(shader, file); if (!check_debug_option && shader->binary.llvm_ir_string) { + /* This is only used with ddebug. */ if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) { fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader)); fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); @@ -1109,47 +1411,36 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, fprintf(file, "%s\n", shader->binary.llvm_ir_string); } - if (!check_debug_option || - (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) { - unsigned wave_size = si_get_shader_wave_size(shader); - + if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) { fprintf(file, "\n%s:\n", si_get_shader_name(shader)); if (shader->prolog) - si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug, + si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug, "prolog", file); if (shader->previous_stage) si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage, - wave_size, debug, "previous stage", file); - if (shader->prolog2) - si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size, - debug, "prolog2", file); - - si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main", + shader->wave_size, debug, "previous stage", file); + si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main", file); if (shader->epilog) - si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug, + si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug, "epilog", file); fprintf(file, "\n"); - } - si_shader_dump_stats(sscreen, shader, file, check_debug_option); + si_shader_dump_stats(sscreen, shader, file, check_debug_option); + } } -static void si_dump_shader_key_vs(const struct si_shader_key *key, - const struct si_vs_prolog_bits *prolog, const char *prefix, - FILE *f) +static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f) { - fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one); - fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix, - prolog->instance_divisor_is_fetched); - fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix); - - fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); + fprintf(f, " mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one); + fprintf(f, " mono.instance_divisor_is_fetched = %u\n", + key->ge.mono.instance_divisor_is_fetched); + fprintf(f, " mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode); fprintf(f, " mono.vs.fix_fetch = {"); for (int i = 0; i < SI_MAX_ATTRIBS; i++) { - union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; + union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i]; if (i) fprintf(f, ", "); if (!fix.bits) @@ -1163,86 +1454,90 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key, static void si_dump_shader_key(const struct si_shader *shader, FILE *f) { - const struct si_shader_key *key = &shader->key; - gl_shader_stage stage = shader->selector->info.stage; + const union si_shader_key *key = &shader->key; + gl_shader_stage stage = shader->selector->stage; fprintf(f, "SHADER KEY\n"); + fprintf(f, " source_sha1 = {"); + _mesa_sha1_print(f, shader->selector->info.base.source_sha1); + fprintf(f, "}\n"); switch (stage) { case MESA_SHADER_VERTEX: - si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f); - fprintf(f, " as_es = %u\n", key->as_es); - fprintf(f, " as_ls = %u\n", key->as_ls); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); + si_dump_shader_key_vs(key, f); + fprintf(f, " as_es = %u\n", key->ge.as_es); + fprintf(f, " as_ls = %u\n", key->ge.as_ls); + fprintf(f, " as_ngg = %u\n", key->ge.as_ngg); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id); break; case MESA_SHADER_TESS_CTRL: - if (shader->selector->screen->info.chip_class >= GFX9) { - si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f); - } - fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); - fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n", - key->mono.u.ff_tcs_inputs_to_copy); - fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono); - fprintf(f, " opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices); + if (shader->selector->screen->info.gfx_level >= GFX9) + si_dump_shader_key_vs(key, f); + + fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->ge.part.tcs.epilog.prim_mode); + fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono); + fprintf(f, " opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices); break; case MESA_SHADER_TESS_EVAL: - fprintf(f, " as_es = %u\n", key->as_es); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); + fprintf(f, " as_es = %u\n", key->ge.as_es); + fprintf(f, " as_ngg = %u\n", key->ge.as_ngg); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id); break; case MESA_SHADER_GEOMETRY: if (shader->is_gs_copy_shader) break; - if (shader->selector->screen->info.chip_class >= GFX9 && - key->part.gs.es->info.stage == MESA_SHADER_VERTEX) { - si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f); - } - fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", - key->part.gs.prolog.tri_strip_adj_fix); - fprintf(f, " as_ngg = %u\n", key->as_ngg); + if (shader->selector->screen->info.gfx_level >= GFX9 && + key->ge.part.gs.es->stage == MESA_SHADER_VERTEX) + si_dump_shader_key_vs(key, f); + + fprintf(f, " mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix); + fprintf(f, " as_ngg = %u\n", key->ge.as_ngg); break; case MESA_SHADER_COMPUTE: break; case MESA_SHADER_FRAGMENT: - fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); - fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); - fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); - fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", - key->part.ps.prolog.force_persp_sample_interp); - fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", - key->part.ps.prolog.force_linear_sample_interp); - fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", - key->part.ps.prolog.force_persp_center_interp); - fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", - key->part.ps.prolog.force_linear_center_interp); - fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", - key->part.ps.prolog.bc_optimize_for_persp); - fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", - key->part.ps.prolog.bc_optimize_for_linear); - fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", - key->part.ps.prolog.samplemask_log_ps_iter); - fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", - key->part.ps.epilog.spi_shader_col_format); - fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); - fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); - fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); - fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); - fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); - fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", - key->part.ps.epilog.poly_line_smoothing); - fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); - fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", - key->mono.u.ps.interpolate_at_sample_force_center); - fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa); - fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D); - fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered); + fprintf(f, " prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side); + fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors); + fprintf(f, " prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple); + fprintf(f, " prolog.force_persp_sample_interp = %u\n", + key->ps.part.prolog.force_persp_sample_interp); + fprintf(f, " prolog.force_linear_sample_interp = %u\n", + key->ps.part.prolog.force_linear_sample_interp); + fprintf(f, " prolog.force_persp_center_interp = %u\n", + key->ps.part.prolog.force_persp_center_interp); + fprintf(f, " prolog.force_linear_center_interp = %u\n", + key->ps.part.prolog.force_linear_center_interp); + fprintf(f, " prolog.bc_optimize_for_persp = %u\n", + key->ps.part.prolog.bc_optimize_for_persp); + fprintf(f, " prolog.bc_optimize_for_linear = %u\n", + key->ps.part.prolog.bc_optimize_for_linear); + fprintf(f, " prolog.samplemask_log_ps_iter = %u\n", + key->ps.part.prolog.samplemask_log_ps_iter); + fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", + key->ps.part.epilog.spi_shader_col_format); + fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8); + fprintf(f, " epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10); + fprintf(f, " epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf); + fprintf(f, " epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func); + fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one); + fprintf(f, " epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz); + fprintf(f, " epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color); + fprintf(f, " epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle); + fprintf(f, " epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt); + fprintf(f, " epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask); + fprintf(f, " mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing); + fprintf(f, " mono.point_smoothing = %u\n", key->ps.mono.point_smoothing); + fprintf(f, " mono.interpolate_at_sample_force_center = %u\n", + key->ps.mono.interpolate_at_sample_force_center); + fprintf(f, " mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa); + fprintf(f, " mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D); + fprintf(f, " mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered); break; default: @@ -1251,116 +1546,718 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_VERTEX) && - !key->as_es && !key->as_ls) { - fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs); - fprintf(f, " opt.kill_pointsize = 0x%x\n", key->opt.kill_pointsize); - fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances); - if (stage != MESA_SHADER_GEOMETRY) - fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling); + !key->ge.as_es && !key->ge.as_ls) { + fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs); + fprintf(f, " opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize); + fprintf(f, " opt.kill_layer = 0x%x\n", key->ge.opt.kill_layer); + fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances); + fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); + fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout); } - fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono); - fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n", - key->opt.inline_uniforms, - key->opt.inlined_uniform_values[0], - key->opt.inlined_uniform_values[1], - key->opt.inlined_uniform_values[2], - key->opt.inlined_uniform_values[3]); + if (stage <= MESA_SHADER_GEOMETRY) + fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono); + else + fprintf(f, " opt.prefer_mono = %u\n", key->ps.opt.prefer_mono); + + if (stage <= MESA_SHADER_GEOMETRY) { + if (key->ge.opt.inline_uniforms) { + fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n", + key->ge.opt.inline_uniforms, + key->ge.opt.inlined_uniform_values[0], + key->ge.opt.inlined_uniform_values[1], + key->ge.opt.inlined_uniform_values[2], + key->ge.opt.inlined_uniform_values[3]); + } else { + fprintf(f, " opt.inline_uniforms = 0\n"); + } + } else { + if (key->ps.opt.inline_uniforms) { + fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n", + key->ps.opt.inline_uniforms, + key->ps.opt.inlined_uniform_values[0], + key->ps.opt.inlined_uniform_values[1], + key->ps.opt.inlined_uniform_values[2], + key->ps.opt.inlined_uniform_values[3]); + } else { + fprintf(f, " opt.inline_uniforms = 0\n"); + } + } } -bool si_vs_needs_prolog(const struct si_shader_selector *sel, - const struct si_vs_prolog_bits *prolog_key, - const struct si_shader_key *key, bool ngg_cull_shader) +/* TODO: convert to nir_shader_instructions_pass */ +static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key) { - /* VGPR initialization fixup for Vega10 and Raven is always done in the - * VS prolog. */ - return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix || - /* The 2nd VS prolog loads input VGPRs from LDS */ - (key->opt.ngg_culling && !ngg_cull_shader) || - /* The 1st VS prolog generates input VGPRs for fast launch. */ - (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + assert(impl); + assert(nir->info.stage <= MESA_SHADER_GEOMETRY); + + if (!key->ge.opt.kill_outputs && + !key->ge.opt.kill_pointsize && + !key->ge.opt.kill_layer && + !key->ge.opt.kill_clip_distances && + !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) { + nir_metadata_preserve(impl, nir_metadata_all); + return false; + } + + bool progress = false; + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + /* No indirect indexing allowed. */ + ASSERTED nir_src offset = *nir_get_io_offset_src(intr); + assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0); + + assert(intr->num_components == 1); /* only scalar stores expected */ + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + if (nir_slot_is_varying(sem.location) && + key->ge.opt.kill_outputs & + (1ull << si_shader_io_get_unique_index(sem.location))) + progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT); + + switch (sem.location) { + case VARYING_SLOT_PSIZ: + if (key->ge.opt.kill_pointsize) + progress |= nir_remove_sysval_output(intr); + break; + + case VARYING_SLOT_CLIP_VERTEX: + /* TODO: We should only kill specific clip planes as required by kill_clip_distance, + * not whole gl_ClipVertex. Lower ClipVertex in NIR. + */ + if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) == + SI_USER_CLIP_PLANE_MASK) + progress |= nir_remove_sysval_output(intr); + break; + + case VARYING_SLOT_CLIP_DIST0: + case VARYING_SLOT_CLIP_DIST1: + if (key->ge.opt.kill_clip_distances) { + assert(nir_intrinsic_src_type(intr) == nir_type_float32); + unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 + + nir_intrinsic_component(intr); + + if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index)) + progress |= nir_remove_sysval_output(intr); + } + break; + + case VARYING_SLOT_LAYER: + /* LAYER is never passed to FS. Instead, we load it there as a system value. */ + progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT); + + if (key->ge.opt.kill_layer) + progress |= nir_remove_sysval_output(intr); + break; + } + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_dominance | + nir_metadata_block_index); + } else { + nir_metadata_preserve(impl, nir_metadata_all); + } + + return progress; } -/** - * Compute the VS prolog key, which contains all the information needed to - * build the VS prolog function, and set shader->info bits where needed. - * - * \param info Shader info of the vertex shader. - * \param num_input_sgprs Number of input SGPRs for the vertex shader. - * \param has_old_ Whether the preceding shader part is the NGG cull shader. - * \param prolog_key Key of the VS prolog - * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. - * \param key Output shader part key. - */ -void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs, - bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key, - struct si_shader *shader_out, union si_shader_part_key *key) +/* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */ +static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key) { - memset(key, 0, sizeof(*key)); - key->vs_prolog.states = *prolog_key; - key->vs_prolog.num_input_sgprs = num_input_sgprs; - key->vs_prolog.num_inputs = info->num_inputs; - key->vs_prolog.as_ls = shader_out->key.as_ls; - key->vs_prolog.as_es = shader_out->key.as_es; - key->vs_prolog.as_ngg = shader_out->key.as_ngg; - - if (ngg_cull_shader) { - key->vs_prolog.gs_fast_launch_tri_list = - !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); - key->vs_prolog.gs_fast_launch_tri_strip = - !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); - key->vs_prolog.gs_fast_launch_index_size_packed = - SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling); - } else if (shader_out->key.opt.ngg_culling) { - key->vs_prolog.load_vgprs_after_culling = 1; - } - - if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) { - key->vs_prolog.as_ls = 1; - key->vs_prolog.num_merged_next_stage_vgprs = 2; - } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) { - key->vs_prolog.as_es = 1; - key->vs_prolog.num_merged_next_stage_vgprs = 5; - } else if (shader_out->key.as_ngg) { - key->vs_prolog.num_merged_next_stage_vgprs = 5; - } - - /* Only one of these combinations can be set. as_ngg can be set with as_es. */ - assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg + - (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1); - - /* Enable loading the InstanceID VGPR. */ - uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); - - if ((key->vs_prolog.states.instance_divisor_is_one | - key->vs_prolog.states.instance_divisor_is_fetched) & - input_mask) - shader_out->info.uses_instanceid = true; -} - -struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel, - const struct si_shader_key *key, - bool *free_nir) + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + /* No indirect indexing allowed. */ + ASSERTED nir_src offset = *nir_get_io_offset_src(intr); + assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0); + + unsigned location = nir_intrinsic_io_semantics(intr).location; + const union si_shader_key *key = _key; + + switch (location) { + case FRAG_RESULT_DEPTH: + case FRAG_RESULT_STENCIL: + return false; + + case FRAG_RESULT_SAMPLE_MASK: + if (key->ps.part.epilog.kill_samplemask) { + nir_instr_remove(instr); + return true; + } + return false; + } + + /* Color outputs. */ + unsigned comp_mask = BITFIELD_MASK(intr->num_components); + assert(nir_intrinsic_component(intr) == 0); + unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format); + + /* Preserve alpha if ALPHA_TESTING is enabled. */ + if (key->ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS || + key->ps.part.epilog.alpha_to_coverage_via_mrtz) + cb_shader_mask |= 1 << 3; + + /* If COLOR is broadcasted to multiple color buffers, combine their masks. */ + if (location == FRAG_RESULT_COLOR) { + for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++) + cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf; + } + + unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0; + unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf; + + if ((output_mask & comp_mask) == comp_mask) + return false; + + if (!(output_mask & comp_mask)) { + nir_instr_remove(instr); + return true; + } + + /* Fill disabled components with undef. */ + b->cursor = nir_before_instr(instr); + nir_def *new_value = intr->src[0].ssa; + nir_def *undef = nir_undef(b, 1, new_value->bit_size); + + unsigned kill_mask = ~output_mask & comp_mask; + u_foreach_bit(i, kill_mask) { + new_value = nir_vector_insert_imm(b, new_value, undef, i); + } + + nir_src_rewrite(&intr->src[0], new_value); + return true; +} + +static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key) +{ + assert(nir->info.stage == MESA_SHADER_FRAGMENT); + return nir_shader_instructions_pass(nir, kill_ps_outputs_cb, + nir_metadata_dominance | + nir_metadata_block_index, (void*)key); +} + +static bool clamp_vertex_color_instr(nir_builder *b, + nir_intrinsic_instr *intrin, void *state) +{ + if (intrin->intrinsic != nir_intrinsic_store_output) + return false; + + unsigned location = nir_intrinsic_io_semantics(intrin).location; + if (location != VARYING_SLOT_COL0 && location != VARYING_SLOT_COL1 && + location != VARYING_SLOT_BFC0 && location != VARYING_SLOT_BFC1) + return false; + + /* no indirect output */ + assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1])); + /* only scalar output */ + assert(intrin->src[0].ssa->num_components == 1); + + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *color = intrin->src[0].ssa; + nir_def *clamp = nir_load_clamp_vertex_color_amd(b); + nir_def *new_color = nir_bcsel(b, clamp, nir_fsat(b, color), color); + nir_src_rewrite(&intrin->src[0], new_color); + + return true; +} + +static bool si_nir_clamp_vertex_color(nir_shader *nir) +{ + uint64_t mask = VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 | VARYING_BIT_BFC1; + if (!(nir->info.outputs_written & mask)) + return false; + + return nir_shader_intrinsics_pass(nir, clamp_vertex_color_instr, + nir_metadata_dominance | nir_metadata_block_index, + NULL); +} + +static unsigned si_map_io_driver_location(unsigned semantic) +{ + if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) || + semantic == VARYING_SLOT_TESS_LEVEL_INNER || + semantic == VARYING_SLOT_TESS_LEVEL_OUTER) + return ac_shader_io_get_unique_index_patch(semantic); + + return si_shader_io_get_unique_index(semantic); +} + +static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir, + uint64_t tcs_vgpr_only_inputs) +{ + struct si_shader_selector *sel = shader->selector; + const union si_shader_key *key = &shader->key; + + if (nir->info.stage == MESA_SHADER_VERTEX) { + if (key->ge.as_ls) { + NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, si_map_io_driver_location, + key->ge.opt.same_patch_vertices, tcs_vgpr_only_inputs); + return true; + } else if (key->ge.as_es) { + NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location, + sel->screen->info.gfx_level, sel->info.esgs_vertex_stride); + return true; + } + } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, si_map_io_driver_location, + key->ge.opt.same_patch_vertices); + + /* Used by hs_emit_write_tess_factors() when monolithic shader. */ + nir->info.tess._primitive_mode = key->ge.part.tcs.epilog.prim_mode; + + NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, si_map_io_driver_location, + sel->screen->info.gfx_level, + ~0ULL, ~0ULL, /* no TES inputs filter */ + util_last_bit64(sel->info.outputs_written_before_tes_gs), + util_last_bit64(sel->info.patch_outputs_written), + shader->wave_size, + /* ALL TCS inputs are passed by register. */ + key->ge.opt.same_patch_vertices && + !(sel->info.base.inputs_read & ~sel->info.tcs_vgpr_only_inputs), + sel->info.tessfactors_are_def_in_all_invocs, + /* Emit tess factor writes in monolithic shaders that don't need an epilog. */ + shader->is_monolithic, + /* Only pass tess factors to epilog in registers when they are defined in all invocations. */ + !shader->is_monolithic && sel->info.tessfactors_are_def_in_all_invocs); + return true; + } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) { + NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location); + + if (key->ge.as_es) { + NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location, + sel->screen->info.gfx_level, sel->info.esgs_vertex_stride); + } + + return true; + } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { + NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location, + sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix); + return true; + } + + return false; +} + +static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) +{ + struct si_shader_selector *sel = shader->selector; + const union si_shader_key *key = &shader->key; + assert(key->ge.as_ngg); + + uint8_t clip_cull_dist_mask = + (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) | + sel->info.culldist_mask; + + ac_nir_lower_ngg_options options = { + .family = sel->screen->info.family, + .gfx_level = sel->screen->info.gfx_level, + .max_workgroup_size = si_get_max_workgroup_size(shader), + .wave_size = shader->wave_size, + .can_cull = !!key->ge.opt.ngg_culling, + .disable_streamout = !si_shader_uses_streamout(shader), + .vs_output_param_offset = shader->info.vs_output_param_offset, + .has_param_exports = shader->info.nr_param_exports, + .clip_cull_dist_mask = clip_cull_dist_mask, + .kill_pointsize = key->ge.opt.kill_pointsize, + .kill_layer = key->ge.opt.kill_layer, + .force_vrs = sel->screen->options.vrs2x2, + }; + + if (nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_TESS_EVAL) { + /* Per instance inputs, used to remove instance load after culling. */ + unsigned instance_rate_inputs = 0; + + if (nir->info.stage == MESA_SHADER_VERTEX) { + instance_rate_inputs = key->ge.mono.instance_divisor_is_one | + key->ge.mono.instance_divisor_is_fetched; + + /* Manually mark the instance ID used, so the shader can repack it. */ + if (instance_rate_inputs) + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + } else { + /* Manually mark the primitive ID used, so the shader can repack it. */ + if (key->ge.mono.u.vs_export_prim_id) + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); + } + + unsigned clip_plane_enable = + SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling); + unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(shader); + + options.num_vertices_per_primitive = num_vertices ? num_vertices : 3; + options.early_prim_export = gfx10_ngg_export_prim_early(shader); + options.passthrough = gfx10_is_ngg_passthrough(shader); + options.use_edgeflags = gfx10_edgeflags_have_effect(shader); + options.has_gen_prim_query = options.has_xfb_prim_query = + sel->screen->info.gfx_level >= GFX11 && !sel->info.base.vs.blit_sgprs_amd; + options.export_primitive_id = key->ge.mono.u.vs_export_prim_id; + options.instance_rate_inputs = instance_rate_inputs; + options.user_clip_plane_enable_mask = clip_plane_enable; + + NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options); + } else { + assert(nir->info.stage == MESA_SHADER_GEOMETRY); + + options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size; + options.has_gen_prim_query = options.has_xfb_prim_query = + sel->screen->info.gfx_level >= GFX11; + options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11; + options.has_gs_primitives_query = true; + + /* For monolithic ES/GS to add vscnt wait when GS export pos0. */ + if (key->ge.part.gs.es) + nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory; + + NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options); + } + + /* may generate some subgroup op like ballot */ + NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options); + + /* may generate some vector output store */ + NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); +} + +struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel) +{ + struct pipe_screen *screen = &sel->screen->b; + const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, + pipe_shader_type_from_mesa(sel->stage)); + + struct blob_reader blob_reader; + blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size); + return nir_deserialize(NULL, options, &blob_reader); +} + +static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader, + int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS]) +{ + struct si_shader_selector *sel = shader->selector; + struct si_shader_binary_info *info = &shader->info; + + uint64_t outputs_written = 0; + uint32_t outputs_written_16bit = 0; + + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + assert(impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + /* No indirect indexing allowed. */ + ASSERTED nir_src offset = *nir_get_io_offset_src(intr); + assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0); + + assert(intr->num_components == 1); /* only scalar stores expected */ + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + if (sem.location >= VARYING_SLOT_VAR0_16BIT) + outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT); + else + outputs_written |= BITFIELD64_BIT(sem.location); + + /* Assign the param index if it's unassigned. */ + if (nir_slot_is_varying(sem.location) && !sem.no_varying && + (sem.gs_streams & 0x3) == 0 && + info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) { + /* The semantic and the base should be the same as in si_shader_info. */ + assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]); + /* It must not be remapped (duplicated). */ + assert(slot_remap[sem.location] == -1); + + info->vs_output_param_offset[sem.location] = info->nr_param_exports++; + } + } + } + + /* Duplicated outputs are redirected here. */ + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (slot_remap[i] >= 0) + info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]]; + } + + if (shader->key.ge.mono.u.vs_export_prim_id) { + info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++; + } + + /* Update outputs written info, we may remove some outputs before. */ + nir->info.outputs_written = outputs_written; + nir->info.outputs_written_16bit = outputs_written_16bit; +} + +static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader) +{ + /* Initialize this first. */ + shader->info.nr_param_exports = 0; + + STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1); + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, + sizeof(shader->info.vs_output_param_offset)); + + /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be + * mapped to multiple fragment shader inputs. + */ + int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS]; + memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS); + + /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */ + /* TODO: This doesn't affect GS. */ + NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap, + shader->info.vs_output_param_offset); + + /* Assign the non-constant outputs. */ + /* TODO: Use this for the GS copy shader too. */ + si_nir_assign_param_offsets(nir, shader, slot_remap); +} + +static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel, + const union si_shader_key *key) +{ + const struct si_shader_info *info = &sel->info; + + /* Must have a position export. */ + unsigned nr_pos_exports = 1; + + if ((info->writes_psize && !key->ge.opt.kill_pointsize) || + (info->writes_edgeflag && !key->ge.as_ngg) || + (info->writes_layer && !key->ge.opt.kill_layer) || + info->writes_viewport_index || sel->screen->options.vrs2x2) { + nr_pos_exports++; + } + + unsigned clipdist_mask = + (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask; + + for (int i = 0; i < 2; i++) { + if (clipdist_mask & BITFIELD_RANGE(i * 4, 4)) + nr_pos_exports++; + } + + return nr_pos_exports; +} + +static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state) +{ + nir_def **colors = (nir_def **)state; + + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic != nir_intrinsic_load_color0 && + intrin->intrinsic != nir_intrinsic_load_color1) + return false; + + unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1; + assert(colors[index]); + + nir_def_rewrite_uses(&intrin->def, colors[index]); + + nir_instr_remove(&intrin->instr); + return true; +} + +static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key, + const struct si_shader_info *info) { + bool progress = false; + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder builder = nir_builder_at(nir_before_impl(impl)); + nir_builder *b = &builder; + + /* Build ready to be used colors at the beginning of the shader. */ + nir_def *colors[2] = {0}; + for (int i = 0; i < 2; i++) { + if (!(info->colors_read & (0xf << (i * 4)))) + continue; + + unsigned color_base = info->color_attr_index[i]; + /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", + * otherwise it's at offset "num_inputs". + */ + unsigned back_color_base = info->num_inputs; + if (i == 1 && (info->colors_read & 0xf)) + back_color_base += 1; + + enum glsl_interp_mode interp_mode = info->color_interpolate[i]; + if (interp_mode == INTERP_MODE_COLOR) { + interp_mode = key->ps.part.prolog.flatshade_colors ? + INTERP_MODE_FLAT : INTERP_MODE_SMOOTH; + } + + nir_def *back_color = NULL; + if (interp_mode == INTERP_MODE_FLAT) { + colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0), + .base = color_base, + .io_semantics.location = VARYING_SLOT_COL0 + i, + .io_semantics.num_slots = 1); + + if (key->ps.part.prolog.color_two_side) { + back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0), + .base = back_color_base, + .io_semantics.location = VARYING_SLOT_BFC0 + i, + .io_semantics.num_slots = 1); + } + } else { + nir_intrinsic_op op = 0; + switch (info->color_interpolate_loc[i]) { + case TGSI_INTERPOLATE_LOC_CENTER: + op = nir_intrinsic_load_barycentric_pixel; + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + op = nir_intrinsic_load_barycentric_centroid; + break; + case TGSI_INTERPOLATE_LOC_SAMPLE: + op = nir_intrinsic_load_barycentric_sample; + break; + default: + unreachable("invalid color interpolate location"); + break; + } + + nir_def *barycentric = nir_load_barycentric(b, op, interp_mode); + + colors[i] = + nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0), + .base = color_base, + .io_semantics.location = VARYING_SLOT_COL0 + i, + .io_semantics.num_slots = 1); + + if (key->ps.part.prolog.color_two_side) { + back_color = + nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0), + .base = back_color_base, + .io_semantics.location = VARYING_SLOT_BFC0 + i, + .io_semantics.num_slots = 1); + } + } + + if (back_color) { + nir_def *is_front_face = nir_load_front_face(b, 1); + colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color); + } + + progress = true; + } + + /* lower nir_load_color0/1 to use the color value. */ + return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic, + nir_metadata_block_index | nir_metadata_dominance, + colors) || progress; +} + +static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder builder = nir_builder_at(nir_before_impl(impl)); + nir_builder *b = &builder; + + /* Load the buffer descriptor. */ + nir_def *desc = + si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4); + + /* Use the fixed-point gl_FragCoord input. + * Since the stipple pattern is 32x32 and it repeats, just get 5 bits + * per coordinate to get the repeating effect. + */ + nir_def *pos_x = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 5); + nir_def *pos_y = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 5); + + nir_def *zero = nir_imm_int(b, 0); + /* The stipple pattern is 32x32, each row has 32 bits. */ + nir_def *offset = nir_ishl_imm(b, pos_y, 2); + nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero); + nir_def *bit = nir_ubfe(b, row, pos_x, nir_imm_int(b, 1)); + + nir_def *pass = nir_i2b(b, bit); + nir_discard_if(b, nir_inot(b, pass)); +} + +bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader) +{ + return shader->info.stage == MESA_SHADER_COMPUTE && shader->info.shared_size > 0 && sscreen->options.clear_lds; +} + +struct nir_shader *si_get_nir_shader(struct si_shader *shader, + struct si_shader_args *args, + bool *free_nir, + uint64_t tcs_vgpr_only_inputs, + ac_nir_gs_output_info *output_info) +{ + struct si_shader_selector *sel = shader->selector; + const union si_shader_key *key = &shader->key; + nir_shader *nir; *free_nir = false; if (sel->nir) { nir = sel->nir; } else if (sel->nir_binary) { - struct pipe_screen *screen = &sel->screen->b; - const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, - pipe_shader_type_from_mesa(sel->info.stage)); - - struct blob_reader blob_reader; - blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size); + nir = si_deserialize_shader(sel); *free_nir = true; - nir = nir_deserialize(NULL, options, &blob_reader); } else { return NULL; } - if (key && key->opt.inline_uniforms) { + bool progress = false; + bool late_opts = false; + + const char *original_name = NULL; + if (unlikely(should_print_nir(nir))) { + /* Modify the shader's name so that each variant gets its own name. */ + original_name = ralloc_strdup(nir, nir->info.name); + ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key))); + + /* Dummy pass to get the starting point. */ + printf("nir_dummy_pass\n"); + nir_print_shader(nir, stdout); + } + + /* Kill outputs according to the shader key. */ + if (sel->stage <= MESA_SHADER_GEOMETRY) + NIR_PASS(progress, nir, si_nir_kill_outputs, key); + + NIR_PASS(progress, nir, ac_nir_lower_tex, + &(ac_nir_lower_tex_options){ + .gfx_level = sel->screen->info.gfx_level, + .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord, + }); + + if (nir->info.uses_resource_info_query) + NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level); + + bool inline_uniforms = false; + uint32_t *inlined_uniform_values; + si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type, + &inline_uniforms, &inlined_uniform_values); + + if (inline_uniforms) { assert(*free_nir); /* Most places use shader information from the default variant, not @@ -1372,24 +2269,22 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel, * - Register usage and code size decrease (obvious) * - Eliminated PS system values are disabled by LLVM * (FragCoord, FrontFace, barycentrics) - * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef. - * (thanks to an LLVM pass in Mesa - TODO: move it to NIR) - * The storage for eliminated outputs is also not allocated. + * - VS/TES/GS param exports are eliminated if they are undef. + * The param space for eliminated outputs is also not allocated. * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM) * - TCS output stores are eliminated + * - Eliminated PS inputs are removed from PS.NUM_INTERP. * * TODO: These are things the driver ignores in the final shader code * and relies on the default shader info. - * - Other system values are not eliminated - * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs - * to remove holes + * - System values in VS, TCS, TES, GS are not eliminated * - uses_discard - if it changed to false * - writes_memory - if it changed to false * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not * eliminated * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS) * GS outputs are eliminated except for the temporary LDS. - * Clip distances, gl_PointSize, and PS outputs are eliminated based + * Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based * on current states, so we don't care about the shader code. * * TODO: Merged shaders don't inline uniforms for the first stage. @@ -1402,35 +2297,543 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel, * TODO: The driver uses a linear search to find a shader variant. This * can be really slow if we get too many variants due to uniform inlining. */ - NIR_PASS_V(nir, nir_inline_uniforms, - nir->info.num_inlinable_uniforms, - key->opt.inlined_uniform_values, - nir->info.inlinable_uniform_dw_offsets); + NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms, + inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets); + progress = true; + } + + if (sel->stage == MESA_SHADER_FRAGMENT) { + /* This uses the epilog key, so only monolithic shaders can call this. */ + if (shader->is_monolithic) + NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key); + + if (key->ps.mono.poly_line_smoothing) + NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES); + + if (key->ps.mono.point_smoothing) + NIR_PASS(progress, nir, nir_lower_point_smooth); + } + + /* This must be before si_nir_lower_resource. */ + if (!sel->screen->info.has_image_opcodes) + NIR_PASS(progress, nir, ac_nir_lower_image_opcodes); + + /* LLVM does not work well with this, so is handled in llvm backend waterfall. */ + if (sel->screen->use_aco && sel->info.has_non_uniform_tex_access) { + nir_lower_non_uniform_access_options options = { + .types = nir_lower_non_uniform_texture_access, + }; + NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options); + } + + NIR_PASS(progress, nir, si_nir_lower_resource, shader, args); + bool is_last_vgt_stage = + (sel->stage == MESA_SHADER_VERTEX || + sel->stage == MESA_SHADER_TESS_EVAL || + (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) && + !shader->key.ge.as_ls && !shader->key.ge.as_es; + + /* Legacy GS is not last VGT stage because it has GS copy shader. */ + bool is_legacy_gs = sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg; + + if (is_last_vgt_stage || is_legacy_gs) + NIR_PASS(progress, nir, si_nir_clamp_vertex_color); + + if (progress) { si_nir_opts(sel->screen, nir, true); + late_opts = true; + progress = false; + } + + /* Lower large variables that are always constant with load_constant intrinsics, which + * get turned into PC-relative loads from a data section next to the shader. + * + * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so + * this should be done after that. + * + * The pass crashes if there are dead temps of lowered IO interface types, so remove + * them first. + */ + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL); + NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16); + + /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so + * this should be done after that. + */ + progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level); + + if (sel->stage == MESA_SHADER_VERTEX) + NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args); + + progress |= si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs); + + if (is_last_vgt_stage) { + /* Assign param export indices. */ + si_assign_param_offsets(nir, shader); + + /* Assign num of position exports. */ + shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key); + + if (key->ge.as_ngg) { + /* Lower last VGT NGG shader stage. */ + si_lower_ngg(shader, nir); + } else if (sel->stage == MESA_SHADER_VERTEX || sel->stage == MESA_SHADER_TESS_EVAL) { + /* Lower last VGT none-NGG VS/TES shader stage. */ + unsigned clip_cull_mask = + (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) | + sel->info.culldist_mask; + + NIR_PASS_V(nir, ac_nir_lower_legacy_vs, + sel->screen->info.gfx_level, + clip_cull_mask, + shader->info.vs_output_param_offset, + shader->info.nr_param_exports, + shader->key.ge.mono.u.vs_export_prim_id, + !si_shader_uses_streamout(shader), + key->ge.opt.kill_pointsize, + key->ge.opt.kill_layer, + sel->screen->options.vrs2x2); + } + progress = true; + } else if (is_legacy_gs) { + NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info); + progress = true; + } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) { + /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs, + * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs + * by renumbering them. This can only happen with monolithic PS. Colors are unaffected + * because they are still represented by nir_intrinsic_load_color0/1. + */ + NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in); + + /* Two-side color selection and interpolation: Get the latest shader info because + * uniform inlining and colormask can fully eliminate color inputs. + */ + struct si_shader_info info; + si_nir_scan_shader(sel->screen, nir, &info); + + if (info.colors_read) + NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info); + + /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped, + * which can only occur with monolithic PS. + */ + shader->info.num_ps_inputs = info.num_inputs; + shader->info.ps_colors_read = info.colors_read; + + ac_nir_lower_ps_options options = { + .gfx_level = sel->screen->info.gfx_level, + .family = sel->screen->info.family, + .use_aco = sel->screen->use_aco, + .uses_discard = si_shader_uses_discard(shader), + .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz, + .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle, + .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format, + .color_is_int8 = key->ps.part.epilog.color_is_int8, + .color_is_int10 = key->ps.part.epilog.color_is_int10, + .clamp_color = key->ps.part.epilog.clamp_color, + .alpha_to_one = key->ps.part.epilog.alpha_to_one, + .alpha_func = key->ps.part.epilog.alpha_func, + .broadcast_last_cbuf = key->ps.part.epilog.last_cbuf, + .kill_samplemask = key->ps.part.epilog.kill_samplemask, + + .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp, + .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear, + .force_persp_sample_interp = key->ps.part.prolog.force_persp_sample_interp, + .force_linear_sample_interp = key->ps.part.prolog.force_linear_sample_interp, + .force_persp_center_interp = key->ps.part.prolog.force_persp_center_interp, + .force_linear_center_interp = key->ps.part.prolog.force_linear_center_interp, + .ps_iter_samples = 1 << key->ps.part.prolog.samplemask_log_ps_iter, + }; + + NIR_PASS_V(nir, ac_nir_lower_ps, &options); + + if (key->ps.part.prolog.poly_stipple) + NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args); + + progress = true; + } + + NIR_PASS(progress, nir, nir_opt_idiv_const, 8); + NIR_PASS(progress, nir, nir_lower_idiv, + &(nir_lower_idiv_options){ + .allow_fp16 = sel->screen->info.gfx_level >= GFX9, + }); + + if (si_should_clear_lds(sel->screen, nir)) { + const unsigned chunk_size = 16; /* max single store size */ + const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size); + NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size); + } + + NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level, + si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level), + &args->ac); + NIR_PASS(progress, nir, si_nir_lower_abi, shader, args); + + if (progress) { + si_nir_opts(sel->screen, nir, false); + progress = false; + late_opts = true; + } + + static const nir_opt_offsets_options offset_options = { + .uniform_max = 0, + .buffer_max = ~0, + .shared_max = ~0, + }; + NIR_PASS_V(nir, nir_opt_offsets, &offset_options); + + if (late_opts) si_nir_late_opts(nir); - /* This must be done again. */ - NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | - nir_var_shader_out); + /* aco only accept scalar const, must be done after si_nir_late_opts() + * which may generate vec const. + */ + if (sel->screen->use_aco) + NIR_PASS_V(nir, nir_lower_load_const_to_scalar); + + /* This helps LLVM form VMEM clauses and thus get more GPU cache hits. + * 200 is tuned for Viewperf. It should be done last. + */ + NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200); + + if (unlikely(original_name)) { + ralloc_free((void*)nir->info.name); + nir->info.name = original_name; } return nir; } +void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir) +{ + struct si_shader_info info; + si_nir_scan_shader(shader->selector->screen, nir, &info); + + shader->info.uses_vmem_load_other |= info.uses_vmem_load_other; + shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh; + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */ + shader->info.num_ps_inputs = info.num_inputs; + shader->info.ps_colors_read = info.colors_read; + + /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */ + unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP); + memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0])); + } +} + +/* Generate code for the hardware VS shader stage to go with a geometry shader */ +static struct si_shader * +si_nir_generate_gs_copy_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader *gs_shader, + nir_shader *gs_nir, + struct util_debug_callback *debug, + ac_nir_gs_output_info *output_info) +{ + struct si_shader *shader; + struct si_shader_selector *gs_selector = gs_shader->selector; + struct si_shader_info *gsinfo = &gs_selector->info; + union si_shader_key *gskey = &gs_shader->key; + + shader = CALLOC_STRUCT(si_shader); + if (!shader) + return NULL; + + /* We can leave the fence as permanently signaled because the GS copy + * shader only becomes visible globally after it has been compiled. */ + util_queue_fence_init(&shader->ready); + + shader->selector = gs_selector; + shader->is_gs_copy_shader = true; + shader->wave_size = si_determine_wave_size(sscreen, shader); + + STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1); + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, + sizeof(shader->info.vs_output_param_offset)); + + for (unsigned i = 0; i < gsinfo->num_outputs; i++) { + unsigned semantic = gsinfo->output_semantic[i]; + + /* Skip if no channel writes to stream 0. */ + if (!nir_slot_is_varying(semantic) || + (gsinfo->output_streams[i] & 0x03 && + gsinfo->output_streams[i] & 0x0c && + gsinfo->output_streams[i] & 0x30 && + gsinfo->output_streams[i] & 0xc0)) + continue; + + shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++; + } + + shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey); + + unsigned clip_cull_mask = + (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask; + + nir_shader *nir = + ac_nir_create_gs_copy_shader(gs_nir, + sscreen->info.gfx_level, + clip_cull_mask, + shader->info.vs_output_param_offset, + shader->info.nr_param_exports, + !si_shader_uses_streamout(gs_shader), + gskey->ge.opt.kill_pointsize, + gskey->ge.opt.kill_layer, + sscreen->options.vrs2x2, + output_info); + + struct si_shader_args args; + si_init_shader_args(shader, &args); + + NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level, AC_HW_VERTEX_SHADER, &args.ac); + NIR_PASS_V(nir, si_nir_lower_abi, shader, &args); + + si_nir_opts(gs_selector->screen, nir, false); + + /* aco only accept scalar const */ + if (sscreen->use_aco) + NIR_PASS_V(nir, nir_lower_load_const_to_scalar); + + if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) { + fprintf(stderr, "GS Copy Shader:\n"); + nir_print_shader(nir, stderr); + } + + bool ok = +#if LLVM_AVAILABLE + !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) : +#endif + si_aco_compile_shader(shader, &args, nir, debug); + + + if (ok) { + assert(!shader->config.scratch_bytes_per_wave); + ok = si_shader_binary_upload(sscreen, shader, 0); + si_shader_dump(sscreen, shader, debug, stderr, true); + } + ralloc_free(nir); + + if (!ok) { + FREE(shader); + shader = NULL; + } else { + si_fix_resource_usage(sscreen, shader); + } + return shader; +} + +struct si_gs_output_info { + uint8_t streams[64]; + uint8_t streams_16bit_lo[16]; + uint8_t streams_16bit_hi[16]; + + uint8_t usage_mask[64]; + uint8_t usage_mask_16bit_lo[16]; + uint8_t usage_mask_16bit_hi[16]; + + ac_nir_gs_output_info info; +}; + +static void +si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info) +{ + for (int i = 0; i < info->num_outputs; i++) { + unsigned slot = info->output_semantic[i]; + if (slot < VARYING_SLOT_VAR0_16BIT) { + out_info->streams[slot] = info->output_streams[i]; + out_info->usage_mask[slot] = info->output_usagemask[i]; + } else { + unsigned index = slot - VARYING_SLOT_VAR0_16BIT; + /* TODO: 16bit need separated fields for lo/hi part. */ + out_info->streams_16bit_lo[index] = info->output_streams[i]; + out_info->streams_16bit_hi[index] = info->output_streams[i]; + out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i]; + out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i]; + } + } + + ac_nir_gs_output_info *ac_info = &out_info->info; + + ac_info->streams = out_info->streams; + ac_info->streams_16bit_lo = out_info->streams_16bit_lo; + ac_info->streams_16bit_hi = out_info->streams_16bit_hi; + + ac_info->usage_mask = out_info->usage_mask; + ac_info->usage_mask_16bit_lo = out_info->usage_mask_16bit_lo; + ac_info->usage_mask_16bit_hi = out_info->usage_mask_16bit_hi; + + /* TODO: construct 16bit slot per component store type. */ + ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL; +} + +static void si_fixup_spi_ps_input_config(struct si_shader *shader) +{ + const union si_shader_key *key = &shader->key; + + /* Enable POS_FIXED_PT if polygon stippling is enabled. */ + if (key->ps.part.prolog.poly_stipple) + shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); + + /* Set up the enable bits for per-sample shading if needed. */ + if (key->ps.part.prolog.force_persp_sample_interp && + (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); + } + if (key->ps.part.prolog.force_linear_sample_interp && + (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); + } + if (key->ps.part.prolog.force_persp_center_interp && + (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + } + if (key->ps.part.prolog.force_linear_center_interp && + (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + } + + /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && + !(shader->config.spi_ps_input_ena & 0xf)) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + } + + /* At least one pair of interpolation weights must be enabled. */ + if (!(shader->config.spi_ps_input_ena & 0x7f)) + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + + /* Samplemask fixup requires the sample ID. */ + if (key->ps.part.prolog.samplemask_log_ps_iter) + shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); +} + +static void +si_set_spi_ps_input_config(struct si_shader *shader) +{ + const struct si_shader_selector *sel = shader->selector; + const struct si_shader_info *info = &sel->info; + const union si_shader_key *key = &shader->key; + + /* TODO: This should be determined from the final NIR instead of the input NIR, + * otherwise LLVM will have a performance advantage here because it determines + * VGPR inputs for each shader variant after LLVM optimizations. + */ + shader->config.spi_ps_input_ena = + S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) | + S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) | + S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample) | + S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) | + S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) | + S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) | + S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) | + S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) | + S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id); + + uint8_t mask = info->reads_frag_coord_mask | info->reads_sample_pos_mask; + u_foreach_bit(i, mask) { + shader->config.spi_ps_input_ena |= S_0286CC_POS_X_FLOAT_ENA(1) << i; + } + + if (key->ps.part.prolog.color_two_side) + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + + /* INTERP_MODE_COLOR, same as SMOOTH if flat shading is disabled. */ + if (info->uses_interp_color && !key->ps.part.prolog.flatshade_colors) { + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample_color) | + S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center_color) | + S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid_color); + } + + /* nir_lower_poly_line_smooth use nir_load_sample_mask_in */ + if (key->ps.mono.poly_line_smoothing) + shader->config.spi_ps_input_ena |= S_0286CC_SAMPLE_COVERAGE_ENA(1); + + /* nir_lower_point_smooth use nir_load_point_coord_maybe_flipped which is lowered + * to nir_load_barycentric_pixel and nir_load_interpolated_input. + */ + if (key->ps.mono.point_smoothing) + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + + /* See fetch_framebuffer() for used args when fbfetch output. */ + if (info->base.fs.uses_fbfetch_output) { + shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); + + if (key->ps.mono.fbfetch_layered || key->ps.mono.fbfetch_msaa) + shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); + } + + if (shader->is_monolithic) { + si_fixup_spi_ps_input_config(shader); + shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena; + } else { + /* Part mode will call si_fixup_spi_ps_input_config() when combining multi + * shader part in si_shader_select_ps_parts(). + * + * Reserve register locations for VGPR inputs the PS prolog may need. + */ + shader->config.spi_ps_input_addr = + shader->config.spi_ps_input_ena | + SI_SPI_PS_INPUT_ADDR_FOR_PROLOG; + } +} + +static void +debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype, + const char *fmt, va_list args) +{ + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); +} + bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug) + struct si_shader *shader, struct util_debug_callback *debug) { + bool ret = true; struct si_shader_selector *sel = shader->selector; + + /* ACO need spi_ps_input in advance to init args and used in compiler. */ + if (sel->stage == MESA_SHADER_FRAGMENT && sscreen->use_aco) + si_set_spi_ps_input_config(shader); + + /* We need this info only when legacy GS. */ + struct si_gs_output_info legacy_gs_output_info; + if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { + memset(&legacy_gs_output_info, 0, sizeof(legacy_gs_output_info)); + si_init_gs_output_info(&sel->info, &legacy_gs_output_info); + } + + struct si_shader_args args; + si_init_shader_args(shader, &args); + bool free_nir; - struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir); + struct nir_shader *nir = + si_get_nir_shader(shader, &args, &free_nir, 0, &legacy_gs_output_info.info); /* Dump NIR before doing NIR->LLVM conversion in case the * conversion fails. */ - if (si_can_dump_shader(sscreen, sel->info.stage) && - !(sscreen->debug_flags & DBG(NO_NIR))) { + if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_NIR)) { nir_print_shader(nir, stderr); - si_dump_streamout(&sel->so); + + if (nir->xfb_info) + nir_print_xfb_info(nir->xfb_info, stderr); } /* Initialize vs_output_ps_input_cntl to default. */ @@ -1438,32 +2841,80 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED; shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0; - shader->info.uses_instanceid = sel->info.uses_instanceid; + si_update_shader_binary_info(shader, nir); + + /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */ + shader->info.uses_instanceid |= sel->info.uses_instanceid; + shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4); - /* TODO: ACO could compile non-monolithic shaders here (starting - * with PS and NGG VS), but monolithic shaders should be compiled - * by LLVM due to more complicated compilation. + /* Set the FP ALU behavior. */ + /* By default, we disable denormals for FP32 and enable them for FP16 and FP64 + * for performance and correctness reasons. FP32 denormals can't be enabled because + * they break output modifiers and v_mad_f32 and are very slow on GFX6-7. + * + * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags + * can be set simultaneously, which means we are allowed to choose, but not really because + * some options cause GLCTS failures. */ - if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) - return false; + unsigned float_mode = V_00B028_FP_16_64_DENORMS; + + if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) && + nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) + float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO; + + if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) && + nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) + float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO; + + if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 | + FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) && + nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | + FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)) + float_mode &= ~V_00B028_FP_16_64_DENORMS; + + ret = +#if LLVM_AVAILABLE + !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) : +#endif + si_aco_compile_shader(shader, &args, nir, debug); + + if (!ret) + goto out; + + shader->config.float_mode = float_mode; + + /* The GS copy shader is compiled next. */ + if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { + shader->gs_copy_shader = + si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug, + &legacy_gs_output_info.info); + if (!shader->gs_copy_shader) { + fprintf(stderr, "radeonsi: can't create GS copy shader\n"); + ret = false; + goto out; + } + } /* Compute vs_output_ps_input_cntl. */ - if ((sel->info.stage == MESA_SHADER_VERTEX || - sel->info.stage == MESA_SHADER_TESS_EVAL || - sel->info.stage == MESA_SHADER_GEOMETRY) && - !shader->key.as_ls && !shader->key.as_es) { - ubyte *vs_output_param_offset = shader->info.vs_output_param_offset; + if ((sel->stage == MESA_SHADER_VERTEX || + sel->stage == MESA_SHADER_TESS_EVAL || + sel->stage == MESA_SHADER_GEOMETRY) && + !shader->key.ge.as_ls && !shader->key.ge.as_es) { + uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset; - if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) - vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset; + if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) + vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset; + /* We must use the original shader info before the removal of duplicated shader outputs. */ /* VS and TES should also set primitive ID output if it's used. */ unsigned num_outputs_with_prim_id = sel->info.num_outputs + - shader->key.mono.u.vs_export_prim_id; + shader->key.ge.mono.u.vs_export_prim_id; for (unsigned i = 0; i < num_outputs_with_prim_id; i++) { unsigned semantic = sel->info.output_semantic[i]; - unsigned offset = vs_output_param_offset[i]; + unsigned offset = vs_output_param_offset[semantic]; unsigned ps_input_cntl; if (offset <= AC_EXP_PARAM_OFFSET_31) { @@ -1485,15 +2936,14 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi } /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */ - if (sel->info.stage == MESA_SHADER_COMPUTE) { - unsigned wave_size = sscreen->compute_wave_size; + if (sel->stage == MESA_SHADER_COMPUTE) { unsigned max_vgprs = - sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1); + sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1); unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd; unsigned max_sgprs_per_wave = 128; unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ unsigned threads_per_tg = si_get_max_workgroup_size(shader); - unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size); + unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size); unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg); max_vgprs = max_vgprs / waves_per_simd; @@ -1514,39 +2964,64 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi } } - /* Add the scratch offset to input SGPRs. */ - if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader)) - shader->info.num_input_sgprs += 1; /* scratch byte offset */ + /* Add/remove the scratch offset to/from input SGPRs. */ + if (sel->screen->info.gfx_level < GFX11 && + (sel->screen->info.family < CHIP_GFX940 || sel->screen->info.has_graphics) && + !si_is_merged_shader(shader)) { + if (sscreen->use_aco) { + /* When aco scratch_offset arg is added explicitly at the beginning. + * After compile if no scratch used, reduce the input sgpr count. + */ + if (!shader->config.scratch_bytes_per_wave) + shader->info.num_input_sgprs--; + } else { + /* scratch_offset arg is added by llvm implicitly */ + if (shader->info.num_input_sgprs) + shader->info.num_input_sgprs++; + } + } /* Calculate the number of fragment input VGPRs. */ - if (sel->info.stage == MESA_SHADER_FRAGMENT) { + if (sel->stage == MESA_SHADER_FRAGMENT) { shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt( - &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index); + &shader->config, &shader->info.num_fragcoord_components); } si_calculate_max_simd_waves(shader); - si_shader_dump_stats_for_shader_db(sscreen, shader, debug); - return true; + + if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_STATS)) { + struct util_debug_callback out_stderr = { + .debug_message = debug_message_stderr, + }; + + si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr); + } else { + si_shader_dump_stats_for_shader_db(sscreen, shader, debug); + } + +out: + if (free_nir) + ralloc_free(nir); + + return ret; } /** * Create, compile and return a shader part (prolog or epilog). * - * \param sscreen screen - * \param list list of shader parts of the same category - * \param type shader type - * \param key shader part key - * \param prolog whether the part being requested is a prolog - * \param tm LLVM target machine - * \param debug debug callback - * \param build the callback responsible for building the main function - * \return non-NULL on success + * \param sscreen screen + * \param list list of shader parts of the same category + * \param type shader type + * \param key shader part key + * \param prolog whether the part being requested is a prolog + * \param tm LLVM target machine + * \param debug debug callback + * \return non-NULL on success */ static struct si_shader_part * si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, gl_shader_stage stage, bool prolog, union si_shader_part_key *key, - struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, - void (*build)(struct si_shader_context *, union si_shader_part_key *), + struct ac_llvm_compiler *compiler, struct util_debug_callback *debug, const char *name) { struct si_shader_part *result; @@ -1565,121 +3040,50 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, result = CALLOC_STRUCT(si_shader_part); result->key = *key; - struct si_shader_selector sel = {}; - sel.screen = sscreen; + bool ok = +#if LLVM_AVAILABLE + !sscreen->use_aco ? si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) : +#endif + si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result); - struct si_shader shader = {}; - shader.selector = &sel; - - switch (stage) { - case MESA_SHADER_VERTEX: - shader.key.as_ls = key->vs_prolog.as_ls; - shader.key.as_es = key->vs_prolog.as_es; - shader.key.as_ngg = key->vs_prolog.as_ngg; - shader.key.opt.ngg_culling = - (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) | - (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) | - SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed); - break; - case MESA_SHADER_TESS_CTRL: - assert(!prolog); - shader.key.part.tcs.epilog = key->tcs_epilog.states; - break; - case MESA_SHADER_GEOMETRY: - assert(prolog); - shader.key.as_ngg = key->gs_prolog.as_ngg; - break; - case MESA_SHADER_FRAGMENT: - if (prolog) - shader.key.part.ps.prolog = key->ps_prolog.states; - else - shader.key.part.ps.epilog = key->ps_epilog.states; - break; - default: - unreachable("bad shader part"); - } - - struct si_shader_context ctx; - si_llvm_context_init(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, stage, - shader.key.as_ngg, shader.key.as_es, - shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)); - ctx.shader = &shader; - ctx.stage = stage; - - build(&ctx, key); - - /* Compile. */ - si_llvm_optimize_module(&ctx); - - if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug, - ctx.stage, name, false)) { + if (ok) { + result->next = *list; + *list = result; + } else { FREE(result); result = NULL; - goto out; } - result->next = *list; - *list = result; - -out: - si_llvm_dispose(&ctx); simple_mtx_unlock(&sscreen->shader_parts_mutex); return result; } -static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug, - struct si_shader *main_part, const struct si_vs_prolog_bits *key) +void si_get_tcs_epilog_key(struct si_shader *shader, union si_shader_part_key *key) { - struct si_shader_selector *vs = main_part->selector; - - if (!si_vs_needs_prolog(vs, key, &shader->key, false)) - return true; - - /* Get the prolog. */ - union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader, - &prolog_key); - - shader->prolog = - si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key, - compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog"); - return shader->prolog != NULL; -} + memset(key, 0, sizeof(*key)); + key->tcs_epilog.wave32 = shader->wave_size == 32; + key->tcs_epilog.states = shader->key.ge.part.tcs.epilog; -/** - * Select and compile (or reuse) vertex shader parts (prolog & epilog). - */ -static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug) -{ - return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog); + /* If output patches are wholly in one wave, we don't need a barrier. */ + key->tcs_epilog.noop_s_barrier = + shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0; } /** * Select and compile (or reuse) TCS parts (epilog). */ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug) + struct si_shader *shader, struct util_debug_callback *debug) { - if (sscreen->info.chip_class >= GFX9) { - struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls; - - if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part, - &shader->key.part.tcs.ls_prolog)) - return false; - - shader->previous_stage = ls_main_part; - } + if (sscreen->info.gfx_level >= GFX9) + shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls; /* Get the epilog. */ union si_shader_part_key epilog_key; - memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; + si_get_tcs_epilog_key(shader, &epilog_key); shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false, - &epilog_key, compiler, debug, si_llvm_build_tcs_epilog, + &epilog_key, compiler, debug, "Tessellation Control Shader Epilog"); return shader->epilog != NULL; } @@ -1688,52 +3092,31 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm * Select and compile (or reuse) GS parts (prolog). */ static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug) + struct si_shader *shader, struct util_debug_callback *debug) { - if (sscreen->info.chip_class >= GFX9) { - struct si_shader *es_main_part; - - if (shader->key.as_ngg) - es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es; + if (sscreen->info.gfx_level >= GFX9) { + if (shader->key.ge.as_ngg) + shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es; else - es_main_part = shader->key.part.gs.es->main_shader_part_es; - - if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX && - !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part, - &shader->key.part.gs.vs_prolog)) - return false; - - shader->previous_stage = es_main_part; + shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es; } - if (!shader->key.part.gs.prolog.tri_strip_adj_fix) - return true; - - union si_shader_part_key prolog_key; - memset(&prolog_key, 0, sizeof(prolog_key)); - prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; - - shader->prolog2 = - si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key, - compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog"); - return shader->prolog2 != NULL; + return true; } /** * Compute the PS prolog key, which contains all the information needed to * build the PS prolog function, and set related bits in shader->config. */ -void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key, - bool separate_prolog) +void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key) { struct si_shader_info *info = &shader->selector->info; memset(key, 0, sizeof(*key)); - key->ps_prolog.states = shader->key.part.ps.prolog; - key->ps_prolog.colors_read = info->colors_read; + key->ps_prolog.states = shader->key.ps.part.prolog; + key->ps_prolog.wave32 = shader->wave_size == 32; + key->ps_prolog.colors_read = shader->info.ps_colors_read; key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; - key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; key->ps_prolog.wqm = info->base.fs.needs_quad_helper_invocations && (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp || @@ -1741,29 +3124,30 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke key->ps_prolog.states.force_persp_center_interp || key->ps_prolog.states.force_linear_center_interp || key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear); - key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; + key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components; + + if (shader->key.ps.part.prolog.poly_stipple) + shader->info.uses_vmem_load_other = true; - if (info->colors_read) { - ubyte *color = shader->selector->color_attr_index; + if (shader->info.ps_colors_read) { + uint8_t *color = shader->selector->info.color_attr_index; - if (shader->key.part.ps.prolog.color_two_side) { + if (shader->key.ps.part.prolog.color_two_side) { /* BCOLORs are stored after the last input. */ - key->ps_prolog.num_interp_inputs = info->num_inputs; - key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; - if (separate_prolog) - shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs; + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); } for (unsigned i = 0; i < 2; i++) { unsigned interp = info->color_interpolate[i]; unsigned location = info->color_interpolate_loc[i]; - if (!(info->colors_read & (0xf << i * 4))) + if (!(shader->info.ps_colors_read & (0xf << i * 4))) continue; key->ps_prolog.color_attr_index[i] = color[i]; - if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR) + if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR) interp = INTERP_MODE_FLAT; switch (interp) { @@ -1773,29 +3157,23 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke case INTERP_MODE_SMOOTH: case INTERP_MODE_COLOR: /* Force the interpolation location for colors here. */ - if (shader->key.part.ps.prolog.force_persp_sample_interp) + if (shader->key.ps.part.prolog.force_persp_sample_interp) location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.part.ps.prolog.force_persp_center_interp) + if (shader->key.ps.part.prolog.force_persp_center_interp) location = TGSI_INTERPOLATE_LOC_CENTER; switch (location) { case TGSI_INTERPOLATE_LOC_SAMPLE: key->ps_prolog.color_interp_vgpr_index[i] = 0; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); - } + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); break; case TGSI_INTERPOLATE_LOC_CENTER: key->ps_prolog.color_interp_vgpr_index[i] = 2; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); - } + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); break; case TGSI_INTERPOLATE_LOC_CENTROID: key->ps_prolog.color_interp_vgpr_index[i] = 4; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1); - } + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1); break; default: assert(0); @@ -1803,9 +3181,9 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke break; case INTERP_MODE_NOPERSPECTIVE: /* Force the interpolation location for colors here. */ - if (shader->key.part.ps.prolog.force_linear_sample_interp) + if (shader->key.ps.part.prolog.force_linear_sample_interp) location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.part.ps.prolog.force_linear_center_interp) + if (shader->key.ps.part.prolog.force_linear_center_interp) location = TGSI_INTERPOLATE_LOC_CENTER; /* The VGPR assignment for non-monolithic shaders @@ -1814,22 +3192,16 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke */ switch (location) { case TGSI_INTERPOLATE_LOC_SAMPLE: - key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); - } + key->ps_prolog.color_interp_vgpr_index[i] = 6; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); break; case TGSI_INTERPOLATE_LOC_CENTER: - key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); - } + key->ps_prolog.color_interp_vgpr_index[i] = 8; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); break; case TGSI_INTERPOLATE_LOC_CENTROID: - key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1); - } + key->ps_prolog.color_interp_vgpr_index[i] = 10; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1); break; default: assert(0); @@ -1864,31 +3236,34 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke { struct si_shader_info *info = &shader->selector->info; memset(key, 0, sizeof(*key)); + key->ps_epilog.wave32 = shader->wave_size == 32; + key->ps_epilog.uses_discard = si_shader_uses_discard(shader); key->ps_epilog.colors_written = info->colors_written; key->ps_epilog.color_types = info->output_color_types; key->ps_epilog.writes_z = info->writes_z; key->ps_epilog.writes_stencil = info->writes_stencil; - key->ps_epilog.writes_samplemask = info->writes_samplemask; - key->ps_epilog.states = shader->key.part.ps.epilog; + key->ps_epilog.writes_samplemask = info->writes_samplemask && + !shader->key.ps.part.epilog.kill_samplemask; + key->ps_epilog.states = shader->key.ps.part.epilog; } /** * Select and compile (or reuse) pixel shader parts (prolog & epilog). */ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug) + struct si_shader *shader, struct util_debug_callback *debug) { union si_shader_part_key prolog_key; union si_shader_part_key epilog_key; /* Get the prolog. */ - si_get_ps_prolog_key(shader, &prolog_key, true); + si_get_ps_prolog_key(shader, &prolog_key); /* The prolog is a no-op if these aren't set. */ if (si_need_ps_prolog(&prolog_key)) { shader->prolog = si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key, - compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog"); + compiler, debug, "Fragment Shader Prolog"); if (!shader->prolog) return false; } @@ -1898,70 +3273,16 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_ shader->epilog = si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key, - compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog"); + compiler, debug, "Fragment Shader Epilog"); if (!shader->epilog) return false; - /* Enable POS_FIXED_PT if polygon stippling is enabled. */ - if (shader->key.part.ps.prolog.poly_stipple) { - shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); - assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); - } + si_fixup_spi_ps_input_config(shader); - /* Set up the enable bits for per-sample shading if needed. */ - if (shader->key.part.ps.prolog.force_persp_sample_interp && - (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); - } - if (shader->key.part.ps.prolog.force_linear_sample_interp && - (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); - } - if (shader->key.part.ps.prolog.force_persp_center_interp && - (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); - } - if (shader->key.part.ps.prolog.force_linear_center_interp && - (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); - } - - /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ - if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && - !(shader->config.spi_ps_input_ena & 0xf)) { - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); - assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); - } - - /* At least one pair of interpolation weights must be enabled. */ - if (!(shader->config.spi_ps_input_ena & 0x7f)) { - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); - assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); - } - - /* Samplemask fixup requires the sample ID. */ - if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { - shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); - assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); - } - - /* The sample mask input is always enabled, because the API shader always - * passes it through to the epilog. Disable it here if it's unused. - */ - if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask) - shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; + /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */ + unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena; + unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr; + assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena); return true; } @@ -1981,24 +3302,29 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_s *lds_size = MAX2(*lds_size, 8); } -void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) +static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) { unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); - if (shader->selector->info.stage == MESA_SHADER_COMPUTE && - si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) { + if (shader->selector->stage == MESA_SHADER_COMPUTE && + si_get_max_workgroup_size(shader) > shader->wave_size) { si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size); } } bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug) + struct si_shader *shader, struct util_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); + if (sel->stage == MESA_SHADER_FRAGMENT) { + shader->ps.writes_samplemask = sel->info.writes_samplemask && + !shader->key.ps.part.epilog.kill_samplemask; + } + /* LS, ES, VS are compiled on demand if the main part hasn't been * compiled for that stage. * @@ -2037,31 +3363,40 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler shader->is_binary_shared = true; shader->binary = mainp->binary; shader->config = mainp->config; - shader->info.num_input_sgprs = mainp->info.num_input_sgprs; - shader->info.num_input_vgprs = mainp->info.num_input_vgprs; - shader->info.face_vgpr_index = mainp->info.face_vgpr_index; - shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; - memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl, - sizeof(mainp->info.vs_output_ps_input_cntl)); - shader->info.uses_instanceid = mainp->info.uses_instanceid; - shader->info.nr_pos_exports = mainp->info.nr_pos_exports; - shader->info.nr_param_exports = mainp->info.nr_param_exports; + shader->info = mainp->info; /* Select prologs and/or epilogs. */ - switch (sel->info.stage) { - case MESA_SHADER_VERTEX: - if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug)) - return false; - break; + switch (sel->stage) { case MESA_SHADER_TESS_CTRL: if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug)) return false; break; - case MESA_SHADER_TESS_EVAL: - break; case MESA_SHADER_GEOMETRY: if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) return false; + + /* Clone the GS copy shader for the shader variant. + * We can't just copy the pointer because we change the pm4 state and + * si_shader_selector::gs_copy_shader must be immutable because it's shared + * by multiple contexts. + */ + if (!shader->key.ge.as_ngg) { + assert(sel->main_shader_part == mainp); + assert(sel->main_shader_part->gs_copy_shader); + assert(sel->main_shader_part->gs_copy_shader->bo); + assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel); + assert(!sel->main_shader_part->gs_copy_shader->scratch_bo); + + shader->gs_copy_shader = CALLOC_STRUCT(si_shader); + memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader, + sizeof(*shader->gs_copy_shader)); + /* Increase the reference count. */ + pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference); + /* Initialize some fields differently. */ + shader->gs_copy_shader->shader_log = NULL; + shader->gs_copy_shader->is_binary_shared = true; + util_queue_fence_init(&shader->gs_copy_shader->ready); + } break; case MESA_SHADER_FRAGMENT: if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) @@ -2075,6 +3410,9 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler default:; } + assert(shader->wave_size == mainp->wave_size); + assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size); + /* Update SGPR and VGPR counts. */ if (shader->prolog) { shader->config.num_sgprs = @@ -2097,12 +3435,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler MAX2(shader->config.scratch_bytes_per_wave, shader->previous_stage->config.scratch_bytes_per_wave); shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid; - } - if (shader->prolog2) { - shader->config.num_sgprs = - MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs); - shader->config.num_vgprs = - MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs); + shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other; + shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh; } if (shader->epilog) { shader->config.num_sgprs = @@ -2113,68 +3447,70 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler si_calculate_max_simd_waves(shader); } - if (shader->key.as_ngg) { - assert(!shader->key.as_es && !shader->key.as_ls); + if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) { + assert(!shader->key.ge.as_es && !shader->key.ge.as_ls); if (!gfx10_ngg_calculate_subgroup_info(shader)) { fprintf(stderr, "Failed to compute subgroup info\n"); return false; } - } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) { + } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) { gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info); } shader->uses_vs_state_provoking_vertex = sscreen->use_ngg && /* Used to convert triangle strips from GS to triangles. */ - ((sel->info.stage == MESA_SHADER_GEOMETRY && + ((sel->stage == MESA_SHADER_GEOMETRY && util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) || - (sel->info.stage == MESA_SHADER_VERTEX && + (sel->stage == MESA_SHADER_VERTEX && /* Used to export PrimitiveID from the correct vertex. */ - (shader->key.mono.u.vs_export_prim_id || - /* Used to generate triangle strip vertex IDs for all threads. */ - shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP))); + shader->key.ge.mono.u.vs_export_prim_id)); - shader->uses_vs_state_outprim = sscreen->use_ngg && - /* Only used by streamout in vertex shaders. */ - sel->info.stage == MESA_SHADER_VERTEX && - sel->so.num_outputs; + shader->uses_gs_state_outprim = sscreen->use_ngg && + /* Only used by streamout and the PrimID export in vertex + * shaders. */ + sel->stage == MESA_SHADER_VERTEX && + (si_shader_uses_streamout(shader) || + shader->uses_vs_state_provoking_vertex); - if (sel->info.stage == MESA_SHADER_VERTEX) { + if (sel->stage == MESA_SHADER_VERTEX) { shader->uses_base_instance = sel->info.uses_base_instance || - shader->key.part.vs.prolog.instance_divisor_is_one || - shader->key.part.vs.prolog.instance_divisor_is_fetched; - } else if (sel->info.stage == MESA_SHADER_TESS_CTRL) { + shader->key.ge.mono.instance_divisor_is_one || + shader->key.ge.mono.instance_divisor_is_fetched; + } else if (sel->stage == MESA_SHADER_TESS_CTRL) { shader->uses_base_instance = shader->previous_stage_sel && (shader->previous_stage_sel->info.uses_base_instance || - shader->key.part.tcs.ls_prolog.instance_divisor_is_one || - shader->key.part.tcs.ls_prolog.instance_divisor_is_fetched); - } else if (sel->info.stage == MESA_SHADER_GEOMETRY) { + shader->key.ge.mono.instance_divisor_is_one || + shader->key.ge.mono.instance_divisor_is_fetched); + } else if (sel->stage == MESA_SHADER_GEOMETRY) { shader->uses_base_instance = shader->previous_stage_sel && (shader->previous_stage_sel->info.uses_base_instance || - shader->key.part.gs.vs_prolog.instance_divisor_is_one || - shader->key.part.gs.vs_prolog.instance_divisor_is_fetched); + shader->key.ge.mono.instance_divisor_is_one || + shader->key.ge.mono.instance_divisor_is_fetched); } si_fix_resource_usage(sscreen, shader); - si_shader_dump(sscreen, shader, debug, stderr, true); /* Upload. */ - if (!si_shader_binary_upload(sscreen, shader, 0)) { - fprintf(stderr, "LLVM failed to upload shader\n"); - return false; - } + bool ok = si_shader_binary_upload(sscreen, shader, 0); + si_shader_dump(sscreen, shader, debug, stderr, true); - return true; + if (!ok) + fprintf(stderr, "LLVM failed to upload shader\n"); + return ok; } void si_shader_binary_clean(struct si_shader_binary *binary) { - free((void *)binary->elf_buffer); - binary->elf_buffer = NULL; + free((void *)binary->code_buffer); + binary->code_buffer = NULL; free(binary->llvm_ir_string); binary->llvm_ir_string = NULL; + free((void *)binary->symbols); + binary->symbols = NULL; + free(binary->uploaded_code); binary->uploaded_code = NULL; binary->uploaded_code_size = 0; @@ -2192,3 +3528,170 @@ void si_shader_destroy(struct si_shader *shader) free(shader->shader_log); } + +nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader, + struct si_shader *prev_shader, + struct si_shader_args *args, + bool *free_nir) +{ + const struct si_shader_selector *sel = shader->selector; + const union si_shader_key *key = &shader->key; + + if (sel->stage == MESA_SHADER_TESS_CTRL) { + struct si_shader_selector *ls = key->ge.part.tcs.ls; + + prev_shader->selector = ls; + prev_shader->key.ge.as_ls = 1; + } else { + struct si_shader_selector *es = key->ge.part.gs.es; + + prev_shader->selector = es; + prev_shader->key.ge.as_es = 1; + prev_shader->key.ge.as_ngg = key->ge.as_ngg; + } + + prev_shader->key.ge.mono = key->ge.mono; + prev_shader->key.ge.opt = key->ge.opt; + prev_shader->key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */ + /* kill_outputs was computed based on second shader's outputs so we can't use it to + * kill first shader's outputs. + */ + prev_shader->key.ge.opt.kill_outputs = 0; + prev_shader->is_monolithic = true; + + si_init_shader_args(prev_shader, args); + + nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, + sel->info.tcs_vgpr_only_inputs, NULL); + + si_update_shader_binary_info(shader, nir); + + shader->info.uses_instanceid |= + prev_shader->selector->info.uses_instanceid || prev_shader->info.uses_instanceid; + + return nir; +} + +unsigned si_get_tcs_out_patch_stride(const struct si_shader_info *info) +{ + unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out; + unsigned vertex_stride = util_last_bit64(info->outputs_written_before_tes_gs) * 4; + unsigned num_patch_outputs = util_last_bit64(info->patch_outputs_written); + + return tcs_out_vertices * vertex_stride + num_patch_outputs * 4; +} + +void si_get_tcs_epilog_args(enum amd_gfx_level gfx_level, + struct si_shader_args *args, + struct ac_arg *rel_patch_id, + struct ac_arg *invocation_id, + struct ac_arg *tf_lds_offset, + struct ac_arg tess_factors[6]) +{ + memset(args, 0, sizeof(*args)); + + if (gfx_level >= GFX9) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); + } else { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset); + } + + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ + /* patch index within the wave (REL_PATCH_ID) */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, rel_patch_id); + /* invocation ID within the patch */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, invocation_id); + /* LDS offset where tess factors should be loaded from */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, tf_lds_offset); + + for (unsigned i = 0; i < 6; i++) + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]); +} + +void si_get_ps_prolog_args(struct si_shader_args *args, + const union si_shader_part_key *key) +{ + memset(args, 0, sizeof(*args)); + + const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs; + + struct ac_arg input_sgprs[num_input_sgprs]; + for (unsigned i = 0; i < num_input_sgprs; i++) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i); + + args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS]; + /* Use the absolute location of the input. */ + args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR]; + + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid); + /* skip PERSP_PULL_MODEL */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid); + /* skip LINE_STIPPLE_TEX */ + + /* POS_X|Y|Z|W_FLOAT */ + for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++) + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); + + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt); +} + +void si_get_ps_epilog_args(struct si_shader_args *args, + const union si_shader_part_key *key, + struct ac_arg colors[MAX_DRAW_BUFFERS], + struct ac_arg *depth, struct ac_arg *stencil, + struct ac_arg *sample_mask) +{ + memset(args, 0, sizeof(*args)); + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference); + + u_foreach_bit (i, key->ps_epilog.colors_written) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i); + } + + if (key->ps_epilog.writes_z) + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth); + + if (key->ps_epilog.writes_stencil) + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil); + + if (key->ps_epilog.writes_samplemask) + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask); +} |