diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_shader.h')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 703 |
1 files changed, 412 insertions, 291 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index fa32c8ed705..e0b41bc3c87 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -1,25 +1,7 @@ /* * Copyright 2012 Advanced Micro Devices, Inc. - * All Rights Reserved. * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. + * SPDX-License-Identifier: MIT */ /* The compiler middle-end architecture: Explaining (non-)monolithic shaders @@ -84,7 +66,6 @@ * shader parts per shader increased. The complete new list of shader parts is: * - 1st shader: prolog part * - 1st shader: main part - * - 2nd shader: prolog part * - 2nd shader: main part * - 2nd shader: epilog part */ @@ -103,10 +84,9 @@ * Name Location * * POSITION 0 - * PSIZE 1 - * CLIPDIST0..1 2..3 - * CULLDIST0..1 (not implemented) - * GENERIC0..31 4..35 + * VAR0..31 1..32 + * CLIP_DIST0..1 49..50 + * PSIZ 51 * * For example, a shader only writing GENERIC0 has the output stride of 5. * @@ -131,33 +111,28 @@ #ifndef SI_SHADER_H #define SI_SHADER_H +#include "shader_info.h" #include "ac_binary.h" -#include "ac_llvm_build.h" -#include "ac_llvm_util.h" -#include "util/simple_mtx.h" -#include "util/u_inlines.h" +#include "ac_gpu_info.h" +#include "util/mesa-sha1.h" #include "util/u_live_shader_cache.h" #include "util/u_queue.h" #include "si_pm4.h" -#include <stdio.h> - #ifdef __cplusplus extern "C" { #endif -// Use LDS symbols when supported by LLVM. Can be disabled for testing the old -// path on newer LLVM for now. Should be removed in the long term. -#define USE_LDS_SYMBOLS (true) - struct nir_shader; -struct si_shader; -struct si_context; +struct nir_instr; +struct nir_lower_subgroups_options; +#define SI_NUM_INTERP 32 #define SI_MAX_ATTRIBS 16 #define SI_MAX_VS_OUTPUTS 40 +#define SI_USER_CLIP_PLANE_MASK 0x3F -#define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) +#define INTERP_MODE_COLOR INTERP_MODE_COUNT #define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0)) #define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3)) @@ -165,6 +140,10 @@ struct si_context; /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */ #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001 +#define SI_VECTOR_ARG_IS_COLOR BITFIELD_BIT(0) +#define SI_VECTOR_ARG_COLOR_COMPONENT(x) (((x) & 0x7) << 1) +#define SI_GET_VECTOR_ARG_COLOR_COMPONENT(x) (((x) >> 1) & 0x7) + /* SGPR user data indices */ enum { @@ -193,28 +172,23 @@ enum /* GFX6-8: TCS only */ GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, - GFX6_SGPR_TCS_OUT_OFFSETS, - GFX6_SGPR_TCS_OUT_LAYOUT, + GFX6_SGPR_TCS_OFFCHIP_ADDR, GFX6_SGPR_TCS_IN_LAYOUT, GFX6_TCS_NUM_USER_SGPR, - /* GFX9: Merged shaders. */ - /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */ - /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */ - GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, - /* GFX9: Merged LS-HS (VS-TCS) only. */ - GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR, - GFX9_SGPR_TCS_OUT_OFFSETS, - GFX9_SGPR_TCS_OUT_LAYOUT, + GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR, + GFX9_SGPR_TCS_OFFCHIP_ADDR, GFX9_TCS_NUM_USER_SGPR, /* GS limits */ GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, - GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, - GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR, SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS, + GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR), + GFX9_SGPR_ATTRIBUTE_RING_ADDR, + GFX9_GS_NUM_USER_SGPR, + /* PS only */ SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, SI_PS_NUM_USER_SGPR, @@ -253,23 +227,57 @@ enum SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ }; -/* Fields of driver-defined VS state SGPR. */ -#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x)&0x1) << 0) -#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE -#define S_VS_STATE_INDEXED(x) (((unsigned)(x)&0x1) << 1) -#define C_VS_STATE_INDEXED 0xFFFFFFFD -#define S_VS_STATE_OUTPRIM(x) (((unsigned)(x)&0x3) << 2) -#define C_VS_STATE_OUTPRIM 0xFFFFFFF3 -#define S_VS_STATE_PROVOKING_VTX_INDEX(x) (((unsigned)(x)&0x3) << 4) -#define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF -#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x)&0x1) << 6) -#define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF -#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x)&0xF) << 7) -#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F -#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x)&0x1FFF) << 11) -#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF -#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24) -#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF +/* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are + * accessible in the shader via vs_state_bits in VS, TES, and GS. + */ +#define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT 0 +#define VS_STATE_CLAMP_VERTEX_COLOR__MASK 0x1 /* Shared by VS and GS */ +#define VS_STATE_INDEXED__SHIFT 1 +#define VS_STATE_INDEXED__MASK 0x1 /* Shared by VS and GS */ + +/* These fields are only set in current_gs_state in si_context, and they are accessible + * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader. + */ +/* bit gap */ +/* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which + * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1 + * Only used by GFX9+ to compute LDS addresses of GS inputs. + */ +#define GS_STATE_NUM_ES_OUTPUTS__SHIFT 13 +#define GS_STATE_NUM_ES_OUTPUTS__MASK 0x3f +/* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n + * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent. + * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf; + * Expand to FP32 like this: ((0x70 | value) << 23); + * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15), which is always a negative + * exponent and it's equal to 1/2^(15 - value). + */ +#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 19 +#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK 0xf +#define GS_STATE_SMALL_PRIM_PRECISION__SHIFT 23 +#define GS_STATE_SMALL_PRIM_PRECISION__MASK 0xf +#define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27 +#define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK 0x1 +#define GS_STATE_PROVOKING_VTX_FIRST__SHIFT 28 +#define GS_STATE_PROVOKING_VTX_FIRST__MASK 0x1 +#define GS_STATE_OUTPRIM__SHIFT 29 +#define GS_STATE_OUTPRIM__MASK 0x3 +#define GS_STATE_PIPELINE_STATS_EMU__SHIFT 31 +#define GS_STATE_PIPELINE_STATS_EMU__MASK 0x1 + +#define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT) +#define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT)) + +/* This is called by functions that change states. */ +#define SET_FIELD(var, field, value) do { \ + assert((value) == ((unsigned)(value) & field##__MASK)); \ + (var) &= CLEAR_FIELD(field); \ + (var) |= ENCODE_FIELD(field, value); \ +} while (0) + +/* This is called during shader compilation and returns LLVMValueRef. */ +#define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \ + util_bitcount(field##__MASK)) enum { @@ -277,16 +285,87 @@ enum SI_VS_BLIT_SGPRS_POS = 3, SI_VS_BLIT_SGPRS_POS_COLOR = 7, SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, + + MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */ }; -#define SI_NGG_CULL_ENABLED (1 << 0) /* this implies W, view.xy, and small prim culling */ +#define SI_NGG_CULL_TRIANGLES (1 << 0) /* this implies W, view.xy, and small prim culling */ #define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ #define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */ -#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3) -#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */ +#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */ +#define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4) /* cull small lines according to the diamond exit rule */ +#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5) +#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff) + +struct si_shader_profile { + uint32_t sha1[SHA1_DIGEST_LENGTH32]; + uint32_t options; +}; + +extern struct si_shader_profile si_shader_profiles[]; +unsigned si_get_num_shader_profiles(void); + +#define SI_PROFILE_WAVE32 (1 << 0) +#define SI_PROFILE_GFX10_WAVE64 (1 << 1) +/* bit gap */ +#define SI_PROFILE_VS_NO_BINNING (1 << 3) +#define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING (1 << 4) +#define SI_PROFILE_CLAMP_DIV_BY_ZERO (1 << 5) +#define SI_PROFILE_NO_OPT_UNIFORM_VARYINGS (1 << 6) + +enum si_shader_dump_type { + SI_DUMP_SHADER_KEY, + SI_DUMP_INIT_NIR, /* initial input NIR when shaders are created (before lowering) */ + SI_DUMP_NIR, /* final NIR after lowering when shader variants are created */ + SI_DUMP_INIT_LLVM_IR, /* initial LLVM IR before optimizations */ + SI_DUMP_LLVM_IR, /* final LLVM IR */ + SI_DUMP_INIT_ACO_IR, /* initial ACO IR before optimizations */ + SI_DUMP_ACO_IR, /* final ACO IR */ + SI_DUMP_ASM, /* final asm shaders */ + SI_DUMP_STATS, /* print statistics as shader-db */ + SI_DUMP_ALWAYS, +}; + +enum { + SI_UNIQUE_SLOT_POS = 0, + + /* Since some shader stages use the highest used IO index + * to determine the size to allocate for inputs/outputs + * (in LDS, tess and GS rings). VARn should be placed right + * after POSITION to make that size as small as possible. + */ + SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */ + + /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as + * legacy desktop GL varyings because they are mutually exclusive. + */ + SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */ + + /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */ + SI_UNIQUE_SLOT_FOGC = 33, + SI_UNIQUE_SLOT_COL0, + SI_UNIQUE_SLOT_COL1, + SI_UNIQUE_SLOT_BFC0, + SI_UNIQUE_SLOT_BFC1, + SI_UNIQUE_SLOT_TEX0, + SI_UNIQUE_SLOT_TEX1, + SI_UNIQUE_SLOT_TEX2, + SI_UNIQUE_SLOT_TEX3, + SI_UNIQUE_SLOT_TEX4, + SI_UNIQUE_SLOT_TEX5, + SI_UNIQUE_SLOT_TEX6, + SI_UNIQUE_SLOT_TEX7, + SI_UNIQUE_SLOT_CLIP_VERTEX, + + /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */ + SI_UNIQUE_SLOT_CLIP_DIST0 = 49, + SI_UNIQUE_SLOT_CLIP_DIST1, + SI_UNIQUE_SLOT_PSIZ, + /* These can't be written by LS, HS, and ES. */ + SI_UNIQUE_SLOT_LAYER, + SI_UNIQUE_SLOT_VIEWPORT, + SI_UNIQUE_SLOT_PRIMITIVE_ID, +}; /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -317,7 +396,7 @@ struct si_compiler_ctx_state { struct ac_llvm_compiler *compiler; /* Used if thread_index == -1 or if debug.async is true. */ - struct pipe_debug_callback debug; + struct util_debug_callback debug; /* Used for creating the log string for gallium/ddebug. */ bool is_debug_context; @@ -332,10 +411,10 @@ enum si_color_output_type { union si_input_info { struct { - ubyte semantic; - ubyte interpolate; - ubyte fp16_lo_hi_valid; - ubyte usage_mask; + uint8_t semantic; + uint8_t interpolate; + uint8_t fp16_lo_hi_valid; + uint8_t usage_mask; }; uint32_t _unused; /* this just forces 4-byte alignment */ }; @@ -343,27 +422,50 @@ union si_input_info { struct si_shader_info { shader_info base; - gl_shader_stage stage; + uint32_t options; /* bitmask of SI_PROFILE_* */ - ubyte num_inputs; - ubyte num_outputs; + uint8_t num_inputs; + uint8_t num_outputs; union si_input_info input[PIPE_MAX_SHADER_INPUTS]; - ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS]; - ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; - ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS]; - ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; - ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */ + uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS]; + uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; + uint8_t output_readmask[PIPE_MAX_SHADER_OUTPUTS]; + uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS]; + uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */ + + uint8_t num_vs_inputs; + uint8_t num_vbos_in_user_sgprs; + uint8_t num_stream_output_components[4]; + uint16_t enabled_streamout_buffer_mask; - ubyte color_interpolate[2]; - ubyte color_interpolate_loc[2]; + uint64_t inputs_read; /* "get_unique_index" bits */ + uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */ - int constbuf0_num_slots; - ubyte num_stream_output_components[4]; + uint64_t outputs_written_before_tes_gs; /* "get_unique_index" bits */ + uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ + uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ - uint num_memory_stores; + uint8_t clipdist_mask; + uint8_t culldist_mask; - ubyte colors_read; /**< which color components are read by the FS */ - ubyte colors_written; + uint16_t lshs_vertex_stride; + uint16_t esgs_vertex_stride; + uint16_t gsvs_vertex_size; + uint8_t gs_input_verts_per_prim; + unsigned max_gsvs_emit_size; + + /* Set 0xf or 0x0 (4 bits) per each written output. + * ANDed with spi_shader_col_format. + */ + unsigned colors_written_4bit; + + int constbuf0_num_slots; + uint num_memory_stores; + uint8_t color_attr_index[2]; + uint8_t color_interpolate[2]; + uint8_t color_interpolate_loc[2]; + uint8_t colors_read; /**< which color components are read by the FS */ + uint8_t colors_written; uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */ bool color0_writes_all_cbufs; /**< gl_FragColor */ bool reads_samplemask; /**< does fragment shader read sample mask? */ @@ -394,7 +496,7 @@ struct si_shader_info { bool uses_block_id[3]; bool uses_variable_block_size; bool uses_grid_size; - bool uses_subgroup_info; + bool uses_tg_size; bool writes_position; bool writes_psize; bool writes_clipvertex; @@ -404,9 +506,13 @@ struct si_shader_info { bool uses_bindless_samplers; bool uses_bindless_images; bool uses_indirect_descriptor; + bool has_divergent_loop; + bool uses_sampleid; + bool uses_layer_id; + bool has_non_uniform_tex_access; - bool uses_vmem_return_type_sampler_or_bvh; - bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */ + bool uses_vmem_sampler_or_bvh; + bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */ /** Whether all codepaths write tess factors in all invocations. */ bool tessfactors_are_def_in_all_invocs; @@ -422,6 +528,10 @@ struct si_shader_info { * texunit + 1. */ uint8_t writes_1_if_tex_is_1; + + /* frag coord and sample pos per component read mask. */ + uint8_t reads_frag_coord_mask; + uint8_t reads_sample_pos_mask; }; /* A shader selector is a gallium CSO and contains shader variants and @@ -432,10 +542,13 @@ struct si_shader_selector { struct si_screen *screen; struct util_queue_fence ready; struct si_compiler_ctx_state compiler_ctx_state; + gl_shader_stage stage; simple_mtx_t mutex; - struct si_shader *first_variant; /* immutable after the first variant */ - struct si_shader *last_variant; /* mutable */ + union si_shader_key *keys; + unsigned variants_count; + unsigned variants_max_count; + struct si_shader **variants; /* The compiled NIR shader without a prolog and/or epilog (not * uploaded to a buffer object). @@ -446,58 +559,26 @@ struct si_shader_selector { struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */ struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */ - struct si_shader *gs_copy_shader; - struct nir_shader *nir; void *nir_binary; unsigned nir_size; - struct pipe_stream_output_info so; struct si_shader_info info; enum pipe_shader_type pipe_shader_type; - ubyte const_and_shader_buf_descriptors_index; - ubyte sampler_and_images_descriptors_index; - bool vs_needs_prolog; - ubyte cs_shaderbufs_sgpr_index; - ubyte cs_num_shaderbufs_in_user_sgprs; - ubyte cs_images_sgpr_index; - ubyte cs_images_num_sgprs; - ubyte cs_num_images_in_user_sgprs; - ubyte num_vs_inputs; - ubyte num_vbos_in_user_sgprs; - unsigned pa_cl_vs_out_cntl; + uint8_t const_and_shader_buf_descriptors_index; + uint8_t sampler_and_images_descriptors_index; + uint8_t cs_shaderbufs_sgpr_index; + uint8_t cs_num_shaderbufs_in_user_sgprs; + uint8_t cs_images_sgpr_index; + uint8_t cs_images_num_sgprs; + uint8_t cs_num_images_in_user_sgprs; unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */ - ubyte clipdist_mask; - ubyte culldist_mask; - enum pipe_prim_type rast_prim; - - /* ES parameters. */ - uint16_t esgs_itemsize; /* vertex stride */ - uint16_t lshs_vertex_stride; + enum mesa_prim rast_prim; /* GS parameters. */ - uint16_t gsvs_vertex_size; - ubyte gs_input_verts_per_prim; - unsigned max_gsvs_emit_size; - uint16_t enabled_streamout_buffer_mask; bool tess_turns_off_ngg; - /* PS parameters. */ - ubyte color_attr_index[2]; - unsigned db_shader_control; - /* Set 0xf or 0x0 (4 bits) per each written output. - * ANDed with spi_shader_col_format. - */ - unsigned colors_written_4bit; - - uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ - uint64_t outputs_written; /* "get_unique_index" bits */ - uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ - - uint64_t inputs_read; /* "get_unique_index" bits */ - uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */ - /* bitmasks of used descriptor slots */ uint64_t active_const_and_shader_buffers; uint64_t active_samplers_and_images; @@ -529,30 +610,6 @@ struct si_shader_selector { */ #pragma pack(push, 1) -/* Common VS bits between the shader key and the prolog key. */ -struct si_vs_prolog_bits { - /* - If neither "is_one" nor "is_fetched" has a bit set, the instance - * divisor is 0. - * - If "is_one" has a bit set, the instance divisor is 1. - * - If "is_fetched" has a bit set, the instance divisor will be loaded - * from the constant buffer. - */ - uint16_t instance_divisor_is_one; /* bitmask of inputs */ - uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ - unsigned ls_vgpr_fix : 1; -}; - -/* Common TCS bits between the shader key and the epilog key. */ -struct si_tcs_epilog_bits { - unsigned prim_mode : 3; - unsigned invoc0_tess_factors_are_def : 1; - unsigned tes_reads_tess_factors : 1; -}; - -struct si_gs_prolog_bits { - unsigned tri_strip_adj_fix : 1; -}; - /* Common PS bits between the shader key and the prolog key. */ struct si_ps_prolog_bits { unsigned color_two_side : 1; @@ -575,49 +632,30 @@ struct si_ps_epilog_bits { unsigned last_cbuf : 3; unsigned alpha_func : 3; unsigned alpha_to_one : 1; - unsigned poly_line_smoothing : 1; + unsigned alpha_to_coverage_via_mrtz : 1; /* gfx11+ */ unsigned clamp_color : 1; + unsigned dual_src_blend_swizzle : 1; /* gfx11+ */ + unsigned rbplus_depth_only_opt:1; + unsigned kill_samplemask:1; }; union si_shader_part_key { struct { - struct si_vs_prolog_bits states; - unsigned num_input_sgprs : 6; - /* For merged stages such as LS-HS, HS input VGPRs are first. */ - unsigned num_merged_next_stage_vgprs : 3; - unsigned num_inputs : 5; - unsigned as_ls : 1; - unsigned as_es : 1; - unsigned as_ngg : 1; - unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */ - unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */ - unsigned gs_fast_launch_index_size_packed : 2; - unsigned load_vgprs_after_culling : 1; - /* Prologs for monolithic shaders shouldn't set EXEC. */ - unsigned is_monolithic : 1; - } vs_prolog; - struct { - struct si_tcs_epilog_bits states; - } tcs_epilog; - struct { - struct si_gs_prolog_bits states; - unsigned as_ngg : 1; - } gs_prolog; - struct { struct si_ps_prolog_bits states; + unsigned wave32 : 1; unsigned num_input_sgprs : 6; - unsigned num_input_vgprs : 5; /* Color interpolation and two-side color selection. */ unsigned colors_read : 8; /* color input components read */ unsigned num_interp_inputs : 5; /* BCOLOR is at this location */ - unsigned face_vgpr_index : 5; - unsigned ancillary_vgpr_index : 5; + unsigned num_fragcoord_components : 3; unsigned wqm : 1; char color_attr_index[2]; signed char color_interp_vgpr_index[2]; /* -1 == constant */ } ps_prolog; struct { struct si_ps_epilog_bits states; + unsigned wave32 : 1; + unsigned uses_discard : 1; unsigned colors_written : 8; unsigned color_types : 16; unsigned writes_z : 1; @@ -626,26 +664,16 @@ union si_shader_part_key { } ps_epilog; }; -struct si_shader_key { +/* The shader key for geometry stages (VS, TCS, TES, GS) */ +struct si_shader_key_ge { /* Prolog and epilog flags. */ union { struct { - struct si_vs_prolog_bits prolog; - } vs; - struct { - struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */ struct si_shader_selector *ls; /* for merged LS-HS */ - struct si_tcs_epilog_bits epilog; } tcs; /* tessellation control shader */ struct { - struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */ struct si_shader_selector *es; /* for merged ES-GS */ - struct si_gs_prolog_bits prolog; } gs; - struct { - struct si_ps_prolog_bits prolog; - struct si_ps_epilog_bits epilog; - } ps; } part; /* These three are initially set according to the NEXT_SHADER property, @@ -658,6 +686,15 @@ struct si_shader_key { /* Flags for monolithic compilation only. */ struct { + /* - If neither "is_one" nor "is_fetched" has a bit set, the instance + * divisor is 0. + * - If "is_one" has a bit set, the instance divisor is 1. + * - If "is_fetched" has a bit set, the instance divisor will be loaded + * from the constant buffer. + */ + uint16_t instance_divisor_is_one; /* bitmask of inputs */ + uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ + /* Whether fetch should be opencoded according to vs_fix_fetch. * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw * with minimal fixups is used. */ @@ -665,15 +702,9 @@ struct si_shader_key { union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS]; union { - uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */ /* When PS needs PrimID and GS is disabled. */ - unsigned vs_export_prim_id : 1; - struct { - unsigned interpolate_at_sample_force_center : 1; - unsigned fbfetch_msaa : 1; - unsigned fbfetch_is_1D : 1; - unsigned fbfetch_layered : 1; - } ps; + unsigned vs_export_prim_id : 1; /* VS and TES only */ + unsigned gs_tri_strip_adj_fix : 1; /* GS only */ } u; } mono; @@ -683,9 +714,12 @@ struct si_shader_key { uint64_t kill_outputs; /* "get_unique_index" bits */ unsigned kill_clip_distances : 8; unsigned kill_pointsize : 1; + unsigned kill_layer : 1; + unsigned remove_streamout : 1; /* For NGG VS and TES. */ - unsigned ngg_culling : 7; /* SI_NGG_CULL_* */ + unsigned ngg_culling : 13; /* SI_NGG_CULL_* */ + /* For shaders where monolithic variants have better code. * @@ -698,8 +732,50 @@ struct si_shader_key { /* VS and TCS have the same number of patch vertices. */ unsigned same_patch_vertices:1; + /* For TCS. */ + unsigned tes_prim_mode : 3; + unsigned tes_reads_tess_factors : 1; + + unsigned inline_uniforms:1; + + /* This must be kept last to limit the number of variants + * depending only on the uniform values. + */ + uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS]; + } opt; +}; + +struct si_shader_key_ps { + struct { + /* Prolog and epilog flags. */ + struct si_ps_prolog_bits prolog; + struct si_ps_epilog_bits epilog; + } part; + + /* Flags for monolithic compilation only. */ + struct { + unsigned poly_line_smoothing : 1; + unsigned point_smoothing : 1; + unsigned interpolate_at_sample_force_center : 1; + unsigned fbfetch_msaa : 1; + unsigned fbfetch_is_1D : 1; + unsigned fbfetch_layered : 1; + } mono; + + /* Optimization flags for asynchronous compilation only. */ + struct { + /* For shaders where monolithic variants have better code. + * + * This is a flag that has no effect on code generation, + * but forces monolithic shaders to be used as soon as + * possible, because it's in the "opt" group. + */ + unsigned prefer_mono : 1; unsigned inline_uniforms:1; + /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */ + int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */ + /* This must be kept last to limit the number of variants * depending only on the uniform values. */ @@ -707,32 +783,56 @@ struct si_shader_key { } opt; }; +union si_shader_key { + struct si_shader_key_ge ge; /* geometry engine shaders */ + struct si_shader_key_ps ps; +}; + /* Restore the pack alignment to default. */ #pragma pack(pop) /* GCN-specific shader info. */ struct si_shader_binary_info { - ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; + uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS]; uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS]; - ubyte num_input_sgprs; - ubyte num_input_vgprs; - signed char face_vgpr_index; - signed char ancillary_vgpr_index; + union si_input_info ps_inputs[SI_NUM_INTERP]; + uint8_t num_ps_inputs; + uint8_t ps_colors_read; + uint8_t num_input_sgprs; + uint8_t num_input_vgprs; + bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */ + bool uses_vmem_sampler_or_bvh; + uint8_t num_fragcoord_components; bool uses_instanceid; - ubyte nr_pos_exports; - ubyte nr_param_exports; + uint8_t nr_pos_exports; + uint8_t nr_param_exports; unsigned private_mem_vgprs; unsigned max_simd_waves; }; +enum si_shader_binary_type { + SI_SHADER_BINARY_ELF, + SI_SHADER_BINARY_RAW, +}; + struct si_shader_binary { - const char *elf_buffer; - size_t elf_size; + enum si_shader_binary_type type; + + /* Depends on binary type, either ELF or raw buffer. */ + const char *code_buffer; + size_t code_size; + uint32_t exec_size; char *uploaded_code; size_t uploaded_code_size; char *llvm_ir_string; + + const char *disasm_string; + size_t disasm_size; + + const unsigned *symbols; + unsigned num_symbols; }; struct gfx9_gs_info { @@ -743,57 +843,32 @@ struct gfx9_gs_info { unsigned esgs_ring_size; /* in bytes */ }; -#define SI_NUM_VGT_STAGES_KEY_BITS 6 -#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) - -/* The VGT_SHADER_STAGES key used to index the table of precomputed values. - * Some fields are set by state-change calls, most are set by draw_vbo. - */ -union si_vgt_stages_key { - struct { -#if UTIL_ARCH_LITTLE_ENDIAN - uint8_t tess : 1; - uint8_t gs : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg : 1; /* gfx10+ */ - uint8_t streamout : 1; /* only used with NGG */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; -#else /* UTIL_ARCH_BIG_ENDIAN */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; - uint8_t streamout : 1; - uint8_t ngg : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t gs : 1; - uint8_t tess : 1; -#endif - } u; - uint8_t index; -}; - struct si_shader { struct si_pm4_state pm4; /* base class */ struct si_compiler_ctx_state compiler_ctx_state; struct si_shader_selector *selector; struct si_shader_selector *previous_stage_sel; /* for refcounting */ - struct si_shader *next_variant; struct si_shader_part *prolog; struct si_shader *previous_stage; /* for GFX9 */ - struct si_shader_part *prolog2; struct si_shader_part *epilog; + struct si_shader *gs_copy_shader; struct si_resource *bo; + /* gpu_address should be bo->gpu_address except if SQTT is + * in use. + */ + uint64_t gpu_address; struct si_resource *scratch_bo; - struct si_shader_key key; + union si_shader_key key; struct util_queue_fence ready; bool compilation_failed; bool is_monolithic; bool is_optimized; bool is_binary_shared; bool is_gs_copy_shader; + uint8_t wave_size; /* The following data is all that's needed for binary shaders. */ struct si_shader_binary binary; @@ -802,19 +877,10 @@ struct si_shader { /* SI_SGPR_VS_STATE_BITS */ bool uses_vs_state_provoking_vertex; - bool uses_vs_state_outprim; + bool uses_gs_state_outprim; bool uses_base_instance; - struct { - uint16_t ngg_emit_size; /* in dwords */ - uint16_t hw_max_esverts; - uint16_t max_gsprims; - uint16_t max_out_verts; - uint16_t prim_amp_factor; - bool max_vert_out_per_gs_instance; - } ngg; - /* Shader key + LLVM IR + disassembly + statistics. * Generated for debug contexts only. */ @@ -823,7 +889,7 @@ struct si_shader { struct gfx9_gs_info gs_info; - /* For save precompute context registers values. */ + /* Precomputed register values. */ union { struct { unsigned vgt_gsvs_ring_offset_1; @@ -844,22 +910,27 @@ struct si_shader { } gs; struct { + /* Computed by gfx10_ngg_calculate_subgroup_info. */ + uint16_t ngg_emit_size; /* in dwords */ + uint16_t hw_max_esverts; + uint16_t max_gsprims; + uint16_t max_out_verts; + bool max_vert_out_per_gs_instance; + /* Register values. */ unsigned ge_max_output_per_subgroup; unsigned ge_ngg_subgrp_cntl; unsigned vgt_primitiveid_en; unsigned vgt_gs_onchip_cntl; unsigned vgt_gs_instance_cnt; - unsigned vgt_esgs_ring_itemsize; + unsigned esgs_vertex_stride; unsigned spi_vs_out_config; - unsigned spi_shader_idx_format; unsigned spi_shader_pos_format; unsigned pa_cl_vte_cntl; - unsigned pa_cl_ngg_cntl; unsigned vgt_gs_max_vert_out; /* for API GS */ unsigned ge_pc_alloc; /* uconfig register */ unsigned spi_shader_pgm_rsrc3_gs; unsigned spi_shader_pgm_rsrc4_gs; - union si_vgt_stages_key vgt_stages; + unsigned vgt_shader_stages_en; } ngg; struct { @@ -880,11 +951,13 @@ struct si_shader { unsigned spi_shader_z_format; unsigned spi_shader_col_format; unsigned cb_shader_mask; + unsigned db_shader_control; unsigned num_interp; + bool writes_samplemask; } ps; - } ctx_reg; + }; - /*For save precompute registers value */ + /* Precomputed register values. */ unsigned vgt_tf_param; /* VGT_TF_PARAM */ unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */ unsigned pa_cl_vs_out_cntl; @@ -899,54 +972,70 @@ struct si_shader_part { }; /* si_shader.c */ +struct ac_rtld_binary; + +void si_update_shader_binary_info(struct si_shader *shader, struct nir_shader *nir); bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug); + struct si_shader *shader, struct util_debug_callback *debug); bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, struct pipe_debug_callback *debug); + struct si_shader *shader, struct util_debug_callback *debug); void si_shader_destroy(struct si_shader *shader); -unsigned si_shader_io_get_unique_index_patch(unsigned semantic); -unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying); +unsigned si_shader_io_get_unique_index(unsigned semantic); bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, uint64_t scratch_va); +bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage, + enum si_shader_dump_type dump_type); void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, - struct pipe_debug_callback *debug, FILE *f, bool check_debug_option); + struct util_debug_callback *debug, FILE *f, bool check_debug_option); void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader, - struct pipe_debug_callback *debug); + struct util_debug_callback *debug); void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size); const char *si_get_shader_name(const struct si_shader *shader); void si_shader_binary_clean(struct si_shader_binary *binary); - -/* si_shader_llvm_gs.c */ -struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader_selector *gs_selector, - struct pipe_debug_callback *debug); +struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel); +unsigned si_get_ps_num_interp(struct si_shader *ps); +bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, + struct ac_rtld_binary *rtld); +bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name, + uint64_t *value); +unsigned si_get_shader_prefetch_size(struct si_shader *shader); + +/* si_shader_info.c */ +void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, + struct si_shader_info *info); /* si_shader_nir.c */ -void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info); +void si_lower_mediump_io(struct nir_shader *nir); + +bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data); void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first); -void si_nir_late_opts(nir_shader *nir); +void si_nir_late_opts(struct nir_shader *nir); char *si_finalize_nir(struct pipe_screen *screen, void *nirptr); -/* si_state_shaders.c */ +/* si_state_shaders.cpp */ +unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader); void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, struct gfx9_gs_info *out); bool gfx10_is_ngg_passthrough(struct si_shader *shader); +bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader); + /* Inline helpers. */ /* Return the pointer to the main shader part's pointer. */ static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel, - const struct si_shader_key *key) + const union si_shader_key *key) { - if (key->as_ls) - return &sel->main_shader_part_ls; - if (key->as_es && key->as_ngg) - return &sel->main_shader_part_ngg_es; - if (key->as_es) - return &sel->main_shader_part_es; - if (key->as_ngg) - return &sel->main_shader_part_ngg; + if (sel->stage <= MESA_SHADER_GEOMETRY) { + if (key->ge.as_ls) + return &sel->main_shader_part_ls; + if (key->ge.as_es && key->ge.as_ngg) + return &sel->main_shader_part_ngg_es; + if (key->ge.as_es) + return &sel->main_shader_part_es; + if (key->ge.as_ngg) + return &sel->main_shader_part_ngg; + } return &sel->main_shader_part; } @@ -960,6 +1049,38 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel return selector ? selector->info.uses_bindless_images : false; } +static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader) +{ + if (shader->selector->stage == MESA_SHADER_VERTEX && + !shader->selector->info.base.vs.blit_sgprs_amd && + !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES)) + return true; + + return false; +} + +static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader) +{ + return gfx10_edgeflags_have_effect(shader) && + shader->selector->info.writes_edgeflag; +} + +static inline bool si_shader_uses_streamout(const struct si_shader *shader) +{ + return shader->selector->stage <= MESA_SHADER_GEOMETRY && + shader->selector->info.enabled_streamout_buffer_mask && + !shader->key.ge.opt.remove_streamout; +} + +static inline bool si_shader_uses_discard(struct si_shader *shader) +{ + /* Changes to this should also update ps_modifies_zs. */ + return shader->selector->info.base.fs.uses_discard || + shader->key.ps.part.prolog.poly_stipple || + shader->key.ps.mono.point_smoothing || + shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS; +} + #ifdef __cplusplus } #endif |