summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/radeonsi/si_shader.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_shader.h')
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.h703
1 files changed, 412 insertions, 291 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index fa32c8ed705..e0b41bc3c87 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -1,25 +1,7 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
- * All Rights Reserved.
*
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * SPDX-License-Identifier: MIT
*/
/* The compiler middle-end architecture: Explaining (non-)monolithic shaders
@@ -84,7 +66,6 @@
* shader parts per shader increased. The complete new list of shader parts is:
* - 1st shader: prolog part
* - 1st shader: main part
- * - 2nd shader: prolog part
* - 2nd shader: main part
* - 2nd shader: epilog part
*/
@@ -103,10 +84,9 @@
* Name Location
*
* POSITION 0
- * PSIZE 1
- * CLIPDIST0..1 2..3
- * CULLDIST0..1 (not implemented)
- * GENERIC0..31 4..35
+ * VAR0..31 1..32
+ * CLIP_DIST0..1 49..50
+ * PSIZ 51
*
* For example, a shader only writing GENERIC0 has the output stride of 5.
*
@@ -131,33 +111,28 @@
#ifndef SI_SHADER_H
#define SI_SHADER_H
+#include "shader_info.h"
#include "ac_binary.h"
-#include "ac_llvm_build.h"
-#include "ac_llvm_util.h"
-#include "util/simple_mtx.h"
-#include "util/u_inlines.h"
+#include "ac_gpu_info.h"
+#include "util/mesa-sha1.h"
#include "util/u_live_shader_cache.h"
#include "util/u_queue.h"
#include "si_pm4.h"
-#include <stdio.h>
-
#ifdef __cplusplus
extern "C" {
#endif
-// Use LDS symbols when supported by LLVM. Can be disabled for testing the old
-// path on newer LLVM for now. Should be removed in the long term.
-#define USE_LDS_SYMBOLS (true)
-
struct nir_shader;
-struct si_shader;
-struct si_context;
+struct nir_instr;
+struct nir_lower_subgroups_options;
+#define SI_NUM_INTERP 32
#define SI_MAX_ATTRIBS 16
#define SI_MAX_VS_OUTPUTS 40
+#define SI_USER_CLIP_PLANE_MASK 0x3F
-#define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
+#define INTERP_MODE_COLOR INTERP_MODE_COUNT
#define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
#define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
@@ -165,6 +140,10 @@ struct si_context;
/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
+#define SI_VECTOR_ARG_IS_COLOR BITFIELD_BIT(0)
+#define SI_VECTOR_ARG_COLOR_COMPONENT(x) (((x) & 0x7) << 1)
+#define SI_GET_VECTOR_ARG_COLOR_COMPONENT(x) (((x) >> 1) & 0x7)
+
/* SGPR user data indices */
enum
{
@@ -193,28 +172,23 @@ enum
/* GFX6-8: TCS only */
GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
- GFX6_SGPR_TCS_OUT_OFFSETS,
- GFX6_SGPR_TCS_OUT_LAYOUT,
+ GFX6_SGPR_TCS_OFFCHIP_ADDR,
GFX6_SGPR_TCS_IN_LAYOUT,
GFX6_TCS_NUM_USER_SGPR,
- /* GFX9: Merged shaders. */
- /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */
- /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */
- GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
-
/* GFX9: Merged LS-HS (VS-TCS) only. */
- GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
- GFX9_SGPR_TCS_OUT_OFFSETS,
- GFX9_SGPR_TCS_OUT_LAYOUT,
+ GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
+ GFX9_SGPR_TCS_OFFCHIP_ADDR,
GFX9_TCS_NUM_USER_SGPR,
/* GS limits */
GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
- GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
- GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+ GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
+ GFX9_SGPR_ATTRIBUTE_RING_ADDR,
+ GFX9_GS_NUM_USER_SGPR,
+
/* PS only */
SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
SI_PS_NUM_USER_SGPR,
@@ -253,23 +227,57 @@ enum
SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
};
-/* Fields of driver-defined VS state SGPR. */
-#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x)&0x1) << 0)
-#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE
-#define S_VS_STATE_INDEXED(x) (((unsigned)(x)&0x1) << 1)
-#define C_VS_STATE_INDEXED 0xFFFFFFFD
-#define S_VS_STATE_OUTPRIM(x) (((unsigned)(x)&0x3) << 2)
-#define C_VS_STATE_OUTPRIM 0xFFFFFFF3
-#define S_VS_STATE_PROVOKING_VTX_INDEX(x) (((unsigned)(x)&0x3) << 4)
-#define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF
-#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x)&0x1) << 6)
-#define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF
-#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x)&0xF) << 7)
-#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F
-#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x)&0x1FFF) << 11)
-#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF
-#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24)
-#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF
+/* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
+ * accessible in the shader via vs_state_bits in VS, TES, and GS.
+ */
+#define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT 0
+#define VS_STATE_CLAMP_VERTEX_COLOR__MASK 0x1 /* Shared by VS and GS */
+#define VS_STATE_INDEXED__SHIFT 1
+#define VS_STATE_INDEXED__MASK 0x1 /* Shared by VS and GS */
+
+/* These fields are only set in current_gs_state in si_context, and they are accessible
+ * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
+ */
+/* bit gap */
+/* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
+ * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
+ * Only used by GFX9+ to compute LDS addresses of GS inputs.
+ */
+#define GS_STATE_NUM_ES_OUTPUTS__SHIFT 13
+#define GS_STATE_NUM_ES_OUTPUTS__MASK 0x3f
+/* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n
+ * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent.
+ * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf;
+ * Expand to FP32 like this: ((0x70 | value) << 23);
+ * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15), which is always a negative
+ * exponent and it's equal to 1/2^(15 - value).
+ */
+#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 19
+#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK 0xf
+#define GS_STATE_SMALL_PRIM_PRECISION__SHIFT 23
+#define GS_STATE_SMALL_PRIM_PRECISION__MASK 0xf
+#define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
+#define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK 0x1
+#define GS_STATE_PROVOKING_VTX_FIRST__SHIFT 28
+#define GS_STATE_PROVOKING_VTX_FIRST__MASK 0x1
+#define GS_STATE_OUTPRIM__SHIFT 29
+#define GS_STATE_OUTPRIM__MASK 0x3
+#define GS_STATE_PIPELINE_STATS_EMU__SHIFT 31
+#define GS_STATE_PIPELINE_STATS_EMU__MASK 0x1
+
+#define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
+#define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
+
+/* This is called by functions that change states. */
+#define SET_FIELD(var, field, value) do { \
+ assert((value) == ((unsigned)(value) & field##__MASK)); \
+ (var) &= CLEAR_FIELD(field); \
+ (var) |= ENCODE_FIELD(field, value); \
+} while (0)
+
+/* This is called during shader compilation and returns LLVMValueRef. */
+#define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
+ util_bitcount(field##__MASK))
enum
{
@@ -277,16 +285,87 @@ enum
SI_VS_BLIT_SGPRS_POS = 3,
SI_VS_BLIT_SGPRS_POS_COLOR = 7,
SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
+
+ MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
};
-#define SI_NGG_CULL_ENABLED (1 << 0) /* this implies W, view.xy, and small prim culling */
+#define SI_NGG_CULL_TRIANGLES (1 << 0) /* this implies W, view.xy, and small prim culling */
#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */
-#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3)
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */
+#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */
+#define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4) /* cull small lines according to the diamond exit rule */
+#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
+#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
+
+struct si_shader_profile {
+ uint32_t sha1[SHA1_DIGEST_LENGTH32];
+ uint32_t options;
+};
+
+extern struct si_shader_profile si_shader_profiles[];
+unsigned si_get_num_shader_profiles(void);
+
+#define SI_PROFILE_WAVE32 (1 << 0)
+#define SI_PROFILE_GFX10_WAVE64 (1 << 1)
+/* bit gap */
+#define SI_PROFILE_VS_NO_BINNING (1 << 3)
+#define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING (1 << 4)
+#define SI_PROFILE_CLAMP_DIV_BY_ZERO (1 << 5)
+#define SI_PROFILE_NO_OPT_UNIFORM_VARYINGS (1 << 6)
+
+enum si_shader_dump_type {
+ SI_DUMP_SHADER_KEY,
+ SI_DUMP_INIT_NIR, /* initial input NIR when shaders are created (before lowering) */
+ SI_DUMP_NIR, /* final NIR after lowering when shader variants are created */
+ SI_DUMP_INIT_LLVM_IR, /* initial LLVM IR before optimizations */
+ SI_DUMP_LLVM_IR, /* final LLVM IR */
+ SI_DUMP_INIT_ACO_IR, /* initial ACO IR before optimizations */
+ SI_DUMP_ACO_IR, /* final ACO IR */
+ SI_DUMP_ASM, /* final asm shaders */
+ SI_DUMP_STATS, /* print statistics as shader-db */
+ SI_DUMP_ALWAYS,
+};
+
+enum {
+ SI_UNIQUE_SLOT_POS = 0,
+
+ /* Since some shader stages use the highest used IO index
+ * to determine the size to allocate for inputs/outputs
+ * (in LDS, tess and GS rings). VARn should be placed right
+ * after POSITION to make that size as small as possible.
+ */
+ SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
+
+ /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
+ * legacy desktop GL varyings because they are mutually exclusive.
+ */
+ SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
+
+ /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
+ SI_UNIQUE_SLOT_FOGC = 33,
+ SI_UNIQUE_SLOT_COL0,
+ SI_UNIQUE_SLOT_COL1,
+ SI_UNIQUE_SLOT_BFC0,
+ SI_UNIQUE_SLOT_BFC1,
+ SI_UNIQUE_SLOT_TEX0,
+ SI_UNIQUE_SLOT_TEX1,
+ SI_UNIQUE_SLOT_TEX2,
+ SI_UNIQUE_SLOT_TEX3,
+ SI_UNIQUE_SLOT_TEX4,
+ SI_UNIQUE_SLOT_TEX5,
+ SI_UNIQUE_SLOT_TEX6,
+ SI_UNIQUE_SLOT_TEX7,
+ SI_UNIQUE_SLOT_CLIP_VERTEX,
+
+ /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
+ SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
+ SI_UNIQUE_SLOT_CLIP_DIST1,
+ SI_UNIQUE_SLOT_PSIZ,
+ /* These can't be written by LS, HS, and ES. */
+ SI_UNIQUE_SLOT_LAYER,
+ SI_UNIQUE_SLOT_VIEWPORT,
+ SI_UNIQUE_SLOT_PRIMITIVE_ID,
+};
/**
* For VS shader keys, describe any fixups required for vertex fetch.
@@ -317,7 +396,7 @@ struct si_compiler_ctx_state {
struct ac_llvm_compiler *compiler;
/* Used if thread_index == -1 or if debug.async is true. */
- struct pipe_debug_callback debug;
+ struct util_debug_callback debug;
/* Used for creating the log string for gallium/ddebug. */
bool is_debug_context;
@@ -332,10 +411,10 @@ enum si_color_output_type {
union si_input_info {
struct {
- ubyte semantic;
- ubyte interpolate;
- ubyte fp16_lo_hi_valid;
- ubyte usage_mask;
+ uint8_t semantic;
+ uint8_t interpolate;
+ uint8_t fp16_lo_hi_valid;
+ uint8_t usage_mask;
};
uint32_t _unused; /* this just forces 4-byte alignment */
};
@@ -343,27 +422,50 @@ union si_input_info {
struct si_shader_info {
shader_info base;
- gl_shader_stage stage;
+ uint32_t options; /* bitmask of SI_PROFILE_* */
- ubyte num_inputs;
- ubyte num_outputs;
+ uint8_t num_inputs;
+ uint8_t num_outputs;
union si_input_info input[PIPE_MAX_SHADER_INPUTS];
- ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS];
- ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
- ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
- ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
- ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
+ uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
+ uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
+ uint8_t output_readmask[PIPE_MAX_SHADER_OUTPUTS];
+ uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
+ uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
+
+ uint8_t num_vs_inputs;
+ uint8_t num_vbos_in_user_sgprs;
+ uint8_t num_stream_output_components[4];
+ uint16_t enabled_streamout_buffer_mask;
- ubyte color_interpolate[2];
- ubyte color_interpolate_loc[2];
+ uint64_t inputs_read; /* "get_unique_index" bits */
+ uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
- int constbuf0_num_slots;
- ubyte num_stream_output_components[4];
+ uint64_t outputs_written_before_tes_gs; /* "get_unique_index" bits */
+ uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
+ uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
- uint num_memory_stores;
+ uint8_t clipdist_mask;
+ uint8_t culldist_mask;
- ubyte colors_read; /**< which color components are read by the FS */
- ubyte colors_written;
+ uint16_t lshs_vertex_stride;
+ uint16_t esgs_vertex_stride;
+ uint16_t gsvs_vertex_size;
+ uint8_t gs_input_verts_per_prim;
+ unsigned max_gsvs_emit_size;
+
+ /* Set 0xf or 0x0 (4 bits) per each written output.
+ * ANDed with spi_shader_col_format.
+ */
+ unsigned colors_written_4bit;
+
+ int constbuf0_num_slots;
+ uint num_memory_stores;
+ uint8_t color_attr_index[2];
+ uint8_t color_interpolate[2];
+ uint8_t color_interpolate_loc[2];
+ uint8_t colors_read; /**< which color components are read by the FS */
+ uint8_t colors_written;
uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
bool color0_writes_all_cbufs; /**< gl_FragColor */
bool reads_samplemask; /**< does fragment shader read sample mask? */
@@ -394,7 +496,7 @@ struct si_shader_info {
bool uses_block_id[3];
bool uses_variable_block_size;
bool uses_grid_size;
- bool uses_subgroup_info;
+ bool uses_tg_size;
bool writes_position;
bool writes_psize;
bool writes_clipvertex;
@@ -404,9 +506,13 @@ struct si_shader_info {
bool uses_bindless_samplers;
bool uses_bindless_images;
bool uses_indirect_descriptor;
+ bool has_divergent_loop;
+ bool uses_sampleid;
+ bool uses_layer_id;
+ bool has_non_uniform_tex_access;
- bool uses_vmem_return_type_sampler_or_bvh;
- bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */
+ bool uses_vmem_sampler_or_bvh;
+ bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
/** Whether all codepaths write tess factors in all invocations. */
bool tessfactors_are_def_in_all_invocs;
@@ -422,6 +528,10 @@ struct si_shader_info {
* texunit + 1.
*/
uint8_t writes_1_if_tex_is_1;
+
+ /* frag coord and sample pos per component read mask. */
+ uint8_t reads_frag_coord_mask;
+ uint8_t reads_sample_pos_mask;
};
/* A shader selector is a gallium CSO and contains shader variants and
@@ -432,10 +542,13 @@ struct si_shader_selector {
struct si_screen *screen;
struct util_queue_fence ready;
struct si_compiler_ctx_state compiler_ctx_state;
+ gl_shader_stage stage;
simple_mtx_t mutex;
- struct si_shader *first_variant; /* immutable after the first variant */
- struct si_shader *last_variant; /* mutable */
+ union si_shader_key *keys;
+ unsigned variants_count;
+ unsigned variants_max_count;
+ struct si_shader **variants;
/* The compiled NIR shader without a prolog and/or epilog (not
* uploaded to a buffer object).
@@ -446,58 +559,26 @@ struct si_shader_selector {
struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */
struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
- struct si_shader *gs_copy_shader;
-
struct nir_shader *nir;
void *nir_binary;
unsigned nir_size;
- struct pipe_stream_output_info so;
struct si_shader_info info;
enum pipe_shader_type pipe_shader_type;
- ubyte const_and_shader_buf_descriptors_index;
- ubyte sampler_and_images_descriptors_index;
- bool vs_needs_prolog;
- ubyte cs_shaderbufs_sgpr_index;
- ubyte cs_num_shaderbufs_in_user_sgprs;
- ubyte cs_images_sgpr_index;
- ubyte cs_images_num_sgprs;
- ubyte cs_num_images_in_user_sgprs;
- ubyte num_vs_inputs;
- ubyte num_vbos_in_user_sgprs;
- unsigned pa_cl_vs_out_cntl;
+ uint8_t const_and_shader_buf_descriptors_index;
+ uint8_t sampler_and_images_descriptors_index;
+ uint8_t cs_shaderbufs_sgpr_index;
+ uint8_t cs_num_shaderbufs_in_user_sgprs;
+ uint8_t cs_images_sgpr_index;
+ uint8_t cs_images_num_sgprs;
+ uint8_t cs_num_images_in_user_sgprs;
unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
- ubyte clipdist_mask;
- ubyte culldist_mask;
- enum pipe_prim_type rast_prim;
-
- /* ES parameters. */
- uint16_t esgs_itemsize; /* vertex stride */
- uint16_t lshs_vertex_stride;
+ enum mesa_prim rast_prim;
/* GS parameters. */
- uint16_t gsvs_vertex_size;
- ubyte gs_input_verts_per_prim;
- unsigned max_gsvs_emit_size;
- uint16_t enabled_streamout_buffer_mask;
bool tess_turns_off_ngg;
- /* PS parameters. */
- ubyte color_attr_index[2];
- unsigned db_shader_control;
- /* Set 0xf or 0x0 (4 bits) per each written output.
- * ANDed with spi_shader_col_format.
- */
- unsigned colors_written_4bit;
-
- uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
- uint64_t outputs_written; /* "get_unique_index" bits */
- uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
-
- uint64_t inputs_read; /* "get_unique_index" bits */
- uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
-
/* bitmasks of used descriptor slots */
uint64_t active_const_and_shader_buffers;
uint64_t active_samplers_and_images;
@@ -529,30 +610,6 @@ struct si_shader_selector {
*/
#pragma pack(push, 1)
-/* Common VS bits between the shader key and the prolog key. */
-struct si_vs_prolog_bits {
- /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
- * divisor is 0.
- * - If "is_one" has a bit set, the instance divisor is 1.
- * - If "is_fetched" has a bit set, the instance divisor will be loaded
- * from the constant buffer.
- */
- uint16_t instance_divisor_is_one; /* bitmask of inputs */
- uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
- unsigned ls_vgpr_fix : 1;
-};
-
-/* Common TCS bits between the shader key and the epilog key. */
-struct si_tcs_epilog_bits {
- unsigned prim_mode : 3;
- unsigned invoc0_tess_factors_are_def : 1;
- unsigned tes_reads_tess_factors : 1;
-};
-
-struct si_gs_prolog_bits {
- unsigned tri_strip_adj_fix : 1;
-};
-
/* Common PS bits between the shader key and the prolog key. */
struct si_ps_prolog_bits {
unsigned color_two_side : 1;
@@ -575,49 +632,30 @@ struct si_ps_epilog_bits {
unsigned last_cbuf : 3;
unsigned alpha_func : 3;
unsigned alpha_to_one : 1;
- unsigned poly_line_smoothing : 1;
+ unsigned alpha_to_coverage_via_mrtz : 1; /* gfx11+ */
unsigned clamp_color : 1;
+ unsigned dual_src_blend_swizzle : 1; /* gfx11+ */
+ unsigned rbplus_depth_only_opt:1;
+ unsigned kill_samplemask:1;
};
union si_shader_part_key {
struct {
- struct si_vs_prolog_bits states;
- unsigned num_input_sgprs : 6;
- /* For merged stages such as LS-HS, HS input VGPRs are first. */
- unsigned num_merged_next_stage_vgprs : 3;
- unsigned num_inputs : 5;
- unsigned as_ls : 1;
- unsigned as_es : 1;
- unsigned as_ngg : 1;
- unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */
- unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
- unsigned gs_fast_launch_index_size_packed : 2;
- unsigned load_vgprs_after_culling : 1;
- /* Prologs for monolithic shaders shouldn't set EXEC. */
- unsigned is_monolithic : 1;
- } vs_prolog;
- struct {
- struct si_tcs_epilog_bits states;
- } tcs_epilog;
- struct {
- struct si_gs_prolog_bits states;
- unsigned as_ngg : 1;
- } gs_prolog;
- struct {
struct si_ps_prolog_bits states;
+ unsigned wave32 : 1;
unsigned num_input_sgprs : 6;
- unsigned num_input_vgprs : 5;
/* Color interpolation and two-side color selection. */
unsigned colors_read : 8; /* color input components read */
unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
- unsigned face_vgpr_index : 5;
- unsigned ancillary_vgpr_index : 5;
+ unsigned num_fragcoord_components : 3;
unsigned wqm : 1;
char color_attr_index[2];
signed char color_interp_vgpr_index[2]; /* -1 == constant */
} ps_prolog;
struct {
struct si_ps_epilog_bits states;
+ unsigned wave32 : 1;
+ unsigned uses_discard : 1;
unsigned colors_written : 8;
unsigned color_types : 16;
unsigned writes_z : 1;
@@ -626,26 +664,16 @@ union si_shader_part_key {
} ps_epilog;
};
-struct si_shader_key {
+/* The shader key for geometry stages (VS, TCS, TES, GS) */
+struct si_shader_key_ge {
/* Prolog and epilog flags. */
union {
struct {
- struct si_vs_prolog_bits prolog;
- } vs;
- struct {
- struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
struct si_shader_selector *ls; /* for merged LS-HS */
- struct si_tcs_epilog_bits epilog;
} tcs; /* tessellation control shader */
struct {
- struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
struct si_shader_selector *es; /* for merged ES-GS */
- struct si_gs_prolog_bits prolog;
} gs;
- struct {
- struct si_ps_prolog_bits prolog;
- struct si_ps_epilog_bits epilog;
- } ps;
} part;
/* These three are initially set according to the NEXT_SHADER property,
@@ -658,6 +686,15 @@ struct si_shader_key {
/* Flags for monolithic compilation only. */
struct {
+ /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
+ * divisor is 0.
+ * - If "is_one" has a bit set, the instance divisor is 1.
+ * - If "is_fetched" has a bit set, the instance divisor will be loaded
+ * from the constant buffer.
+ */
+ uint16_t instance_divisor_is_one; /* bitmask of inputs */
+ uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
+
/* Whether fetch should be opencoded according to vs_fix_fetch.
* Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
* with minimal fixups is used. */
@@ -665,15 +702,9 @@ struct si_shader_key {
union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
union {
- uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
/* When PS needs PrimID and GS is disabled. */
- unsigned vs_export_prim_id : 1;
- struct {
- unsigned interpolate_at_sample_force_center : 1;
- unsigned fbfetch_msaa : 1;
- unsigned fbfetch_is_1D : 1;
- unsigned fbfetch_layered : 1;
- } ps;
+ unsigned vs_export_prim_id : 1; /* VS and TES only */
+ unsigned gs_tri_strip_adj_fix : 1; /* GS only */
} u;
} mono;
@@ -683,9 +714,12 @@ struct si_shader_key {
uint64_t kill_outputs; /* "get_unique_index" bits */
unsigned kill_clip_distances : 8;
unsigned kill_pointsize : 1;
+ unsigned kill_layer : 1;
+ unsigned remove_streamout : 1;
/* For NGG VS and TES. */
- unsigned ngg_culling : 7; /* SI_NGG_CULL_* */
+ unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
+
/* For shaders where monolithic variants have better code.
*
@@ -698,8 +732,50 @@ struct si_shader_key {
/* VS and TCS have the same number of patch vertices. */
unsigned same_patch_vertices:1;
+ /* For TCS. */
+ unsigned tes_prim_mode : 3;
+ unsigned tes_reads_tess_factors : 1;
+
+ unsigned inline_uniforms:1;
+
+ /* This must be kept last to limit the number of variants
+ * depending only on the uniform values.
+ */
+ uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
+ } opt;
+};
+
+struct si_shader_key_ps {
+ struct {
+ /* Prolog and epilog flags. */
+ struct si_ps_prolog_bits prolog;
+ struct si_ps_epilog_bits epilog;
+ } part;
+
+ /* Flags for monolithic compilation only. */
+ struct {
+ unsigned poly_line_smoothing : 1;
+ unsigned point_smoothing : 1;
+ unsigned interpolate_at_sample_force_center : 1;
+ unsigned fbfetch_msaa : 1;
+ unsigned fbfetch_is_1D : 1;
+ unsigned fbfetch_layered : 1;
+ } mono;
+
+ /* Optimization flags for asynchronous compilation only. */
+ struct {
+ /* For shaders where monolithic variants have better code.
+ *
+ * This is a flag that has no effect on code generation,
+ * but forces monolithic shaders to be used as soon as
+ * possible, because it's in the "opt" group.
+ */
+ unsigned prefer_mono : 1;
unsigned inline_uniforms:1;
+ /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
+ int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
+
/* This must be kept last to limit the number of variants
* depending only on the uniform values.
*/
@@ -707,32 +783,56 @@ struct si_shader_key {
} opt;
};
+union si_shader_key {
+ struct si_shader_key_ge ge; /* geometry engine shaders */
+ struct si_shader_key_ps ps;
+};
+
/* Restore the pack alignment to default. */
#pragma pack(pop)
/* GCN-specific shader info. */
struct si_shader_binary_info {
- ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+ uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
- ubyte num_input_sgprs;
- ubyte num_input_vgprs;
- signed char face_vgpr_index;
- signed char ancillary_vgpr_index;
+ union si_input_info ps_inputs[SI_NUM_INTERP];
+ uint8_t num_ps_inputs;
+ uint8_t ps_colors_read;
+ uint8_t num_input_sgprs;
+ uint8_t num_input_vgprs;
+ bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
+ bool uses_vmem_sampler_or_bvh;
+ uint8_t num_fragcoord_components;
bool uses_instanceid;
- ubyte nr_pos_exports;
- ubyte nr_param_exports;
+ uint8_t nr_pos_exports;
+ uint8_t nr_param_exports;
unsigned private_mem_vgprs;
unsigned max_simd_waves;
};
+enum si_shader_binary_type {
+ SI_SHADER_BINARY_ELF,
+ SI_SHADER_BINARY_RAW,
+};
+
struct si_shader_binary {
- const char *elf_buffer;
- size_t elf_size;
+ enum si_shader_binary_type type;
+
+ /* Depends on binary type, either ELF or raw buffer. */
+ const char *code_buffer;
+ size_t code_size;
+ uint32_t exec_size;
char *uploaded_code;
size_t uploaded_code_size;
char *llvm_ir_string;
+
+ const char *disasm_string;
+ size_t disasm_size;
+
+ const unsigned *symbols;
+ unsigned num_symbols;
};
struct gfx9_gs_info {
@@ -743,57 +843,32 @@ struct gfx9_gs_info {
unsigned esgs_ring_size; /* in bytes */
};
-#define SI_NUM_VGT_STAGES_KEY_BITS 6
-#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
-
-/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
- * Some fields are set by state-change calls, most are set by draw_vbo.
- */
-union si_vgt_stages_key {
- struct {
-#if UTIL_ARCH_LITTLE_ENDIAN
- uint8_t tess : 1;
- uint8_t gs : 1;
- uint8_t ngg_gs_fast_launch : 1;
- uint8_t ngg_passthrough : 1;
- uint8_t ngg : 1; /* gfx10+ */
- uint8_t streamout : 1; /* only used with NGG */
- uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
-#else /* UTIL_ARCH_BIG_ENDIAN */
- uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
- uint8_t streamout : 1;
- uint8_t ngg : 1;
- uint8_t ngg_passthrough : 1;
- uint8_t ngg_gs_fast_launch : 1;
- uint8_t gs : 1;
- uint8_t tess : 1;
-#endif
- } u;
- uint8_t index;
-};
-
struct si_shader {
struct si_pm4_state pm4; /* base class */
struct si_compiler_ctx_state compiler_ctx_state;
struct si_shader_selector *selector;
struct si_shader_selector *previous_stage_sel; /* for refcounting */
- struct si_shader *next_variant;
struct si_shader_part *prolog;
struct si_shader *previous_stage; /* for GFX9 */
- struct si_shader_part *prolog2;
struct si_shader_part *epilog;
+ struct si_shader *gs_copy_shader;
struct si_resource *bo;
+ /* gpu_address should be bo->gpu_address except if SQTT is
+ * in use.
+ */
+ uint64_t gpu_address;
struct si_resource *scratch_bo;
- struct si_shader_key key;
+ union si_shader_key key;
struct util_queue_fence ready;
bool compilation_failed;
bool is_monolithic;
bool is_optimized;
bool is_binary_shared;
bool is_gs_copy_shader;
+ uint8_t wave_size;
/* The following data is all that's needed for binary shaders. */
struct si_shader_binary binary;
@@ -802,19 +877,10 @@ struct si_shader {
/* SI_SGPR_VS_STATE_BITS */
bool uses_vs_state_provoking_vertex;
- bool uses_vs_state_outprim;
+ bool uses_gs_state_outprim;
bool uses_base_instance;
- struct {
- uint16_t ngg_emit_size; /* in dwords */
- uint16_t hw_max_esverts;
- uint16_t max_gsprims;
- uint16_t max_out_verts;
- uint16_t prim_amp_factor;
- bool max_vert_out_per_gs_instance;
- } ngg;
-
/* Shader key + LLVM IR + disassembly + statistics.
* Generated for debug contexts only.
*/
@@ -823,7 +889,7 @@ struct si_shader {
struct gfx9_gs_info gs_info;
- /* For save precompute context registers values. */
+ /* Precomputed register values. */
union {
struct {
unsigned vgt_gsvs_ring_offset_1;
@@ -844,22 +910,27 @@ struct si_shader {
} gs;
struct {
+ /* Computed by gfx10_ngg_calculate_subgroup_info. */
+ uint16_t ngg_emit_size; /* in dwords */
+ uint16_t hw_max_esverts;
+ uint16_t max_gsprims;
+ uint16_t max_out_verts;
+ bool max_vert_out_per_gs_instance;
+ /* Register values. */
unsigned ge_max_output_per_subgroup;
unsigned ge_ngg_subgrp_cntl;
unsigned vgt_primitiveid_en;
unsigned vgt_gs_onchip_cntl;
unsigned vgt_gs_instance_cnt;
- unsigned vgt_esgs_ring_itemsize;
+ unsigned esgs_vertex_stride;
unsigned spi_vs_out_config;
- unsigned spi_shader_idx_format;
unsigned spi_shader_pos_format;
unsigned pa_cl_vte_cntl;
- unsigned pa_cl_ngg_cntl;
unsigned vgt_gs_max_vert_out; /* for API GS */
unsigned ge_pc_alloc; /* uconfig register */
unsigned spi_shader_pgm_rsrc3_gs;
unsigned spi_shader_pgm_rsrc4_gs;
- union si_vgt_stages_key vgt_stages;
+ unsigned vgt_shader_stages_en;
} ngg;
struct {
@@ -880,11 +951,13 @@ struct si_shader {
unsigned spi_shader_z_format;
unsigned spi_shader_col_format;
unsigned cb_shader_mask;
+ unsigned db_shader_control;
unsigned num_interp;
+ bool writes_samplemask;
} ps;
- } ctx_reg;
+ };
- /*For save precompute registers value */
+ /* Precomputed register values. */
unsigned vgt_tf_param; /* VGT_TF_PARAM */
unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
unsigned pa_cl_vs_out_cntl;
@@ -899,54 +972,70 @@ struct si_shader_part {
};
/* si_shader.c */
+struct ac_rtld_binary;
+
+void si_update_shader_binary_info(struct si_shader *shader, struct nir_shader *nir);
bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
- struct si_shader *shader, struct pipe_debug_callback *debug);
+ struct si_shader *shader, struct util_debug_callback *debug);
bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
- struct si_shader *shader, struct pipe_debug_callback *debug);
+ struct si_shader *shader, struct util_debug_callback *debug);
void si_shader_destroy(struct si_shader *shader);
-unsigned si_shader_io_get_unique_index_patch(unsigned semantic);
-unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying);
+unsigned si_shader_io_get_unique_index(unsigned semantic);
bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
uint64_t scratch_va);
+bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
+ enum si_shader_dump_type dump_type);
void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
- struct pipe_debug_callback *debug, FILE *f, bool check_debug_option);
+ struct util_debug_callback *debug, FILE *f, bool check_debug_option);
void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
- struct pipe_debug_callback *debug);
+ struct util_debug_callback *debug);
void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
const char *si_get_shader_name(const struct si_shader *shader);
void si_shader_binary_clean(struct si_shader_binary *binary);
-
-/* si_shader_llvm_gs.c */
-struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader_selector *gs_selector,
- struct pipe_debug_callback *debug);
+struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
+unsigned si_get_ps_num_interp(struct si_shader *ps);
+bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+ struct ac_rtld_binary *rtld);
+bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
+ uint64_t *value);
+unsigned si_get_shader_prefetch_size(struct si_shader *shader);
+
+/* si_shader_info.c */
+void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
+ struct si_shader_info *info);
/* si_shader_nir.c */
-void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
+void si_lower_mediump_io(struct nir_shader *nir);
+
+bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
-void si_nir_late_opts(nir_shader *nir);
+void si_nir_late_opts(struct nir_shader *nir);
char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
-/* si_state_shaders.c */
+/* si_state_shaders.cpp */
+unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
struct gfx9_gs_info *out);
bool gfx10_is_ngg_passthrough(struct si_shader *shader);
+bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
+
/* Inline helpers. */
/* Return the pointer to the main shader part's pointer. */
static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
- const struct si_shader_key *key)
+ const union si_shader_key *key)
{
- if (key->as_ls)
- return &sel->main_shader_part_ls;
- if (key->as_es && key->as_ngg)
- return &sel->main_shader_part_ngg_es;
- if (key->as_es)
- return &sel->main_shader_part_es;
- if (key->as_ngg)
- return &sel->main_shader_part_ngg;
+ if (sel->stage <= MESA_SHADER_GEOMETRY) {
+ if (key->ge.as_ls)
+ return &sel->main_shader_part_ls;
+ if (key->ge.as_es && key->ge.as_ngg)
+ return &sel->main_shader_part_ngg_es;
+ if (key->ge.as_es)
+ return &sel->main_shader_part_es;
+ if (key->ge.as_ngg)
+ return &sel->main_shader_part_ngg;
+ }
return &sel->main_shader_part;
}
@@ -960,6 +1049,38 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel
return selector ? selector->info.uses_bindless_images : false;
}
+static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
+{
+ if (shader->selector->stage == MESA_SHADER_VERTEX &&
+ !shader->selector->info.base.vs.blit_sgprs_amd &&
+ !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES))
+ return true;
+
+ return false;
+}
+
+static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
+{
+ return gfx10_edgeflags_have_effect(shader) &&
+ shader->selector->info.writes_edgeflag;
+}
+
+static inline bool si_shader_uses_streamout(const struct si_shader *shader)
+{
+ return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
+ shader->selector->info.enabled_streamout_buffer_mask &&
+ !shader->key.ge.opt.remove_streamout;
+}
+
+static inline bool si_shader_uses_discard(struct si_shader *shader)
+{
+ /* Changes to this should also update ps_modifies_zs. */
+ return shader->selector->info.base.fs.uses_discard ||
+ shader->key.ps.part.prolog.poly_stipple ||
+ shader->key.ps.mono.point_smoothing ||
+ shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
+}
+
#ifdef __cplusplus
}
#endif