diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_state.c')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.c | 3679 |
1 files changed, 2332 insertions, 1347 deletions
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 8180201bd28..67099b1e366 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1,25 +1,7 @@ /* * Copyright 2012 Advanced Micro Devices, Inc. - * All Rights Reserved. * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. + * SPDX-License-Identifier: MIT */ #include "si_build_pm4.h" @@ -30,6 +12,7 @@ #include "util/format/u_format.h" #include "util/format/u_format_s3tc.h" #include "util/u_dual_blend.h" +#include "util/u_helpers.h" #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" @@ -67,7 +50,7 @@ static unsigned si_pack_float_12p4(float x) * CB_TARGET_MASK is emitted here to avoid a hang with dual source blending * if there is not enough PS outputs. */ -static void si_emit_cb_render_state(struct si_context *sctx) +static void si_emit_cb_render_state(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct si_state_blend *blend = sctx->queued.named.blend; @@ -89,46 +72,53 @@ static void si_emit_cb_render_state(struct si_context *sctx) /* GFX9: Flush DFSM when CB_TARGET_MASK changes. * I think we don't have to do anything between IBs. */ - if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) { + if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask && + sctx->screen->pbb_context_states_per_bin > 1) { sctx->last_cb_target_mask = cb_target_mask; radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); radeon_end(); } - radeon_begin(cs); - radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, - cb_target_mask); + uint32_t cb_dcc_control = 0; - if (sctx->chip_class >= GFX8) { + if (sctx->gfx_level >= GFX8) { /* DCC MSAA workaround. * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- * COMBINER_DISABLE, but that would be more complicated. */ bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2; - unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark; - radeon_opt_set_context_reg( - sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, - S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) | - S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | + if (sctx->gfx_level >= GFX11) { + cb_dcc_control = + S_028424_SAMPLE_MASK_TRACKER_DISABLE(oc_disable) | + S_028424_SAMPLE_MASK_TRACKER_WATERMARK(sctx->screen->info.has_dedicated_vram ? 0 : 15); + } else { + cb_dcc_control = + S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->gfx_level <= GFX9) | + S_028424_OVERWRITE_COMBINER_WATERMARK(sctx->gfx_level >= GFX10 ? 6 : 4) | S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) | - S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode)); + S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->gfx_level < GFX11 && + sctx->screen->info.has_dcc_constant_encode); + } } + uint32_t sx_ps_downconvert = 0; + uint32_t sx_blend_opt_epsilon = 0; + uint32_t sx_blend_opt_control = 0; + /* RB+ register settings. */ if (sctx->screen->info.rbplus_allowed) { unsigned spi_shader_col_format = - sctx->shader.ps.cso ? sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format + sctx->shader.ps.cso ? sctx->shader.ps.current->key.ps.part.epilog.spi_shader_col_format : 0; - unsigned sx_ps_downconvert = 0; - unsigned sx_blend_opt_epsilon = 0; - unsigned sx_blend_opt_control = 0; + unsigned num_cbufs = util_last_bit(sctx->framebuffer.colorbuf_enabled_4bit & + blend->cb_target_enabled_4bit) / 4; - for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + for (i = 0; i < num_cbufs; i++) { struct si_surface *surf = (struct si_surface *)sctx->framebuffer.state.cbufs[i]; unsigned format, swap, spi_format, colormask; bool has_alpha, has_rgb; @@ -143,13 +133,15 @@ static void si_emit_cb_render_state(struct si_context *sctx) continue; } - format = G_028C70_FORMAT(surf->cb_color_info); + format = sctx->gfx_level >= GFX11 ? G_028C70_FORMAT_GFX11(surf->cb_color_info): + G_028C70_FORMAT_GFX6(surf->cb_color_info); swap = G_028C70_COMP_SWAP(surf->cb_color_info); spi_format = (spi_shader_col_format >> (i * 4)) & 0xf; colormask = (cb_target_mask >> (i * 4)) & 0xf; /* Set if RGB and A are present. */ - has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib); + has_alpha = !(sctx->gfx_level >= GFX11 ? G_028C74_FORCE_DST_ALPHA_1_GFX11(surf->cb_color_attrib): + G_028C74_FORCE_DST_ALPHA_1_GFX6(surf->cb_color_attrib)); if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32) @@ -184,28 +176,29 @@ static void si_emit_cb_render_state(struct si_context *sctx) spi_format == V_028714_SPI_SHADER_UINT16_ABGR || spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); - sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); + if (G_028C70_NUMBER_TYPE(surf->cb_color_info) != V_028C70_NUMBER_SRGB) + sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT_0_5 << (i * 4); } break; case V_028C70_COLOR_5_6_5: if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); - sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); + sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT_0_5 << (i * 4); } break; case V_028C70_COLOR_1_5_5_5: if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); - sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); + sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT_0_5 << (i * 4); } break; case V_028C70_COLOR_4_4_4_4: if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); - sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); + sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT_0_5 << (i * 4); } break; @@ -236,9 +229,10 @@ static void si_emit_cb_render_state(struct si_context *sctx) break; case V_028C70_COLOR_2_10_10_10: + case V_028C70_COLOR_10_10_10_2: if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); - sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); + sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT_0_5 << (i * 4); } break; @@ -254,12 +248,37 @@ static void si_emit_cb_render_state(struct si_context *sctx) */ if (!sx_ps_downconvert) sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; + } - /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ - radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, - sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, + cb_target_mask); + gfx11_opt_set_context_reg(R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, + cb_dcc_control); + gfx11_opt_set_context_reg(R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, + sx_ps_downconvert); + gfx11_opt_set_context_reg(R_028758_SX_BLEND_OPT_EPSILON, SI_TRACKED_SX_BLEND_OPT_EPSILON, + sx_blend_opt_epsilon); + gfx11_opt_set_context_reg(R_02875C_SX_BLEND_OPT_CONTROL, SI_TRACKED_SX_BLEND_OPT_CONTROL, + sx_blend_opt_control); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(cs); + radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, + cb_target_mask); + if (sctx->gfx_level >= GFX8) { + radeon_opt_set_context_reg(sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, + cb_dcc_control); + } + if (sctx->screen->info.rbplus_allowed) { + radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, + sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); + } + radeon_end_update_context_roll(sctx); } - radeon_end_update_context_roll(sctx); } /* @@ -287,7 +306,7 @@ static uint32_t si_translate_blend_function(int blend_func) return 0; } -static uint32_t si_translate_blend_factor(int blend_fact) +static uint32_t si_translate_blend_factor(enum amd_gfx_level gfx_level, int blend_fact) { switch (blend_fact) { case PIPE_BLENDFACTOR_ONE: @@ -303,9 +322,11 @@ static uint32_t si_translate_blend_factor(int blend_fact) case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return V_028780_BLEND_SRC_ALPHA_SATURATE; case PIPE_BLENDFACTOR_CONST_COLOR: - return V_028780_BLEND_CONSTANT_COLOR; + return gfx_level >= GFX11 ? V_028780_BLEND_CONSTANT_COLOR_GFX11: + V_028780_BLEND_CONSTANT_COLOR_GFX6; case PIPE_BLENDFACTOR_CONST_ALPHA: - return V_028780_BLEND_CONSTANT_ALPHA; + return gfx_level >= GFX11 ? V_028780_BLEND_CONSTANT_ALPHA_GFX11 : + V_028780_BLEND_CONSTANT_ALPHA_GFX6; case PIPE_BLENDFACTOR_ZERO: return V_028780_BLEND_ZERO; case PIPE_BLENDFACTOR_INV_SRC_COLOR: @@ -317,17 +338,23 @@ static uint32_t si_translate_blend_factor(int blend_fact) case PIPE_BLENDFACTOR_INV_DST_COLOR: return V_028780_BLEND_ONE_MINUS_DST_COLOR; case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; + return gfx_level >= GFX11 ? V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX11: + V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX6; case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; + return gfx_level >= GFX11 ? V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX11: + V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX6; case PIPE_BLENDFACTOR_SRC1_COLOR: - return V_028780_BLEND_SRC1_COLOR; + return gfx_level >= GFX11 ? V_028780_BLEND_SRC1_COLOR_GFX11: + V_028780_BLEND_SRC1_COLOR_GFX6; case PIPE_BLENDFACTOR_SRC1_ALPHA: - return V_028780_BLEND_SRC1_ALPHA; + return gfx_level >= GFX11 ? V_028780_BLEND_SRC1_ALPHA_GFX11: + V_028780_BLEND_SRC1_ALPHA_GFX6; case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - return V_028780_BLEND_INV_SRC1_COLOR; + return gfx_level >= GFX11 ? V_028780_BLEND_INV_SRC1_COLOR_GFX11: + V_028780_BLEND_INV_SRC1_COLOR_GFX6; case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - return V_028780_BLEND_INV_SRC1_ALPHA; + return gfx_level >= GFX11 ? V_028780_BLEND_INV_SRC1_ALPHA_GFX11: + V_028780_BLEND_INV_SRC1_ALPHA_GFX6; default: PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact); assert(0); @@ -394,20 +421,9 @@ static void si_blend_check_commutativity(struct si_screen *sscreen, struct si_st (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) | (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA); - if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src))) { - /* Addition is commutative, but floating point addition isn't - * associative: subtle changes can be introduced via different - * rounding. - * - * Out-of-order is also non-deterministic, which means that - * this breaks OpenGL invariance requirements. So only enable - * out-of-order additive blending if explicitly allowed by a - * setting. - */ - if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN || - (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add)) - blend->commutative_4bit |= chanmask; - } + if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src)) && + (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN)) + blend->commutative_4bit |= chanmask; } /** @@ -442,6 +458,8 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, if (!blend) return NULL; + si_pm4_clear_state(pm4, sctx->screen, false); + blend->alpha_to_coverage = state->alpha_to_coverage; blend->alpha_to_one = state->alpha_to_one; blend->dual_src_blend = util_blend_state_is_dual(state, 0); @@ -465,26 +483,26 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, color_control |= S_028808_ROP3(0xcc); } + unsigned db_alpha_to_mask; if (state->alpha_to_coverage && state->alpha_to_coverage_dither) { - si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, - S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | - S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) | - S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) | - S_028B70_OFFSET_ROUND(1)); + db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | + S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) | + S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) | + S_028B70_OFFSET_ROUND(1); } else { - si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, - S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | - S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) | - S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) | - S_028B70_OFFSET_ROUND(0)); + db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | + S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) | + S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) | + S_028B70_OFFSET_ROUND(0); } - if (state->alpha_to_coverage) - blend->need_src_alpha_4bit |= 0xf; + si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, db_alpha_to_mask); blend->cb_target_mask = 0; blend->cb_target_enabled_4bit = 0; + unsigned last_blend_cntl; + for (int i = 0; i < num_shader_outputs; i++) { /* state->rt entries > 0 only written if independent blending */ const int j = state->independent_blend_enable ? i : 0; @@ -504,9 +522,12 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, /* Only set dual source blending for MRT0 to avoid a hang. */ if (i >= 1 && blend->dual_src_blend) { - /* Vulkan does this for dual source blending. */ - if (i == 1) - blend_cntl |= S_028780_ENABLE(1); + if (i == 1) { + if (sctx->gfx_level >= GFX11) + blend_cntl = last_blend_cntl; + else + blend_cntl = S_028780_ENABLE(1); + } si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); continue; @@ -572,23 +593,34 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) | S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); + /* Alpha-to-coverage with blending enabled, depth writes enabled, and having no MRTZ export + * should disable SX blend optimizations. + * + * TODO: Add a piglit test for this. It should fail on gfx11 without this. + */ + if (sctx->gfx_level >= GFX11 && state->alpha_to_coverage && i == 0) { + sx_mrt_blend_opt[0] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | + S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE); + } + /* Set blend state. */ blend_cntl |= S_028780_ENABLE(1); blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); - blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); - blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); + blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(sctx->gfx_level, srcRGB)); + blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(sctx->gfx_level, dstRGB)); if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); - blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); - blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); + blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(sctx->gfx_level, srcA)); + blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(sctx->gfx_level, dstA)); } si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + last_blend_cntl = blend_cntl; blend->blend_enable_4bit |= 0xfu << (i * 4); - if (sctx->chip_class >= GFX8 && sctx->chip_class <= GFX10) + if (sctx->gfx_level >= GFX8 && sctx->gfx_level <= GFX10) blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4); /* This is only important for formats without alpha. */ @@ -599,7 +631,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->need_src_alpha_4bit |= 0xfu << (i * 4); } - if (sctx->chip_class >= GFX8 && sctx->chip_class <= GFX10 && logicop_enable) + if (sctx->gfx_level >= GFX8 && sctx->gfx_level <= GFX10 && logicop_enable) blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit; if (blend->cb_target_mask) { @@ -628,6 +660,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, } si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); + si_pm4_finalize(pm4); return blend; } @@ -636,19 +669,17 @@ static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_b return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL); } -static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, - const struct pipe_draw_info *info, - unsigned drawid_offset, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws) { - struct si_context *sctx = (struct si_context *)ctx; - +static bool si_check_blend_dst_sampler_noop(struct si_context *sctx) +{ if (sctx->framebuffer.state.nr_cbufs == 1) { struct si_shader_selector *sel = sctx->shader.ps.cso; - bool free_nir; + if (unlikely(sel->info.writes_1_if_tex_is_1 == 0xff)) { - struct nir_shader *nir = si_get_nir_shader(sel, NULL, &free_nir); + /* Wait for the shader to be ready. */ + util_queue_fence_wait(&sel->ready); + assert(sel->nir_binary); + + struct nir_shader *nir = si_deserialize_shader(sel); /* Determine if this fragment shader always writes vec4(1) if a specific texture * is all 1s. @@ -663,8 +694,7 @@ static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, sel->info.writes_1_if_tex_is_1 = 0; } - if (free_nir) - ralloc_free(nir); + ralloc_free(nir); } if (sel->info.writes_1_if_tex_is_1 && @@ -677,16 +707,44 @@ static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, if (tex->is_depth && tex->depth_cleared_level_mask & BITFIELD_BIT(samp->views[unit]->u.tex.first_level) && tex->depth_clear_value[0] == 1) { - return; + return false; } /* TODO: handle color textures */ } } } + return true; +} + +static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) { + struct si_context *sctx = (struct si_context *)ctx; + + if (!si_check_blend_dst_sampler_noop(sctx)) + return; + sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws); } +static void si_draw_vstate_blend_dst_sampler_noop(struct pipe_context *ctx, + struct pipe_vertex_state *state, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) { + struct si_context *sctx = (struct si_context *)ctx; + + if (!si_check_blend_dst_sampler_noop(sctx)) + return; + + sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws); +} + static void si_bind_blend_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -704,17 +762,23 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) sctx->framebuffer.has_dcc_msaa)) si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - if (old_blend->cb_target_mask != blend->cb_target_mask || + if ((sctx->screen->info.has_export_conflict_bug && + old_blend->blend_enable_4bit != blend->blend_enable_4bit) || + (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN && + !!old_blend->cb_target_mask != !!blend->cb_target_enabled_4bit)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + if (old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || old_blend->alpha_to_coverage != blend->alpha_to_coverage || old_blend->alpha_to_one != blend->alpha_to_one || old_blend->dual_src_blend != blend->dual_src_blend || old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) { - si_ps_key_update_framebuffer_blend(sctx); - si_ps_key_update_blend_rasterizer(sctx); + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) + si_ps_key_update_framebuffer_blend_rasterizer(sctx); + + if (old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || + old_blend->alpha_to_coverage != blend->alpha_to_coverage) si_update_ps_inputs_read_or_disabled(sctx); - sctx->do_update_shaders = true; - } if (sctx->screen->dpbb_allowed && (old_blend->alpha_to_coverage != blend->alpha_to_coverage || @@ -722,18 +786,28 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - if (sctx->screen->has_out_of_order_rast && + if (sctx->screen->info.has_out_of_order_rast && ((old_blend->blend_enable_4bit != blend->blend_enable_4bit || old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || old_blend->commutative_4bit != blend->commutative_4bit || old_blend->logicop_enable != blend->logicop_enable))) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + /* RB+ depth-only rendering. See the comment where we set rbplus_depth_only_opt for more + * information. + */ + if (sctx->screen->info.rbplus_allowed && + !!old_blend->cb_target_mask != !!blend->cb_target_mask) { + sctx->framebuffer.dirty_cbufs |= BITFIELD_BIT(0); + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + } + if (likely(!radeon_uses_secure_bos(sctx->ws))) { if (unlikely(blend->allows_noop_optimization)) { - si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop); + si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop, + si_draw_vstate_blend_dst_sampler_noop); } else { - si_install_draw_wrapper(sctx, NULL); + si_install_draw_wrapper(sctx, NULL, NULL); } } } @@ -758,13 +832,13 @@ static void si_set_blend_color(struct pipe_context *ctx, const struct pipe_blend si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color); } -static void si_emit_blend_color(struct si_context *sctx) +static void si_emit_blend_color(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); - radeon_emit_array(cs, (uint32_t *)sctx->blend_color.color, 4); + radeon_set_context_reg_seq(R_028414_CB_BLEND_RED, 4); + radeon_emit_array((uint32_t *)sctx->blend_color.color, 4); radeon_end(); } @@ -792,28 +866,27 @@ static void si_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_s si_set_internal_const_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb); } -static void si_emit_clip_state(struct si_context *sctx) +static void si_emit_clip_state(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4); - radeon_emit_array(cs, (uint32_t *)sctx->clip_state.ucp, 6 * 4); + radeon_set_context_reg_seq(R_0285BC_PA_CL_UCP_0_X, 6 * 4); + radeon_emit_array((uint32_t *)sctx->clip_state.ucp, 6 * 4); radeon_end(); } -static void si_emit_clip_regs(struct si_context *sctx) +static void si_emit_clip_regs(struct si_context *sctx, unsigned index) { struct si_shader *vs = si_get_vs(sctx)->current; struct si_shader_selector *vs_sel = vs->selector; struct si_shader_info *info = &vs_sel->info; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - bool window_space = info->stage == MESA_SHADER_VERTEX ? + bool window_space = vs_sel->stage == MESA_SHADER_VERTEX ? info->base.vs.window_space_position : 0; - unsigned clipdist_mask = vs_sel->clipdist_mask; - unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS; - unsigned culldist_mask = vs_sel->culldist_mask; - unsigned vs_out_mask = (clipdist_mask & ~vs->key.opt.kill_clip_distances) | culldist_mask; + unsigned clipdist_mask = vs_sel->info.clipdist_mask; + unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SI_USER_CLIP_PLANE_MASK; + unsigned culldist_mask = vs_sel->info.culldist_mask; /* Clip distances on points have no effect, so need to be implemented * as cull distances. This applies for the clipvertex case as well. @@ -824,54 +897,31 @@ static void si_emit_clip_regs(struct si_context *sctx) clipdist_mask &= rs->clip_plane_enable; culldist_mask |= clipdist_mask; - unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) | - S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 && + unsigned pa_cl_cntl = S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->gfx_level >= GFX10_3 && !sctx->screen->options.vrs2x2) | - S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) | + S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->gfx_level >= GFX10_3) | clipdist_mask | (culldist_mask << 8); - radeon_begin(&sctx->gfx_cs); - - if (sctx->chip_class >= GFX10) { - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl, - ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); + unsigned pa_cl_clip_cntl = rs->pa_cl_clip_cntl | ucp_mask | + S_028810_CLIP_DISABLE(window_space); + unsigned pa_cl_vs_out_cntl = pa_cl_cntl | vs->pa_cl_vs_out_cntl; + + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, + pa_cl_clip_cntl); + gfx11_opt_set_context_reg(R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, + pa_cl_vs_out_cntl); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ } else { - radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, - vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); - } - radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, - rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space)); - radeon_end_update_context_roll(sctx); -} - -/* - * inferred state between framebuffer and rasterizer - */ -static void si_update_poly_offset_state(struct si_context *sctx) -{ - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - - if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { - si_pm4_bind_state(sctx, poly_offset, NULL); - return; - } - - /* Use the user format, not db_render_format, so that the polygon - * offset behaves as expected by applications. - */ - switch (sctx->framebuffer.state.zsbuf->texture->format) { - case PIPE_FORMAT_Z16_UNORM: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]); - break; - default: /* 24-bit */ - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]); - break; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]); - break; + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, + pa_cl_clip_cntl); + radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, + pa_cl_vs_out_cntl); + radeon_end_update_context_roll(sctx); } } @@ -898,9 +948,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast { struct si_screen *sscreen = ((struct si_context *)ctx)->screen; struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer); - struct si_pm4_state *pm4 = &rs->pm4; - unsigned tmp, i; - float psize_min, psize_max; if (!rs) { return NULL; @@ -918,6 +965,7 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast rs->line_smooth = state->line_smooth; rs->line_width = state->line_width; rs->poly_smooth = state->poly_smooth; + rs->point_smooth = state->point_smooth; rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri; rs->clamp_fragment_color = state->clamp_fragment_color; rs->clamp_vertex_color = state->clamp_vertex_color; @@ -925,31 +973,40 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast rs->flatshade_first = state->flatshade_first; rs->sprite_coord_enable = state->sprite_coord_enable; rs->rasterizer_discard = state->rasterizer_discard; + rs->bottom_edge_rule = state->bottom_edge_rule; rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) || (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK)); rs->polygon_mode_is_points = (state->fill_front == PIPE_POLYGON_MODE_POINT && !(state->cull_face & PIPE_FACE_FRONT)) || (state->fill_back == PIPE_POLYGON_MODE_POINT && !(state->cull_face & PIPE_FACE_BACK)); - rs->pa_sc_line_stipple = state->line_stipple_enable - ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | - S_028A0C_REPEAT_COUNT(state->line_stipple_factor) - : 0; + rs->pa_sc_line_stipple = state->line_stipple_enable ? + S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | + S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0; + /* TODO: implement line stippling with perpendicular end caps. */ + /* Line width > 2 is an internal recommendation. */ + rs->perpendicular_end_caps = state->multisample && + state->line_width > 2 && !state->line_stipple_enable; + rs->pa_cl_clip_cntl = S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) | S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) | S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) | S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) | S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); + rs->ngg_cull_flags_tris = SI_NGG_CULL_TRIANGLES | + SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable); + rs->ngg_cull_flags_tris_y_inverted = rs->ngg_cull_flags_tris; + + rs->ngg_cull_flags_lines = SI_NGG_CULL_LINES | + (!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0) | + SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable); + if (rs->rasterizer_discard) { - rs->ngg_cull_flags = SI_NGG_CULL_ENABLED | - SI_NGG_CULL_FRONT_FACE | - SI_NGG_CULL_BACK_FACE; - rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags; + rs->ngg_cull_flags_tris |= SI_NGG_CULL_FRONT_FACE | + SI_NGG_CULL_BACK_FACE; + rs->ngg_cull_flags_tris_y_inverted = rs->ngg_cull_flags_tris; } else { - rs->ngg_cull_flags = SI_NGG_CULL_ENABLED; - rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags; - bool cull_front, cull_back; if (!state->front_ccw) { @@ -961,28 +1018,37 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast } if (cull_front) { - rs->ngg_cull_flags |= SI_NGG_CULL_FRONT_FACE; - rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_BACK_FACE; + rs->ngg_cull_flags_tris |= SI_NGG_CULL_FRONT_FACE; + rs->ngg_cull_flags_tris_y_inverted |= SI_NGG_CULL_BACK_FACE; } if (cull_back) { - rs->ngg_cull_flags |= SI_NGG_CULL_BACK_FACE; - rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_FRONT_FACE; + rs->ngg_cull_flags_tris |= SI_NGG_CULL_BACK_FACE; + rs->ngg_cull_flags_tris_y_inverted |= SI_NGG_CULL_FRONT_FACE; } } - si_pm4_set_reg( - pm4, R_0286D4_SPI_INTERP_CONTROL_0, - S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | - S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | - S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | - S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | - S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | - S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT)); + /* Force gl_FrontFacing to true or false if the other face is culled. */ + if (util_bitcount(state->cull_face) == 1) { + if (state->cull_face & PIPE_FACE_FRONT) + rs->force_front_face_input = -1; + else + rs->force_front_face_input = 1; + } + + rs->spi_interp_control_0 = S_0286D4_FLAT_SHADE_ENA(1) | + S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | + S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | + S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | + S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | + S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | + S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != + PIPE_SPRITE_COORD_UPPER_LEFT); /* point size 12.4 fixed point */ - tmp = (unsigned)(state->point_size * 8.0); - si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); + float psize_min, psize_max; + unsigned tmp = (unsigned)(state->point_size * 8.0); + rs->pa_su_point_size = S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp); if (state->point_size_per_vertex) { psize_min = util_get_min_point_size(state); @@ -995,82 +1061,178 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast rs->max_point_size = psize_max; /* Divide by two, because 0.5 = 1 pixel. */ - si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX, - S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) | - S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2))); - - si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, - S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2))); - si_pm4_set_reg( - pm4, R_028A48_PA_SC_MODE_CNTL_0, - S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | - S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || state->line_smooth) | - S_028A48_VPORT_SCISSOR_ENABLE(1) | - S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9)); + rs->pa_su_point_minmax = S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) | + S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)); + rs->pa_su_line_cntl = S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)); + + rs->pa_sc_mode_cntl_0 = S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | + S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || + state->line_smooth) | + S_028A48_VPORT_SCISSOR_ENABLE(1) | + S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.gfx_level >= GFX9); bool polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) || (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK)); - si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL, - S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | - S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | - S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | - S_028814_FACE(!state->front_ccw) | - S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | - S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | - S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | - S_028814_POLY_MODE(polygon_mode_enabled) | - S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | - S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) | - /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */ - S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? polygon_mode_enabled : 0)); - - if (!rs->uses_poly_offset) - return rs; - - rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state)); - if (!rs->pm4_poly_offset) { - FREE(rs); - return NULL; + rs->pa_su_sc_mode_cntl = S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | + S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | + S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | + S_028814_FACE(!state->front_ccw) | + S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | + S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | + S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | + S_028814_POLY_MODE(polygon_mode_enabled) | + S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | + S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) | + /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */ + S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.gfx_level >= GFX10 ? + polygon_mode_enabled || + rs->perpendicular_end_caps : 0); + if (sscreen->info.gfx_level >= GFX10) { + rs->pa_cl_ngg_cntl = S_028838_INDEX_BUF_EDGE_FLAG_ENA(rs->polygon_mode_is_points || + rs->polygon_mode_is_lines) | + S_028838_VERTEX_REUSE_DEPTH(sscreen->info.gfx_level >= GFX10_3 ? 30 : 0); + } + + if (state->bottom_edge_rule) { + /* OpenGL windows should set this. */ + rs->pa_sc_edgerule = S_028230_ER_TRI(0xA) | + S_028230_ER_POINT(0x5) | + S_028230_ER_RECT(0x9) | + S_028230_ER_LINE_LR(0x2A) | + S_028230_ER_LINE_RL(0x2A) | + S_028230_ER_LINE_TB(0xA) | + S_028230_ER_LINE_BT(0xA); + } else { + /* OpenGL FBOs and Direct3D should set this. */ + rs->pa_sc_edgerule = S_028230_ER_TRI(0xA) | + S_028230_ER_POINT(0x6) | + S_028230_ER_RECT(0xA) | + S_028230_ER_LINE_LR(0x19) | + S_028230_ER_LINE_RL(0x25) | + S_028230_ER_LINE_TB(0xA) | + S_028230_ER_LINE_BT(0xA); } - /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ - for (i = 0; i < 3; i++) { - struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i]; - float offset_units = state->offset_units; - float offset_scale = state->offset_scale * 16.0f; - uint32_t pa_su_poly_offset_db_fmt_cntl = 0; + if (rs->uses_poly_offset) { + /* Calculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ + rs->pa_su_poly_offset_clamp = fui(state->offset_clamp); + rs->pa_su_poly_offset_frontback_scale = fui(state->offset_scale * 16); if (!state->offset_units_unscaled) { - switch (i) { - case 0: /* 16-bit zbuffer */ - offset_units *= 4.0f; - pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); - break; - case 1: /* 24-bit zbuffer */ - offset_units *= 2.0f; - pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); - break; - case 2: /* 32-bit zbuffer */ - offset_units *= 1.0f; - pa_su_poly_offset_db_fmt_cntl = - S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); - break; - } + /* 16-bit zbuffer */ + rs->pa_su_poly_offset_db_fmt_cntl[0] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); + rs->pa_su_poly_offset_frontback_offset[0] = fui(state->offset_units * 4); + + /* 24-bit zbuffer */ + rs->pa_su_poly_offset_db_fmt_cntl[1] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); + rs->pa_su_poly_offset_frontback_offset[1] = fui(state->offset_units * 2); + + /* 32-bit zbuffer */ + rs->pa_su_poly_offset_db_fmt_cntl[2] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | + S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); + rs->pa_su_poly_offset_frontback_offset[2] = fui(state->offset_units); + } else { + rs->pa_su_poly_offset_frontback_offset[0] = fui(state->offset_units); + rs->pa_su_poly_offset_frontback_offset[1] = fui(state->offset_units); + rs->pa_su_poly_offset_frontback_offset[2] = fui(state->offset_units); } - - si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl); - si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); - si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units)); - si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units)); } return rs; } +static void si_pm4_emit_rasterizer(struct si_context *sctx, unsigned index) +{ + struct si_state_rasterizer *state = sctx->queued.named.rasterizer; + + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_0286D4_SPI_INTERP_CONTROL_0, SI_TRACKED_SPI_INTERP_CONTROL_0, + state->spi_interp_control_0); + gfx11_opt_set_context_reg(R_028A00_PA_SU_POINT_SIZE, SI_TRACKED_PA_SU_POINT_SIZE, + state->pa_su_point_size); + gfx11_opt_set_context_reg(R_028A04_PA_SU_POINT_MINMAX, SI_TRACKED_PA_SU_POINT_MINMAX, + state->pa_su_point_minmax); + gfx11_opt_set_context_reg(R_028A08_PA_SU_LINE_CNTL, SI_TRACKED_PA_SU_LINE_CNTL, + state->pa_su_line_cntl); + gfx11_opt_set_context_reg(R_028A48_PA_SC_MODE_CNTL_0, SI_TRACKED_PA_SC_MODE_CNTL_0, + state->pa_sc_mode_cntl_0); + gfx11_opt_set_context_reg(R_028814_PA_SU_SC_MODE_CNTL, SI_TRACKED_PA_SU_SC_MODE_CNTL, + state->pa_su_sc_mode_cntl); + gfx11_opt_set_context_reg(R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL, + state->pa_cl_ngg_cntl); + gfx11_opt_set_context_reg(R_028230_PA_SC_EDGERULE, SI_TRACKED_PA_SC_EDGERULE, + state->pa_sc_edgerule); + + if (state->uses_poly_offset && sctx->framebuffer.state.zsbuf) { + unsigned db_format_index = + ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index; + + gfx11_opt_set_context_reg(R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + state->pa_su_poly_offset_db_fmt_cntl[db_format_index]); + gfx11_opt_set_context_reg(R_028B7C_PA_SU_POLY_OFFSET_CLAMP, + SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP, + state->pa_su_poly_offset_clamp); + gfx11_opt_set_context_reg(R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE, + state->pa_su_poly_offset_frontback_scale); + gfx11_opt_set_context_reg(R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET, + state->pa_su_poly_offset_frontback_offset[db_format_index]); + gfx11_opt_set_context_reg(R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE, + state->pa_su_poly_offset_frontback_scale); + gfx11_opt_set_context_reg(R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET, + state->pa_su_poly_offset_frontback_offset[db_format_index]); + } + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg(sctx, R_0286D4_SPI_INTERP_CONTROL_0, + SI_TRACKED_SPI_INTERP_CONTROL_0, + state->spi_interp_control_0); + radeon_opt_set_context_reg(sctx, R_028A00_PA_SU_POINT_SIZE, SI_TRACKED_PA_SU_POINT_SIZE, + state->pa_su_point_size); + radeon_opt_set_context_reg(sctx, R_028A04_PA_SU_POINT_MINMAX, SI_TRACKED_PA_SU_POINT_MINMAX, + state->pa_su_point_minmax); + radeon_opt_set_context_reg(sctx, R_028A08_PA_SU_LINE_CNTL, SI_TRACKED_PA_SU_LINE_CNTL, + state->pa_su_line_cntl); + radeon_opt_set_context_reg(sctx, R_028A48_PA_SC_MODE_CNTL_0, SI_TRACKED_PA_SC_MODE_CNTL_0, + state->pa_sc_mode_cntl_0); + radeon_opt_set_context_reg(sctx, R_028814_PA_SU_SC_MODE_CNTL, + SI_TRACKED_PA_SU_SC_MODE_CNTL, state->pa_su_sc_mode_cntl); + if (sctx->gfx_level >= GFX10) { + radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL, + state->pa_cl_ngg_cntl); + } + radeon_opt_set_context_reg(sctx, R_028230_PA_SC_EDGERULE, SI_TRACKED_PA_SC_EDGERULE, + state->pa_sc_edgerule); + + if (state->uses_poly_offset && sctx->framebuffer.state.zsbuf) { + unsigned db_format_index = + ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index; + + radeon_opt_set_context_reg6(R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + state->pa_su_poly_offset_db_fmt_cntl[db_format_index], + state->pa_su_poly_offset_clamp, + state->pa_su_poly_offset_frontback_scale, + state->pa_su_poly_offset_frontback_offset[db_format_index], + state->pa_su_poly_offset_frontback_scale, + state->pa_su_poly_offset_frontback_offset[db_format_index]); + } + radeon_end_update_context_roll(); + } + + sctx->emitted.named.rasterizer = state; +} + static void si_bind_rs_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -1081,32 +1243,41 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state; if (old_rs->multisample_enable != rs->multisample_enable) { - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); /* Update the small primitive filter workaround if necessary. */ - if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + if (sctx->screen->info.has_small_prim_filter_sample_loc_bug && sctx->framebuffer.nr_samples > 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_locations); /* NGG cull state uses multisample_enable. */ if (sctx->screen->use_ngg_culling) si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); } - sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR; - sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color); + if (old_rs->perpendicular_end_caps != rs->perpendicular_end_caps) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + if (sctx->screen->use_ngg_culling && + (old_rs->half_pixel_center != rs->half_pixel_center || + old_rs->line_width != rs->line_width)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); + + SET_FIELD(sctx->current_vs_state, VS_STATE_CLAMP_VERTEX_COLOR, rs->clamp_vertex_color); si_pm4_bind_state(sctx, rasterizer, rs); - si_update_poly_offset_state(sctx); if (old_rs->scissor_enable != rs->scissor_enable) si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size || - old_rs->half_pixel_center != rs->half_pixel_center) + /* This never changes for OpenGL. */ + if (old_rs->half_pixel_center != rs->half_pixel_center) si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); + if (util_prim_is_lines(sctx->current_rast_prim)) + si_set_clip_discard_distance(sctx, rs->line_width); + else if (sctx->current_rast_prim == MESA_PRIM_POINTS) + si_set_clip_discard_distance(sctx, rs->max_point_size); + if (old_rs->clip_halfz != rs->clip_halfz) si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); @@ -1118,28 +1289,49 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->flatshade != rs->flatshade) si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); - if (old_rs->clip_plane_enable != rs->clip_plane_enable || - old_rs->rasterizer_discard != rs->rasterizer_discard || - old_rs->sprite_coord_enable != rs->sprite_coord_enable || - old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side || - old_rs->multisample_enable != rs->multisample_enable || - old_rs->poly_stipple_enable != rs->poly_stipple_enable || - old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth || - old_rs->clamp_fragment_color != rs->clamp_fragment_color || - old_rs->force_persample_interp != rs->force_persample_interp || - old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) { - si_ps_key_update_blend_rasterizer(sctx); + if (sctx->screen->dpbb_allowed && (old_rs->bottom_edge_rule != rs->bottom_edge_rule)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + + if (old_rs->multisample_enable != rs->multisample_enable) + si_ps_key_update_framebuffer_blend_rasterizer(sctx); + + if (old_rs->flatshade != rs->flatshade || + old_rs->clamp_fragment_color != rs->clamp_fragment_color) si_ps_key_update_rasterizer(sctx); + + if (old_rs->flatshade != rs->flatshade || + old_rs->force_persample_interp != rs->force_persample_interp || + old_rs->multisample_enable != rs->multisample_enable) si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + + if (old_rs->rasterizer_discard != rs->rasterizer_discard || + old_rs->two_side != rs->two_side || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->point_smooth != rs->point_smooth) si_update_ps_inputs_read_or_disabled(sctx); + + if (old_rs->point_smooth != rs->point_smooth || + old_rs->line_smooth != rs->line_smooth || + old_rs->poly_smooth != rs->poly_smooth || + old_rs->polygon_mode_is_points != rs->polygon_mode_is_points || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->two_side != rs->two_side || + old_rs->force_front_face_input != rs->force_front_face_input) + si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); + + /* Used by si_get_vs_key_outputs in si_update_shaders: */ + if (old_rs->clip_plane_enable != rs->clip_plane_enable) sctx->do_update_shaders = true; - } if (old_rs->line_smooth != rs->line_smooth || old_rs->poly_smooth != rs->poly_smooth || + old_rs->point_smooth != rs->point_smooth || old_rs->poly_stipple_enable != rs->poly_stipple_enable || old_rs->flatshade != rs->flatshade) si_update_vrs_flat_shading(sctx); + + if (old_rs->flatshade_first != rs->flatshade_first) + si_update_ngg_sgpr_state_provoking_vtx(sctx, si_get_vs(sctx)->current, sctx->ngg); } static void si_delete_rs_state(struct pipe_context *ctx, void *state) @@ -1150,28 +1342,28 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) if (sctx->queued.named.rasterizer == state) si_bind_rs_state(ctx, sctx->discard_rasterizer_state); - FREE(rs->pm4_poly_offset); si_pm4_free_state(sctx, &rs->pm4, SI_STATE_IDX(rasterizer)); } /* * inferred state between dsa and stencil ref */ -static void si_emit_stencil_ref(struct si_context *sctx) +static void si_emit_stencil_ref(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); - radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) | - S_028430_STENCILMASK(dsa->valuemask[0]) | - S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1)); - radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | - S_028434_STENCILMASK_BF(dsa->valuemask[1]) | - S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | - S_028434_STENCILOPVAL_BF(1)); + radeon_set_context_reg_seq(R_028430_DB_STENCILREFMASK, 2); + radeon_emit(S_028430_STENCILTESTVAL(ref->ref_value[0]) | + S_028430_STENCILMASK(dsa->valuemask[0]) | + S_028430_STENCILWRITEMASK(dsa->writemask[0]) | + S_028430_STENCILOPVAL(1)); + radeon_emit(S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | + S_028434_STENCILMASK_BF(dsa->valuemask[1]) | + S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | + S_028434_STENCILOPVAL_BF(1)); radeon_end(); } @@ -1240,12 +1432,7 @@ static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *st static void *si_create_dsa_state(struct pipe_context *ctx, const struct pipe_depth_stencil_alpha_state *state) { - struct si_context *sctx = (struct si_context *)ctx; struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa); - struct si_pm4_state *pm4 = &dsa->pm4; - unsigned db_depth_control; - uint32_t db_stencil_control = 0; - if (!dsa) { return NULL; } @@ -1255,57 +1442,51 @@ static void *si_create_dsa_state(struct pipe_context *ctx, dsa->stencil_ref.writemask[0] = state->stencil[0].writemask; dsa->stencil_ref.writemask[1] = state->stencil[1].writemask; - db_depth_control = + dsa->db_depth_control = S_028800_Z_ENABLE(state->depth_enabled) | S_028800_Z_WRITE_ENABLE(state->depth_writemask) | S_028800_ZFUNC(state->depth_func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth_bounds_test); /* stencil */ if (state->stencil[0].enabled) { - db_depth_control |= S_028800_STENCIL_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); - db_stencil_control |= + dsa->db_depth_control |= S_028800_STENCIL_ENABLE(1); + dsa->db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); + dsa->db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op)); if (state->stencil[1].enabled) { - db_depth_control |= S_028800_BACKFACE_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); - db_stencil_control |= + dsa->db_depth_control |= S_028800_BACKFACE_ENABLE(1); + dsa->db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); + dsa->db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op)); } } + dsa->db_depth_bounds_min = fui(state->depth_bounds_min); + dsa->db_depth_bounds_max = fui(state->depth_bounds_max); + /* alpha */ if (state->alpha_enabled) { dsa->alpha_func = state->alpha_func; - - si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, - fui(state->alpha_ref_value)); + dsa->spi_shader_user_data_ps_alpha_ref = fui(state->alpha_ref_value); } else { dsa->alpha_func = PIPE_FUNC_ALWAYS; } - si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control); - if (state->stencil[0].enabled) - si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control); - if (state->depth_bounds_test) { - si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth_bounds_min)); - si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth_bounds_max)); - } - dsa->depth_enabled = state->depth_enabled; dsa->depth_write_enabled = state->depth_enabled && state->depth_writemask; dsa->stencil_enabled = state->stencil[0].enabled; dsa->stencil_write_enabled = (util_writes_stencil(&state->stencil[0]) || util_writes_stencil(&state->stencil[1])); dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled; + dsa->depth_bounds_enabled = state->depth_bounds_test; bool zfunc_is_ordered = state->depth_func == PIPE_FUNC_NEVER || state->depth_func == PIPE_FUNC_LESS || @@ -1329,15 +1510,71 @@ static void *si_create_dsa_state(struct pipe_context *ctx, !dsa->depth_write_enabled || (state->depth_func == PIPE_FUNC_ALWAYS || state->depth_func == PIPE_FUNC_NEVER); - dsa->order_invariance[1].pass_last = sctx->screen->assume_no_z_fights && - !dsa->stencil_write_enabled && dsa->depth_write_enabled && - zfunc_is_ordered; - dsa->order_invariance[0].pass_last = - sctx->screen->assume_no_z_fights && dsa->depth_write_enabled && zfunc_is_ordered; - return dsa; } +static void si_pm4_emit_dsa(struct si_context *sctx, unsigned index) +{ + struct si_state_dsa *state = sctx->queued.named.dsa; + assert(state && state != sctx->emitted.named.dsa); + + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028800_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL, + state->db_depth_control); + if (state->stencil_enabled) { + gfx11_opt_set_context_reg(R_02842C_DB_STENCIL_CONTROL, SI_TRACKED_DB_STENCIL_CONTROL, + state->db_stencil_control); + } + if (state->depth_bounds_enabled) { + gfx11_opt_set_context_reg(R_028020_DB_DEPTH_BOUNDS_MIN, SI_TRACKED_DB_DEPTH_BOUNDS_MIN, + state->db_depth_bounds_min); + gfx11_opt_set_context_reg(R_028024_DB_DEPTH_BOUNDS_MAX, SI_TRACKED_DB_DEPTH_BOUNDS_MAX, + state->db_depth_bounds_max); + } + gfx11_end_packed_context_regs(); + + if (state->alpha_func != PIPE_FUNC_ALWAYS) { + if (sctx->screen->info.has_set_sh_pairs_packed) { + gfx11_opt_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, + state->spi_shader_user_data_ps_alpha_ref); + } else { + radeon_opt_set_sh_reg(sctx, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, + state->spi_shader_user_data_ps_alpha_ref); + } + } + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg(sctx, R_028800_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL, + state->db_depth_control); + if (state->stencil_enabled) { + radeon_opt_set_context_reg(sctx, R_02842C_DB_STENCIL_CONTROL, SI_TRACKED_DB_STENCIL_CONTROL, + state->db_stencil_control); + } + if (state->depth_bounds_enabled) { + radeon_opt_set_context_reg2(sctx, R_028020_DB_DEPTH_BOUNDS_MIN, + SI_TRACKED_DB_DEPTH_BOUNDS_MIN, + state->db_depth_bounds_min, + state->db_depth_bounds_max); + } + radeon_end_update_context_roll(); + + if (state->alpha_func != PIPE_FUNC_ALWAYS) { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_sh_reg(sctx, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, + state->spi_shader_user_data_ps_alpha_ref); + radeon_end(); + } + } + + sctx->emitted.named.dsa = state; +} + static void si_bind_dsa_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -1358,16 +1595,20 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) if (old_dsa->alpha_func != dsa->alpha_func) { si_ps_key_update_dsa(sctx); si_update_ps_inputs_read_or_disabled(sctx); - si_update_ps_kill_enable(sctx); sctx->do_update_shaders = true; } + if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN && + (old_dsa->depth_enabled != dsa->depth_enabled || + old_dsa->depth_write_enabled != dsa->depth_write_enabled)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled || old_dsa->stencil_enabled != dsa->stencil_enabled || old_dsa->db_can_write != dsa->db_can_write))) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - if (sctx->screen->has_out_of_order_rast && + if (sctx->screen->info.has_out_of_order_rast && (memcmp(old_dsa->order_invariance, dsa->order_invariance, sizeof(old_dsa->order_invariance)))) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); @@ -1398,11 +1639,18 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable) /* Pipeline stat & streamout queries. */ if (enable) { - sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + /* Disable pipeline stats if there are no active queries. */ + if (sctx->num_hw_pipestat_streamout_queries) { + sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } } else { - sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + if (sctx->num_hw_pipestat_streamout_queries) { + sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } } /* Occlusion queries. */ @@ -1412,16 +1660,6 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable) } } -void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable) -{ - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - bool perfect_enable = sctx->num_perfect_occlusion_queries != 0; - - if (perfect_enable != old_perfect_enable) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); -} - void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); @@ -1432,198 +1670,168 @@ void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st) sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, true, &st->saved_const0); } -static void si_emit_db_render_state(struct si_context *sctx) +static void si_emit_db_render_state(struct si_context *sctx, unsigned index) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned db_shader_control, db_render_control, db_count_control; + unsigned db_shader_control = 0, db_render_control = 0, db_count_control = 0, vrs_override_cntl = 0; /* DB_RENDER_CONTROL */ if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) { - db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | - S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | - S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); + assert(sctx->gfx_level < GFX11); + db_render_control |= S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | + S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | + S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { - db_render_control = S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | - S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); + db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | + S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); } else { - db_render_control = S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | - S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); + db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | + S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); } - /* DB_COUNT_CONTROL (occlusion queries) */ - if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) { - bool perfect = sctx->num_perfect_occlusion_queries > 0; - bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect; - - if (sctx->chip_class >= GFX7) { - unsigned log_sample_rate = sctx->framebuffer.log_samples; + if (sctx->gfx_level >= GFX11) { + unsigned max_allowed_tiles_in_wave; - db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) | - S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | - S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) | - S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1); - } else { - db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) | - S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); - } - } else { - /* Disable occlusion queries. */ - if (sctx->chip_class >= GFX7) { - db_count_control = 0; + if (sctx->screen->info.has_dedicated_vram) { + if (sctx->framebuffer.nr_samples == 8) + max_allowed_tiles_in_wave = 6; + else if (sctx->framebuffer.nr_samples == 4) + max_allowed_tiles_in_wave = 13; + else + max_allowed_tiles_in_wave = 0; } else { - db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); + if (sctx->framebuffer.nr_samples == 8) + max_allowed_tiles_in_wave = 7; + else if (sctx->framebuffer.nr_samples == 4) + max_allowed_tiles_in_wave = 15; + else + max_allowed_tiles_in_wave = 0; } + + db_render_control |= S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave); } - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, - db_render_control, db_count_control); + /* DB_COUNT_CONTROL (occlusion queries) */ + if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_DISABLE || + sctx->occlusion_queries_disabled) { + /* Occlusion queries disabled. */ + if (sctx->gfx_level >= GFX7) + db_count_control |= S_028004_ZPASS_ENABLE(0); + else + db_count_control |= S_028004_ZPASS_INCREMENT_DISABLE(1); + } else { + /* Occlusion queries enabled. */ + db_count_control |= S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); - /* DB_RENDER_OVERRIDE2 */ - radeon_opt_set_context_reg( - sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2, - S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | - S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | - S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) | - S_028010_CENTROID_COMPUTATION_MODE(sctx->chip_class >= GFX10_3 ? 1 : 0)); + if (sctx->gfx_level >= GFX7) { + db_count_control |= S_028004_ZPASS_ENABLE(1) | + S_028004_SLICE_EVEN_ENABLE(1) | + S_028004_SLICE_ODD_ENABLE(1); + } - db_shader_control = sctx->ps_db_shader_control; + if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER || + /* Boolean occlusion queries must set PERFECT_ZPASS_COUNTS for depth-only rendering + * without depth writes or when depth testing is disabled. */ + (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN && + (!sctx->queued.named.dsa->depth_enabled || + (!sctx->queued.named.blend->cb_target_mask && + !sctx->queued.named.dsa->depth_write_enabled)))) + db_count_control |= S_028004_PERFECT_ZPASS_COUNTS(1); - /* Bug workaround for smoothing (overrasterization) on GFX6. */ - if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) { - db_shader_control &= C_02880C_Z_ORDER; - db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z); + if (sctx->gfx_level >= GFX10 && + sctx->occlusion_query_mode != SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN) + db_count_control |= S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(1); } - /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */ - if (!rs->multisample_enable) - db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; + /* This should always be set on GFX11. */ + if (sctx->gfx_level >= GFX11) + db_count_control |= S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(1); + + db_shader_control |= sctx->ps_db_shader_control; - if (sctx->screen->info.has_rbplus && !sctx->screen->info.rbplus_allowed) - db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); + if (sctx->screen->info.has_export_conflict_bug && + sctx->queued.named.blend->blend_enable_4bit && + si_get_num_coverage_samples(sctx) == 1) { + db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1) | + S_02880C_OVERRIDE_INTRINSIC_RATE(2); + } - radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, - db_shader_control); + if (sctx->gfx_level >= GFX10_3) { + /* Variable rate shading. */ + unsigned mode, log_rate_x, log_rate_y; - if (sctx->chip_class >= GFX10_3) { if (sctx->allow_flat_shading) { - radeon_opt_set_context_reg(sctx, R_028064_DB_VRS_OVERRIDE_CNTL, - SI_TRACKED_DB_VRS_OVERRIDE_CNTL, - S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE( - V_028064_VRS_COMB_MODE_OVERRIDE) | - S_028064_VRS_OVERRIDE_RATE_X(1) | - S_028064_VRS_OVERRIDE_RATE_Y(1)); + mode = V_028064_SC_VRS_COMB_MODE_OVERRIDE; + log_rate_x = log_rate_y = 1; /* 2x2 VRS (log2(2) == 1) */ } else { - /* If the shader is using discard, turn off coarse shading because - * discard at 2x2 pixel granularity degrades quality too much. + /* If the shader is using discard, turn off coarse shading because discarding at 2x2 pixel + * granularity degrades quality too much. * - * MIN allows sample shading but not coarse shading. + * The shader writes the VRS rate and we either pass it through or do MIN(shader, 1x1) + * to disable coarse shading. */ - unsigned mode = sctx->screen->options.vrs2x2 && G_02880C_KILL_ENABLE(db_shader_control) ? - V_028064_VRS_COMB_MODE_MIN : V_028064_VRS_COMB_MODE_PASSTHRU; + mode = sctx->screen->options.vrs2x2 && G_02880C_KILL_ENABLE(db_shader_control) ? + V_028064_SC_VRS_COMB_MODE_MIN : V_028064_SC_VRS_COMB_MODE_PASSTHRU; + log_rate_x = log_rate_y = 0; /* 1x1 VRS (log2(1) == 0) */ + } + if (sctx->gfx_level >= GFX11) { + vrs_override_cntl = S_0283D0_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) | + S_0283D0_VRS_RATE(log_rate_x * 4 + log_rate_y); + } else { + vrs_override_cntl = S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) | + S_028064_VRS_OVERRIDE_RATE_X(log_rate_x) | + S_028064_VRS_OVERRIDE_RATE_Y(log_rate_y); + } + } + + unsigned db_render_override2 = + S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | + S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | + S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) | + S_028010_CENTROID_COMPUTATION_MODE(sctx->gfx_level >= GFX10_3 ? 1 : 0); + + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, + db_render_control); + gfx11_opt_set_context_reg(R_028004_DB_COUNT_CONTROL, SI_TRACKED_DB_COUNT_CONTROL, + db_count_control); + gfx11_opt_set_context_reg(R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2, + db_render_override2); + gfx11_opt_set_context_reg(R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, + db_shader_control); + gfx11_opt_set_context_reg(R_0283D0_PA_SC_VRS_OVERRIDE_CNTL, + SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, + db_render_control, db_count_control); + radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2, + SI_TRACKED_DB_RENDER_OVERRIDE2, db_render_override2); + radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, + db_shader_control); + + if (sctx->gfx_level >= GFX11) { + radeon_opt_set_context_reg(sctx, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL, + SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); + } else if (sctx->gfx_level >= GFX10_3) { radeon_opt_set_context_reg(sctx, R_028064_DB_VRS_OVERRIDE_CNTL, - SI_TRACKED_DB_VRS_OVERRIDE_CNTL, - S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) | - S_028064_VRS_OVERRIDE_RATE_X(0) | - S_028064_VRS_OVERRIDE_RATE_Y(0)); + SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); } + radeon_end_update_context_roll(sctx); } - radeon_end_update_context_roll(sctx); } /* * format translation */ -static uint32_t si_translate_colorformat(enum chip_class chip_class, - enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - if (!desc) - return V_028C70_COLOR_INVALID; - -#define HAS_SIZE(x, y, z, w) \ - (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \ - desc->channel[2].size == (z) && desc->channel[3].size == (w)) - - if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ - return V_028C70_COLOR_10_11_11; - - if (chip_class >= GFX10_3 && - format == PIPE_FORMAT_R9G9B9E5_FLOAT) /* isn't plain */ - return V_028C70_COLOR_5_9_9_9; - - if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return V_028C70_COLOR_INVALID; - - /* hw cannot support mixed formats (except depth/stencil, since - * stencil is not written to). */ - if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) - return V_028C70_COLOR_INVALID; - - switch (desc->nr_channels) { - case 1: - switch (desc->channel[0].size) { - case 8: - return V_028C70_COLOR_8; - case 16: - return V_028C70_COLOR_16; - case 32: - return V_028C70_COLOR_32; - } - break; - case 2: - if (desc->channel[0].size == desc->channel[1].size) { - switch (desc->channel[0].size) { - case 8: - return V_028C70_COLOR_8_8; - case 16: - return V_028C70_COLOR_16_16; - case 32: - return V_028C70_COLOR_32_32; - } - } else if (HAS_SIZE(8, 24, 0, 0)) { - return V_028C70_COLOR_24_8; - } else if (HAS_SIZE(24, 8, 0, 0)) { - return V_028C70_COLOR_8_24; - } - break; - case 3: - if (HAS_SIZE(5, 6, 5, 0)) { - return V_028C70_COLOR_5_6_5; - } else if (HAS_SIZE(32, 8, 24, 0)) { - return V_028C70_COLOR_X24_8_32_FLOAT; - } - break; - case 4: - if (desc->channel[0].size == desc->channel[1].size && - desc->channel[0].size == desc->channel[2].size && - desc->channel[0].size == desc->channel[3].size) { - switch (desc->channel[0].size) { - case 4: - return V_028C70_COLOR_4_4_4_4; - case 8: - return V_028C70_COLOR_8_8_8_8; - case 16: - return V_028C70_COLOR_16_16_16_16; - case 32: - return V_028C70_COLOR_32_32_32_32; - } - } else if (HAS_SIZE(5, 5, 5, 1)) { - return V_028C70_COLOR_1_5_5_5; - } else if (HAS_SIZE(1, 5, 5, 5)) { - return V_028C70_COLOR_5_5_5_1; - } else if (HAS_SIZE(10, 10, 10, 2)) { - return V_028C70_COLOR_2_10_10_10; - } - break; - } - return V_028C70_COLOR_INVALID; -} static uint32_t si_colorformat_endian_swap(uint32_t colorformat) { - if (SI_BIG_ENDIAN) { + if (UTIL_ARCH_BIG_ENDIAN) { switch (colorformat) { /* 8-bit buffers. */ case V_028C70_COLOR_8: @@ -1640,6 +1848,7 @@ static uint32_t si_colorformat_endian_swap(uint32_t colorformat) /* 32-bit buffers. */ case V_028C70_COLOR_8_8_8_8: case V_028C70_COLOR_2_10_10_10: + case V_028C70_COLOR_10_10_10_2: case V_028C70_COLOR_8_24: case V_028C70_COLOR_24_8: case V_028C70_COLOR_16_16: @@ -1693,7 +1902,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for bool uniform = true; int i; - assert(sscreen->info.chip_class <= GFX9); + assert(sscreen->info.gfx_level <= GFX9); /* Colorspace (return non-RGB formats directly). */ switch (desc->colorspace) { @@ -1709,7 +1918,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for * gathers in stencil sampling. This affects at least * GL45-CTS.texture_cube_map_array.sampling on GFX8. */ - if (sscreen->info.chip_class <= GFX8) + if (sscreen->info.gfx_level <= GFX8) return V_008F14_IMG_DATA_FORMAT_8_8_8_8; if (format == PIPE_FORMAT_X24S8_UINT) @@ -1746,9 +1955,6 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for } if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - switch (format) { case PIPE_FORMAT_RGTC1_SNORM: case PIPE_FORMAT_LATC1_SNORM: @@ -1791,9 +1997,6 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for } if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - switch (format) { case PIPE_FORMAT_BPTC_RGBA_UNORM: case PIPE_FORMAT_BPTC_SRGBA: @@ -1820,9 +2023,6 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for } if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - switch (format) { case PIPE_FORMAT_DXT1_RGB: case PIPE_FORMAT_DXT1_RGBA: @@ -1846,13 +2046,35 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for return V_008F14_IMG_DATA_FORMAT_10_11_11; } - /* R8G8Bx_SNORM - TODO CxV8U8 */ + /* Other "OTHER" layouts are unsupported. */ + if (desc->layout == UTIL_FORMAT_LAYOUT_OTHER) + goto out_unknown; /* hw cannot support mixed formats (except depth/stencil, since only * depth is read).*/ if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) goto out_unknown; + if (first_non_void < 0 || first_non_void > 3) + goto out_unknown; + + /* Reject SCALED formats because we don't implement them for CB and do the same for texturing. */ + if ((desc->channel[first_non_void].type == UTIL_FORMAT_TYPE_UNSIGNED || + desc->channel[first_non_void].type == UTIL_FORMAT_TYPE_SIGNED) && + !desc->channel[first_non_void].normalized && + !desc->channel[first_non_void].pure_integer) + goto out_unknown; + + /* Reject unsupported 32_*NORM and FIXED formats. */ + if (desc->channel[first_non_void].size == 32 && + (desc->channel[first_non_void].normalized || + desc->channel[first_non_void].type == UTIL_FORMAT_TYPE_FIXED)) + goto out_unknown; + + /* This format fails on Gfx8/Carrizo´. */ + if (format == PIPE_FORMAT_A8R8_UNORM) + goto out_unknown; + /* See whether the components are of the same size. */ for (i = 1; i < desc->nr_channels; i++) { uniform = uniform && desc->channel[0].size == desc->channel[i].size; @@ -1868,6 +2090,13 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for } goto out_unknown; case 4: + /* 5551 and 1555 UINT formats fail on Gfx8/Carrizo´. */ + if (desc->channel[1].size == 5 && + desc->channel[2].size == 5 && + desc->channel[first_non_void].type == UTIL_FORMAT_TYPE_UNSIGNED && + desc->channel[first_non_void].pure_integer) + goto out_unknown; + if (desc->channel[0].size == 5 && desc->channel[1].size == 5 && desc->channel[2].size == 5 && desc->channel[3].size == 1) { return V_008F14_IMG_DATA_FORMAT_1_5_5_5; @@ -1885,18 +2114,16 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for goto out_unknown; } - if (first_non_void < 0 || first_non_void > 3) - goto out_unknown; - /* uniform formats */ switch (desc->channel[first_non_void].size) { case 4: switch (desc->nr_channels) { -#if 0 /* Not supported for render targets */ - case 2: - return V_008F14_IMG_DATA_FORMAT_4_4; -#endif case 4: + /* 4444 UINT formats fail on Gfx8/Carrizo´. */ + if (desc->channel[first_non_void].type == UTIL_FORMAT_TYPE_UNSIGNED && + desc->channel[first_non_void].pure_integer) + goto out_unknown; + return V_008F14_IMG_DATA_FORMAT_4_4_4_4; } break; @@ -1989,8 +2216,11 @@ static unsigned si_tex_mipfilter(unsigned filter) } } -static unsigned si_tex_compare(unsigned compare) +static unsigned si_tex_compare(unsigned mode, unsigned compare) { + if (mode == PIPE_TEX_COMPARE_NONE) + return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; + switch (compare) { default: case PIPE_FUNC_NEVER: @@ -2025,7 +2255,7 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, un /* GFX9 allocates 1D textures as 2D. */ if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) && - sscreen->info.chip_class == GFX9 && + sscreen->info.gfx_level == GFX9 && tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) { if (res_target == PIPE_TEXTURE_1D) res_target = PIPE_TEXTURE_2D; @@ -2059,18 +2289,20 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, un static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format) { struct si_screen *sscreen = (struct si_screen *)screen; + const struct util_format_description *desc = util_format_description(format); - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; + /* Samplers don't support 64 bits per channel. */ + if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + desc->channel[0].size == 64) + return false; + + if (sscreen->info.gfx_level >= GFX10) { + const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&sscreen->info)[format]; if (!fmt->img_format || fmt->buffers_only) return false; return true; } - const struct util_format_description *desc = util_format_description(format); - if (!desc) - return false; - return si_translate_texformat(screen, format, desc, util_format_get_first_non_void_channel(format)) != ~0U; } @@ -2081,7 +2313,7 @@ static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen, { int i; - assert(((struct si_screen *)screen)->info.chip_class <= GFX9); + assert(((struct si_screen *)screen)->info.gfx_level <= GFX9); if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) return V_008F0C_BUF_DATA_FORMAT_10_11_11; @@ -2155,7 +2387,7 @@ static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen, const struct util_format_description *desc, int first_non_void) { - assert(((struct si_screen *)screen)->info.chip_class <= GFX9); + assert(((struct si_screen *)screen)->info.gfx_level <= GFX9); if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) return V_008F0C_BUF_NUM_FORMAT_FLOAT; @@ -2198,8 +2430,6 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum p 0); desc = util_format_description(format); - if (!desc) - return 0; /* There are no native 8_8_8 or 16_16_16 data formats, and we currently * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well @@ -2217,9 +2447,11 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum p } } - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; - if (!fmt->img_format || fmt->img_format >= 128) + if (sscreen->info.gfx_level >= GFX10) { + const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&sscreen->info)[format]; + unsigned first_image_only_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128; + + if (!fmt->img_format || fmt->img_format >= first_image_only_format) return 0; return usage; } @@ -2232,11 +2464,11 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum p return usage; } -static bool si_is_colorbuffer_format_supported(enum chip_class chip_class, +static bool si_is_colorbuffer_format_supported(enum amd_gfx_level gfx_level, enum pipe_format format) { - return si_translate_colorformat(chip_class, format) != V_028C70_COLOR_INVALID && - si_translate_colorswap(format, false) != ~0U; + return ac_get_cb_format(gfx_level, format) != V_028C70_COLOR_INVALID && + si_translate_colorswap(gfx_level, format, false) != ~0U; } static bool si_is_zs_format_supported(enum pipe_format format) @@ -2256,6 +2488,12 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format return false; } + /* Require PIPE_BIND_SAMPLER_VIEW support when PIPE_BIND_RENDER_TARGET + * is requested. + */ + if (usage & PIPE_BIND_RENDER_TARGET) + usage |= PIPE_BIND_SAMPLER_VIEW; + if ((target == PIPE_TEXTURE_3D || target == PIPE_TEXTURE_CUBE) && !sscreen->info.has_3d_cube_border_color_mipmap) return false; @@ -2277,12 +2515,14 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format /* Chips with 1 RB don't increment occlusion queries at 16x MSAA sample rate, * so don't expose 16 samples there. + * + * EQAA also uses max 8 samples because our FMASK fetches only load 32 bits and + * would need to be changed to 64 bits for 16 samples. */ - const unsigned max_eqaa_samples = util_bitcount(sscreen->info.enabled_rb_mask) <= 1 ? 8 : 16; const unsigned max_samples = 8; /* MSAA support without framebuffer attachments. */ - if (format == PIPE_FORMAT_NONE && sample_count <= max_eqaa_samples) + if (format == PIPE_FORMAT_NONE && sample_count <= max_samples) return true; if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) { @@ -2291,7 +2531,7 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format return false; } else { /* Color with EQAA. */ - if (sample_count > max_eqaa_samples || storage_sample_count > max_samples) + if (sample_count > max_samples || storage_sample_count > max_samples) return false; } } @@ -2308,7 +2548,7 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) && - si_is_colorbuffer_format_supported(sscreen->info.chip_class, format)) { + si_is_colorbuffer_format_supported(sscreen->info.gfx_level, format)) { retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED); if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format)) @@ -2357,47 +2597,20 @@ static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf) { struct si_texture *tex = (struct si_texture *)surf->base.texture; - unsigned color_info, color_attrib; unsigned format, swap, ntype, endian; const struct util_format_description *desc; - int firstchan; unsigned blend_clamp = 0, blend_bypass = 0; desc = util_format_description(surf->base.format); - for (firstchan = 0; firstchan < 4; firstchan++) { - if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) { - break; - } - } - if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) { - ntype = V_028C70_NUMBER_FLOAT; - } else { - ntype = V_028C70_NUMBER_UNORM; - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) - ntype = V_028C70_NUMBER_SRGB; - else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) { - if (desc->channel[firstchan].pure_integer) { - ntype = V_028C70_NUMBER_SINT; - } else { - assert(desc->channel[firstchan].normalized); - ntype = V_028C70_NUMBER_SNORM; - } - } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) { - if (desc->channel[firstchan].pure_integer) { - ntype = V_028C70_NUMBER_UINT; - } else { - assert(desc->channel[firstchan].normalized); - ntype = V_028C70_NUMBER_UNORM; - } - } - } - format = si_translate_colorformat(sctx->chip_class, surf->base.format); + ntype = ac_get_cb_number_type(surf->base.format); + format = ac_get_cb_format(sctx->gfx_level, surf->base.format); + if (format == V_028C70_COLOR_INVALID) { PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format); } assert(format != V_028C70_COLOR_INVALID); - swap = si_translate_colorswap(surf->base.format, false); + swap = si_translate_colorswap(sctx->gfx_level, surf->base.format, false); endian = si_colorformat_endian_swap(format); /* blend clamp should be set for all NORM/SRGB types */ @@ -2422,35 +2635,14 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa surf->color_is_int10 = true; } - color_info = - S_028C70_FORMAT(format) | S_028C70_COMP_SWAP(swap) | S_028C70_BLEND_CLAMP(blend_clamp) | - S_028C70_BLEND_BYPASS(blend_bypass) | S_028C70_SIMPLE_FLOAT(1) | - S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM && - ntype != V_028C70_NUMBER_SRGB && format != V_028C70_COLOR_8_24 && - format != V_028C70_COLOR_24_8) | - S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian); - + unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples); + unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples); /* Intensity is implemented as Red, so treat it that way. */ - color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 || - util_format_is_intensity(surf->base.format)); - - if (tex->buffer.b.b.nr_samples > 1) { - unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples); - unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples); - - color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments); - - if (tex->surface.fmask_offset) { - color_info |= S_028C70_COMPRESSION(1); - unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.color.fmask.bankh); - - if (sctx->chip_class == GFX6) { - /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */ - color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh); - } - } - } - + bool force_dst_alpha_1 = desc->swizzle[3] == PIPE_SWIZZLE_1 || + util_format_is_intensity(surf->base.format); + bool round_mode = ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM && + ntype != V_028C70_NUMBER_SRGB && + format != V_028C70_COLOR_8_24 && format != V_028C70_COLOR_24_8; /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and * 64 for APU because all of our APUs to date use DIMMs which have * a request granularity size of 64B while all other chips have a @@ -2459,60 +2651,112 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa if (!sctx->screen->info.has_dedicated_vram) min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; - if (sctx->chip_class >= GFX10) { + surf->cb_color_info = S_028C70_COMP_SWAP(swap) | + S_028C70_BLEND_CLAMP(blend_clamp) | + S_028C70_BLEND_BYPASS(blend_bypass) | + S_028C70_SIMPLE_FLOAT(1) | + S_028C70_ROUND_MODE(round_mode) | + S_028C70_NUMBER_TYPE(ntype); + + unsigned width0 = surf->width0; + + /* GFX10.3+ can set a custom pitch for 1D and 2D non-array, but it must be a multiple of + * 256B. + * + * We set the pitch in MIP0_WIDTH. + */ + if (sctx->gfx_level >= GFX10_3 && tex->surface.u.gfx9.uses_custom_pitch) { + ASSERTED unsigned min_alignment = 256; + assert((tex->surface.u.gfx9.surf_pitch * tex->surface.bpe) % min_alignment == 0); + assert(tex->buffer.b.b.target == PIPE_TEXTURE_2D || + tex->buffer.b.b.target == PIPE_TEXTURE_RECT); + assert(tex->surface.is_linear); + + width0 = tex->surface.u.gfx9.surf_pitch; + + /* Subsampled images have the pitch in the units of blocks. */ + if (tex->surface.blk_w == 2) + width0 *= 2; + } + + if (sctx->gfx_level >= GFX10) { + /* Gfx10-11. */ + surf->cb_color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | + S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer) | + S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level); + surf->cb_color_attrib = 0; + surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(width0 - 1) | + S_028C68_MIP0_HEIGHT(surf->height0 - 1) | + S_028C68_MAX_MIP(tex->buffer.b.b.last_level); + surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(util_max_layer(&tex->buffer.b.b, 0)) | + S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) | + S_028EE0_RESOURCE_LEVEL(sctx->gfx_level >= GFX11 ? 0 : 1); surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | S_028C78_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.color.dcc.max_compressed_block_size) | S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | - S_028C78_INDEPENDENT_64B_BLOCKS(tex->surface.u.gfx9.color.dcc.independent_64B_blocks) | - S_028C78_INDEPENDENT_128B_BLOCKS(tex->surface.u.gfx9.color.dcc.independent_128B_blocks); - } else if (sctx->chip_class >= GFX8) { - unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; - - if (tex->buffer.b.b.nr_storage_samples > 1) { - if (tex->surface.bpe == 1) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - else if (tex->surface.bpe == 2) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; + S_028C78_INDEPENDENT_64B_BLOCKS(tex->surface.u.gfx9.color.dcc.independent_64B_blocks); + + if (sctx->gfx_level >= GFX11) { + assert(!UTIL_ARCH_BIG_ENDIAN); + surf->cb_color_info |= S_028C70_FORMAT_GFX11(format); + surf->cb_color_attrib |= S_028C74_NUM_FRAGMENTS_GFX11(log_fragments) | + S_028C74_FORCE_DST_ALPHA_1_GFX11(force_dst_alpha_1); + surf->cb_dcc_control |= S_028C78_INDEPENDENT_128B_BLOCKS_GFX11(tex->surface.u.gfx9.color.dcc.independent_128B_blocks); + } else { + surf->cb_color_info |= S_028C70_ENDIAN(endian) | + S_028C70_FORMAT_GFX6(format) | + S_028C70_COMPRESSION(!!tex->surface.fmask_offset); + surf->cb_color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | + S_028C74_NUM_FRAGMENTS_GFX6(log_fragments) | + S_028C74_FORCE_DST_ALPHA_1_GFX6(force_dst_alpha_1); + surf->cb_dcc_control |= S_028C78_INDEPENDENT_128B_BLOCKS_GFX10(tex->surface.u.gfx9.color.dcc.independent_128B_blocks); + } + } else { + /* Gfx6-9. */ + surf->cb_color_info |= S_028C70_ENDIAN(endian) | + S_028C70_FORMAT_GFX6(format) | + S_028C70_COMPRESSION(!!tex->surface.fmask_offset); + surf->cb_color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | + S_028C6C_SLICE_MAX_GFX6(surf->base.u.tex.last_layer); + surf->cb_color_attrib = S_028C74_NUM_SAMPLES(log_samples) | + S_028C74_NUM_FRAGMENTS_GFX6(log_fragments) | + S_028C74_FORCE_DST_ALPHA_1_GFX6(force_dst_alpha_1); + surf->cb_color_attrib2 = 0; + surf->cb_dcc_control = 0; + + if (sctx->gfx_level == GFX9) { + surf->cb_color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level); + surf->cb_color_attrib |= S_028C74_MIP0_DEPTH(util_max_layer(&tex->buffer.b.b, 0)) | + S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); + surf->cb_color_attrib2 |= S_028C68_MIP0_WIDTH(surf->width0 - 1) | + S_028C68_MIP0_HEIGHT(surf->height0 - 1) | + S_028C68_MAX_MIP(tex->buffer.b.b.last_level); } - surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | - S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | - S_028C78_INDEPENDENT_64B_BLOCKS(1); - } - - /* This must be set for fast clear to work without FMASK. */ - if (!tex->surface.fmask_size && sctx->chip_class == GFX6) { - unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh); - color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); - } - - /* GFX10 field has the same base shift as the GFX6 field */ - unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | - S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer); - unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0); + if (sctx->gfx_level >= GFX8) { + unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; - if (sctx->chip_class >= GFX10) { - color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level); + if (tex->buffer.b.b.nr_storage_samples > 1) { + if (tex->surface.bpe == 1) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + else if (tex->surface.bpe == 2) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; + } - surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) | - S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) | - S_028EE0_RESOURCE_LEVEL(1); - } else if (sctx->chip_class == GFX9) { - color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level); - color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) | - S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); - } + surf->cb_dcc_control |= S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | + S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | + S_028C78_INDEPENDENT_64B_BLOCKS(1); + } - if (sctx->chip_class >= GFX9) { - surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) | - S_028C68_MIP0_HEIGHT(surf->height0 - 1) | - S_028C68_MAX_MIP(tex->buffer.b.b.last_level); + if (sctx->gfx_level == GFX6) { + /* Due to a hw bug, FMASK_BANK_HEIGHT must still be set on GFX6. (inherited from GFX5) */ + /* This must also be set for fast clear to work without FMASK. */ + unsigned fmask_bankh = tex->surface.fmask_offset ? tex->surface.u.legacy.color.fmask.bankh + : tex->surface.u.legacy.bankh; + surf->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(util_logbase2(fmask_bankh)); + } } - surf->cb_color_view = color_view; - surf->cb_color_info = color_info; - surf->cb_color_attrib = color_attrib; - /* Determine pixel shader export format */ si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth); @@ -2524,7 +2768,6 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su struct si_texture *tex = (struct si_texture *)surf->base.texture; unsigned level = surf->base.u.tex.level; unsigned format, stencil_format; - uint32_t z_info, s_info; format = si_translate_dbformat(tex->db_render_format); stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID; @@ -2533,50 +2776,68 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su if (format == V_028040_Z_INVALID) PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format); - surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | - S_028008_SLICE_MAX(surf->base.u.tex.last_layer); - surf->db_htile_data_base = 0; - surf->db_htile_surface = 0; - - if (sctx->chip_class >= GFX10) { - surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) | - S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11); + /* Use the original Z format, not db_render_format, so that the polygon offset behaves as + * expected by applications. + */ + switch (tex->buffer.b.b.format) { + case PIPE_FORMAT_Z16_UNORM: + surf->db_format_index = 0; + break; + default: /* 24-bit */ + surf->db_format_index = 1; + break; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + surf->db_format_index = 2; + break; } - if (sctx->chip_class >= GFX9) { + if (sctx->gfx_level >= GFX9) { + surf->db_htile_data_base = 0; + surf->db_htile_surface = 0; + surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | + S_028008_SLICE_MAX(surf->base.u.tex.last_layer); + if (sctx->gfx_level >= GFX10) { + surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) | + S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11); + } + assert(tex->surface.u.gfx9.surf_offset == 0); surf->db_depth_base = tex->buffer.gpu_address >> 8; surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.zs.stencil_offset) >> 8; - z_info = S_028038_FORMAT(format) | - S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) | - S_028038_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | - S_028038_MAXMIP(tex->buffer.b.b.last_level); - s_info = S_02803C_FORMAT(stencil_format) | - S_02803C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode); - - if (sctx->chip_class == GFX9) { + surf->db_z_info = S_028038_FORMAT(format) | + S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) | + S_028038_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | + S_028038_MAXMIP(tex->buffer.b.b.last_level) | + S_028040_ITERATE_256(sctx->gfx_level >= GFX11); + surf->db_stencil_info = S_02803C_FORMAT(stencil_format) | + S_02803C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode) | + S_028044_ITERATE_256(sctx->gfx_level >= GFX11); + + if (sctx->gfx_level == GFX9) { surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.epitch); surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.zs.stencil_epitch); } surf->db_depth_view |= S_028008_MIPID(level); - surf->db_depth_size = - S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1); + surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | + S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1); if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { - z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1); - s_info |= S_02803C_TILE_STENCIL_DISABLE(tex->htile_stencil_disabled); + surf->db_z_info |= S_028038_TILE_SURFACE_ENABLE(1) | + S_028038_ALLOW_EXPCLEAR(1); + surf->db_stencil_info |= S_02803C_TILE_STENCIL_DISABLE(tex->htile_stencil_disabled); if (tex->surface.has_stencil && !tex->htile_stencil_disabled) { /* Stencil buffer workaround ported from the GFX6-GFX8 code. * See that for explanation. */ - s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1); + surf->db_stencil_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1); } surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; - surf->db_htile_surface = - S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1); - if (sctx->chip_class == GFX9) { + surf->db_htile_surface = S_028ABC_FULL_CACHE(1) | + S_028ABC_PIPE_ALIGNED(1); + if (sctx->gfx_level == GFX9) { surf->db_htile_surface |= S_028ABC_RB_ALIGNED(1); } } @@ -2590,13 +2851,16 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su (tex->buffer.gpu_address >> 8) + tex->surface.u.legacy.level[level].offset_256B; surf->db_stencil_base = (tex->buffer.gpu_address >> 8) + tex->surface.u.legacy.zs.stencil_level[level].offset_256B; - - z_info = - S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)); - s_info = S_028044_FORMAT(stencil_format); + surf->db_htile_data_base = 0; + surf->db_htile_surface = 0; + surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | + S_028008_SLICE_MAX(surf->base.u.tex.last_layer); + surf->db_z_info = S_028040_FORMAT(format) | + S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)); + surf->db_stencil_info = S_028044_FORMAT(stencil_format); surf->db_depth_info = 0; - if (sctx->chip_class >= GFX7) { + if (sctx->gfx_level >= GFX7) { struct radeon_info *info = &sctx->screen->info; unsigned index = tex->surface.u.legacy.tiling_index[level]; unsigned stencil_index = tex->surface.u.legacy.zs.stencil_tiling_index[level]; @@ -2611,13 +2875,13 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); - z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); - s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); + surf->db_z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); + surf->db_stencil_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); } else { unsigned tile_mode_index = si_tile_mode_index(tex, level, false); - z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); + surf->db_z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); tile_mode_index = si_tile_mode_index(tex, level, true); - s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); + surf->db_stencil_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); } surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) | @@ -2626,8 +2890,9 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1); if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { - z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1); - s_info |= S_028044_TILE_STENCIL_DISABLE(tex->htile_stencil_disabled); + surf->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1) | + S_028040_ALLOW_EXPCLEAR(1); + surf->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(tex->htile_stencil_disabled); if (tex->surface.has_stencil) { /* Workaround: For a not yet understood reason, the @@ -2642,7 +2907,7 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su * test if you want to try changing this. */ if (tex->buffer.b.b.nr_samples <= 1) - s_info |= S_028044_ALLOW_EXPCLEAR(1); + surf->db_stencil_info |= S_028044_ALLOW_EXPCLEAR(1); } surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; @@ -2650,12 +2915,24 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su } } - surf->db_z_info = z_info; - surf->db_stencil_info = s_info; - surf->depth_initialized = true; } +void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct si_texture *tex) +{ + /* Check all sampler bindings in all shaders where depth textures are bound, and update + * which samplers should be decompressed. + */ + u_foreach_bit(sh, sctx->shader_has_depth_tex) { + u_foreach_bit(i, sctx->samplers[sh].has_depth_tex_mask) { + if (sctx->samplers[sh].views[i]->texture == &tex->buffer.b.b) { + sctx->samplers[sh].needs_depth_decompress_mask |= 1 << i; + sctx->shader_needs_decompress_mask |= 1 << sh; + } + } + } +} + void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) { if (sctx->decompression_enabled) @@ -2669,6 +2946,8 @@ void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) if (tex->surface.has_stencil) tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level; + + si_set_sampler_depth_decompress_mask(sctx, tex); } unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask; @@ -2738,7 +3017,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, bool old_has_stencil = old_has_zsbuf && ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil; - bool unbound = false; + uint8_t old_db_format_index = + old_has_zsbuf ? + ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index : -1; int i; /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs @@ -2753,31 +3034,23 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_update_fb_dirtiness_after_rendering(sctx); /* Disable DCC if the formats are incompatible. */ - for (i = 0; i < state->nr_cbufs; i++) { - if (!state->cbufs[i]) - continue; - - surf = (struct si_surface *)state->cbufs[i]; - tex = (struct si_texture *)surf->base.texture; + if (sctx->gfx_level >= GFX8 && sctx->gfx_level < GFX11) { + for (i = 0; i < state->nr_cbufs; i++) { + if (!state->cbufs[i]) + continue; - if (!surf->dcc_incompatible) - continue; + surf = (struct si_surface *)state->cbufs[i]; + tex = (struct si_texture *)surf->base.texture; - /* Since the DCC decompression calls back into set_framebuffer- - * _state, we need to unbind the framebuffer, so that - * vi_separate_dcc_stop_query isn't called twice with the same - * color buffer. - */ - if (!unbound) { - util_copy_framebuffer_state(&sctx->framebuffer.state, NULL); - unbound = true; - } + if (!surf->dcc_incompatible) + continue; - if (vi_dcc_enabled(tex, surf->base.u.tex.level)) - if (!si_texture_disable_dcc(sctx, tex)) - si_decompress_dcc(sctx, tex); + if (vi_dcc_enabled(tex, surf->base.u.tex.level)) + if (!si_texture_disable_dcc(sctx, tex)) + si_decompress_dcc(sctx, tex); - surf->dcc_incompatible = false; + surf->dcc_incompatible = false; + } } /* Only flush TC when changing the framebuffer state, because @@ -2788,6 +3061,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * - FB write -> shader read * - shader write -> FB read * + * Wait for draws because of possible transitions: + * - texture -> render (eg: glBlitFramebuffer(with src=dst) then glDraw*) + * * DB caches are flushed on demand (using si_decompress_textures). * * When MSAA is enabled, CB and TC caches are flushed on demand @@ -2803,7 +3079,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.all_DCC_pipe_aligned); } - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); /* u_blitter doesn't invoke depth decompression when it does multiple * blits in a row, but the only case when it matters for DB is when @@ -2813,7 +3090,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, */ if (sctx->generate_mipmap_for_depth) { si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata); - } else if (sctx->chip_class == GFX9) { + } else if (sctx->gfx_level == GFX9) { /* It appears that DB metadata "leaks" in a sequence of: * - depth clear * - DCC decompress for shader image writes (with DB disabled) @@ -2821,6 +3098,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * Flushing DB metadata works around the problem. */ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } /* Take the maximum of the old and new count. If the new count is lower, @@ -2832,6 +3110,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_dec_framebuffer_counters(&sctx->framebuffer.state); util_copy_framebuffer_state(&sctx->framebuffer.state, state); + /* Recompute layers because frontends and utils might not set it. */ + sctx->framebuffer.state.layers = util_framebuffer_get_num_layers(state); sctx->framebuffer.colorbuf_enabled_4bit = 0; sctx->framebuffer.spi_shader_col_format = 0; @@ -2852,6 +3132,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.all_DCC_pipe_aligned = true; sctx->framebuffer.has_dcc_msaa = false; sctx->framebuffer.min_bytes_per_pixel = 0; + sctx->framebuffer.disable_vrs_flat_shading = false; + sctx->framebuffer.has_stencil = false; for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) @@ -2897,28 +3179,28 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { sctx->framebuffer.CB_has_shader_readable_metadata = true; - if (sctx->chip_class >= GFX9 && !tex->surface.u.gfx9.color.dcc.pipe_aligned) + if (sctx->gfx_level >= GFX9 && !tex->surface.u.gfx9.color.dcc.pipe_aligned) sctx->framebuffer.all_DCC_pipe_aligned = false; if (tex->buffer.b.b.nr_storage_samples >= 2) sctx->framebuffer.has_dcc_msaa = true; } - si_context_add_resource_size(sctx, surf->base.texture); - p_atomic_inc(&tex->framebuffers_bound); /* Update the minimum but don't keep 0. */ if (!sctx->framebuffer.min_bytes_per_pixel || tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe; - } - /* For optimal DCC performance. */ - if (sctx->chip_class >= GFX10) - sctx->framebuffer.dcc_overwrite_combiner_watermark = 6; - else - sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; + /* Disable VRS flat shading where it decreases performance. + * This gives the best results for slow clears for AMD_TEST=blitperf on Navi31. + */ + if ((sctx->framebuffer.nr_samples == 8 && tex->surface.bpe != 2) || + (tex->surface.thick_tiling && tex->surface.bpe == 4 && + util_format_get_nr_components(surf->base.format) == 4)) + sctx->framebuffer.disable_vrs_flat_shading = true; + } struct si_texture *zstex = NULL; @@ -2933,16 +3215,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS)) sctx->framebuffer.DB_has_shader_readable_metadata = true; - si_context_add_resource_size(sctx, surf->base.texture); - /* Update the minimum but don't keep 0. */ if (!sctx->framebuffer.min_bytes_per_pixel || zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; + + /* Update polygon offset based on the Z format. */ + if (sctx->queued.named.rasterizer->uses_poly_offset && + surf->db_format_index != old_db_format_index) + (sctx)->dirty_atoms |= SI_STATE_BIT(rasterizer); + + if (util_format_has_stencil(util_format_description(zstex->buffer.b.b.format))) + sctx->framebuffer.has_stencil = true; } si_update_ps_colorbuf0_slot(sctx); - si_update_poly_offset_state(sctx); si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); @@ -2956,7 +3243,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (sctx->framebuffer.any_dst_linear != old_any_dst_linear) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - if (sctx->screen->has_out_of_order_rast && + if (sctx->screen->info.has_out_of_order_rast && (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit || !!sctx->framebuffer.state.zsbuf != old_has_zsbuf || (zstex && zstex->surface.has_stencil != old_has_stencil))) @@ -2982,19 +3269,19 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, break; case 2: constbuf.buffer_offset = - (ubyte *)sctx->sample_positions.x2 - (ubyte *)sctx->sample_positions.x1; + (uint8_t *)sctx->sample_positions.x2 - (uint8_t *)sctx->sample_positions.x1; break; case 4: constbuf.buffer_offset = - (ubyte *)sctx->sample_positions.x4 - (ubyte *)sctx->sample_positions.x1; + (uint8_t *)sctx->sample_positions.x4 - (uint8_t *)sctx->sample_positions.x1; break; case 8: constbuf.buffer_offset = - (ubyte *)sctx->sample_positions.x8 - (ubyte *)sctx->sample_positions.x1; + (uint8_t *)sctx->sample_positions.x8 - (uint8_t *)sctx->sample_positions.x1; break; case 16: constbuf.buffer_offset = - (ubyte *)sctx->sample_positions.x16 - (ubyte *)sctx->sample_positions.x1; + (uint8_t *)sctx->sample_positions.x16 - (uint8_t *)sctx->sample_positions.x1; break; default: PRINT_ERR("Requested an invalid number of samples %i.\n", sctx->framebuffer.nr_samples); @@ -3003,13 +3290,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4; si_set_internal_const_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf); - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_locations); } si_ps_key_update_framebuffer(sctx); - si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_framebuffer_blend_rasterizer(sctx); si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); si_update_ps_inputs_read_or_disabled(sctx); + si_update_vrs_flat_shading(sctx); sctx->do_update_shaders = true; if (!sctx->decompression_enabled) { @@ -3020,82 +3309,91 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, } } -static void si_emit_framebuffer_state(struct si_context *sctx) +static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct pipe_framebuffer_state *state = &sctx->framebuffer.state; unsigned i, nr_cbufs = state->nr_cbufs; struct si_texture *tex = NULL; struct si_surface *cb = NULL; - unsigned cb_color_info = 0; + bool is_msaa_resolve = state->nr_cbufs == 2 && + state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 && + state->cbufs[1] && state->cbufs[1]->texture->nr_samples <= 1; + + /* CB can't do MSAA resolve on gfx11. */ + assert(!is_msaa_resolve || sctx->gfx_level < GFX11); radeon_begin(cs); /* Colorbuffers. */ for (i = 0; i < nr_cbufs; i++) { - uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base; - unsigned cb_color_attrib; - if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) continue; + /* RB+ depth-only rendering. See the comment where we set rbplus_depth_only_opt for more + * information. + */ + if (i == 0 && + sctx->screen->info.rbplus_allowed && + !sctx->queued.named.blend->cb_target_mask) { + radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, + (sctx->gfx_level >= GFX11 ? + S_028C70_FORMAT_GFX11(V_028C70_COLOR_32) : + S_028C70_FORMAT_GFX6(V_028C70_COLOR_32)) | + S_028C70_NUMBER_TYPE(V_028C70_NUMBER_FLOAT)); + continue; + } + cb = (struct si_surface *)state->cbufs[i]; if (!cb) { - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, - S_028C70_FORMAT(V_028C70_COLOR_INVALID)); + radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, + sctx->gfx_level >= GFX11 ? + S_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID) : + S_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID)); continue; } tex = (struct si_texture *)cb->base.texture; radeon_add_to_buffer_list( - sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | RADEON_USAGE_NEEDS_IMPLICIT_SYNC, - tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER); + sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | RADEON_USAGE_CB_NEEDS_IMPLICIT_SYNC | + (tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER)); if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) { radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, tex->cmask_buffer, - RADEON_USAGE_READWRITE | RADEON_USAGE_NEEDS_IMPLICIT_SYNC, + RADEON_USAGE_READWRITE | RADEON_USAGE_CB_NEEDS_IMPLICIT_SYNC | RADEON_PRIO_SEPARATE_META); } /* Compute mutable surface parameters. */ - cb_color_base = tex->buffer.gpu_address >> 8; - cb_color_fmask = 0; - cb_color_cmask = tex->cmask_base_address_reg; - cb_dcc_base = 0; - cb_color_info = cb->cb_color_info | tex->cb_color_info; - cb_color_attrib = cb->cb_color_attrib; - - if (tex->swap_rgb_to_bgr) { - /* Swap R and B channels. */ - static unsigned rgb_to_bgr[4] = { - [V_028C70_SWAP_STD] = V_028C70_SWAP_ALT, - [V_028C70_SWAP_ALT] = V_028C70_SWAP_STD, - [V_028C70_SWAP_STD_REV] = V_028C70_SWAP_ALT_REV, - [V_028C70_SWAP_ALT_REV] = V_028C70_SWAP_STD_REV, - }; - unsigned swap = rgb_to_bgr[G_028C70_COMP_SWAP(cb_color_info)]; + uint64_t cb_color_base = tex->buffer.gpu_address >> 8; + uint64_t cb_dcc_base = 0; + unsigned cb_color_info = cb->cb_color_info | tex->cb_color_info; + + if (sctx->gfx_level < GFX11) { + if (tex->swap_rgb_to_bgr) { + /* Swap R and B channels. */ + static unsigned rgb_to_bgr[4] = { + [V_028C70_SWAP_STD] = V_028C70_SWAP_ALT, + [V_028C70_SWAP_ALT] = V_028C70_SWAP_STD, + [V_028C70_SWAP_STD_REV] = V_028C70_SWAP_ALT_REV, + [V_028C70_SWAP_ALT_REV] = V_028C70_SWAP_STD_REV, + }; + unsigned swap = rgb_to_bgr[G_028C70_COMP_SWAP(cb_color_info)]; + + cb_color_info &= C_028C70_COMP_SWAP; + cb_color_info |= S_028C70_COMP_SWAP(swap); + } - cb_color_info &= C_028C70_COMP_SWAP; - cb_color_info |= S_028C70_COMP_SWAP(swap); - } + if (cb->base.u.tex.level > 0) + cb_color_info &= C_028C70_FAST_CLEAR; - if (cb->base.u.tex.level > 0) - cb_color_info &= C_028C70_FAST_CLEAR; - if (tex->surface.fmask_offset) { - cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; - cb_color_fmask |= tex->surface.fmask_tile_swizzle; + if (vi_dcc_enabled(tex, cb->base.u.tex.level) && (i != 1 || !is_msaa_resolve)) + cb_color_info |= S_028C70_DCC_ENABLE(1); } /* Set up DCC. */ if (vi_dcc_enabled(tex, cb->base.u.tex.level)) { - bool is_msaa_resolve_dst = state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 && - state->cbufs[1] == &cb->base && - state->cbufs[1]->texture->nr_samples <= 1; - - if (!is_msaa_resolve_dst) - cb_color_info |= S_028C70_DCC_ENABLE(1); - cb_dcc_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; @@ -3103,16 +3401,57 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_dcc_base |= dcc_tile_swizzle; } - if (sctx->chip_class >= GFX10) { + if (sctx->gfx_level >= GFX11) { + unsigned cb_color_attrib3, cb_fdcc_control; + + /* Set mutable surface parameters. */ + cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; + cb_color_base |= tex->surface.tile_swizzle; + + cb_color_attrib3 = cb->cb_color_attrib3 | + S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | + S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned); + cb_fdcc_control = cb->cb_dcc_control | + S_028C78_DISABLE_CONSTANT_ENCODE_REG(1) | + S_028C78_FDCC_ENABLE(vi_dcc_enabled(tex, cb->base.u.tex.level)); + + if (sctx->family >= CHIP_GFX1103_R2) { + cb_fdcc_control |= S_028C78_ENABLE_MAX_COMP_FRAG_OVERRIDE(1) | + S_028C78_MAX_COMP_FRAGS(cb->base.texture->nr_samples >= 4); + } + + radeon_set_context_reg(R_028C60_CB_COLOR0_BASE + i * 0x3C, cb_color_base); + + radeon_set_context_reg_seq(R_028C6C_CB_COLOR0_VIEW + i * 0x3C, 4); + radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cb->cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cb_fdcc_control); /* CB_COLOR0_FDCC_CONTROL */ + + radeon_set_context_reg(R_028C94_CB_COLOR0_DCC_BASE + i * 0x3C, cb_dcc_base); + radeon_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); + radeon_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); + radeon_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); + radeon_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); + } else if (sctx->gfx_level >= GFX10) { unsigned cb_color_attrib3; + uint64_t cb_color_fmask, cb_color_cmask; /* Set mutable surface parameters. */ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; cb_color_base |= tex->surface.tile_swizzle; - if (!tex->surface.fmask_offset) + + if (tex->surface.fmask_offset) { + cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; + cb_color_fmask |= tex->surface.fmask_tile_swizzle; + } else { cb_color_fmask = cb_color_base; + } + if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; + else + cb_color_cmask = tex->cmask_base_address_reg; cb_color_attrib3 = cb->cb_color_attrib3 | S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | @@ -3120,35 +3459,37 @@ static void si_emit_framebuffer_state(struct si_context *sctx) S_028EE0_CMASK_PIPE_ALIGNED(1) | S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned); - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - - radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); - radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, + radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); + radeon_emit(cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(0); /* hole */ + radeon_emit(0); /* hole */ + radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cb->cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(0); /* hole */ + radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(0); /* hole */ + radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + + radeon_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); + radeon_set_context_reg(R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, cb_color_cmask >> 32); - radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, + radeon_set_context_reg(R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, cb_color_fmask >> 32); - radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); - radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); - radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); - } else if (sctx->chip_class == GFX9) { + radeon_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); + radeon_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); + radeon_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); + } else if (sctx->gfx_level == GFX9) { struct gfx9_surf_meta_flags meta = { .rb_aligned = 1, .pipe_aligned = 1, }; + unsigned cb_color_attrib = cb->cb_color_attrib; + uint64_t cb_color_fmask, cb_color_cmask; if (!tex->is_depth && tex->surface.meta_offset) meta = tex->surface.u.gfx9.color.dcc; @@ -3156,33 +3497,42 @@ static void si_emit_framebuffer_state(struct si_context *sctx) /* Set mutable surface parameters. */ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; cb_color_base |= tex->surface.tile_swizzle; - if (!tex->surface.fmask_offset) + + if (tex->surface.fmask_offset) { + cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; + cb_color_fmask |= tex->surface.fmask_tile_swizzle; + } else { cb_color_fmask = cb_color_base; + } + if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; + else + cb_color_cmask = tex->cmask_base_address_reg; + cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.color.fmask_swizzle_mode) | S_028C74_RB_ALIGNED(meta.rb_aligned) | S_028C74_PIPE_ALIGNED(meta.pipe_aligned); - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ - radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ - - radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4, + radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); + radeon_emit(cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ + radeon_emit(cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ + radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ + radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ + radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + radeon_emit(S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ + + radeon_set_context_reg(R_0287A0_CB_MRT0_EPITCH + i * 4, S_0287A0_EPITCH(tex->surface.u.gfx9.epitch)); } else { /* Compute mutable surface parameters (GFX6-GFX8). */ @@ -3190,16 +3540,26 @@ static void si_emit_framebuffer_state(struct si_context *sctx) &tex->surface.u.legacy.level[cb->base.u.tex.level]; unsigned pitch_tile_max, slice_tile_max, tile_mode_index; unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice; + unsigned cb_color_attrib = cb->cb_color_attrib; + uint64_t cb_color_fmask, cb_color_cmask; cb_color_base += level_info->offset_256B; /* Only macrotiled modes can set tile swizzle. */ if (level_info->mode == RADEON_SURF_MODE_2D) cb_color_base |= tex->surface.tile_swizzle; - if (!tex->surface.fmask_offset) + if (tex->surface.fmask_offset) { + cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; + cb_color_fmask |= tex->surface.fmask_tile_swizzle; + } else { cb_color_fmask = cb_color_base; + } + if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; + else + cb_color_cmask = tex->cmask_base_address_reg; + if (cb_dcc_base) cb_dcc_base += tex->surface.u.legacy.color.dcc_level[cb->base.u.tex.level].dcc_offset >> 8; @@ -3212,7 +3572,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); if (tex->surface.fmask_offset) { - if (sctx->chip_class >= GFX7) + if (sctx->gfx_level >= GFX7) cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.color.fmask.pitch_in_pixels / 8 - 1); cb_color_attrib |= @@ -3220,35 +3580,35 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.color.fmask.slice_tile_max); } else { /* This must be set for fast clear to work without FMASK. */ - if (sctx->chip_class >= GFX7) + if (sctx->gfx_level >= GFX7) cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max); } - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, - sctx->chip_class >= GFX8 ? 14 : 13); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */ - radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - - if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */ - radeon_emit(cs, cb_dcc_base); + radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, + sctx->gfx_level >= GFX8 ? 14 : 13); + radeon_emit(cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cb_color_pitch); /* CB_COLOR0_PITCH */ + radeon_emit(cb_color_slice); /* CB_COLOR0_SLICE */ + radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ + radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ + radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + + if (sctx->gfx_level >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */ + radeon_emit(cb_dcc_base); } } for (; i < 8; i++) if (sctx->framebuffer.dirty_cbufs & (1 << i)) - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); + radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); /* ZS buffer. */ if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { @@ -3258,75 +3618,86 @@ static void si_emit_framebuffer_state(struct si_context *sctx) unsigned db_stencil_info = zb->db_stencil_info; unsigned db_htile_surface = zb->db_htile_surface; - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE, - zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA - : RADEON_PRIO_DEPTH_BUFFER); + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | + (zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA + : RADEON_PRIO_DEPTH_BUFFER)); + bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS); /* Set fields dependent on tc_compatile_htile. */ - if (sctx->chip_class >= GFX9 && - vi_tc_compat_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS)) { + if (sctx->gfx_level >= GFX9 && tc_compat_htile) { unsigned max_zplanes = 4; if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1) max_zplanes = 2; - db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); - - if (sctx->chip_class >= GFX10) { - db_z_info |= S_028040_ITERATE_FLUSH(1); - db_stencil_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled); + if (sctx->gfx_level >= GFX10) { + bool iterate256 = tex->buffer.b.b.nr_samples >= 2; + db_z_info |= S_028040_ITERATE_FLUSH(1) | + S_028040_ITERATE_256(iterate256); + db_stencil_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled) | + S_028044_ITERATE_256(iterate256); + + /* Workaround for a DB hang when ITERATE_256 is set to 1. Only affects 4X MSAA D/S images. */ + if (sctx->screen->info.has_two_planes_iterate256_bug && iterate256 && + !tex->htile_stencil_disabled && tex->buffer.b.b.nr_samples == 4) { + max_zplanes = 1; + } } else { db_z_info |= S_028038_ITERATE_FLUSH(1); db_stencil_info |= S_02803C_ITERATE_FLUSH(1); } + + db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); } unsigned level = zb->base.u.tex.level; - if (sctx->chip_class >= GFX10) { - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); + if (sctx->gfx_level >= GFX10) { + radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7); - radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ - radeon_emit(cs, db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); - radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5); - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ - } else if (sctx->chip_class == GFX9) { - radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); - radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ - radeon_emit(cs, - S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10); - radeon_emit(cs, db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); - radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, - S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2); - radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */ - radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ + if (sctx->gfx_level >= GFX11) { + radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 6); + } else { + radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 7); + radeon_emit(S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ + } + radeon_emit(db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + + radeon_set_context_reg_seq(R_028068_DB_Z_READ_BASE_HI, 5); + radeon_emit(zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ + radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ + radeon_emit(zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ + } else if (sctx->gfx_level == GFX9) { + radeon_set_context_reg_seq(R_028014_DB_HTILE_DATA_BASE, 3); + radeon_emit(zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ + radeon_emit(S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ + radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */ + + radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 10); + radeon_emit(db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ + + radeon_set_context_reg_seq(R_028068_DB_Z_INFO2, 2); + radeon_emit(zb->db_z_info2); /* DB_Z_INFO2 */ + radeon_emit(zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ } else { /* GFX6-GFX8 */ /* Set fields dependent on tc_compatile_htile. */ @@ -3344,46 +3715,52 @@ static void si_emit_framebuffer_state(struct si_context *sctx) } } - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9); - radeon_emit(cs, zb->db_depth_info | /* DB_DEPTH_INFO */ + radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 9); + radeon_emit(zb->db_depth_info | /* DB_DEPTH_INFO */ S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile)); - radeon_emit(cs, db_z_info | /* DB_Z_INFO */ - S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); - radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */ + radeon_emit(db_z_info | /* DB_Z_INFO */ + S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */ + radeon_emit(zb->db_depth_slice); /* DB_DEPTH_SLICE */ } - radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); - radeon_emit(cs, tex->stencil_clear_value[level]); /* R_028028_DB_STENCIL_CLEAR */ - radeon_emit(cs, fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */ + radeon_set_context_reg_seq(R_028028_DB_STENCIL_CLEAR, 2); + radeon_emit(tex->stencil_clear_value[level]); /* R_028028_DB_STENCIL_CLEAR */ + radeon_emit(fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */ - radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view); - radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface); + radeon_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view); + radeon_set_context_reg(R_028ABC_DB_HTILE_SURFACE, db_htile_surface); } else if (sctx->framebuffer.dirty_zsbuf) { - if (sctx->chip_class == GFX9) - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2); + if (sctx->gfx_level == GFX9) + radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 2); else - radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2); + radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 2); - radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ - radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ + /* Gfx11+: DB_Z_INFO.NUM_SAMPLES should match the framebuffer samples if no Z/S is bound. + * It determines the sample count for VRS, primitive-ordered pixel shading, and occlusion + * queries. + */ + radeon_emit(S_028040_FORMAT(V_028040_Z_INVALID) | /* DB_Z_INFO */ + S_028040_NUM_SAMPLES(sctx->gfx_level >= GFX11 ? sctx->framebuffer.log_samples : 0)); + radeon_emit(S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ } /* Framebuffer dimensions. */ - /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */ - radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, + /* PA_SC_WINDOW_SCISSOR_TL is set to 0,0 in gfx*_init_gfx_preamble_state */ + radeon_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR, S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); - if (sctx->screen->dpbb_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + if (sctx->screen->dpbb_allowed && + sctx->screen->pbb_context_states_per_bin > 1) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } radeon_end(); @@ -3393,65 +3770,187 @@ static void si_emit_framebuffer_state(struct si_context *sctx) sctx->framebuffer.dirty_zsbuf = false; } -static void si_emit_msaa_sample_locs(struct si_context *sctx) +static void gfx11_dgpu_emit_framebuffer_state(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned nr_samples = sctx->framebuffer.nr_samples; - bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug; + struct pipe_framebuffer_state *state = &sctx->framebuffer.state; + unsigned i, nr_cbufs = state->nr_cbufs; + struct si_texture *tex = NULL; + struct si_surface *cb = NULL; + bool is_msaa_resolve = state->nr_cbufs == 2 && + state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 && + state->cbufs[1] && state->cbufs[1]->texture->nr_samples <= 1; - /* Smoothing (only possible with nr_samples == 1) uses the same - * sample locations as the MSAA it simulates. - */ - if (nr_samples <= 1 && sctx->smoothing_enabled) - nr_samples = SI_NUM_SMOOTH_AA_SAMPLES; - - /* On Polaris, the small primitive filter uses the sample locations - * even when MSAA is off, so we need to make sure they're set to 0. - * - * GFX10 uses sample locations unconditionally, so they always need - * to be set up. - */ - if ((nr_samples >= 2 || has_msaa_sample_loc_bug || sctx->chip_class >= GFX10) && - nr_samples != sctx->sample_locs_num_samples) { - sctx->sample_locs_num_samples = nr_samples; - si_emit_sample_locations(cs, nr_samples); - } + /* CB can't do MSAA resolve on gfx11. */ + assert(!is_msaa_resolve); radeon_begin(cs); + gfx11_begin_packed_context_regs(); - if (sctx->family >= CHIP_POLARIS10) { - unsigned small_prim_filter_cntl = - S_028830_SMALL_PRIM_FILTER_ENABLE(1) | - /* line bug */ - S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12); + /* Colorbuffers. */ + for (i = 0; i < nr_cbufs; i++) { + if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) + continue; - /* For hardware with the sample location bug, the problem is that in order to use the small - * primitive filter, we need to explicitly set the sample locations to 0. But the DB doesn't - * properly process the change of sample locations without a flush, and so we can end up - * with incorrect Z values. - * - * Instead of doing a flush, just disable the small primitive filter when MSAA is - * force-disabled. - * - * The alternative of setting sample locations to 0 would require a DB flush to avoid - * Z errors, see https://bugs.freedesktop.org/show_bug.cgi?id=96908 + /* RB+ depth-only rendering. See the comment where we set rbplus_depth_only_opt for more + * information. */ - if (has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1 && !rs->multisample_enable) - small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE; + if (i == 0 && + sctx->screen->info.rbplus_allowed && + !sctx->queued.named.blend->cb_target_mask) { + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT_GFX11(V_028C70_COLOR_32) | + S_028C70_NUMBER_TYPE(V_028C70_NUMBER_FLOAT)); + continue; + } + + cb = (struct si_surface *)state->cbufs[i]; + if (!cb) { + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)); + continue; + } + + tex = (struct si_texture *)cb->base.texture; + radeon_add_to_buffer_list( + sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | RADEON_USAGE_CB_NEEDS_IMPLICIT_SYNC | + (tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER)); + + if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, tex->cmask_buffer, + RADEON_USAGE_READWRITE | RADEON_USAGE_CB_NEEDS_IMPLICIT_SYNC | + RADEON_PRIO_SEPARATE_META); + } + + /* Compute mutable surface parameters. */ + uint64_t cb_color_base = tex->buffer.gpu_address >> 8; + uint64_t cb_dcc_base = 0; + unsigned cb_color_info = cb->cb_color_info | tex->cb_color_info; + + /* Set up DCC. */ + if (vi_dcc_enabled(tex, cb->base.u.tex.level)) { + cb_dcc_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; + + unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; + dcc_tile_swizzle &= ((1 << tex->surface.meta_alignment_log2) - 1) >> 8; + cb_dcc_base |= dcc_tile_swizzle; + } - radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, - SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl); + unsigned cb_color_attrib3, cb_fdcc_control; + + /* Set mutable surface parameters. */ + cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; + cb_color_base |= tex->surface.tile_swizzle; + + cb_color_attrib3 = cb->cb_color_attrib3 | + S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | + S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned); + cb_fdcc_control = cb->cb_dcc_control | + S_028C78_DISABLE_CONSTANT_ENCODE_REG(1) | + S_028C78_FDCC_ENABLE(vi_dcc_enabled(tex, cb->base.u.tex.level)); + + if (sctx->family >= CHIP_GFX1103_R2) { + cb_fdcc_control |= S_028C78_ENABLE_MAX_COMP_FRAG_OVERRIDE(1) | + S_028C78_MAX_COMP_FRAGS(cb->base.texture->nr_samples >= 4); + } + + gfx11_set_context_reg(R_028C60_CB_COLOR0_BASE + i * 0x3C, cb_color_base); + gfx11_set_context_reg(R_028C6C_CB_COLOR0_VIEW + i * 0x3C, cb->cb_color_view); + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, cb_color_info); + gfx11_set_context_reg(R_028C74_CB_COLOR0_ATTRIB + i * 0x3C, cb->cb_color_attrib); + gfx11_set_context_reg(R_028C78_CB_COLOR0_DCC_CONTROL + i * 0x3C, cb_fdcc_control); + gfx11_set_context_reg(R_028C94_CB_COLOR0_DCC_BASE + i * 0x3C, cb_dcc_base); + gfx11_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); + gfx11_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); + gfx11_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); + gfx11_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); } + for (; i < 8; i++) + if (sctx->framebuffer.dirty_cbufs & (1 << i)) + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); - /* The exclusion bits can be set to improve rasterization efficiency - * if no sample lies on the pixel boundary (-8 sample offset). - */ - bool exclusion = sctx->chip_class >= GFX7 && (!rs->multisample_enable || nr_samples != 16); - radeon_opt_set_context_reg( - sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, - S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); + /* ZS buffer. */ + if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { + struct si_surface *zb = (struct si_surface *)state->zsbuf; + struct si_texture *tex = (struct si_texture *)zb->base.texture; + unsigned db_z_info = zb->db_z_info; + unsigned db_stencil_info = zb->db_stencil_info; + unsigned db_htile_surface = zb->db_htile_surface; + + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | + (zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA + : RADEON_PRIO_DEPTH_BUFFER)); + bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS); + + /* Set fields dependent on tc_compatile_htile. */ + if (tc_compat_htile) { + unsigned max_zplanes = 4; + + if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1) + max_zplanes = 2; + + bool iterate256 = tex->buffer.b.b.nr_samples >= 2; + db_z_info |= S_028040_ITERATE_FLUSH(1) | + S_028040_ITERATE_256(iterate256); + db_stencil_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled) | + S_028044_ITERATE_256(iterate256); + + /* Workaround for a DB hang when ITERATE_256 is set to 1. Only affects 4X MSAA D/S images. */ + if (sctx->screen->info.has_two_planes_iterate256_bug && iterate256 && + !tex->htile_stencil_disabled && tex->buffer.b.b.nr_samples == 4) + max_zplanes = 1; + + db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); + } + + unsigned level = zb->base.u.tex.level; + + gfx11_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + gfx11_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); + gfx11_set_context_reg(R_028040_DB_Z_INFO, db_z_info | + S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + gfx11_set_context_reg(R_028044_DB_STENCIL_INFO, db_stencil_info); + gfx11_set_context_reg(R_028048_DB_Z_READ_BASE, zb->db_depth_base); + gfx11_set_context_reg(R_02804C_DB_STENCIL_READ_BASE, zb->db_stencil_base); + gfx11_set_context_reg(R_028050_DB_Z_WRITE_BASE, zb->db_depth_base); + gfx11_set_context_reg(R_028054_DB_STENCIL_WRITE_BASE, zb->db_stencil_base); + gfx11_set_context_reg(R_028068_DB_Z_READ_BASE_HI, zb->db_depth_base >> 32); + gfx11_set_context_reg(R_02806C_DB_STENCIL_READ_BASE_HI, zb->db_stencil_base >> 32); + gfx11_set_context_reg(R_028070_DB_Z_WRITE_BASE_HI, zb->db_depth_base >> 32); + gfx11_set_context_reg(R_028074_DB_STENCIL_WRITE_BASE_HI, zb->db_stencil_base >> 32); + gfx11_set_context_reg(R_028078_DB_HTILE_DATA_BASE_HI, zb->db_htile_data_base >> 32); + gfx11_set_context_reg(R_028028_DB_STENCIL_CLEAR, tex->stencil_clear_value[level]); + gfx11_set_context_reg(R_02802C_DB_DEPTH_CLEAR, fui(tex->depth_clear_value[level])); + gfx11_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view); + gfx11_set_context_reg(R_028ABC_DB_HTILE_SURFACE, db_htile_surface); + } else if (sctx->framebuffer.dirty_zsbuf) { + /* Gfx11+: DB_Z_INFO.NUM_SAMPLES should match the framebuffer samples if no Z/S is bound. + * It determines the sample count for VRS, primitive-ordered pixel shading, and occlusion + * queries. + */ + gfx11_set_context_reg(R_028040_DB_Z_INFO, + S_028040_FORMAT(V_028040_Z_INVALID) | + S_028040_NUM_SAMPLES(sctx->framebuffer.log_samples)); + gfx11_set_context_reg(R_028044_DB_STENCIL_INFO, S_028044_FORMAT(V_028044_STENCIL_INVALID)); + } + + /* Framebuffer dimensions. */ + /* PA_SC_WINDOW_SCISSOR_TL is set to 0,0 in gfx*_init_gfx_preamble_state */ + gfx11_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR, + S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); + gfx11_end_packed_context_regs(); + + if (sctx->screen->dpbb_allowed && + sctx->screen->pbb_context_states_per_bin > 1) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } radeon_end(); + + si_update_display_dcc_dirty(sctx); + + sctx->framebuffer.dirty_cbufs = 0; + sctx->framebuffer.dirty_zsbuf = false; } static bool si_out_of_order_rasterization(struct si_context *sctx) @@ -3459,7 +3958,7 @@ static bool si_out_of_order_rasterization(struct si_context *sctx) struct si_state_blend *blend = sctx->queued.named.blend; struct si_state_dsa *dsa = sctx->queued.named.dsa; - if (!sctx->screen->has_out_of_order_rast) + if (!sctx->screen->info.has_out_of_order_rast) return false; unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; @@ -3471,8 +3970,7 @@ static bool si_out_of_order_rasterization(struct si_context *sctx) return false; struct si_dsa_order_invariance dsa_order_invariant = {.zs = true, - .pass_set = true, - .pass_last = false}; + .pass_set = true}; if (sctx->framebuffer.state.zsbuf) { struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture; @@ -3488,7 +3986,8 @@ static bool si_out_of_order_rasterization(struct si_context *sctx) !dsa_order_invariant.pass_set) return false; - if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set) + if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER && + !dsa_order_invariant.pass_set) return false; } @@ -3506,15 +4005,13 @@ static bool si_out_of_order_rasterization(struct si_context *sctx) return false; } - if (colormask & ~blendmask) { - if (!dsa_order_invariant.pass_last) - return false; - } + if (colormask & ~blendmask) + return false; return true; } -static void si_emit_msaa_config(struct si_context *sctx) +static void si_emit_msaa_config(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes; @@ -3531,8 +4028,8 @@ static void si_emit_msaa_config(struct si_context *sctx) S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1); unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) | - S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); - unsigned coverage_samples, color_samples, z_samples; + S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); + unsigned coverage_samples, z_samples; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; /* S: Coverage samples (up to 16x): @@ -3576,49 +4073,45 @@ static void si_emit_msaa_config(struct si_context *sctx) * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry * EQAA 2s 2z 2f = 2x MSAA */ - coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx); + coverage_samples = si_get_num_coverage_samples(sctx); + + /* DCC_DECOMPRESS and ELIMINATE_FAST_CLEAR require MSAA_NUM_SAMPLES=0. */ + if (sctx->gfx_level >= GFX11 && sctx->gfx11_force_msaa_num_samples_zero) + coverage_samples = 1; + + /* The DX10 diamond test is not required by GL and decreases line rasterization + * performance, so don't use it. + */ + unsigned sc_line_cntl = 0; + unsigned sc_aa_config = 0; + + if (coverage_samples > 1 && (rs->multisample_enable || + sctx->smoothing_enabled)) { + unsigned log_samples = util_logbase2(coverage_samples); - if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) { - color_samples = sctx->framebuffer.nr_color_samples; + sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1) | + S_028BDC_PERPENDICULAR_ENDCAP_ENA(rs->perpendicular_end_caps) | + S_028BDC_EXTRA_DX_DY_PRECISION(rs->perpendicular_end_caps && + (sctx->family == CHIP_VEGA20 || + sctx->gfx_level >= GFX10)); + sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | + S_028BE0_MAX_SAMPLE_DIST(si_msaa_max_distance[log_samples]) | + S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | + S_028BE0_COVERED_CENTROID_IS_CENTER(sctx->gfx_level >= GFX10_3); + } + if (sctx->framebuffer.nr_samples > 1 || + sctx->smoothing_enabled) { if (sctx->framebuffer.state.zsbuf) { z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples; z_samples = MAX2(1, z_samples); } else { z_samples = coverage_samples; } - } - - /* Required by OpenGL line rasterization. - * - * TODO: We should also enable perpendicular endcaps for AA lines, - * but that requires implementing line stippling in the pixel - * shader. SC can only do line stippling with axis-aligned - * endcaps. - */ - unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); - unsigned sc_aa_config = 0; - - if (coverage_samples > 1) { - /* distance from the pixel center, indexed by log2(nr_samples) */ - static unsigned max_dist[] = { - 0, /* unused */ - 4, /* 2x MSAA */ - 6, /* 4x MSAA */ - 7, /* 8x MSAA */ - 8, /* 16x MSAA */ - }; unsigned log_samples = util_logbase2(coverage_samples); unsigned log_z_samples = util_logbase2(z_samples); unsigned ps_iter_samples = si_get_ps_iter_samples(sctx); unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); - - sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); - sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | - S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | - S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | - S_028BE0_COVERED_CENTROID_IS_CENTER(sctx->chip_class >= GFX10_3); - if (sctx->framebuffer.nr_samples > 1) { db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | @@ -3630,17 +4123,27 @@ static void si_emit_msaa_config(struct si_context *sctx) } } - radeon_begin(cs); - - /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ - radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, - sc_line_cntl, sc_aa_config); - /* R_028804_DB_EQAA */ - radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); - /* R_028A4C_PA_SC_MODE_CNTL_1 */ - radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, - sc_mode_cntl_1); - radeon_end_update_context_roll(sctx); + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, + sc_line_cntl); + gfx11_opt_set_context_reg(R_028BE0_PA_SC_AA_CONFIG, SI_TRACKED_PA_SC_AA_CONFIG, + sc_aa_config); + gfx11_opt_set_context_reg(R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); + gfx11_opt_set_context_reg(R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(cs); + radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, + sc_line_cntl, sc_aa_config); + radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); + radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + radeon_end_update_context_roll(sctx); + } } void si_update_ps_iter_samples(struct si_context *sctx) @@ -3679,7 +4182,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) * @param state 256-bit descriptor; only the high 128 bits are filled in */ void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, - enum pipe_format format, unsigned offset, unsigned size, + enum pipe_format format, unsigned offset, unsigned num_elements, uint32_t *state) { const struct util_format_description *desc; @@ -3689,7 +4192,7 @@ void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf desc = util_format_description(format); stride = desc->block.bits / 8; - num_records = size / stride; + num_records = num_elements; num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride); /* The NUM_RECORDS field has a different meaning depending on the chip, @@ -3715,7 +4218,7 @@ void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units. * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE. */ - if (screen->info.chip_class == GFX8) + if (screen->info.gfx_level == GFX8) num_records *= stride; state[4] = 0; @@ -3726,19 +4229,32 @@ void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); - if (screen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; + if (screen->info.gfx_level >= GFX10) { + const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&screen->info)[format]; - /* OOB_SELECT chooses the out-of-bounds check: + /* OOB_SELECT chooses the out-of-bounds check. + * + * GFX10: * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE) * - 1: index >= NUM_RECORDS * - 2: NUM_RECORDS == 0 - * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS - * else: swizzle_address >= NUM_RECORDS + * - 3: if SWIZZLE_ENABLE: + * swizzle_address >= NUM_RECORDS + * else: + * offset >= NUM_RECORDS + * + * GFX11: + * - 0: (index >= NUM_RECORDS) || (offset+payload > STRIDE) + * - 1: index >= NUM_RECORDS + * - 2: NUM_RECORDS == 0 + * - 3: if SWIZZLE_ENABLE && STRIDE: + * (index >= NUM_RECORDS) || ( offset+payload > STRIDE) + * else: + * offset+payload > NUM_RECORDS */ state[7] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); + S_008F0C_RESOURCE_LEVEL(screen->info.gfx_level < GFX11); } else { int first_non_void; unsigned num_format, data_format; @@ -3781,14 +4297,116 @@ static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4]) } /** + * Translate the parameters to an image descriptor for CDNA image emulation. + * In this function, we choose our own image descriptor format because we emulate image opcodes + * using buffer opcodes. + */ +static void cdna_emu_make_image_descriptor(struct si_screen *screen, struct si_texture *tex, + bool sampler, enum pipe_texture_target target, + enum pipe_format pipe_format, + const unsigned char state_swizzle[4], unsigned first_level, + unsigned last_level, unsigned first_layer, + unsigned last_layer, unsigned width, unsigned height, + unsigned depth, uint32_t *state, uint32_t *fmask_state) +{ + const struct util_format_description *desc = util_format_description(pipe_format); + + /* We don't need support these. We only need enough to support VAAPI and OpenMAX. */ + if (target == PIPE_TEXTURE_CUBE || + target == PIPE_TEXTURE_CUBE_ARRAY || + tex->buffer.b.b.last_level > 0 || + tex->buffer.b.b.nr_samples >= 2 || + desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB || + desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED || + util_format_is_compressed(pipe_format)) { + assert(!"unexpected texture type"); + memset(state, 0, 8 * 4); + return; + } + + /* Adjust the image parameters according to the texture type. */ + switch (target) { + case PIPE_TEXTURE_1D: + height = 1; + FALLTHROUGH; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + depth = 1; + break; + + case PIPE_TEXTURE_1D_ARRAY: + height = 1; + FALLTHROUGH; + case PIPE_TEXTURE_2D_ARRAY: + first_layer = MIN2(first_layer, tex->buffer.b.b.array_size - 1); + last_layer = MIN2(last_layer, tex->buffer.b.b.array_size - 1); + last_layer = MAX2(last_layer, first_layer); + depth = last_layer - first_layer + 1; + break; + + case PIPE_TEXTURE_3D: + first_layer = 0; + break; + + default: + unreachable("invalid texture target"); + } + + unsigned stride = desc->block.bits / 8; + uint64_t num_records = tex->surface.surf_size / stride; + assert(num_records <= UINT32_MAX); + + /* Prepare the format fields. */ + unsigned char swizzle[4]; + util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); + + /* Buffer descriptor */ + state[0] = 0; + state[1] = S_008F04_STRIDE(stride); + state[2] = num_records; + state[3] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])); + + if (screen->info.gfx_level >= GFX10) { + const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&screen->info)[pipe_format]; + + state[3] |= S_008F0C_FORMAT(fmt->img_format) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | + S_008F0C_RESOURCE_LEVEL(screen->info.gfx_level < GFX11); + } else { + int first_non_void = util_format_get_first_non_void_channel(pipe_format); + unsigned num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void); + unsigned data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void); + + state[3] |= S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); + } + + /* Additional fields used by image opcode emulation. */ + state[4] = width | (height << 16); + state[5] = depth | (first_layer << 16); + state[6] = tex->surface.u.gfx9.surf_pitch; + state[7] = (uint32_t)tex->surface.u.gfx9.surf_pitch * tex->surface.u.gfx9.surf_height; +} + +/** * Build the sampler view descriptor for a texture. */ static void gfx10_make_texture_descriptor( struct si_screen *screen, struct si_texture *tex, bool sampler, enum pipe_texture_target target, enum pipe_format pipe_format, const unsigned char state_swizzle[4], unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height, - unsigned depth, uint32_t *state, uint32_t *fmask_state) + unsigned depth, bool get_bo_metadata, uint32_t *state, uint32_t *fmask_state) { + if (!screen->info.has_image_opcodes && !get_bo_metadata) { + cdna_emu_make_image_descriptor(screen, tex, sampler, target, pipe_format, state_swizzle, + first_level, last_level, first_layer, last_layer, width, + height, depth, state, fmask_state); + return; + } + struct pipe_resource *res = &tex->buffer.b.b; const struct util_format_description *desc; unsigned img_format; @@ -3797,7 +4415,7 @@ static void gfx10_make_texture_descriptor( uint64_t va; desc = util_format_description(pipe_format); - img_format = gfx10_format_table[pipe_format].img_format; + img_format = ac_get_gfx10_format_table(&screen->info)[pipe_format].img_format; if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; @@ -3827,8 +4445,13 @@ static void gfx10_make_texture_descriptor( } if (tex->upgraded_depth && !is_stencil) { - assert(img_format == V_008F0C_GFX10_FORMAT_32_FLOAT); - img_format = V_008F0C_GFX10_FORMAT_32_FLOAT_CLAMP; + if (screen->info.gfx_level >= GFX11) { + assert(img_format == V_008F0C_GFX11_FORMAT_32_FLOAT); + img_format = V_008F0C_GFX11_FORMAT_32_FLOAT_CLAMP; + } else { + assert(img_format == V_008F0C_GFX10_FORMAT_32_FLOAT); + img_format = V_008F0C_GFX10_FORMAT_32_FLOAT_CLAMP; + } } } else { util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); @@ -3855,7 +4478,8 @@ static void gfx10_make_texture_descriptor( state[0] = 0; state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1); state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | - S_00A008_RESOURCE_LEVEL(1); + S_00A008_RESOURCE_LEVEL(screen->info.gfx_level < GFX11); + state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | @@ -3871,9 +4495,16 @@ static void gfx10_make_texture_descriptor( S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) | S_00A010_BASE_ARRAY(first_layer); state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | - S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples) - : tex->buffer.b.b.last_level) | S_00A014_PERF_MOD(4); + + unsigned max_mip = res->nr_samples > 1 ? util_logbase2(res->nr_samples) : + tex->buffer.b.b.last_level; + + if (screen->info.gfx_level >= GFX11) { + state[1] |= S_00A004_MAX_MIP(max_mip); + } else { + state[5] |= S_00A014_MAX_MIP(max_mip); + } state[6] = 0; state[7] = 0; @@ -3960,8 +4591,16 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu const unsigned char state_swizzle[4], unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height, - unsigned depth, uint32_t *state, uint32_t *fmask_state) + unsigned depth, bool get_bo_metadata, + uint32_t *state, uint32_t *fmask_state) { + if (!screen->info.has_image_opcodes && !get_bo_metadata) { + cdna_emu_make_image_descriptor(screen, tex, sampler, target, pipe_format, state_swizzle, + first_level, last_level, first_layer, last_layer, width, + height, depth, state, fmask_state); + return; + } + struct pipe_resource *res = &tex->buffer.b.b; const struct util_format_description *desc; unsigned char swizzle[4]; @@ -3991,7 +4630,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu * fix texture gathers. This affects at least * GL45-CTS.texture_cube_map_array.sampling on GFX8. */ - if (screen->info.chip_class <= GFX8) + if (screen->info.gfx_level <= GFX8) util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); else util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); @@ -4077,11 +4716,11 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu } /* S8 with Z32 HTILE needs a special format. */ - if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT) + if (screen->info.gfx_level == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT) data_format = V_008F14_IMG_DATA_FORMAT_S8_32; if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY || - (screen->info.chip_class <= GFX8 && res->target == PIPE_TEXTURE_3D))) { + (screen->info.gfx_level <= GFX8 && res->target == PIPE_TEXTURE_3D))) { /* For the purpose of shader images, treat cube maps and 3D * textures as 2D arrays. For 3D textures, the address * calculations for mipmaps are different, so we rely on the @@ -4118,7 +4757,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu state[6] = 0; state[7] = 0; - if (screen->info.chip_class == GFX9) { + if (screen->info.gfx_level == GFX9) { unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle); /* Depth is the the last accessible layer on Gfx9. @@ -4144,7 +4783,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu /* The last dword is unused by hw. The shader uses it to clear * bits in the first dword of sampler state. */ - if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) { + if (screen->info.gfx_level <= GFX7 && res->nr_samples <= 1) { if (first_level == last_level) state[7] = C_008F30_MAX_ANISO_RATIO; else @@ -4159,7 +4798,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu va = tex->buffer.gpu_address + tex->surface.fmask_offset; #define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) - if (screen->info.chip_class == GFX9) { + if (screen->info.gfx_level == GFX9) { data_format = V_008F14_IMG_DATA_FORMAT_FMASK; switch (FMASK(res->nr_samples, res->nr_storage_samples)) { case FMASK(2, 1): @@ -4265,7 +4904,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu fmask_state[6] = 0; fmask_state[7] = 0; - if (screen->info.chip_class == GFX9) { + if (screen->info.gfx_level == GFX9) { fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.color.fmask_swizzle_mode); fmask_state[4] |= S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.color.fmask_epitch); @@ -4283,25 +4922,18 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu /** * Create a sampler view. * - * @param ctx context - * @param texture texture - * @param state sampler view template - * @param width0 width0 override (for compressed textures as int) - * @param height0 height0 override (for compressed textures as int) - * @param force_level set the base address to the level (for compressed textures) + * @param ctx context + * @param texture texture + * @param state sampler view template */ -struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx, +static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx, struct pipe_resource *texture, - const struct pipe_sampler_view *state, - unsigned width0, unsigned height0, - unsigned force_level) + const struct pipe_sampler_view *state) { struct si_context *sctx = (struct si_context *)ctx; - struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); + struct si_sampler_view *view = CALLOC_STRUCT_CL(si_sampler_view); struct si_texture *tex = (struct si_texture *)texture; - unsigned base_level, first_level, last_level; unsigned char state_swizzle[4]; - unsigned height, depth, width; unsigned last_layer = state->u.tex.last_layer; enum pipe_format pipe_format; const struct legacy_surf_level *surflevel; @@ -4324,8 +4956,11 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx /* Buffer resource. */ if (texture->target == PIPE_BUFFER) { + uint32_t elements = si_clamp_texture_texel_count(sctx->screen->max_texel_buffer_elements, + state->format, state->u.buf.size); + si_make_buffer_descriptor(sctx->screen, si_resource(texture), state->format, - state->u.buf.offset, state->u.buf.size, view->state); + state->u.buf.offset, elements, view->state); return &view->base; } @@ -4334,23 +4969,6 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx state_swizzle[2] = state->swizzle_b; state_swizzle[3] = state->swizzle_a; - base_level = 0; - first_level = state->u.tex.first_level; - last_level = state->u.tex.last_level; - width = width0; - height = height0; - depth = texture->depth0; - - if (sctx->chip_class <= GFX8 && force_level) { - assert(force_level == first_level && force_level == last_level); - base_level = force_level; - first_level = 0; - last_level = 0; - width = u_minify(width, force_level); - height = u_minify(height, force_level); - depth = u_minify(depth, force_level); - } - /* This is not needed if gallium frontends set last_layer correctly. */ if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D || state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE) @@ -4409,29 +5027,22 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx vi_dcc_formats_are_incompatible(texture, state->u.tex.first_level, state->format); sctx->screen->make_texture_descriptor( - sctx->screen, tex, true, state->target, pipe_format, state_swizzle, first_level, last_level, - state->u.tex.first_layer, last_layer, width, height, depth, view->state, view->fmask_state); + sctx->screen, tex, true, state->target, pipe_format, state_swizzle, + state->u.tex.first_level, state->u.tex.last_level, + state->u.tex.first_layer, last_layer, texture->width0, texture->height0, texture->depth0, + false, view->state, view->fmask_state); - view->base_level_info = &surflevel[base_level]; - view->base_level = base_level; + view->base_level_info = &surflevel[0]; view->block_width = util_format_get_blockwidth(pipe_format); return &view->base; } -static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state) -{ - return si_create_sampler_view_custom(ctx, texture, state, texture ? texture->width0 : 0, - texture ? texture->height0 : 0, 0); -} - static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state) { struct si_sampler_view *view = (struct si_sampler_view *)state; pipe_resource_reference(&state->texture, NULL); - FREE(view); + FREE_CL(view); } static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter) @@ -4478,9 +5089,13 @@ static uint32_t si_translate_border_color(struct si_context *sctx, if (i >= SI_MAX_BORDER_COLORS) { /* Getting 4096 unique border colors is very unlikely. */ - fprintf(stderr, "radeonsi: The border color table is full. " - "Any new border colors will be just black. " - "Please file a bug.\n"); + static bool printed; + if (!printed) { + fprintf(stderr, "radeonsi: The border color table is full. " + "Any new border colors will be just black. " + "This is a hardware limitation.\n"); + printed = true; + } return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); } @@ -4491,7 +5106,8 @@ static uint32_t si_translate_border_color(struct si_context *sctx, sctx->border_color_count++; } - return S_008F3C_BORDER_COLOR_PTR(i) | + return (sctx->screen->info.gfx_level >= GFX11 ? S_008F3C_BORDER_COLOR_PTR_GFX11(i): + S_008F3C_BORDER_COLOR_PTR_GFX6(i)) | S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER); } @@ -4531,9 +5147,10 @@ static void *si_create_sampler_state(struct pipe_context *ctx, struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy; unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso); - bool trunc_coord = state->min_img_filter == PIPE_TEX_FILTER_NEAREST && - state->mag_img_filter == PIPE_TEX_FILTER_NEAREST && - state->compare_mode == PIPE_TEX_COMPARE_NONE; + bool trunc_coord = (state->min_img_filter == PIPE_TEX_FILTER_NEAREST && + state->mag_img_filter == PIPE_TEX_FILTER_NEAREST && + state->compare_mode == PIPE_TEX_COMPARE_NONE) || + sscreen->info.conformant_trunc_coord; union pipe_color_union clamped_border_color; if (!rstate) { @@ -4557,29 +5174,29 @@ static void *si_create_sampler_state(struct pipe_context *ctx, rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) | S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | - S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) | - S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) | + S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_mode, state->compare_func)) | + S_008F30_FORCE_UNNORMALIZED(state->unnormalized_coords) | S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) | S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | - S_008F30_TRUNC_COORD(trunc_coord) | - S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9)); + S_008F30_TRUNC_COORD(trunc_coord)); rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0)); - rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) | - S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) | + rstate->val[2] = (S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) | S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) | - S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) | - S_008F38_MIP_POINT_PRECLAMP(0)); + S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter))); rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, state->border_color_is_integer); - if (sscreen->info.chip_class >= GFX10) { - rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1); + if (sscreen->info.gfx_level >= GFX10) { + rstate->val[2] |= S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -32, 31), 8)) | + S_008F38_ANISO_OVERRIDE_GFX10(1); } else { - rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) | + rstate->val[0] |= S_008F30_COMPAT_MODE(sctx->gfx_level >= GFX8); + rstate->val[2] |= S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) | + S_008F38_DISABLE_LSB_CEIL(sctx->gfx_level <= GFX8) | S_008F38_FILTER_PREC_FIX(1) | - S_008F38_ANISO_OVERRIDE_GFX8(sctx->chip_class >= GFX8); + S_008F38_ANISO_OVERRIDE_GFX8(sctx->gfx_level >= GFX8); } /* Create sampler resource for upgraded depth textures. */ @@ -4592,7 +5209,7 @@ static void *si_create_sampler_state(struct pipe_context *ctx, } if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) { - if (sscreen->info.chip_class <= GFX9) + if (sscreen->info.gfx_level <= GFX9) rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1); } else { rstate->upgraded_depth_val[3] = @@ -4613,7 +5230,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask); } -static void si_emit_sample_mask(struct si_context *sctx) +static void si_emit_sample_mask(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; unsigned mask = sctx->sample_mask; @@ -4626,9 +5243,9 @@ static void si_emit_sample_mask(struct si_context *sctx) (mask & 1 && sctx->blitter_running)); radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); - radeon_emit(cs, mask | (mask << 16)); - radeon_emit(cs, mask | (mask << 16)); + radeon_set_context_reg_seq(R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); + radeon_emit(mask | (mask << 16)); + radeon_emit(mask | (mask << 16)); radeon_end(); } @@ -4664,8 +5281,16 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, const struct pipe_vertex_element *elements) { struct si_screen *sscreen = (struct si_screen *)ctx->screen; + + if (sscreen->debug_flags & DBG(VERTEX_ELEMENTS)) { + for (int i = 0; i < count; ++i) { + const struct pipe_vertex_element *e = elements + i; + fprintf(stderr, "elements[%d]: offset %2d, buffer_index %d, dual_slot %d, format %3d, divisor %u\n", + i, e->src_offset, e->vertex_buffer_index, e->dual_slot, e->src_format, e->instance_divisor); + } + } + struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements); - bool used[SI_NUM_VERTEX_BUFFERS] = {}; struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {}; STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16); STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4); @@ -4706,17 +5331,13 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, } } - if (!used[vbo_index]) { - v->first_vb_use_mask |= 1 << i; - used[vbo_index] = true; - } - desc = util_format_description(elements[i].src_format); first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL; - v->format_size[i] = desc->block.bits / 8; - v->src_offset[i] = elements[i].src_offset; + v->elem[i].format_size = desc->block.bits / 8; + v->elem[i].src_offset = elements[i].src_offset; + v->elem[i].stride = elements[i].src_stride; v->vertex_buffer_index[i] = vbo_index; bool always_fix = false; @@ -4773,7 +5394,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, * unsigned, so a shader workaround is needed. The affected * chips are GFX8 and older except Stoney (GFX8.1). */ - always_fix = sscreen->info.chip_class <= GFX8 && sscreen->info.family != CHIP_STONEY && + always_fix = sscreen->info.gfx_level <= GFX8 && sscreen->info.family != CHIP_STONEY && channel->type == UTIL_FORMAT_TYPE_SIGNED; } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) { fix_fetch.u.log_size = 3; /* special encoding */ @@ -4818,10 +5439,11 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, */ bool check_alignment = log_hw_load_size >= 1 && - (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class >= GFX10); + (sscreen->info.gfx_level == GFX6 || sscreen->info.gfx_level >= GFX10); bool opencode = sscreen->options.vs_fetch_always_opencode; - if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0) + if (check_alignment && ((elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0 || + elements[i].src_stride & 3)) opencode = true; if (always_fix || check_alignment || opencode) @@ -4840,20 +5462,30 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, v->vb_alignment_check_mask |= 1 << vbo_index; } - v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); - - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[elements[i].src_format]; - assert(fmt->img_format != 0 && fmt->img_format < 128); - v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(1); + v->elem[i].rsrc_word3 = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); + + if (sscreen->info.gfx_level >= GFX10) { + const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&sscreen->info)[elements[i].src_format]; + ASSERTED unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128; + assert(fmt->img_format != 0 && fmt->img_format < last_vertex_format); + v->elem[i].rsrc_word3 |= + S_008F0C_FORMAT(fmt->img_format) | + S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) | + /* OOB_SELECT chooses the out-of-bounds check: + * - 1: index >= NUM_RECORDS (Structured) + * - 3: offset >= NUM_RECORDS (Raw) + */ + S_008F0C_OOB_SELECT(v->elem[i].stride ? V_008F0C_OOB_SELECT_STRUCTURED + : V_008F0C_OOB_SELECT_RAW); } else { unsigned data_format, num_format; data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); - v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); + v->elem[i].rsrc_word3 |= S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); } } @@ -4884,14 +5516,7 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) sctx->vertex_elements = v; sctx->num_vertex_elements = v->count; - - if (sctx->num_vertex_elements) { - sctx->vertex_buffers_dirty = true; - } else { - sctx->vertex_buffers_dirty = false; - sctx->vertex_buffer_pointer_dirty = false; - sctx->vertex_buffer_user_sgprs_dirty = false; - } + sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0; if (old->instance_divisor_is_one != v->instance_divisor_is_one || old->instance_divisor_is_fetched != v->instance_divisor_is_fetched || @@ -4934,68 +5559,44 @@ static void si_delete_vertex_element(struct pipe_context *ctx, void *state) FREE(state); } -static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count, - unsigned unbind_num_trailing_slots, bool take_ownership, +static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned count, const struct pipe_vertex_buffer *buffers) { struct si_context *sctx = (struct si_context *)ctx; - struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; - unsigned updated_mask = u_bit_consecutive(start_slot, count + unbind_num_trailing_slots); - uint32_t orig_unaligned = sctx->vertex_buffer_unaligned; uint32_t unaligned = 0; - int i; + unsigned i; - assert(start_slot + count + unbind_num_trailing_slots <= ARRAY_SIZE(sctx->vertex_buffer)); + assert(count <= ARRAY_SIZE(sctx->vertex_buffer)); + assert(!count || buffers); - if (buffers) { - if (take_ownership) { - for (i = 0; i < count; i++) { - const struct pipe_vertex_buffer *src = buffers + i; - struct pipe_vertex_buffer *dsti = dst + i; - struct pipe_resource *buf = src->buffer.resource; - unsigned slot_bit = 1 << (start_slot + i); + for (i = 0; i < count; i++) { + const struct pipe_vertex_buffer *src = buffers + i; + struct pipe_vertex_buffer *dst = sctx->vertex_buffer + i; + struct pipe_resource *buf = src->buffer.resource; - /* Only unreference bound vertex buffers. (take_ownership) */ - pipe_resource_reference(&dsti->buffer.resource, NULL); + dst->buffer_offset = src->buffer_offset; - if (src->buffer_offset & 3 || src->stride & 3) - unaligned |= slot_bit; + /* Only unreference bound vertex buffers. */ + pipe_resource_reference(&dst->buffer.resource, NULL); + dst->buffer.resource = src->buffer.resource; - si_context_add_resource_size(sctx, buf); - if (buf) - si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER; - } - /* take_ownership allows us to copy pipe_resource pointers without refcounting. */ - memcpy(dst, buffers, count * sizeof(struct pipe_vertex_buffer)); - } else { - for (i = 0; i < count; i++) { - const struct pipe_vertex_buffer *src = buffers + i; - struct pipe_vertex_buffer *dsti = dst + i; - struct pipe_resource *buf = src->buffer.resource; - unsigned slot_bit = 1 << (start_slot + i); - - pipe_resource_reference(&dsti->buffer.resource, buf); - dsti->buffer_offset = src->buffer_offset; - dsti->stride = src->stride; - - if (dsti->buffer_offset & 3 || dsti->stride & 3) - unaligned |= slot_bit; - - si_context_add_resource_size(sctx, buf); - if (buf) - si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER; - } + if (src->buffer_offset & 3) + unaligned |= BITFIELD_BIT(i); + + if (buf) { + si_resource(buf)->bind_history |= SI_BIND_VERTEX_BUFFER; + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buf), + RADEON_USAGE_READ | RADEON_PRIO_VERTEX_BUFFER); } - } else { - for (i = 0; i < count; i++) - pipe_resource_reference(&dst[i].buffer.resource, NULL); } - for (i = 0; i < unbind_num_trailing_slots; i++) - pipe_resource_reference(&dst[count + i].buffer.resource, NULL); + unsigned last_count = sctx->num_vertex_buffers; + for (; i < last_count; i++) + pipe_resource_reference(&sctx->vertex_buffer[i].buffer.resource, NULL); + sctx->num_vertex_buffers = count; sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0; - sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned; + sctx->vertex_buffer_unaligned = unaligned; /* Check whether alignment may have changed in a way that requires * shader changes. This check is conservative: a vertex buffer can only @@ -5004,13 +5605,84 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, * whether buffers are at least dword-aligned, since that should always * be the case in well-behaved applications anyway. */ - if ((sctx->vertex_elements->vb_alignment_check_mask & - (unaligned | orig_unaligned) & updated_mask)) { + if (sctx->vertex_elements->vb_alignment_check_mask & unaligned) { si_vs_key_update_inputs(sctx); sctx->do_update_shaders = true; } } +static struct pipe_vertex_state * +si_create_vertex_state(struct pipe_screen *screen, + struct pipe_vertex_buffer *buffer, + const struct pipe_vertex_element *elements, + unsigned num_elements, + struct pipe_resource *indexbuf, + uint32_t full_velem_mask) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_vertex_state *state = CALLOC_STRUCT(si_vertex_state); + + util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf, full_velem_mask, + &state->b); + + /* Initialize the vertex element state in state->element. + * Do it by creating a vertex element state object and copying it there. + */ + struct si_context ctx = {}; + ctx.b.screen = screen; + struct si_vertex_elements *velems = si_create_vertex_elements(&ctx.b, num_elements, elements); + state->velems = *velems; + si_delete_vertex_element(&ctx.b, velems); + + assert(!state->velems.instance_divisor_is_one); + assert(!state->velems.instance_divisor_is_fetched); + assert(!state->velems.fix_fetch_always); + assert(buffer->buffer_offset % 4 == 0); + assert(!buffer->is_user_buffer); + for (unsigned i = 0; i < num_elements; i++) { + assert(elements[i].src_offset % 4 == 0); + assert(!elements[i].dual_slot); + assert(elements[i].src_stride % 4 == 0); + } + + for (unsigned i = 0; i < num_elements; i++) { + si_set_vertex_buffer_descriptor(sscreen, &state->velems, &state->b.input.vbuffer, i, + &state->descriptors[i * 4]); + } + + return &state->b; +} + +static void si_vertex_state_destroy(struct pipe_screen *screen, + struct pipe_vertex_state *state) +{ + pipe_vertex_buffer_unreference(&state->input.vbuffer); + pipe_resource_reference(&state->input.indexbuf, NULL); + FREE(state); +} + +static struct pipe_vertex_state * +si_pipe_create_vertex_state(struct pipe_screen *screen, + struct pipe_vertex_buffer *buffer, + const struct pipe_vertex_element *elements, + unsigned num_elements, + struct pipe_resource *indexbuf, + uint32_t full_velem_mask) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + return util_vertex_state_cache_get(screen, buffer, elements, num_elements, indexbuf, + full_velem_mask, &sscreen->vertex_state_cache); +} + +static void si_pipe_vertex_state_destroy(struct pipe_screen *screen, + struct pipe_vertex_state *state) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + util_vertex_state_destroy(screen, &sscreen->vertex_state_cache, state); +} + /* * Misc */ @@ -5033,13 +5705,6 @@ static void si_set_tess_state(struct pipe_context *ctx, const float default_oute si_set_internal_const_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb); } -static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices) -{ - struct si_context *sctx = (struct si_context *)ctx; - - sctx->patch_vertices = patch_vertices; -} - static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; @@ -5076,13 +5741,17 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) * automatically at end of shader, but the contents of other * L1 caches might still be stale. */ sctx->flags |= SI_CONTEXT_INV_VCACHE; + + if (flags & (PIPE_BARRIER_IMAGE | PIPE_BARRIER_TEXTURE) && + sctx->screen->info.tcc_rb_non_coherent) + sctx->flags |= SI_CONTEXT_INV_L2; } if (flags & PIPE_BARRIER_INDEX_BUFFER) { /* Indices are read through TC L2 since GFX8. * L1 isn't used. */ - if (sctx->screen->info.chip_class <= GFX7) + if (sctx->screen->info.gfx_level <= GFX7) sctx->flags |= SI_CONTEXT_WB_L2; } @@ -5092,13 +5761,15 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; - if (sctx->chip_class <= GFX8) + if (sctx->gfx_level <= GFX8) sctx->flags |= SI_CONTEXT_WB_L2; } /* Indirect buffers use TC L2 on GFX9, but not older hw. */ - if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER) + if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER) sctx->flags |= SI_CONTEXT_WB_L2; + + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) @@ -5111,6 +5782,11 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) return si_create_blend_state_mode(&sctx->b, &blend, mode); } +static void si_emit_cache_flush_state(struct si_context *sctx, unsigned index) +{ + sctx->emit_cache_flush(sctx, &sctx->gfx_cs); +} + void si_init_state_compute_functions(struct si_context *sctx) { sctx->b.create_sampler_state = si_create_sampler_state; @@ -5122,8 +5798,22 @@ void si_init_state_compute_functions(struct si_context *sctx) void si_init_state_functions(struct si_context *sctx) { - sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; - sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs; + sctx->atoms.s.pm4_states[SI_STATE_IDX(blend)].emit = si_pm4_emit_state; + sctx->atoms.s.pm4_states[SI_STATE_IDX(rasterizer)].emit = si_pm4_emit_rasterizer; + sctx->atoms.s.pm4_states[SI_STATE_IDX(dsa)].emit = si_pm4_emit_dsa; + sctx->atoms.s.pm4_states[SI_STATE_IDX(sqtt_pipeline)].emit = si_pm4_emit_state; + sctx->atoms.s.pm4_states[SI_STATE_IDX(ls)].emit = si_pm4_emit_shader; + sctx->atoms.s.pm4_states[SI_STATE_IDX(hs)].emit = si_pm4_emit_shader; + sctx->atoms.s.pm4_states[SI_STATE_IDX(es)].emit = si_pm4_emit_shader; + sctx->atoms.s.pm4_states[SI_STATE_IDX(gs)].emit = si_pm4_emit_shader; + sctx->atoms.s.pm4_states[SI_STATE_IDX(vs)].emit = si_pm4_emit_shader; + sctx->atoms.s.pm4_states[SI_STATE_IDX(ps)].emit = si_pm4_emit_shader; + + if (sctx->screen->info.has_set_context_pairs_packed) + sctx->atoms.s.framebuffer.emit = gfx11_dgpu_emit_framebuffer_state; + else + sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; + sctx->atoms.s.db_render_state.emit = si_emit_db_render_state; sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state; sctx->atoms.s.msaa_config.emit = si_emit_msaa_config; @@ -5133,6 +5823,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->atoms.s.clip_regs.emit = si_emit_clip_regs; sctx->atoms.s.clip_state.emit = si_emit_clip_state; sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref; + sctx->atoms.s.cache_flush.emit = si_emit_cache_flush_state; sctx->b.create_blend_state = si_create_blend_state; sctx->b.bind_blend_state = si_bind_blend_state; @@ -5148,11 +5839,18 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state; sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx); - sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE); - sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS); - sctx->custom_blend_eliminate_fastclear = - si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR); - sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS); + + if (sctx->gfx_level < GFX11) { + sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE); + sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS); + sctx->custom_blend_eliminate_fastclear = + si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR); + } + + sctx->custom_blend_dcc_decompress = + si_create_blend_custom(sctx, sctx->gfx_level >= GFX11 ? + V_028808_CB_DCC_DECOMPRESS_GFX11 : + V_028808_CB_DCC_DECOMPRESS_GFX8); sctx->b.set_clip_state = si_set_clip_state; sctx->b.set_stencil_ref = si_set_stencil_ref; @@ -5169,7 +5867,6 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.texture_barrier = si_texture_barrier; sctx->b.set_min_samples = si_set_min_samples; sctx->b.set_tess_state = si_set_tess_state; - sctx->b.set_patch_vertices = si_set_patch_vertices; sctx->b.set_active_query_state = si_set_active_query_state; } @@ -5177,17 +5874,21 @@ void si_init_state_functions(struct si_context *sctx) void si_init_screen_state_functions(struct si_screen *sscreen) { sscreen->b.is_format_supported = si_is_format_supported; + sscreen->b.create_vertex_state = si_pipe_create_vertex_state; + sscreen->b.vertex_state_destroy = si_pipe_vertex_state_destroy; - if (sscreen->info.chip_class >= GFX10) { + if (sscreen->info.gfx_level >= GFX10) sscreen->make_texture_descriptor = gfx10_make_texture_descriptor; - } else { + else sscreen->make_texture_descriptor = si_make_texture_descriptor; - } + + util_vertex_state_cache_init(&sscreen->vertex_state_cache, + si_create_vertex_state, si_vertex_state_destroy); } static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value) { - unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX; + unsigned reg = sctx->gfx_level >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX; si_pm4_set_reg(pm4, reg, value); } @@ -5215,7 +5916,7 @@ static void si_write_harvested_raster_configs(struct si_context *sctx, struct si } si_set_grbm_gfx_index(sctx, pm4, ~0); - if (sctx->chip_class >= GFX7) { + if (sctx->gfx_level >= GFX7) { si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); } } @@ -5224,43 +5925,90 @@ static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *p { struct si_screen *sscreen = sctx->screen; unsigned num_rb = MIN2(sscreen->info.max_render_backends, 16); - unsigned rb_mask = sscreen->info.enabled_rb_mask; + uint64_t rb_mask = sscreen->info.enabled_rb_mask; unsigned raster_config = sscreen->pa_sc_raster_config; unsigned raster_config_1 = sscreen->pa_sc_raster_config_1; - if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { + if (!rb_mask || util_bitcount64(rb_mask) >= num_rb) { /* Always use the default config when all backends are enabled * (or when we failed to determine the enabled backends). */ si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config); - if (sctx->chip_class >= GFX7) + if (sctx->gfx_level >= GFX7) si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); } else { si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1); } } -void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) +unsigned gfx103_get_cu_mask_ps(struct si_screen *sscreen) +{ + /* It's wasteful to enable all CUs for PS if shader arrays have a different + * number of CUs. The reason is that the hardware sends the same number of PS + * waves to each shader array, so the slowest shader array limits the performance. + * Disable the extra CUs for PS in other shader arrays to save power and thus + * increase clocks for busy CUs. In the future, we might disable or enable this + * tweak only for certain apps. + */ + return u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa); +} + +static void gfx6_init_gfx_preamble_state(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; - uint64_t border_color_va = sctx->border_color_buffer->gpu_address; + uint64_t border_color_va = + sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0; + uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) | + S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en); bool has_clear_state = sscreen->info.has_clear_state; - struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); + /* We need more space because the preamble is large. */ + struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, 214, sctx->has_graphics); if (!pm4) return; - if (!uses_reg_shadowing) { + if (sctx->has_graphics && !sctx->shadowing.registers) { si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1)); si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1)); + if (sscreen->dpbb_allowed) { + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } + if (has_clear_state) { si_pm4_cmd_add(pm4, PKT3(PKT3_CLEAR_STATE, 0, 0)); si_pm4_cmd_add(pm4, 0); } } + /* Compute registers. */ + si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); + si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en); + si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en); + + if (sctx->gfx_level >= GFX7) { + si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en); + si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en); + } + + if (sctx->gfx_level >= GFX9) + si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0); + + /* Set the pointer to border colors. MI200 doesn't support border colors. */ + if (sctx->gfx_level >= GFX7 && sctx->border_color_buffer) { + si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); + si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, + S_030E04_ADDRESS(border_color_va >> 40)); + } else if (sctx->gfx_level == GFX6) { + si_pm4_set_reg(pm4, R_00950C_TA_CS_BC_BASE_ADDR, border_color_va >> 8); + } + + if (!sctx->has_graphics) + goto done; + + /* Graphics registers. */ /* CLEAR_STATE doesn't restore these correctly. */ si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, @@ -5271,32 +6019,50 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, - S_028230_ER_TRI(0xA) | S_028230_ER_POINT(0xA) | S_028230_ER_RECT(0xA) | - /* Required by DX10_DIAMOND_TEST_ENA: */ - S_028230_ER_LINE_LR(0x1A) | S_028230_ER_LINE_RL(0x26) | - S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA)); si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0); si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0); - si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); + si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); + si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); } si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); - if (sctx->chip_class >= GFX7) + if (sctx->gfx_level >= GFX7) si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40)); - if (sctx->chip_class == GFX6) { + if (sctx->gfx_level == GFX6) { si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); } - if (sctx->chip_class <= GFX7 || !has_clear_state) { + if (sctx->gfx_level >= GFX7) { + si_pm4_set_reg(pm4, R_030A00_PA_SU_LINE_STIPPLE_VALUE, 0); + si_pm4_set_reg(pm4, R_030A04_PA_SC_LINE_STIPPLE_STATE, 0); + } else { + si_pm4_set_reg(pm4, R_008A60_PA_SU_LINE_STIPPLE_VALUE, 0); + si_pm4_set_reg(pm4, R_008B10_PA_SC_LINE_STIPPLE_STATE, 0); + } + + /* If any sample location uses the -8 coordinate, the EXCLUSION fields should be set to 0. */ + si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, + S_02882C_XMAX_RIGHT_EXCLUSION(sctx->gfx_level >= GFX7) | + S_02882C_YMAX_BOTTOM_EXCLUSION(sctx->gfx_level >= GFX7)); + + if (sctx->family >= CHIP_POLARIS10 && !sctx->screen->info.has_small_prim_filter_sample_loc_bug) { + /* Polaris10-12 should disable small line culling, but those also have the sample loc bug, + * so they never enter this branch. + */ + assert(sctx->family > CHIP_POLARIS12); + si_pm4_set_reg(pm4, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, + S_028830_SMALL_PRIM_FILTER_ENABLE(1)); + } + + if (sctx->gfx_level <= GFX7 || !has_clear_state) { si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); @@ -5310,30 +6076,14 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) S_028034_BR_X(16384) | S_028034_BR_Y(16384)); } - if (sctx->chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_028038_DB_DFSM_CONTROL, - S_028038_PUNCHOUT_MODE(V_028038_FORCE_OFF) | - S_028038_POPS_DRAIN_PS_ON_OVERLAP(1)); - } - - unsigned cu_mask_ps = 0xffffffff; - - /* It's wasteful to enable all CUs for PS if shader arrays have a different - * number of CUs. The reason is that the hardware sends the same number of PS - * waves to each shader array, so the slowest shader array limits the performance. - * Disable the extra CUs for PS in other shader arrays to save power and thus - * increase clocks for busy CUs. In the future, we might disable or enable this - * tweak only for certain apps. - */ - if (sctx->chip_class >= GFX10_3) - cu_mask_ps = u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa); - - if (sctx->chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, - S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F)); + if (sctx->gfx_level >= GFX7) { + si_pm4_set_reg_idx3(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, + ac_apply_cu_en(S_00B01C_CU_EN(0xffffffff) | + S_00B01C_WAVE_LIMIT(0x3F), + C_00B01C_CU_EN, 0, &sscreen->info)); } - if (sctx->chip_class <= GFX8) { + if (sctx->gfx_level <= GFX8) { si_set_raster_config(sctx, pm4); /* FIXME calculate these values somehow ??? */ @@ -5349,12 +6099,7 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); } - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, - S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8)); - si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, - S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8)); - } else if (sscreen->info.chip_class == GFX9) { + if (sctx->gfx_level == GFX9) { si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(sscreen->info.address32_hi >> 8)); si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, @@ -5364,12 +6109,14 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8)); } - if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) { + if (sctx->gfx_level >= GFX7 && sctx->gfx_level <= GFX8) { si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, - S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); + ac_apply_cu_en(S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F), + C_00B51C_CU_EN, 0, &sscreen->info)); si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, - S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); + ac_apply_cu_en(S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F), + C_00B31C_CU_EN, 0, &sscreen->info)); /* If this is 0, Bonaire can hang even if GS isn't being used. * Other chips are unaffected. These are suboptimal values, @@ -5379,140 +6126,296 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); } - if (sctx->chip_class == GFX8) { + if (sctx->gfx_level >= GFX8) { unsigned vgt_tess_distribution; - vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) | - S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT_GFX81(16); + if (sctx->gfx_level == GFX9) { + vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(12) | + S_028B50_ACCUM_TRI(30) | + S_028B50_ACCUM_QUAD(24) | + S_028B50_DONUT_SPLIT_GFX9(24) | + S_028B50_TRAP_SPLIT(6); + } else { + vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | + S_028B50_ACCUM_TRI(11) | + S_028B50_ACCUM_QUAD(11) | + S_028B50_DONUT_SPLIT_GFX81(16); - /* Testing with Unigine Heaven extreme tesselation yielded best results - * with TRAP_SPLIT = 3. - */ - if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10) - vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); + /* Testing with Unigine Heaven extreme tessellation yielded best results + * with TRAP_SPLIT = 3. + */ + if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10) + vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); + } si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); } - if (sscreen->info.chip_class <= GFX9) { - si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); - } + si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); - if (sctx->chip_class == GFX9) { + if (sctx->gfx_level == GFX9) { si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); - si_pm4_set_reg(pm4, R_028060_DB_DFSM_CONTROL, - S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | - S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); - } + si_pm4_set_reg(pm4, R_028060_DB_DFSM_CONTROL, S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF)); - if (sctx->chip_class >= GFX9) { - si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); + si_pm4_set_reg_idx3(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + ac_apply_cu_en(S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), + C_00B41C_CU_EN, 0, &sscreen->info)); - si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, - S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) | - S_028B50_DONUT_SPLIT_GFX9(24) | S_028B50_TRAP_SPLIT(6)); si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | S_028C48_MAX_PRIM_PER_BATCH(1023)); si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); + si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1); si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); - si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, - sctx->chip_class >= GFX10 ? 0x20 : 0); } - if (sctx->chip_class >= GFX10) { - /* Logical CUs 16 - 31 */ - si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16)); - si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff)); +done: + si_pm4_finalize(pm4); + sctx->cs_preamble_state = pm4; + sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ +} - si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0); - si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0); - si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0); - si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0); - si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0); - si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0); - si_pm4_set_reg(pm4, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0); - si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0); - si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0); - si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0); - si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0); - si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0); - si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0); - si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0); - si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0); - si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0); - - si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, - S_00B0C0_SOFT_GROUPING_EN(1) | - S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); - si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); +static void cdna_init_compute_preamble_state(struct si_context *sctx) +{ + struct si_screen *sscreen = sctx->screen; + uint64_t border_color_va = + sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0; + uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) | + S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en); + + struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, 48, true); + if (!pm4) + return; + + /* Compute registers. */ + /* Disable profiling on compute chips. */ + si_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); + si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); + si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en); + si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en); + si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en); + si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en); + si_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0); + + if (sscreen->info.family >= CHIP_GFX940) { + si_pm4_set_reg(pm4, R_00B89C_COMPUTE_TG_CHUNK_SIZE, 0); + si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_PGM_RSRC3, 0); + } else { + si_pm4_set_reg(pm4, R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en); + si_pm4_set_reg(pm4, R_00B898_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en); + si_pm4_set_reg(pm4, R_00B89C_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en); + si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en); + } + + si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0); + + /* Set the pointer to border colors. Only MI100 supports border colors. */ + if (sscreen->info.family == CHIP_MI100) { + si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); + si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, + S_030E04_ADDRESS(border_color_va >> 40)); + } + + si_pm4_finalize(pm4); + sctx->cs_preamble_state = pm4; + sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ +} + +static void gfx10_init_gfx_preamble_state(struct si_context *sctx) +{ + struct si_screen *sscreen = sctx->screen; + uint64_t border_color_va = + sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0; + uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) | + S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en); + unsigned meta_write_policy, meta_read_policy, color_write_policy, color_read_policy; + unsigned zs_write_policy, zs_read_policy; + unsigned cache_no_alloc = sctx->gfx_level >= GFX11 ? V_02807C_CACHE_NOA_GFX11: + V_02807C_CACHE_NOA_GFX10; + + if (sscreen->options.cache_rb_gl2) { + color_write_policy = V_028410_CACHE_LRU_WR; + color_read_policy = V_028410_CACHE_LRU_RD; + zs_write_policy = V_02807C_CACHE_LRU_WR; + zs_read_policy = V_02807C_CACHE_LRU_RD; + meta_write_policy = V_02807C_CACHE_LRU_WR; + meta_read_policy = V_02807C_CACHE_LRU_RD; + } else { + color_write_policy = V_028410_CACHE_STREAM; + color_read_policy = cache_no_alloc; + zs_write_policy = V_02807C_CACHE_STREAM; + zs_read_policy = cache_no_alloc; /* Enable CMASK/HTILE/DCC caching in L2 for small chips. */ - unsigned meta_write_policy, meta_read_policy; if (sscreen->info.max_render_backends <= 4) { meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */ meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */ } else { meta_write_policy = V_02807C_CACHE_STREAM; /* write combine */ - meta_read_policy = V_02807C_CACHE_NOA; /* don't cache reads */ + meta_read_policy = cache_no_alloc; /* don't cache reads that miss */ } + } - si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, - S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM) | - S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM) | - S_02807C_HTILE_WR_POLICY(meta_write_policy) | - S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) | - S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA) | - S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA) | - S_02807C_HTILE_RD_POLICY(meta_read_policy)); - si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, - S_028410_CMASK_WR_POLICY(meta_write_policy) | - S_028410_FMASK_WR_POLICY(V_028410_CACHE_STREAM) | - S_028410_DCC_WR_POLICY(meta_write_policy) | - S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM) | - S_028410_CMASK_RD_POLICY(meta_read_policy) | - S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA) | - S_028410_DCC_RD_POLICY(meta_read_policy) | - S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA)); - - si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); - si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); - - /* Break up a pixel wave if it contains deallocs for more than - * half the parameter cache. - * - * To avoid a deadlock where pixel waves aren't launched - * because they're waiting for more pixels while the frontend - * is stuck waiting for PC space, the maximum allowed value is - * the size of the PC minus the largest possible allocation for - * a single primitive shader subgroup. - */ - si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512)); - /* Reuse for legacy (non-NGG) only. */ - si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + /* We need more space because the preamble is large. */ + struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, 214, sctx->has_graphics); + if (!pm4) + return; + + if (sctx->has_graphics && !sctx->shadowing.registers) { + si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1)); + si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1)); - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, - sscreen->info.pa_sc_tile_steering_override); + if (sscreen->dpbb_allowed) { + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } + si_pm4_cmd_add(pm4, PKT3(PKT3_CLEAR_STATE, 0, 0)); + si_pm4_cmd_add(pm4, 0); + } + + /* Non-graphics uconfig registers. */ + if (sctx->gfx_level < GFX11) + si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0x20); + si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); + si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(border_color_va >> 40)); + + /* Compute registers. */ + si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sscreen->info.address32_hi >> 8)); + + for (unsigned i = 0; i < 4; ++i) + si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 + i * 4, + i < sscreen->info.num_se ? compute_cu_en : 0x0); + + si_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0); + si_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0); + si_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0); + si_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0); + + if (sctx->gfx_level >= GFX11) { + for (unsigned i = 4; i < 8; ++i) + si_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4 + (i - 4) * 4, + i < sscreen->info.num_se ? compute_cu_en : 0x0); - si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); - si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); - si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); + /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits. + * Only these values are valid: 0 (disabled), 64, 128, 256, 512 + * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure) + */ + si_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256)); + } else { + si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_PGM_RSRC3, 0); + } + + si_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); + + if (!sctx->has_graphics) + goto done; + + /* Shader registers - PS. */ + unsigned cu_mask_ps = sctx->gfx_level >= GFX10_3 ? gfx103_get_cu_mask_ps(sscreen) : ~0u; + if (sctx->gfx_level < GFX11) { + si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, + ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16), /* CUs 16-31 */ + C_00B004_CU_EN, 16, &sscreen->info)); + } + si_pm4_set_reg_idx3(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, + ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) | + S_00B01C_WAVE_LIMIT(0x3F) | + S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11), + C_00B01C_CU_EN, 0, &sscreen->info)); + si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, + S_00B0C0_SOFT_GROUPING_EN(1) | + S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); + si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0); + si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0); + si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0); + si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0); + + /* Shader registers - VS. */ + if (sctx->gfx_level < GFX11) { + si_pm4_set_reg_idx3(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, + ac_apply_cu_en(S_00B104_CU_EN(0xffff), /* CUs 16-31 */ + C_00B104_CU_EN, 16, &sscreen->info)); + si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); + si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0); + si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0); + si_pm4_set_reg(pm4, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0); + si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0); } - if (sctx->chip_class >= GFX10_3) { + /* Shader registers - GS. */ + si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0); + si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0); + si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0); + si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0); + si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, + S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8)); + + /* Shader registers - HS. */ + if (sctx->gfx_level < GFX11) { + si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, + ac_apply_cu_en(S_00B404_CU_EN(0xffff), /* CUs 16-31 */ + C_00B404_CU_EN, 16, &sscreen->info)); + } + si_pm4_set_reg_idx3(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + ac_apply_cu_en(S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), + C_00B41C_CU_EN, 0, &sscreen->info)); + si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0); + si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0); + si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0); + si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0); + si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, + S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8)); + + /* Context registers. */ + if (sctx->gfx_level < GFX11) { + si_pm4_set_reg(pm4, R_028038_DB_DFSM_CONTROL, S_028038_PUNCHOUT_MODE(V_028038_FORCE_OFF)); + } + si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, + S_02807C_Z_WR_POLICY(zs_write_policy) | + S_02807C_S_WR_POLICY(zs_write_policy) | + S_02807C_HTILE_WR_POLICY(meta_write_policy) | + S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) | /* occlusion query writes */ + S_02807C_Z_RD_POLICY(zs_read_policy) | + S_02807C_S_RD_POLICY(zs_read_policy) | + S_02807C_HTILE_RD_POLICY(meta_read_policy)); + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); + si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40)); + + si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, + (sctx->gfx_level >= GFX11 ? + S_028410_COLOR_WR_POLICY_GFX11(color_write_policy) | + S_028410_COLOR_RD_POLICY(color_read_policy) | + S_028410_DCC_WR_POLICY_GFX11(meta_write_policy) | + S_028410_DCC_RD_POLICY(meta_read_policy) + : + S_028410_COLOR_WR_POLICY_GFX10(color_write_policy) | + S_028410_COLOR_RD_POLICY(color_read_policy)) | + S_028410_FMASK_WR_POLICY(color_write_policy) | + S_028410_FMASK_RD_POLICY(color_read_policy) | + S_028410_CMASK_WR_POLICY(meta_write_policy) | + S_028410_CMASK_RD_POLICY(meta_read_policy) | + S_028410_DCC_WR_POLICY_GFX10(meta_write_policy) | + S_028410_DCC_RD_POLICY(meta_read_policy)); + si_pm4_set_reg(pm4, R_028708_SPI_SHADER_IDX_FORMAT, + S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP)); + + if (sctx->gfx_level >= GFX10_3) si_pm4_set_reg(pm4, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff); + + /* If any sample location uses the -8 coordinate, the EXCLUSION fields should be set to 0. */ + si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, + S_02882C_XMAX_RIGHT_EXCLUSION(1) | + S_02882C_YMAX_BOTTOM_EXCLUSION(1)); + si_pm4_set_reg(pm4, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, + S_028830_SMALL_PRIM_FILTER_ENABLE(1)); + if (sctx->gfx_level >= GFX10_3) { /* The rate combiners have no effect if they are disabled like this: * VERTEX_RATE: BYPASS_VTX_RATE_COMBINER = 1 * PRIMITIVE_RATE: BYPASS_PRIM_RATE_COMBINER = 1 @@ -5523,9 +6426,91 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) * (e.g. enabled sample shading overrides the vertex rate) */ si_pm4_set_reg(pm4, R_028848_PA_CL_VRS_CNTL, - S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE) | - S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE)); + S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE) | + S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE)); } + si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); + si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1); + si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, + sctx->gfx_level >= GFX11 ? + S_028B50_ACCUM_ISOLINE(128) | + S_028B50_ACCUM_TRI(128) | + S_028B50_ACCUM_QUAD(128) | + S_028B50_DONUT_SPLIT_GFX9(24) | + S_028B50_TRAP_SPLIT(6) + : + S_028B50_ACCUM_ISOLINE(12) | + S_028B50_ACCUM_TRI(30) | + S_028B50_ACCUM_QUAD(24) | + S_028B50_DONUT_SPLIT_GFX9(24) | + S_028B50_TRAP_SPLIT(6)); + + /* GFX11+ shouldn't subtract 1 from pbb_max_alloc_count. */ + unsigned gfx10_one = sctx->gfx_level < GFX11; + si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, + S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - gfx10_one) | + S_028C48_MAX_PRIM_PER_BATCH(1023)); + + if (sctx->gfx_level >= GFX11_5) + si_pm4_set_reg(pm4, R_028C54_PA_SC_BINNER_CNTL_2, + S_028C54_ENABLE_PING_PONG_BIN_ORDER(1)); + + /* Break up a pixel wave if it contains deallocs for more than + * half the parameter cache. + * + * To avoid a deadlock where pixel waves aren't launched + * because they're waiting for more pixels while the frontend + * is stuck waiting for PC space, the maximum allowed value is + * the size of the PC minus the largest possible allocation for + * a single primitive shader subgroup. + */ + si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, + S_028C50_MAX_DEALLOCS_IN_WAVE(sctx->gfx_level >= GFX11 ? 16 : 512)); + if (sctx->gfx_level < GFX11) + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); /* Reuse for legacy (non-NGG) only. */ + + /* Uconfig registers. */ + si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); + if (sctx->gfx_level >= GFX11) { + /* This is changed by draws for indexed draws, but we need to set DISABLE_FOR_AUTO_INDEX + * here, which disables primitive restart for all non-indexed draws, so that those draws + * won't have to set this state. + */ + si_pm4_set_reg(pm4, R_03092C_GE_MULTI_PRIM_IB_RESET_EN, S_03092C_DISABLE_FOR_AUTO_INDEX(1)); + } + si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); + si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); + si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); + + si_pm4_set_reg(pm4, R_030A00_PA_SU_LINE_STIPPLE_VALUE, 0); + si_pm4_set_reg(pm4, R_030A04_PA_SC_LINE_STIPPLE_STATE, 0); + + if (sctx->gfx_level >= GFX11) { + uint64_t rb_mask = BITFIELD64_MASK(sscreen->info.max_render_backends); + + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 2, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1)); + si_pm4_cmd_add(pm4, PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) | + PIXEL_PIPE_STATE_CNTL_STRIDE(2) | + PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask)); + si_pm4_cmd_add(pm4, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask)); + } + +done: + si_pm4_finalize(pm4); sctx->cs_preamble_state = pm4; + sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ +} + +void si_init_gfx_preamble_state(struct si_context *sctx) +{ + if (!sctx->screen->info.has_graphics) + cdna_init_compute_preamble_state(sctx); + else if (sctx->gfx_level >= GFX10) + gfx10_init_gfx_preamble_state(sctx); + else + gfx6_init_gfx_preamble_state(sctx); } |