summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2021-02-09 18:56:04 -0500
committerMarek Olšák <marek.olsak@amd.com>2021-02-17 04:49:24 -0500
commit98ea523e007efa71adecfcce92a168efcf9b54dd (patch)
treeb7b9397a25c5c100632385b0515f66f8cc582cd8
parent4fe37b850a4c00221a210481eeb3b9f5ec68e3ea (diff)
radeonsi: for tess, determine the minimum num_patches before optimizing tg size
Doing these MINs at the end could have undone optimizations for the LDS size and threadgroup size, so move the MINs up. Reviewed-by: Zoltán Böszörményi <zboszor@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9028>
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.cpp32
1 files changed, 16 insertions, 16 deletions
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index a7aa9612e6c..62f4f3e1422 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -271,6 +271,22 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
*num_patches = 256 / max_verts_per_patch;
+ /* Not necessary for correctness, but higher numbers are slower.
+ * The hardware can do more, but the radeonsi shader constant is
+ * limited to 6 bits.
+ */
+ *num_patches = MIN2(*num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */
+
+ /* When distributed tessellation is unsupported, switch between SEs
+ * at a higher frequency to manually balance the workload between SEs.
+ */
+ if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
+ *num_patches = MIN2(*num_patches, 16); /* recommended */
+
+ /* Make sure the output data fits in the offchip buffer */
+ *num_patches =
+ MIN2(*num_patches, (sctx->screen->tess_offchip_block_dw_size * 4) / output_patch_size);
+
/* Make sure that the data fits in LDS. This assumes the shaders only
* use LDS for the inputs and outputs.
*
@@ -286,22 +302,6 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
*num_patches = MAX2(*num_patches, 1);
assert(*num_patches * lds_per_patch <= max_lds_size);
- /* Make sure the output data fits in the offchip buffer */
- *num_patches =
- MIN2(*num_patches, (sctx->screen->tess_offchip_block_dw_size * 4) / output_patch_size);
-
- /* Not necessary for correctness, but improves performance.
- * The hardware can do more, but the radeonsi shader constant is
- * limited to 6 bits.
- */
- *num_patches = MIN2(*num_patches, 64); /* triangles: 3 full waves */
-
- /* When distributed tessellation is unsupported, switch between SEs
- * at a higher frequency to compensate for it.
- */
- if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
- *num_patches = MIN2(*num_patches, 16); /* recommended */
-
/* Make sure that vector lanes are reasonably occupied. It probably
* doesn't matter much because this is LS-HS, and TES is likely to
* occupy significantly more CUs.