radv: Support NGG culling with new perftest environment variable.

Currently we don't enable it on any chip by default, but we plan to enable it soon on GFX10.3 when we are comfortable with its performance. RADV_PERFTEST=nggc environment variable enables it on GFX10+ GPUs. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
author: Timur Kristóf <timur.kristof@gmail.com> 2021-06-08 12:32:35 +0200
committer: Marge Bot <eric+marge@anholt.net> 2021-07-13 23:56:33 +0000
commit: f30e4351de2f562955435a04cf75dd641639d31c (patch)
tree: 668a704123a1473db3ac098030a532ce516c336d
parent: 182d9b1e6072bec190cf0a52e9d93dbbdbaa850d (diff)
8 files changed, 64 insertions, 5 deletions
diff --git a/docs/envvars.rst b/docs/envvars.rst
index 1538f0a509a..df9c0434ab3 100644
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -647,6 +647,8 @@ RADV driver environment variables
       disable optimizations that get enabled when all VRAM is CPU visible.
    ``pswave32``
       enable wave32 for pixel shaders (GFX10+)
+   ``nggc``
+      enable NGG culling on GFX10+ GPUs.
    ``rt``
       enable rt extensions whose implementation is still experimental.
    ``sam``
diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt
index 495a3545f28..f00cbf6bb01 100644
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@@ -15,6 +15,7 @@ VK_EXT_multi_draw on ANV, lavapipe, and RADV
 VK_KHR_separate_depth_stencil_layouts on lavapipe
 VK_EXT_separate_stencil_usage on lavapipe
 VK_EXT_extended_dynamic_state2 on lavapipe
+NGG shader based primitive culling is now supported by RADV.
 Panfrost supports OpenGL ES 3.1
 New Asahi driver for the Apple M1
 GL_ARB_sample_locations on zink
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 88e8c53822d..5a0f2958600 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -74,6 +74,7 @@ enum {
    RADV_PERFTEST_NO_SAM = 1u << 6,
    RADV_PERFTEST_SAM = 1u << 7,
    RADV_PERFTEST_RT = 1u << 8,
+   RADV_PERFTEST_NGGC = 1u << 9,
 };
 
 bool radv_init_trace(struct radv_device *device);
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 1ae5a2e7283..738f68db9b1 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -830,6 +830,7 @@ static const struct debug_control radv_perftest_options[] = {{"localbos", RADV_P
                                                              {"nosam", RADV_PERFTEST_NO_SAM},
                                                              {"sam", RADV_PERFTEST_SAM},
                                                              {"rt", RADV_PERFTEST_RT},
+                                                             {"nggc", RADV_PERFTEST_NGGC},
                                                              {NULL, 0}};
 
 const char *
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 9b08c1ba1cf..3dfaa44c4bc 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -211,6 +211,8 @@ get_hash_flags(const struct radv_device *device, bool stats)
 
    if (device->instance->debug_flags & RADV_DEBUG_NO_NGG)
       hash_flags |= RADV_HASH_SHADER_NO_NGG;
+   if (device->instance->perftest_flags & RADV_PERFTEST_NGGC)
+      hash_flags |= RADV_HASH_SHADER_FORCE_NGG_CULLING;
    if (device->physical_device->cs_wave_size == 32)
       hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
    if (device->physical_device->ps_wave_size == 32)
@@ -3451,8 +3453,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
          bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
          bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
                             !radv_use_llvm_for_stage(device, i);
-         if (lowered_ngg)
-            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i]);
+         if (lowered_ngg) {
+            uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
+            bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
+            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i], consider_culling);
+         }
 
          radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index b99bea00dd1..045af48628f 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1672,6 +1672,7 @@ struct radv_event {
 #define RADV_HASH_SHADER_FORCE_VRS_2x2     (1 << 9)
 #define RADV_HASH_SHADER_FORCE_VRS_2x1     (1 << 10)
 #define RADV_HASH_SHADER_FORCE_VRS_1x2     (1 << 11)
+#define RADV_HASH_SHADER_FORCE_NGG_CULLING (1 << 13)
 
 void radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **stages,
                        const struct radv_pipeline_layout *layout,
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 5a59e7f251a..96bdb2cabc4 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -907,10 +907,44 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
    return false;
 }
 
+bool
+radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
+                      uint64_t ps_inputs_read)
+{
+   /* Culling doesn't make sense for meta shaders. */
+   if (!!nir->info.name)
+      return false;
+
+   /* TODO: enable by default on GFX10.3 when we're confident about performance. */
+   bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;
+
+   if (!culling_enabled)
+      return false;
+
+   /* Shader based culling efficiency can depend on PS throughput.
+    * Estimate an upper limit for PS input param count based on GPU info.
+    */
+   unsigned max_ps_params;
+   unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;
+   unsigned max_se = device->physical_device->rad_info.max_se;
+
+   if (max_render_backends < 2)
+      return false; /* Don't use NGG culling on 1 RB chips. */
+   else if (max_render_backends / max_se == 4)
+      max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */
+   else
+      max_ps_params = 4; /* Navi 1x. */
+
+   /* TODO: consider other heuristics here, such as PS execution time */
+
+   return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
+}
+
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                     struct radv_shader_info *info,
                     const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key)
+                    struct radv_shader_variant_key *key,
+                    bool consider_culling)
 {
    /* TODO: support the LLVM backend with the NIR lowering */
    assert(!radv_use_llvm_for_stage(device, nir->info.stage));
@@ -930,9 +964,19 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
          num_vertices_per_prim = 1;
       else if (nir->info.tess.primitive_mode == GL_ISOLINES)
          num_vertices_per_prim = 2;
+
+      /* Manually mark the primitive ID used, so the shader can repack it. */
+      if (key->vs_common_out.export_prim_id)
+         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
       /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
       num_vertices_per_prim = key->vs.outprim + 1;
+
+      /* Manually mark the instance ID used, so the shader can repack it. */
+      if (key->vs.instance_rate_inputs)
+         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
       num_vertices_per_prim = nir->info.gs.vertices_in;
    } else {
@@ -964,7 +1008,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
             num_vertices_per_prim,
             max_workgroup_size,
             info->wave_size,
-            false,
+            consider_culling,
             key->vs_common_out.as_ngg_passthrough,
             key->vs_common_out.export_prim_id,
             key->vs.provoking_vtx_last);
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 1ad54b93276..ab3dcac35c6 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -569,6 +569,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                     struct radv_shader_info *info,
                     const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key);
+                    struct radv_shader_variant_key *key,
+                    bool consider_culling);
+
+bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
+                           uint64_t ps_inputs_read);
 
 #endif
author	Timur Kristóf <timur.kristof@gmail.com>	2021-06-08 12:32:35 +0200
committer	Marge Bot <eric+marge@anholt.net>	2021-07-13 23:56:33 +0000
commit	f30e4351de2f562955435a04cf75dd641639d31c (patch)
tree	668a704123a1473db3ac098030a532ce516c336d
parent	182d9b1e6072bec190cf0a52e9d93dbbdbaa850d (diff)