summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorFrancisco Jerez <currojerez@riseup.net>2020-05-31 14:56:40 -0700
committerEric Engestrom <eric@engestrom.ch>2020-08-05 22:23:20 +0200
commit797ed40aa70681099d792ad7a8be699741315645 (patch)
tree84394e64f05104518eab550c5a8b92046e455056 /src
parent33f8d25fa4ccfcf9263e7abc58d2af58c0e59d3a (diff)
intel/ir/gen12+: Work around FS performance regressions due to SIMD32 discard divergence.
This avoids some performance regressions on Gen12 platforms caused by SIMD32 fragment shaders reported in titles like Dota2, TF2, Xonotic, and GFXBench5 Car Chase and Aztec Ruins. The most obvious pattern in the regressing shaders I identified among these workloads is that they all had non-uniform discard statements, which are handled rather optimistically by the current IR analysis pass: No penalty is currently applied to the SIMD32 variant of the shader in the form of differing branching weights like we do for other control flow instructions in order to account for the greater likelihood of divergence of a SIMD32 shader. Simply changing that by giving the same treatment to discard statements as we give to other branching instructions seemed to hurt more than it helped on platforms earlier than Gen12, since it reversed most of the improvement obtained from SIMD32 fragment shaders in Manhattan for no measurable benefit in other workloads (Manhattan has a handful of shaders with statically non-uniform discard statements which actually perform better in SIMD32 mode due to their approximate dynamic uniformity). For that reason this change is applied to Gen12+ platforms only. I've been running a number of tests trying to understand the difference in behavior between Gen12 and earlier platforms, and most of the evidence I've gathered seems to point at EU fusion being the culprit: Unlike previous generations, on Gen12 EUs are arranged in pairs which execute instructions in lockstep, giving an effective warp size of 64 threads in SIMD32 mode, which seems to increase the likelihood for control flow divergence in some of the affected shaders significantly. Fixes: 188a3659aea6dec9acf1 "intel/ir: Import shader performance analysis pass." Reported-by: Caleb Callaway <caleb.callaway@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5910> (cherry picked from commit 4d73988f6fef39e9263ec0bb49cd5efff68393bc)
Diffstat (limited to 'src')
-rw-r--r--src/intel/compiler/brw_ir_performance.cpp14
1 files changed, 14 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp
index 8d02db932ba..3c39594d121 100644
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@@ -1522,9 +1522,19 @@ namespace {
* difference is the worst-case scenario branch_weight used for
* SIMD32 which accounts for the possibility of a dynamically
* uniform branch becoming divergent in SIMD32.
+ *
+ * Note that we provide slightly more pessimistic weights on
+ * Gen12+ for SIMD32, since the effective warp size on that
+ * platform is 2x the SIMD width due to EU fusion, which increases
+ * the likelihood of divergent control flow in comparison to
+ * previous generations, giving narrower SIMD modes a performance
+ * advantage in several test-cases with non-uniform discard jumps.
*/
const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5);
+ const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
+ 1.0 : 0.5);
const float loop_weight = 10;
+ unsigned discard_count = 0;
unsigned elapsed = 0;
state st;
@@ -1538,6 +1548,8 @@ namespace {
if (inst->opcode == BRW_OPCODE_ENDIF)
st.weight /= branch_weight;
+ else if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
+ st.weight /= discard_weight;
elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
@@ -1547,6 +1559,8 @@ namespace {
st.weight *= loop_weight;
else if (inst->opcode == BRW_OPCODE_WHILE)
st.weight /= loop_weight;
+ else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
+ st.weight *= discard_weight;
}
p.block_latency[block->num] = elapsed - elapsed0;