summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlyssa Rosenzweig <alyssa@collabora.com>2021-12-11 12:54:01 -0500
committerMarge Bot <emma+marge@anholt.net>2022-02-24 01:35:33 +0000
commit6b2eda6b729215ca61617943dc540a8c690f2f72 (patch)
tree48ffbcd6d17286098117cefa27fa4e08d25f0479
parent6eec8fcbfa6e90431d47919479debf76fb04b8b0 (diff)
pan/bi: Reorder pushed uniforms to avoid moves
On Bifrost and Valhall, push uniforms are loaded into Fast Access Uniform Random Access Memory (FAU-RAM). FAU-RAM is organized as an array of 64-bit slots. A given tuple (Bifrost) or instruction (Valhall) may access at most a single 64-bit slot. If an instruction requires uniforms from multiple 64-bit slots, a uniform-to-register move must be inserted to avoid the hazard. However, if an instruction requires a pair of 32-bit uniforms from the same 64-bit slot, no move is required. To reduce the number of moves we emit, this commit adds an optimization pass that reorders pushed uniforms, trying to group uniforms used by the same instruction. The pass works by creating a graph of pushed uniforms, where edges denote the "both 32-bit uniforms required by the same instruction" relationship. We perform depth-first search on this graph to find the connected components, where each connected component is a cluster of uniforms that are used together. We then select pairs of uniforms from each connected component. The remaining unpaired uniforms (from components of odd sizes) are paired together arbitrarily. In principle, we should weight the graph by number of occurences and choose pairs that maximize the total selected edge weight. This is left for future work, as it is nontrivial -- selecting these edges optimally appears to be NP-hard at first blush. Implementation note: As position and varying shaders share FAU on Bifrost, extra care is taken with a `push_offset` shader stage info parameter that ensures varying shaders do not reorder uniforms selected by the previous position shader. total instructions in shared programs: 2503343 -> 2451758 (-2.06%) instructions in affected programs: 1553309 -> 1501724 (-3.32%) helped: 14256 HURT: 8 helped stats (abs) min: 1.0 max: 80.0 x̄: 3.62 x̃: 3 helped stats (rel) min: 0.06% max: 36.36% x̄: 7.31% x̃: 6.67% HURT stats (abs) min: 1.0 max: 2.0 x̄: 1.38 x̃: 1 HURT stats (rel) min: 1.30% max: 12.50% x̄: 4.99% x̃: 3.85% 95% mean confidence interval for instructions value: -3.66 -3.58 95% mean confidence interval for instructions %-change: -7.41% -7.20% Instructions are helped. total tuples in shared programs: 2008399 -> 1969627 (-1.93%) tuples in affected programs: 1146344 -> 1107572 (-3.38%) helped: 12867 HURT: 147 helped stats (abs) min: 1.0 max: 61.0 x̄: 3.03 x̃: 2 helped stats (rel) min: 0.17% max: 42.86% x̄: 6.79% x̃: 4.65% HURT stats (abs) min: 1.0 max: 3.0 x̄: 1.20 x̃: 1 HURT stats (rel) min: 0.29% max: 20.00% x̄: 2.12% x̃: 1.19% 95% mean confidence interval for tuples value: -3.03 -2.93 95% mean confidence interval for tuples %-change: -6.82% -6.57% Tuples are helped. total clauses in shared programs: 408005 -> 401708 (-1.54%) clauses in affected programs: 90760 -> 84463 (-6.94%) helped: 6006 HURT: 164 helped stats (abs) min: 1.0 max: 9.0 x̄: 1.08 x̃: 1 helped stats (rel) min: 0.45% max: 33.33% x̄: 12.44% x̃: 14.29% HURT stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 HURT stats (rel) min: 1.64% max: 25.00% x̄: 9.81% x̃: 5.26% 95% mean confidence interval for clauses value: -1.03 -1.01 95% mean confidence interval for clauses %-change: -12.03% -11.66% Clauses are helped. total cycles in shared programs: 203308.37 -> 202737.83 (-0.28%) cycles in affected programs: 19264.71 -> 18694.17 (-2.96%) helped: 3024 HURT: 41 helped stats (abs) min: 0.041665999999999315 max: 2.5416680000000014 x̄: 0.19 x̃: 0 helped stats (rel) min: 0.17% max: 33.33% x̄: 3.83% x̃: 2.83% HURT stats (abs) min: 0.041665999999999315 max: 0.125 x̄: 0.06 x̃: 0 HURT stats (rel) min: 0.30% max: 5.88% x̄: 1.41% x̃: 0.93% 95% mean confidence interval for cycles value: -0.19 -0.18 95% mean confidence interval for cycles %-change: -3.89% -3.64% Cycles are helped. total arith in shared programs: 76265.67 -> 74669.25 (-2.09%) arith in affected programs: 45001.50 -> 43405.08 (-3.55%) helped: 12945 HURT: 97 helped stats (abs) min: 0.041665999999999315 max: 2.5416680000000014 x̄: 0.12 x̃: 0 helped stats (rel) min: 0.17% max: 50.00% x̄: 8.06% x̃: 4.88% HURT stats (abs) min: 0.041665999999999315 max: 0.125 x̄: 0.05 x̃: 0 HURT stats (rel) min: 0.21% max: 33.33% x̄: 2.16% x̃: 0.96% 95% mean confidence interval for arith value: -0.12 -0.12 95% mean confidence interval for arith %-change: -8.16% -7.81% Arith are helped. total quadwords in shared programs: 1796563 -> 1766803 (-1.66%) quadwords in affected programs: 948830 -> 919070 (-3.14%) helped: 12078 HURT: 219 helped stats (abs) min: 1.0 max: 42.0 x̄: 2.49 x̃: 2 helped stats (rel) min: 0.10% max: 33.33% x̄: 5.57% x̃: 5.26% HURT stats (abs) min: 1.0 max: 4.0 x̄: 1.21 x̃: 1 HURT stats (rel) min: 0.33% max: 6.67% x̄: 2.00% x̃: 1.14% 95% mean confidence interval for quadwords value: -2.46 -2.38 95% mean confidence interval for quadwords %-change: -5.52% -5.36% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14163>
-rw-r--r--src/panfrost/bifrost/bi_opt_push_ubo.c166
-rw-r--r--src/panfrost/bifrost/bifrost_compile.c4
-rw-r--r--src/panfrost/bifrost/compiler.h2
3 files changed, 171 insertions, 1 deletions
diff --git a/src/panfrost/bifrost/bi_opt_push_ubo.c b/src/panfrost/bifrost/bi_opt_push_ubo.c
index 4024a10b114..75b8dbfe107 100644
--- a/src/panfrost/bifrost/bi_opt_push_ubo.c
+++ b/src/panfrost/bifrost/bi_opt_push_ubo.c
@@ -181,3 +181,169 @@ bi_opt_push_ubo(bi_context *ctx)
free(analysis.blocks);
}
+
+typedef struct {
+ BITSET_DECLARE(row, PAN_MAX_PUSH);
+} adjacency_row;
+
+/* Find the connected component containing `node` with depth-first search */
+static void
+bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited,
+ unsigned *component, unsigned *size, unsigned node)
+{
+ unsigned neighbour;
+
+ BITSET_SET(visited, node);
+ component[(*size)++] = node;
+
+ BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) {
+ if (!BITSET_TEST(visited, neighbour)) {
+ bi_find_component(adjacency, visited, component, size,
+ neighbour);
+ }
+ }
+}
+
+static bool
+bi_is_uniform(bi_index idx)
+{
+ return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM);
+}
+
+/* Get the index of a uniform in 32-bit words from the start of FAU-RAM */
+static unsigned
+bi_uniform_word(bi_index idx)
+{
+ assert(bi_is_uniform(idx));
+ assert(idx.offset <= 1);
+
+ return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset;
+}
+
+/*
+ * Create an undirected graph where nodes are 32-bit uniform indices and edges
+ * represent that two nodes are used in the same instruction.
+ *
+ * The graph is constructed as an adjacency matrix stored in adjacency.
+ */
+static void
+bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency)
+{
+ bi_foreach_instr_global(ctx, I) {
+ unsigned nodes[BI_MAX_SRCS] = {};
+ unsigned node_count = 0;
+
+ /* Set nodes[] to 32-bit uniforms accessed */
+ bi_foreach_src(I, s) {
+ if (bi_is_uniform(I->src[s])) {
+ unsigned word = bi_uniform_word(I->src[s]);
+
+ if (word >= ctx->info.push_offset)
+ nodes[node_count++] = word;
+ }
+ }
+
+ /* Create clique connecting nodes[] */
+ for (unsigned i = 0; i < node_count; ++i) {
+ for (unsigned j = 0; j < node_count; ++j) {
+ if (i == j)
+ continue;
+
+ unsigned x = nodes[i], y = nodes[j];
+ assert(MAX2(x, y) < ctx->info.push->count);
+
+ /* Add undirected edge between the nodes */
+ BITSET_SET(adjacency[x].row, y);
+ BITSET_SET(adjacency[y].row, x);
+ }
+ }
+ }
+}
+
+/*
+ * Optimization pass to reorder uniforms. The goal is to reduce the number of
+ * moves we emit when lowering FAU. The pass groups uniforms used by the same
+ * instruction.
+ *
+ * The pass works by creating a graph of pushed uniforms, where edges denote the
+ * "both 32-bit uniforms required by the same instruction" relationship. We
+ * perform depth-first search on this graph to find the connected components,
+ * where each connected component is a cluster of uniforms that are used
+ * together. We then select pairs of uniforms from each connected component.
+ * The remaining unpaired uniforms (from components of odd sizes) are paired
+ * together arbitrarily.
+ *
+ * After a new ordering is selected, pushed uniforms in the program and the
+ * panfrost_ubo_push data structure must be remapped to use the new ordering.
+ */
+void
+bi_opt_reorder_push(bi_context *ctx)
+{
+ adjacency_row adjacency[PAN_MAX_PUSH] = { 0 };
+ BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 };
+
+ unsigned ordering[PAN_MAX_PUSH] = { 0 };
+ unsigned unpaired[PAN_MAX_PUSH] = { 0 };
+ unsigned pushed = 0, unpaired_count = 0;
+
+ struct panfrost_ubo_push *push = ctx->info.push;
+ unsigned push_offset = ctx->info.push_offset;
+
+ bi_create_fau_interference_graph(ctx, adjacency);
+
+ for (unsigned i = push_offset; i < push->count; ++i) {
+ if (BITSET_TEST(visited, i)) continue;
+
+ unsigned component[PAN_MAX_PUSH] = { 0 };
+ unsigned size = 0;
+ bi_find_component(adjacency, visited, component, &size, i);
+
+ /* If there is an odd number of uses, at least one use must be
+ * unpaired. Arbitrarily take the last one.
+ */
+ if (size % 2)
+ unpaired[unpaired_count++] = component[--size];
+
+ /* The rest of uses are paired */
+ assert((size % 2) == 0);
+
+ /* Push the paired uses */
+ memcpy(ordering + pushed, component, sizeof(unsigned) * size);
+ pushed += size;
+ }
+
+ /* Push unpaired nodes at the end */
+ memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count);
+ pushed += unpaired_count;
+
+ /* Ordering is a permutation. Invert it for O(1) lookup. */
+ unsigned old_to_new[PAN_MAX_PUSH] = { 0 };
+
+ for (unsigned i = 0; i < push_offset; ++i) {
+ old_to_new[i] = i;
+ }
+
+ for (unsigned i = 0; i < pushed; ++i) {
+ assert(ordering[i] >= push_offset);
+ old_to_new[ordering[i]] = push_offset + i;
+ }
+
+ /* Use new ordering throughout the program */
+ bi_foreach_instr_global(ctx, I) {
+ bi_foreach_src(I, s) {
+ if (bi_is_uniform(I->src[s])) {
+ unsigned node = bi_uniform_word(I->src[s]);
+ unsigned new_node = old_to_new[node];
+ I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1);
+ I->src[s].offset = new_node & 1;
+ }
+ }
+ }
+
+ /* Use new ordering for push */
+ struct panfrost_ubo_push old = *push;
+ for (unsigned i = 0; i < pushed; ++i)
+ push->words[push_offset + i] = old.words[ordering[i]];
+
+ push->count = push_offset + pushed;
+}
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 42cb08248a5..64f2e3fe4d3 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -3981,6 +3981,7 @@ bi_compile_variant_nir(nir_shader *nir,
bi_opt_dead_code_eliminate(ctx);
bi_opt_cse(ctx);
bi_opt_dead_code_eliminate(ctx);
+ bi_opt_reorder_push(ctx);
bi_validate(ctx, "Optimization passes");
}
@@ -4060,7 +4061,8 @@ bi_compile_variant(nir_shader *nir,
.push = &info->push,
.bifrost = &info->bifrost,
.tls_size = info->tls_size,
- .sysvals = &info->sysvals
+ .sysvals = &info->sysvals,
+ .push_offset = info->push.count
};
unsigned offset = binary->size;
diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h
index 86a8548c3dc..cc37c661c2f 100644
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@@ -682,6 +682,7 @@ struct bi_shader_info {
struct panfrost_sysvals *sysvals;
unsigned tls_size;
unsigned work_reg_count;
+ unsigned push_offset;
};
/* State of index-driven vertex shading for current shader */
@@ -986,6 +987,7 @@ void bi_opt_dead_code_eliminate(bi_context *ctx);
void bi_opt_fuse_dual_texture(bi_context *ctx);
void bi_opt_dce_post_ra(bi_context *ctx);
void bi_opt_push_ubo(bi_context *ctx);
+void bi_opt_reorder_push(bi_context *ctx);
void bi_lower_swizzle(bi_context *ctx);
void bi_lower_fau(bi_context *ctx);
void bi_assign_scoreboard(bi_context *ctx);