summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>2019-07-19 13:21:11 -0700
committerAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>2019-07-22 08:20:34 -0700
commit21510c253ca8f381fb39d365eb0770b47a44add0 (patch)
tree1075f3e6b63bb6f005c8fa76fac4e1b6fe020202 /src
parent533d65786f43f23306c9e65d1d7022cf506a143f (diff)
panfrost/midgard: Implement register spilling
Now that we run RA in a loop, before each iteration after a failed allocation we choose a spill node and spill it to Thread Local Storage using st_int4/ld_int4 instructions (for spills and fills respectively). This allows us to compile complex shaders that normally would not fit within the 16 work register limits, although it comes at a fairly steep performance penalty. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/panfrost/pan_context.c3
-rw-r--r--src/panfrost/midgard/compiler.h3
-rw-r--r--src/panfrost/midgard/midgard_ra.c47
-rw-r--r--src/panfrost/midgard/midgard_schedule.c159
4 files changed, 158 insertions, 54 deletions
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index 26e7fca1d2f..d20f0185a42 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -158,6 +158,7 @@ panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count)
unsigned height = ctx->pipe_framebuffer.height;
struct bifrost_framebuffer framebuffer = {
+ .unk0 = 0x1e5, /* 1e4 if no spill */
.width1 = MALI_POSITIVE(width),
.height1 = MALI_POSITIVE(height),
.width2 = MALI_POSITIVE(width),
@@ -2663,7 +2664,7 @@ panfrost_setup_hardware(struct panfrost_context *ctx)
struct pipe_context *gallium = (struct pipe_context *) ctx;
struct panfrost_screen *screen = pan_screen(gallium->screen);
- panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64, false, 0, 0, 0);
+ panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
panfrost_drm_allocate_slab(screen, &ctx->tiler_polygon_list, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h
index 3002a079dea..91ca185d628 100644
--- a/src/panfrost/midgard/compiler.h
+++ b/src/panfrost/midgard/compiler.h
@@ -429,9 +429,6 @@ mir_has_arg(midgard_instruction *ins, unsigned arg)
return false;
}
-midgard_instruction
-v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store);
-
/* Scheduling */
void schedule_program(compiler_context *ctx);
diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c
index dcae8183513..fdd222b88a1 100644
--- a/src/panfrost/midgard/midgard_ra.c
+++ b/src/panfrost/midgard/midgard_ra.c
@@ -113,31 +113,6 @@ compose_swizzle(unsigned swizzle, unsigned mask,
return shifted;
}
-/* When we're 'squeezing down' the values in the IR, we maintain a hash
- * as such */
-
-static unsigned
-find_or_allocate_temp(compiler_context *ctx, unsigned hash)
-{
- if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
- return hash;
-
- unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
- ctx->hash_to_temp, hash + 1);
-
- if (temp)
- return temp - 1;
-
- /* If no temp is find, allocate one */
- temp = ctx->temp_count++;
- ctx->max_hash = MAX2(ctx->max_hash, hash);
-
- _mesa_hash_table_u64_insert(ctx->hash_to_temp,
- hash + 1, (void *) ((uintptr_t) temp + 1));
-
- return temp;
-}
-
/* Helper to return the default phys_reg for a given register */
static struct phys_reg
@@ -242,21 +217,7 @@ allocate_registers(compiler_context *ctx, bool *spilled)
/* We're done setting up */
ra_set_finalize(regs, NULL);
- /* Transform the MIR into squeezed index form */
- mir_foreach_block(ctx, block) {
- mir_foreach_instr_in_block(block, ins) {
- if (ins->compact_branch) continue;
-
- ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
- ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
-
- if (!ins->ssa_args.inline_constant)
- ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
-
- }
- }
-
- /* No register allocation to do with no SSA */
+ /* No register allocation to do with no SSA */
if (!ctx->temp_count)
return NULL;
@@ -381,9 +342,13 @@ allocate_registers(compiler_context *ctx, bool *spilled)
if (!ra_allocate(g)) {
*spilled = true;
- return NULL;
+ } else {
+ *spilled = false;
}
+ /* Whether we were successful or not, report the graph so we can
+ * compute spill nodes */
+
return g;
}
diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
index db87ab65f7f..5c03c53023a 100644
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -24,6 +24,7 @@
#include "compiler.h"
#include "midgard_ops.h"
#include "util/u_memory.h"
+#include "util/register_allocate.h"
/* Create a mask of accessed components from a swizzle to figure out vector
* dependencies */
@@ -575,15 +576,66 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
}
}
-midgard_instruction
-v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
+/* When we're 'squeezing down' the values in the IR, we maintain a hash
+ * as such */
+
+static unsigned
+find_or_allocate_temp(compiler_context *ctx, unsigned hash)
+{
+ if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+ return hash;
+
+ unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
+ ctx->hash_to_temp, hash + 1);
+
+ if (temp)
+ return temp - 1;
+
+ /* If no temp is find, allocate one */
+ temp = ctx->temp_count++;
+ ctx->max_hash = MAX2(ctx->max_hash, hash);
+
+ _mesa_hash_table_u64_insert(ctx->hash_to_temp,
+ hash + 1, (void *) ((uintptr_t) temp + 1));
+
+ return temp;
+}
+
+/* Reassigns numbering to get rid of gaps in the indices */
+
+static void
+mir_squeeze_index(compiler_context *ctx)
+{
+ /* Reset */
+ ctx->temp_count = 0;
+ /* TODO don't leak old hash_to_temp */
+ ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
+
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->compact_branch) continue;
+
+ ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
+ ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
+
+ if (!ins->ssa_args.inline_constant)
+ ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+
+ }
+}
+
+static midgard_instruction
+v_load_store_scratch(
+ unsigned srcdest,
+ unsigned index,
+ bool is_store,
+ unsigned mask)
{
/* We index by 32-bit vec4s */
unsigned byte = (index * 4 * 4);
midgard_instruction ins = {
.type = TAG_LOAD_STORE_4,
- .mask = 0xF,
+ .mask = mask,
.ssa_args = {
.dest = -1,
.src0 = -1,
@@ -602,10 +654,10 @@ v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
}
};
- if (is_store) {
+ if (is_store) {
/* r0 = r26, r1 = r27 */
- assert(srcdest == 26 || srcdest == 27);
- ins.ssa_args.src0 = SSA_FIXED_REGISTER(srcdest - 26);
+ assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
+ ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0);
} else {
ins.ssa_args.dest = srcdest;
}
@@ -618,7 +670,10 @@ schedule_program(compiler_context *ctx)
{
struct ra_graph *g = NULL;
bool spilled = false;
- int iter_count = 10; /* max iterations */
+ int iter_count = 1000; /* max iterations */
+
+ /* Number of 128-bit slots in memory we've spilled into */
+ unsigned spill_count = 0;
midgard_promote_uniforms(ctx, 8);
@@ -627,18 +682,104 @@ schedule_program(compiler_context *ctx)
}
do {
+ /* If we spill, find the best spill node and spill it */
+
+ unsigned spill_index = ctx->temp_count;
+ if (g && spilled) {
+ /* All nodes are equal in spill cost, but we can't
+ * spill nodes written to from an unspill */
+
+ for (unsigned i = 0; i < ctx->temp_count; ++i) {
+ ra_set_node_spill_cost(g, i, 1.0);
+ }
+
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->type != TAG_LOAD_STORE_4) continue;
+ if (ins->load_store.op != midgard_op_ld_int4) continue;
+ if (ins->load_store.unknown != 0x1EEA) continue;
+ ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0);
+ }
+
+ int spill_node = ra_get_best_spill_node(g);
+
+ if (spill_node < 0)
+ assert(0);
+
+ /* Allocate TLS slot */
+ unsigned spill_slot = spill_count++;
+
+ /* Replace all stores to the spilled node with stores
+ * to TLS */
+
+ mir_foreach_instr_global_safe(ctx, ins) {
+ if (ins->compact_branch) continue;
+ if (ins->ssa_args.dest != spill_node) continue;
+ ins->ssa_args.dest = SSA_FIXED_REGISTER(26);
+
+ midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask);
+ mir_insert_instruction_before(mir_next_op(ins), st);
+ }
+
+ /* Insert a load from TLS before the first consecutive
+ * use of the node, rewriting to use spilled indices to
+ * break up the live range */
+
+ mir_foreach_block(ctx, block) {
+
+ bool consecutive_skip = false;
+ unsigned consecutive_index = 0;
+
+ mir_foreach_instr_in_block(block, ins) {
+ if (ins->compact_branch) continue;
+
+ if (!mir_has_arg(ins, spill_node)) {
+ consecutive_skip = false;
+ continue;
+ }
+
+ if (consecutive_skip) {
+ /* Rewrite */
+ mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+ continue;
+ }
+
+ consecutive_index = ++spill_index;
+ midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
+ midgard_instruction *before = ins;
+
+ /* For a csel, go back one more not to break up the bundle */
+ if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
+ before = mir_prev_op(before);
+
+ mir_insert_instruction_before(before, st);
+ // consecutive_skip = true;
+
+
+ /* Rewrite to use */
+ mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+ }
+ }
+ }
+
+ mir_squeeze_index(ctx);
+
+ g = NULL;
+ g = allocate_registers(ctx, &spilled);
+ } while(spilled && ((iter_count--) > 0));
+
/* We would like to run RA after scheduling, but spilling can
* complicate this */
mir_foreach_block(ctx, block) {
schedule_block(ctx, block);
}
+#if 0
/* Pipeline registers creation is a prepass before RA */
mir_create_pipeline_registers(ctx);
+#endif
+
- g = allocate_registers(ctx, &spilled);
- } while(spilled && ((iter_count--) > 0));
if (iter_count <= 0) {
fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");