asahi: Add batch tracking logic

We already have the notion of an agx_batch, which encapsulates a render pass. Extend the logic to allow multiple in-flight batches per context, avoiding a flush in set_framebuffer_state and improving performance for certain applications designed for IMRs that ping-pong unnecessarily between FBOs. I don't have such an application immediately in mind, but I wanted to get this flag-day out of the way while the driver is still small and flexible. The driver was written from day 1 with batch tracking in mind, so this is a relatively small change to actually wire it up, but there are lots of little details to get right. The code itself is mostly a copy/paste of panfrost, which in turn draws inspiration from freedreno and v3d. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19865>
author: Alyssa Rosenzweig <alyssa@rosenzweig.io> 2022-11-17 18:10:11 -0500
committer: Marge Bot <emma+marge@anholt.net> 2022-11-19 15:33:16 +0000
commit: d7511ad784ce0fc6a5060de2d7c969a1300a0fb9 (patch)
tree: a40783e954b8377e9a9de35f51f023683f219083
parent: de1eb9400f1d0d64627630dd28d11073ad1c1a7e (diff)
4 files changed, 315 insertions, 138 deletions
diff --git a/src/gallium/drivers/asahi/agx_batch.c b/src/gallium/drivers/asahi/agx_batch.c
index 549f928c21b..b1aa11ac358 100644
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@@ -1,16 +1,202 @@
 /*
  * Copyright 2022 Alyssa Rosenzweig
+ * Copyright 2019-2020 Collabora, Ltd.
  * SPDX-License-Identifier: MIT
  */
 
 #include "agx_state.h"
 
+#define foreach_batch(ctx, idx) \
+        BITSET_FOREACH_SET(idx, ctx->batches.active, AGX_MAX_BATCHES)
+
+static unsigned
+agx_batch_idx(struct agx_batch *batch)
+{
+   return batch - batch->ctx->batches.slots;
+}
+
+bool
+agx_batch_is_active(struct agx_batch *batch)
+{
+   return BITSET_TEST(batch->ctx->batches.active, agx_batch_idx(batch));
+}
+
+static void
+agx_batch_init(struct agx_context *ctx,
+               const struct pipe_framebuffer_state *key,
+               struct agx_batch *batch)
+{
+   struct agx_device *dev = agx_device(ctx->base.screen);
+
+   batch->ctx = ctx;
+   util_copy_framebuffer_state(&batch->key, key);
+   batch->seqnum = ++ctx->batches.seqnum;
+
+   agx_pool_init(&batch->pool, dev, AGX_MEMORY_TYPE_FRAMEBUFFER, true);
+   agx_pool_init(&batch->pipeline_pool, dev, AGX_MEMORY_TYPE_SHADER, true);
+
+   /* These allocations can happen only once and will just be zeroed (not freed)
+    * during batch clean up. The memory is owned by the context.
+    */
+   if (!batch->bo_list.set) {
+      batch->bo_list.set = rzalloc_array(ctx, BITSET_WORD, 128);
+      batch->bo_list.word_count = 128;
+   } else {
+      memset(batch->bo_list.set, 0, batch->bo_list.word_count * sizeof(BITSET_WORD));
+   }
+
+   if (!batch->encoder) {
+      batch->encoder = agx_bo_create(dev, 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
+      batch->encoder_current = batch->encoder->ptr.cpu;
+      batch->encoder_end = batch->encoder_current + batch->encoder->size;
+   } else {
+      batch->encoder_current = batch->encoder->ptr.cpu;
+      batch->encoder_end = batch->encoder_current + batch->encoder->size;
+   }
+
+   if (!batch->scissor.bo) {
+      batch->scissor.bo = agx_bo_create(dev, 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
+   }
+
+   if (!batch->depth_bias.bo) {
+      batch->depth_bias.bo = agx_bo_create(dev, 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
+   }
+
+   batch->clear = 0;
+   batch->draw = 0;
+   batch->load = 0;
+   batch->clear_depth = 0;
+   batch->clear_stencil = 0;
+   batch->scissor.count = 0;
+   batch->depth_bias.count = 0;
+   batch->varyings = 0;
+
+   /* We need to emit prim state at the start. Max collides with all. */
+   batch->reduced_prim = PIPE_PRIM_MAX;
+
+   if (batch->key.zsbuf) {
+      agx_batch_writes(batch, agx_resource(key->zsbuf->texture));
+   }
+
+   for (unsigned i = 0; i < key->nr_cbufs; ++i) {
+      agx_batch_writes(batch, agx_resource(key->cbufs[i]->texture));
+   }
+
+   unsigned batch_idx = agx_batch_idx(batch);
+   BITSET_SET(ctx->batches.active, batch_idx);
+
+   agx_batch_init_state(batch);
+}
+
 void
-agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch)
+{
+   struct agx_device *dev = agx_device(ctx->base.screen);
+   assert(batch->ctx == ctx);
+
+   if (ctx->batch == batch)
+      ctx->batch = NULL;
+
+   /* There is no more writer for anything we wrote recorded on this context */
+   hash_table_foreach(ctx->writer, ent) {
+      if (ent->data == batch)
+         _mesa_hash_table_remove(ctx->writer, ent);
+   }
+
+   int handle;
+   AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
+      agx_bo_unreference(agx_lookup_bo(dev, handle));
+   }
+
+   agx_pool_cleanup(&batch->pool);
+   agx_pool_cleanup(&batch->pipeline_pool);
+   util_unreference_framebuffer_state(&batch->key);
+
+   unsigned batch_idx = agx_batch_idx(batch);
+   BITSET_CLEAR(ctx->batches.active, batch_idx);
+}
+
+static struct agx_batch *
+agx_get_batch_for_framebuffer(struct agx_context *ctx,
+                              const struct pipe_framebuffer_state *state)
 {
-   /* TODO: Turn into loop when we support multiple batches */
-   if (ctx->batch) {
-      struct agx_batch *batch = ctx->batch;
+   /* Look if we have a matching batch */
+   unsigned i;
+   foreach_batch(ctx, i) {
+      struct agx_batch *candidate = &ctx->batches.slots[i];
+
+      if (util_framebuffer_state_equal(&candidate->key, state)) {
+         /* We found a match, increase the seqnum for the LRU
+          * eviction logic.
+          */
+         candidate->seqnum = ++ctx->batches.seqnum;
+         return candidate;
+      }
+   }
+
+   /* Look if we have a free batch */
+   struct agx_batch *batch = NULL;
+   for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+      if (!BITSET_TEST(ctx->batches.active, i)) {
+         batch = &ctx->batches.slots[i];
+         break;
+      }
+   }
+
+   /* Else, evict something */
+   if (!batch) {
+      for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+         struct agx_batch *candidate = &ctx->batches.slots[i];
+
+         if (!batch || batch->seqnum > candidate->seqnum)
+            batch = candidate;
+      }
+
+      agx_flush_batch(ctx, batch);
+   }
+
+   /* Batch is now free */
+   agx_batch_init(ctx, state, batch);
+   return batch;
+}
+
+struct agx_batch *
+agx_get_batch(struct agx_context *ctx)
+{
+   if (!ctx->batch) {
+      ctx->batch = agx_get_batch_for_framebuffer(ctx, &ctx->framebuffer);
+      agx_dirty_all(ctx);
+   }
+
+   assert(util_framebuffer_state_equal(&ctx->framebuffer, &ctx->batch->key));
+   return ctx->batch;
+}
+
+void
+agx_flush_all(struct agx_context *ctx, const char *reason)
+{
+   if (reason)
+      perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
+
+   unsigned idx;
+   foreach_batch(ctx, idx) {
+      agx_flush_batch(ctx, &ctx->batches.slots[idx]);
+   }
+}
+
+static void
+agx_flush_readers_except(struct agx_context *ctx,
+                         struct agx_resource *rsrc,
+                         struct agx_batch *except,
+                         const char *reason)
+{
+   unsigned idx;
+
+   foreach_batch(ctx, idx) {
+      struct agx_batch *batch = &ctx->batches.slots[idx];
+
+      if (batch == except)
+         continue;
 
       if (agx_batch_uses_bo(batch, rsrc->bo)) {
          perf_debug_ctx(ctx, "Flush reader due to: %s\n", reason);
@@ -19,20 +205,38 @@ agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char
    }
 }
 
-void
-agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+static void
+agx_flush_writer_except(struct agx_context *ctx,
+                        struct agx_resource *rsrc,
+                        struct agx_batch *except,
+                        const char *reason)
 {
    struct hash_entry *ent = _mesa_hash_table_search(ctx->writer, rsrc);
 
-   if (ent) {
+   if (ent && ent->data != except) {
       perf_debug_ctx(ctx, "Flush writer due to: %s\n", reason);
       agx_flush_batch(ctx, ent->data);
    }
 }
 
 void
+agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+{
+   agx_flush_readers_except(ctx, rsrc, NULL, reason);
+}
+
+void
+agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+{
+   agx_flush_writer_except(ctx, rsrc, NULL, reason);
+}
+
+void
 agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc)
 {
+   /* Hazard: read-after-write */
+   agx_flush_writer_except(batch->ctx, rsrc, batch, "Read from another batch");
+
    agx_batch_add_bo(batch, rsrc->bo);
 
    if (rsrc->separate_stencil)
@@ -45,12 +249,15 @@ agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc)
    struct agx_context *ctx = batch->ctx;
    struct hash_entry *ent = _mesa_hash_table_search(ctx->writer, rsrc);
 
+   agx_flush_readers_except(ctx, rsrc, batch, "Write from other batch");
+
    /* Nothing to do if we're already writing */
    if (ent && ent->data == batch)
       return;
 
-   /* Flush the old writer if there is one */
-   agx_flush_writer(ctx, rsrc, "Multiple writers");
+   /* Hazard: writer-after-write, write-after-read */
+   if (ent)
+      agx_flush_writer(ctx, rsrc, "Multiple writers");
 
    /* Write is strictly stronger than a read */
    agx_batch_reads(batch, rsrc);
diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c
index baea9e3d63e..cacc5bdba4c 100644
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -659,7 +659,7 @@ agx_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor
           const union pipe_color_union *color, double depth, unsigned stencil)
 {
    struct agx_context *ctx = agx_context(pctx);
-   struct agx_batch *batch = ctx->batch;
+   struct agx_batch *batch = agx_get_batch(ctx);
 
    unsigned fastclear = buffers & ~(batch->draw | batch->load);
    unsigned slowclear = buffers & ~fastclear;
@@ -690,11 +690,11 @@ agx_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor
    assert((batch->draw & slowclear) == slowclear);
 }
 
-
 static void
 agx_flush_resource(struct pipe_context *ctx,
                    struct pipe_resource *resource)
 {
+   agx_flush_writer(agx_context(ctx), agx_resource(resource), "flush_resource");
 }
 
 /*
@@ -710,7 +710,7 @@ agx_flush(struct pipe_context *pctx,
    if (fence)
       *fence = NULL;
 
-   agx_flush_batch(ctx, ctx->batch);
+   agx_flush_all(ctx, "Gallium flush");
 }
 
 void
@@ -718,9 +718,13 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
 {
    struct agx_device *dev = agx_device(ctx->base.screen);
 
+   assert(agx_batch_is_active(batch));
+
    /* Nothing to do */
-   if (!(batch->draw | batch->clear))
+   if (!(batch->draw | batch->clear)) {
+      agx_batch_cleanup(ctx, batch);
       return;
+   }
 
    /* Finalize the encoder */
    uint8_t stop[5 + 64] = { 0x00, 0x00, 0x00, 0xc0, 0x00 };
@@ -761,7 +765,7 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
       pipeline_store =
          agx_build_store_pipeline(batch,
                                   dev->internal.store,
-                                  agx_pool_upload(&batch->pool, ctx->render_target[0], sizeof(ctx->render_target)));
+                                  agx_batch_upload_pbe(batch, 0));
    }
 
    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
@@ -851,37 +855,7 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
       agxdecode_next_frame();
    }
 
-   AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
-      agx_bo_unreference(agx_lookup_bo(dev, handle));
-   }
-
-   /* There is no more writer for anything we wrote recorded on this context */
-   hash_table_foreach(ctx->writer, ent) {
-      if (ent->data == batch)
-         _mesa_hash_table_remove(ctx->writer, ent);
-   }
-
-   memset(batch->bo_list.set, 0, batch->bo_list.word_count * sizeof(BITSET_WORD));
-   agx_pool_cleanup(&batch->pool);
-   agx_pool_cleanup(&batch->pipeline_pool);
-   agx_pool_init(&batch->pool, dev, AGX_MEMORY_TYPE_FRAMEBUFFER, true);
-   agx_pool_init(&batch->pipeline_pool, dev, AGX_MEMORY_TYPE_CMDBUF_32, true);
-   batch->clear = 0;
-   batch->draw = 0;
-   batch->load = 0;
-   batch->encoder_current = batch->encoder->ptr.cpu;
-   batch->encoder_end = batch->encoder_current + batch->encoder->size;
-   batch->scissor.count = 0;
-
-   agx_dirty_all(ctx);
-   agx_batch_init_state(batch);
-
-   /* After resetting the batch, rebind the framebuffer so we update resource
-    * tracking logic and the BO lists.
-    *
-    * XXX: This is a hack to workaround lack of proper batch tracking.
-    */
-   ctx->base.set_framebuffer_state(&ctx->base, &ctx->framebuffer);
+   agx_batch_cleanup(ctx, batch);
 }
 
 static void
@@ -919,20 +893,6 @@ agx_create_context(struct pipe_screen *screen,
    pctx->screen = screen;
    pctx->priv = priv;
 
-   ctx->batch = rzalloc(ctx, struct agx_batch);
-   ctx->batch->ctx = ctx;
-   ctx->batch->bo_list.set = rzalloc_array(ctx->batch, BITSET_WORD, 128);
-   ctx->batch->bo_list.word_count = 128;
-   agx_pool_init(&ctx->batch->pool,
-                 agx_device(screen), AGX_MEMORY_TYPE_FRAMEBUFFER, true);
-   agx_pool_init(&ctx->batch->pipeline_pool,
-                 agx_device(screen), AGX_MEMORY_TYPE_SHADER, true);
-   ctx->batch->encoder = agx_bo_create(agx_device(screen), 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
-   ctx->batch->encoder_current = ctx->batch->encoder->ptr.cpu;
-   ctx->batch->encoder_end = ctx->batch->encoder_current + ctx->batch->encoder->size;
-   ctx->batch->scissor.bo = agx_bo_create(agx_device(screen), 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
-   ctx->batch->depth_bias.bo = agx_bo_create(agx_device(screen), 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
-
    ctx->writer = _mesa_pointer_hash_table_create(ctx);
 
    /* Upload fixed shaders (TODO: compile them?) */
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 2c4b3a74cd4..f5c0cdbb559 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -781,63 +781,61 @@ agx_set_framebuffer_state(struct pipe_context *pctx,
    if (!state)
       return;
 
-   /* XXX: eliminate this flush with batch tracking logic */
-   agx_flush_all(ctx, "Framebuffer switch");
-
    util_copy_framebuffer_state(&ctx->framebuffer, state);
-   util_copy_framebuffer_state(&ctx->batch->key, state);
-   ctx->dirty = ~0;
-
-   if (state->zsbuf)
-      agx_batch_writes(ctx->batch, agx_resource(state->zsbuf->texture));
+   ctx->batch = NULL;
+   agx_dirty_all(ctx);
+}
 
+uint64_t
+agx_batch_upload_pbe(struct agx_batch *batch, unsigned rt)
+{
+   struct pipe_surface *surf = batch->key.cbufs[rt];
+   struct agx_resource *tex = agx_resource(surf->texture);
+   const struct util_format_description *desc =
+      util_format_description(surf->format);
+   unsigned level = surf->u.tex.level;
+   unsigned layer = surf->u.tex.first_layer;
 
-   for (unsigned i = 0; i < state->nr_cbufs; ++i) {
-      struct pipe_surface *surf = state->cbufs[i];
-      struct agx_resource *tex = agx_resource(surf->texture);
-      const struct util_format_description *desc =
-         util_format_description(surf->format);
-      unsigned level = surf->u.tex.level;
-      unsigned layer = surf->u.tex.first_layer;
+   assert(surf->u.tex.last_layer == layer);
 
-      agx_batch_writes(ctx->batch, tex);
+   struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, AGX_RENDER_TARGET_LENGTH, 256);
 
-      assert(surf->u.tex.last_layer == layer);
+   agx_pack(T.cpu, RENDER_TARGET, cfg) {
+      cfg.layout = agx_translate_layout(tex->layout.tiling);
+      cfg.channels = agx_pixel_format[surf->format].channels;
+      cfg.type = agx_pixel_format[surf->format].type;
 
-      agx_pack(ctx->render_target[i], RENDER_TARGET, cfg) {
-         cfg.layout = agx_translate_layout(tex->layout.tiling);
-         cfg.channels = agx_pixel_format[surf->format].channels;
-         cfg.type = agx_pixel_format[surf->format].type;
+      assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
+      cfg.swizzle_r = agx_channel_from_pipe(desc->swizzle[0]) & 3;
 
-         assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
-         cfg.swizzle_r = agx_channel_from_pipe(desc->swizzle[0]) & 3;
+      if (desc->nr_channels >= 2)
+         cfg.swizzle_g = agx_channel_from_pipe(desc->swizzle[1]) & 3;
 
-         if (desc->nr_channels >= 2)
-            cfg.swizzle_g = agx_channel_from_pipe(desc->swizzle[1]) & 3;
+      if (desc->nr_channels >= 3)
+         cfg.swizzle_b = agx_channel_from_pipe(desc->swizzle[2]) & 3;
 
-         if (desc->nr_channels >= 3)
-            cfg.swizzle_b = agx_channel_from_pipe(desc->swizzle[2]) & 3;
+      if (desc->nr_channels >= 4)
+         cfg.swizzle_a = agx_channel_from_pipe(desc->swizzle[3]) & 3;
 
-         if (desc->nr_channels >= 4)
-            cfg.swizzle_a = agx_channel_from_pipe(desc->swizzle[3]) & 3;
+      cfg.width = batch->key.width;
+      cfg.height = batch->key.height;
+      cfg.level = surf->u.tex.level;
+      cfg.buffer = agx_map_texture_gpu(tex, layer);
+      cfg.unk_mipmapped = tex->mipmapped;
 
-         cfg.width = state->width;
-         cfg.height = state->height;
-         cfg.level = surf->u.tex.level;
-         cfg.buffer = agx_map_texture_gpu(tex, layer);
-         cfg.unk_mipmapped = tex->mipmapped;
+      if (tex->layout.tiling == AIL_TILING_LINEAR) {
+         cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
+         cfg.levels = 1;
+      } else {
+         cfg.unk_tiled = true;
+         cfg.levels = tex->base.last_level + 1;
+      }
+   };
 
-         if (tex->layout.tiling == AIL_TILING_LINEAR) {
-            cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
-            cfg.levels = 1;
-         } else {
-            cfg.unk_tiled = true;
-            cfg.levels = tex->base.last_level + 1;
-         }
-      };
-   }
+   return T.gpu;
 }
 
+
 /* Likewise constant buffers, textures, and samplers are handled in a common
  * per-draw path, with dirty tracking to reduce the costs involved.
  */
@@ -1224,18 +1222,20 @@ agx_update_vs(struct agx_context *ctx)
 }
 
 static bool
-agx_update_fs(struct agx_context *ctx)
+agx_update_fs(struct agx_batch *batch)
 {
+   struct agx_context *ctx = batch->ctx;
+
    struct asahi_shader_key key = {
-      .nr_cbufs = ctx->batch->key.nr_cbufs,
+      .nr_cbufs = batch->key.nr_cbufs,
       .clip_plane_enable = ctx->rast->base.clip_plane_enable,
    };
 
-   if (ctx->batch->reduced_prim == PIPE_PRIM_POINTS)
+   if (batch->reduced_prim == PIPE_PRIM_POINTS)
       key.sprite_coord_enable = ctx->rast->base.sprite_coord_enable;
 
    for (unsigned i = 0; i < key.nr_cbufs; ++i) {
-      struct pipe_surface *surf = ctx->batch->key.cbufs[i];
+      struct pipe_surface *surf = batch->key.cbufs[i];
 
       if (surf) {
          enum pipe_format fmt = surf->format;
@@ -1557,9 +1557,6 @@ agx_batch_init_state(struct agx_batch *batch)
 
    agx_ppp_fini(&out, &ppp);
    batch->encoder_current = out;
-
-   /* We need to emit prim state at the start. Max collides with all. */
-   batch->reduced_prim = PIPE_PRIM_MAX;
 }
 
 static enum agx_object_type
@@ -1586,9 +1583,10 @@ agx_pass_type_for_shader(struct agx_shader_info *info)
 #define MAX_PPP_UPDATES 2
 
 static uint8_t *
-agx_encode_state(struct agx_context *ctx, uint8_t *out,
+agx_encode_state(struct agx_batch *batch, uint8_t *out,
                  bool is_lines, bool is_points)
 {
+   struct agx_context *ctx = batch->ctx;
    struct agx_rasterizer *rast = ctx->rast;
    unsigned ppp_updates = 0;
 
@@ -1613,7 +1611,7 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
       out += AGX_VDM_STATE_VERTEX_SHADER_WORD_0_LENGTH;
 
       agx_pack(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
-         cfg.pipeline = agx_build_pipeline(ctx->batch, ctx->vs, PIPE_SHADER_VERTEX);
+         cfg.pipeline = agx_build_pipeline(batch, ctx->vs, PIPE_SHADER_VERTEX);
       }
       out += AGX_VDM_STATE_VERTEX_SHADER_WORD_1_LENGTH;
 
@@ -1634,17 +1632,17 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
       out += 4;
    }
 
-   struct agx_pool *pool = &ctx->batch->pool;
+   struct agx_pool *pool = &batch->pool;
    struct agx_compiled_shader *vs = ctx->vs, *fs = ctx->fs;
    unsigned zbias = 0;
 
    if (ctx->rast->base.offset_tri) {
-      zbias = agx_upload_depth_bias(ctx->batch, &ctx->rast->base);
+      zbias = agx_upload_depth_bias(batch, &ctx->rast->base);
       ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
    }
 
    if (ctx->dirty & (AGX_DIRTY_VIEWPORT | AGX_DIRTY_SCISSOR_ZBIAS)) {
-      agx_upload_viewport_scissor(pool, ctx->batch, &out, &ctx->viewport,
+      agx_upload_viewport_scissor(pool, batch, &out, &ctx->viewport,
             ctx->rast->base.scissor ? &ctx->scissor : NULL,
             zbias);
    }
@@ -1652,7 +1650,7 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
    bool varyings_dirty = false;
 
    if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS)) {
-      ctx->batch->varyings = agx_link_varyings_vs_fs(&ctx->batch->pipeline_pool,
+      batch->varyings = agx_link_varyings_vs_fs(&batch->pipeline_pool,
             &ctx->vs->info.varyings.vs,
             &ctx->fs->info.varyings.fs,
             ctx->rast->base.flatshade_first);
@@ -1774,13 +1772,13 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
    if (IS_DIRTY(FS) || varyings_dirty) {
       unsigned frag_tex_count = ctx->stage[PIPE_SHADER_FRAGMENT].texture_count;
       agx_ppp_push(&ppp, FRAGMENT_SHADER, cfg) {
-         cfg.pipeline = agx_build_pipeline(ctx->batch, ctx->fs, PIPE_SHADER_FRAGMENT),
+         cfg.pipeline = agx_build_pipeline(batch, ctx->fs, PIPE_SHADER_FRAGMENT),
          cfg.uniform_register_count = ctx->fs->info.push_count;
          cfg.preshader_register_count = ctx->fs->info.nr_preamble_gprs;
          cfg.texture_state_register_count = frag_tex_count;
          cfg.sampler_state_register_count = frag_tex_count;
          cfg.cf_binding_count = ctx->fs->info.varyings.fs.nr_bindings;
-         cfg.cf_bindings = ctx->batch->varyings;
+         cfg.cf_bindings = batch->varyings;
 
          /* XXX: This is probably wrong */
          cfg.unknown_30 = frag_tex_count >= 4;
@@ -1883,18 +1881,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    }
 
    struct agx_context *ctx = agx_context(pctx);
-   struct agx_batch *batch = ctx->batch;
+   struct agx_batch *batch = agx_get_batch(ctx);
 
    if (agx_scissor_culls_everything(ctx))
 	   return;
 
-#ifndef NDEBUG
-   /* For debugging dirty tracking, mark all state as dirty every draw, forcing
-    * everything to be re-emitted fresh.
-    */
-   if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
-      agx_dirty_all(ctx);
-#endif
+   agx_dirty_all(ctx);
 
    /* Dirty track the reduced prim: lines vs points vs triangles */
    enum pipe_prim_type reduced_prim = u_reduced_prim(info->mode);
@@ -1902,8 +1894,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    batch->reduced_prim = reduced_prim;
 
    /* TODO: masks */
-   ctx->batch->draw |= ~0;
-   ctx->batch->load |= ~0;
+   batch->draw |= ~0;
+   batch->load |= ~0;
 
    /* TODO: These are expensive calls, consider finer dirty tracking */
    if (agx_update_vs(ctx))
@@ -1911,7 +1903,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    else if (ctx->stage[PIPE_SHADER_VERTEX].dirty)
       ctx->dirty |= AGX_DIRTY_VS;
 
-   if (agx_update_fs(ctx))
+   if (agx_update_fs(batch))
       ctx->dirty |= AGX_DIRTY_FS | AGX_DIRTY_FS_PROG;
    else if (ctx->stage[PIPE_SHADER_FRAGMENT].dirty)
       ctx->dirty |= AGX_DIRTY_FS;
@@ -1939,7 +1931,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
                                AGX_INDEX_LIST_START_LENGTH +
                                AGX_INDEX_LIST_BUFFER_SIZE_LENGTH);
 
-   uint8_t *out = agx_encode_state(ctx, batch->encoder_current,
+   uint8_t *out = agx_encode_state(batch, batch->encoder_current,
                                    reduced_prim == PIPE_PRIM_LINES,
                                    reduced_prim == PIPE_PRIM_POINTS);
 
@@ -2008,6 +2000,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    assert(batch->encoder_current <= batch->encoder_end &&
           "Failed to reserve sufficient space in encoder");
    ctx->dirty = 0;
+
+   assert(batch == agx_get_batch(ctx) && "batch should not change under us");
 }
 
 void agx_init_state_functions(struct pipe_context *ctx);
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index a6291264dad..22f9d7b37ce 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -95,6 +95,7 @@ struct agx_array {
 struct agx_batch {
    struct agx_context *ctx;
    struct pipe_framebuffer_state key;
+   uint64_t seqnum;
 
    /* PIPE_CLEAR_* bitmask */
    uint32_t clear, draw, load;
@@ -174,11 +175,24 @@ enum agx_dirty {
    AGX_DIRTY_FS_PROG    = BITFIELD_BIT(11),
 };
 
+#define AGX_MAX_BATCHES (2)
+
 struct agx_context {
    struct pipe_context base;
    struct agx_compiled_shader *vs, *fs;
    uint32_t dirty;
 
+   /* Set of batches. When full, the LRU entry (the batch with the smallest
+    * seqnum) is flushed to free a slot.
+    */
+   struct {
+      uint64_t seqnum;
+      struct agx_batch slots[AGX_MAX_BATCHES];
+
+      /** Set of active batches for faster traversal */
+      BITSET_DECLARE(active, AGX_MAX_BATCHES);
+   } batches;
+
    struct agx_batch *batch;
 
    struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
@@ -204,8 +218,6 @@ struct agx_context {
    struct util_debug_callback debug;
    bool is_noop;
 
-   uint8_t render_target[8][AGX_RENDER_TARGET_LENGTH];
-
    struct blitter_context *blitter;
 
    /* Map of agx_resource to agx_batch that writes that resource */
@@ -350,6 +362,9 @@ uint64_t
 agx_push_location(struct agx_batch *batch, struct agx_push push,
                   enum pipe_shader_type stage);
 
+bool
+agx_batch_is_active(struct agx_batch *batch);
+
 uint64_t
 agx_build_clear_pipeline(struct agx_batch *batch, uint32_t code, uint64_t clear_buf);
 
@@ -360,6 +375,9 @@ agx_build_store_pipeline(struct agx_batch *batch, uint32_t code,
 uint64_t
 agx_build_reload_pipeline(struct agx_batch *batch, uint32_t code, struct pipe_surface *surf);
 
+uint64_t
+agx_batch_upload_pbe(struct agx_batch *batch, unsigned rt);
+
 /* Add a BO to a batch. This needs to be amortized O(1) since it's called in
  * hot paths. To achieve this we model BO lists by bit sets */
 
@@ -383,7 +401,7 @@ agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
 {
    /* Double the size of the BO list if we run out, this is amortized O(1) */
    if (unlikely(bo->handle > agx_batch_bo_list_bits(batch))) {
-      batch->bo_list.set = rerzalloc(batch, batch->bo_list.set, BITSET_WORD,
+      batch->bo_list.set = rerzalloc(batch->ctx, batch->bo_list.set, BITSET_WORD,
                                      batch->bo_list.word_count,
                                      batch->bo_list.word_count * 2);
       batch->bo_list.word_count *= 2;
@@ -408,6 +426,7 @@ agx_batch_num_bo(struct agx_batch *batch)
    BITSET_FOREACH_SET(handle, (batch)->bo_list.set, agx_batch_bo_list_bits(batch))
 
 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
+void agx_flush_all(struct agx_context *ctx, const char *reason);
 void agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason);
 void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason);
 
@@ -415,6 +434,9 @@ void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const
 void agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc);
 void agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc);
 
+struct agx_batch *agx_get_batch(struct agx_context *ctx);
+void agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch);
+
 /* Blit shaders */
 void
 agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
@@ -426,12 +448,6 @@ void agx_blit(struct pipe_context *pipe,
 void agx_internal_shaders(struct agx_device *dev);
 
 /* Batch logic */
-static void
-agx_flush_all(struct agx_context *ctx, const char *reason)
-{
-   perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
-   ctx->base.flush(&ctx->base, NULL, 0);
-}
 
 void
 agx_batch_init_state(struct agx_batch *batch);
author	Alyssa Rosenzweig <alyssa@rosenzweig.io>	2022-11-17 18:10:11 -0500
committer	Marge Bot <emma+marge@anholt.net>	2022-11-19 15:33:16 +0000
commit	d7511ad784ce0fc6a5060de2d7c969a1300a0fb9 (patch)
tree	a40783e954b8377e9a9de35f51f023683f219083
parent	de1eb9400f1d0d64627630dd28d11073ad1c1a7e (diff)