7 files changed, 369 insertions, 256 deletions
diff --git a/src/broadcom/vulkan/v3dv_cl.c b/src/broadcom/vulkan/v3dv_cl.c
index d3494c53f64..e20e6733356 100644
--- a/src/broadcom/vulkan/v3dv_cl.c
+++ b/src/broadcom/vulkan/v3dv_cl.c
@@ -25,20 +25,18 @@
 #include "broadcom/cle/v3dx_pack.h"
 
 void
-v3dv_cl_init(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl *cl)
+v3dv_cl_init(struct v3dv_job *job, struct v3dv_cl *cl)
 {
    cl->base = NULL;
    cl->next = cl->base;
    cl->bo = NULL;
    cl->size = 0;
-   cl->cmd_buffer = cmd_buffer;
+   cl->job = job;
 }
 
 void
 v3dv_cl_begin(struct v3dv_cl *cl)
 {
-   assert(!cl->cmd_buffer ||
-          cl->cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
    assert(v3dv_cl_offset(cl) == 0);
 }
 
@@ -48,15 +46,15 @@ v3dv_cl_reset(struct v3dv_cl *cl)
    /* FIXME: consider keeping the BO when the command buffer is reset with
     * flag VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT.
     */
-   v3dv_cl_init(cl->cmd_buffer, cl);
+   v3dv_cl_init(cl->job, cl);
 }
 
 void
 v3dv_cl_destroy(struct v3dv_cl *cl)
 {
    if (cl->bo) {
-      assert(cl->cmd_buffer);
-      v3dv_bo_free(cl->cmd_buffer->device, cl->bo);
+      assert(cl->job);
+      v3dv_bo_free(cl->job->cmd_buffer->device, cl->bo);
    }
 
    /* Leave the CL in a reset state to catch use after destroy instances */
@@ -73,15 +71,15 @@ v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment)
       return offset;
    }
 
-   struct v3dv_bo *bo = v3dv_bo_alloc(cl->cmd_buffer->device, space);
+   struct v3dv_bo *bo = v3dv_bo_alloc(cl->job->cmd_buffer->device, space);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for command list");
       abort();
    }
 
-   v3dv_cmd_buffer_add_bo(cl->cmd_buffer, bo);
+   v3dv_job_add_bo(cl->job, bo);
 
-   bool ok = v3dv_bo_map(cl->cmd_buffer->device, bo, bo->size);
+   bool ok = v3dv_bo_map(cl->job->cmd_buffer->device, bo, bo->size);
    if (!ok) {
       fprintf(stderr, "failed to map command list buffer");
       abort();
@@ -102,7 +100,7 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
    if (v3dv_cl_offset(cl) + space + cl_packet_length(BRANCH) <= cl->size)
       return;
 
-   struct v3dv_bo *bo = v3dv_bo_alloc(cl->cmd_buffer->device, space);
+   struct v3dv_bo *bo = v3dv_bo_alloc(cl->job->cmd_buffer->device, space);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for command list");
       abort();
@@ -115,9 +113,9 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
       }
    }
 
-   v3dv_cmd_buffer_add_bo(cl->cmd_buffer, bo);
+   v3dv_job_add_bo(cl->job, bo);
 
-   bool ok = v3dv_bo_map(cl->cmd_buffer->device, bo, bo->size);
+   bool ok = v3dv_bo_map(cl->job->cmd_buffer->device, bo, bo->size);
    if (!ok) {
       fprintf(stderr, "failed to map command list buffer");
       abort();
diff --git a/src/broadcom/vulkan/v3dv_cl.h b/src/broadcom/vulkan/v3dv_cl.h
index f58b2d5cf99..c95110f48e0 100644
--- a/src/broadcom/vulkan/v3dv_cl.h
+++ b/src/broadcom/vulkan/v3dv_cl.h
@@ -27,10 +27,10 @@
 #include "broadcom/cle/v3d_packet_helpers.h"
 
 struct v3dv_bo;
-struct v3dv_cmd_buffer;
+struct v3dv_job;
 struct v3dv_cl;
 
-void v3dv_cmd_buffer_add_bo(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *bo);
+void v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo);
 
 /**
  * Undefined structure, used for typechecking that you're passing the pointers
@@ -46,7 +46,7 @@ struct v3dv_cl_reloc {
 
 struct v3dv_cl {
    void *base;
-   struct v3dv_cmd_buffer *cmd_buffer;
+   struct v3dv_job *job;
    struct v3dv_cl_out *next;
    struct v3dv_bo *bo;
    uint32_t size;
@@ -82,7 +82,7 @@ v3dv_cl_get_address(struct v3dv_cl *cl)
    return (struct v3dv_cl_reloc){ .bo = cl->bo, .offset = v3dv_cl_offset(cl) };
 }
 
-void v3dv_cl_init(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl *cl);
+void v3dv_cl_init(struct v3dv_job *job, struct v3dv_cl *cl);
 void v3dv_cl_begin(struct v3dv_cl *cl);
 void v3dv_cl_reset(struct v3dv_cl *cl);
 void v3dv_cl_destroy(struct v3dv_cl *cl);
@@ -167,7 +167,7 @@ static inline void
 cl_pack_emit_reloc(struct v3dv_cl *cl, const struct v3dv_cl_reloc *reloc)
 {
         if (reloc->bo)
-                v3dv_cmd_buffer_add_bo(cl->cmd_buffer, reloc->bo);
+                v3dv_job_add_bo(cl->job, reloc->bo);
 }
 
 #endif /* V3DV_CL_H */
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index ca302db0301..caf2d2ad5be 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -35,16 +35,16 @@ const struct v3dv_dynamic_state default_dynamic_state = {
 };
 
 void
-v3dv_cmd_buffer_add_bo(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *bo)
+v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
 {
    if (!bo)
       return;
 
-   if (_mesa_set_search(cmd_buffer->bos, bo))
+   if (_mesa_set_search(job->bos, bo))
       return;
 
-   _mesa_set_add(cmd_buffer->bos, bo);
-   cmd_buffer->bo_count++;
+   _mesa_set_add(job->bos, bo);
+   job->bo_count++;
 }
 
 VkResult
@@ -94,13 +94,7 @@ cmd_buffer_create(struct v3dv_device *device,
    cmd_buffer->level = level;
    cmd_buffer->usage_flags = 0;
 
-   cmd_buffer->bos =
-      _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
-   cmd_buffer->bo_count = 0;
-
-   v3dv_cl_init(cmd_buffer, &cmd_buffer->bcl);
-   v3dv_cl_init(cmd_buffer, &cmd_buffer->rcl);
-   v3dv_cl_init(cmd_buffer, &cmd_buffer->indirect);
+   list_inithead(&cmd_buffer->submit_jobs);
 
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_NEW;
 
@@ -113,48 +107,114 @@ cmd_buffer_create(struct v3dv_device *device,
 }
 
 static void
-cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
+job_destroy(struct v3dv_job *job)
 {
-   list_del(&cmd_buffer->pool_link);
+   assert(job);
+
+   list_del(&job->list_link);
 
-   v3dv_cl_destroy(&cmd_buffer->bcl);
-   v3dv_cl_destroy(&cmd_buffer->rcl);
-   v3dv_cl_destroy(&cmd_buffer->indirect);
+   v3dv_cl_destroy(&job->bcl);
+   v3dv_cl_destroy(&job->rcl);
+   v3dv_cl_destroy(&job->indirect);
 
    /* Since we don't ref BOs, when we add them to the command buffer, don't
     * unref them here either.
     */
 #if 0
-   set_foreach(cmd_buffer->bos, entry) {
+   set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
       v3dv_bo_free(cmd_buffer->device, bo);
    }
 #endif
-   _mesa_set_destroy(cmd_buffer->bos, NULL);
+   _mesa_set_destroy(job->bos, NULL);
+
+   v3dv_bo_free(job->cmd_buffer->device, job->tile_alloc);
+   v3dv_bo_free(job->cmd_buffer->device, job->tile_state);
+}
+
+static void
+cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   list_del(&cmd_buffer->pool_link);
 
-   v3dv_bo_free(cmd_buffer->device, cmd_buffer->tile_alloc);
-   v3dv_bo_free(cmd_buffer->device, cmd_buffer->tile_state);
+   list_for_each_entry_safe(struct v3dv_job, job,
+                            &cmd_buffer->submit_jobs, list_link) {
+      job_destroy(job);
+   }
+
+   if (cmd_buffer->state.job)
+      job_destroy(cmd_buffer->state.job);
 
    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
 }
 
+static void
+emit_binning_flush(struct v3dv_job *job)
+{
+   assert(job);
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
+   cl_emit(&job->bcl, FLUSH, flush);
+}
+
+void
+v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+   assert(v3dv_cl_offset(&job->bcl) != 0);
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   cmd_buffer->state.job = NULL;
+}
+
+struct v3dv_job *
+v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* Ensure we are not starting a new job without finishing a previous one */
+   if (cmd_buffer->state.job != NULL) {
+      emit_binning_flush(cmd_buffer->state.job);
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+   }
+
+   assert(cmd_buffer->state.job == NULL);
+   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
+                                    sizeof(struct v3dv_job), 8,
+                                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   assert(job);
+
+   job->cmd_buffer = cmd_buffer;
+
+   job->bos =
+      _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   job->bo_count = 0;
+
+   v3dv_cl_init(job, &job->bcl);
+   v3dv_cl_begin(&job->bcl);
+
+   v3dv_cl_init(job, &job->rcl);
+   v3dv_cl_begin(&job->rcl);
+
+   v3dv_cl_init(job, &job->indirect);
+   v3dv_cl_begin(&job->indirect);
+
+   cmd_buffer->state.job = job;
+   return job;
+}
+
 static VkResult
 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer)
 {
    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
-      cmd_buffer->usage_flags = 0;
+      /* FIXME */
+      assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_NEW);
 
-      _mesa_set_clear(cmd_buffer->bos, NULL);
-      cmd_buffer->bo_count = 0;
-
-      v3dv_cl_reset(&cmd_buffer->bcl);
-      v3dv_cl_reset(&cmd_buffer->rcl);
-      v3dv_cl_reset(&cmd_buffer->indirect);
+      cmd_buffer->usage_flags = 0;
 
       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
       state->pass = NULL;
       state->framebuffer = NULL;
       state->subpass_idx = 0;
+      state->job = NULL;
 
       cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;
    }
@@ -248,19 +308,16 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
 
    cmd_buffer->usage_flags = pBeginInfo->flags;
 
-   v3dv_cl_begin(&cmd_buffer->bcl);
-   v3dv_cl_begin(&cmd_buffer->rcl);
-   v3dv_cl_begin(&cmd_buffer->indirect);
-
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
 
    return VK_SUCCESS;
 }
 
 static void
-emit_clip_window(struct v3dv_cmd_buffer *cmd_buffer, VkRect2D *rect)
+emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
 {
-   cl_emit(&cmd_buffer->bcl, CLIP_WINDOW, clip) {
+   assert(job);
+   cl_emit(&job->bcl, CLIP_WINDOW, clip) {
       clip.clip_window_left_pixel_coordinate = rect->offset.x;
       clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
       clip.clip_window_width_in_pixels = rect->extent.width;
@@ -349,90 +406,12 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
                                      pRenderPassBegin->clearValueCount,
                                      pRenderPassBegin->pClearValues);
 
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->bcl, 256);
-
-   /* The PTB will request the tile alloc initial size per tile at start
-    * of tile binning.
-    */
-   const uint32_t fb_layers = 1; /* FIXME */
-   uint32_t tile_alloc_size = 64 * MAX2(fb_layers, 1) *
-                              framebuffer->draw_tiles_x *
-                              framebuffer->draw_tiles_y;
-
-   /* The PTB allocates in aligned 4k chunks after the initial setup. */
-   tile_alloc_size = align(tile_alloc_size, 4096);
-
-   /* Include the first two chunk allocations that the PTB does so that
-    * we definitely clear the OOM condition before triggering one (the HW
-    * won't trigger OOM during the first allocations).
-    */
-   tile_alloc_size += 8192;
-
-   /* For performance, allocate some extra initial memory after the PTB's
-    * minimal allocations, so that we hopefully don't have to block the
-    * GPU on the kernel handling an OOM signal.
-    */
-   tile_alloc_size += 512 * 1024;
-
-   cmd_buffer->tile_alloc = v3dv_bo_alloc(cmd_buffer->device, tile_alloc_size);
-   v3dv_cmd_buffer_add_bo(cmd_buffer, cmd_buffer->tile_alloc);
-
-   const uint32_t tsda_per_tile_size = 256;
-   const uint32_t tile_state_size = MAX2(fb_layers, 1) *
-                                    framebuffer->draw_tiles_x *
-                                    framebuffer->draw_tiles_y *
-                                    tsda_per_tile_size;
-   cmd_buffer->tile_state = v3dv_bo_alloc(cmd_buffer->device, tile_state_size);
-   v3dv_cmd_buffer_add_bo(cmd_buffer, cmd_buffer->tile_state);
-
-   /* This must go before the binning mode configuration. It is
-    * required for layered framebuffers to work.
-    */
-   if (fb_layers > 0) {
-      cl_emit(&cmd_buffer->bcl, NUMBER_OF_LAYERS, config) {
-         config.number_of_layers = fb_layers;
-      }
-   }
-
-   cl_emit(&cmd_buffer->bcl, TILE_BINNING_MODE_CFG, config) {
-      config.width_in_pixels = framebuffer->width;
-      config.height_in_pixels = framebuffer->height;
-      config.number_of_render_targets = MAX2(framebuffer->attachment_count, 1);
-      config.multisample_mode_4x = false; /* FIXME */
-      config.maximum_bpp_of_all_render_targets = framebuffer->internal_bpp;
-   }
-
-   /* There's definitely nothing in the VCD cache we want. */
-   cl_emit(&cmd_buffer->bcl, FLUSH_VCD_CACHE, bin);
-
-   /* Disable any leftover OQ state from another job. */
-   cl_emit(&cmd_buffer->bcl, OCCLUSION_QUERY_COUNTER, counter);
-
-   /* "Binning mode lists must have a Start Tile Binning item (6) after
-    *  any prefix state data before the binning list proper starts."
-    */
-   cl_emit(&cmd_buffer->bcl, START_TILE_BINNING, bin);
-
    /* FIXME: probably need to align the render area to tile boundaries since
     *        the tile clears will render full tiles anyway.
     *        See vkGetRenderAreaGranularity().
     */
    state->render_area = pRenderPassBegin->renderArea;
 
-   /* If we don't have a scissor or viewport defined let's just use the render
-    * area as clip_window, as that would be required for a clear in any
-    * case. If we have that, it would be emitted as part of the pipeline
-    * dynamic state flush
-    *
-    * FIXME: this is mostly just needed for clear. radv has dedicated paths
-    * for them, so we could get that idea. In any case, need to revisit if
-    * this is the place to emit the clip window.
-    */
-   if (cmd_buffer->state.dynamic.scissor.count == 0 &&
-       cmd_buffer->state.dynamic.viewport.count == 0) {
-      emit_clip_window(cmd_buffer, &state->render_area);
-   }
-
    /* Setup for first subpass */
    state->subpass_idx = 0;
 }
@@ -627,10 +606,13 @@ emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
 static void
 emit_generic_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    /* Emit the generic list in our indirect state -- the rcl will just
     * have pointers into it.
     */
-   struct v3dv_cl *cl = &cmd_buffer->indirect;
+   struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
 
@@ -653,7 +635,7 @@ emit_generic_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
 
    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
 
-   cl_emit(&cmd_buffer->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
       branch.start = tile_list_start;
       branch.end = v3dv_cl_get_address(cl);
    }
@@ -665,7 +647,8 @@ emit_render_layer(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
 
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_job *job = cmd_buffer->state.job;
+   struct v3dv_cl *rcl = &job->rcl;
 
    /* If doing multicore binning, we would need to initialize each
     * core's tile list here.
@@ -673,7 +656,7 @@ emit_render_layer(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
    const uint32_t tile_alloc_offset =
       64 * layer * framebuffer->draw_tiles_x * framebuffer->draw_tiles_y;
    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(cmd_buffer->tile_alloc, tile_alloc_offset);
+      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
    }
 
    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
@@ -758,10 +741,13 @@ emit_render_layer(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
 static void
 emit_rcl(struct v3dv_cmd_buffer *cmd_buffer)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    /* FIXME */
    const uint32_t fb_layers = 1;
 
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->rcl, 200 +
+   v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
                                     MAX2(fb_layers, 1) * 256 *
                                     cl_packet_length(SUPERTILE_COORDINATES));
 
@@ -772,7 +758,7 @@ emit_rcl(struct v3dv_cmd_buffer *cmd_buffer)
    const struct v3dv_subpass *subpass =
       &state->pass->subpasses[state->subpass_idx];
 
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_cl *rcl = &job->rcl;
 
    /* Comon config must be the first TILE_RENDERING_MODE_CFG and
     * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
@@ -892,7 +878,7 @@ subpass_start(struct v3dv_cmd_buffer *cmd_buffer)
    for (uint32_t i = 0; i < subpass->color_count; i++) {
       uint32_t rp_attachment_idx = subpass->color_attachments[i].attachment;
       const struct v3dv_render_pass_attachment *attachment =
-         &cmd_buffer->state.pass->attachments[rp_attachment_idx];
+         &state->pass->attachments[rp_attachment_idx];
 
       /* FIXME: if a previous subpass has alredy computed the hw clear color
        *        for this attachment we could skip this. We can just flag this
@@ -904,7 +890,7 @@ subpass_start(struct v3dv_cmd_buffer *cmd_buffer)
 
       const uint32_t sp_attachment_idx = i;
       const struct v3dv_image_view *iview =
-         cmd_buffer->state.framebuffer->attachments[sp_attachment_idx];
+         state->framebuffer->attachments[sp_attachment_idx];
 
       assert((iview->aspects &
               (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) == 0);
@@ -917,20 +903,113 @@ subpass_start(struct v3dv_cmd_buffer *cmd_buffer)
                                                      clear_color);
       }
    }
+
+   /* FIXME: for now, each subpass goes into a separate job. In the future we
+    * might be able to merge subpasses that render to the same render targets
+    * so long as they don't render to more than 4 color attachments and there
+    * aren't other subpass dependencies preveting this.
+    */
+   struct v3dv_job *job = v3dv_cmd_buffer_start_job(cmd_buffer);
+
+   const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+
+   /* Setup binning for this subpass.
+    *
+    * FIXME: For now we do this at the start each subpass but if we implement
+    * subpass merges in the future we would only want to emit this once per job.
+    */
+   v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
+
+   /* The PTB will request the tile alloc initial size per tile at start
+    * of tile binning.
+    */
+   const uint32_t fb_layers = 1; /* FIXME */
+   uint32_t tile_alloc_size = 64 * MAX2(fb_layers, 1) *
+                              framebuffer->draw_tiles_x *
+                              framebuffer->draw_tiles_y;
+
+   /* The PTB allocates in aligned 4k chunks after the initial setup. */
+   tile_alloc_size = align(tile_alloc_size, 4096);
+
+   /* Include the first two chunk allocations that the PTB does so that
+    * we definitely clear the OOM condition before triggering one (the HW
+    * won't trigger OOM during the first allocations).
+    */
+   tile_alloc_size += 8192;
+
+   /* For performance, allocate some extra initial memory after the PTB's
+    * minimal allocations, so that we hopefully don't have to block the
+    * GPU on the kernel handling an OOM signal.
+    */
+   tile_alloc_size += 512 * 1024;
+
+   job->tile_alloc = v3dv_bo_alloc(cmd_buffer->device, tile_alloc_size);
+   v3dv_job_add_bo(job, job->tile_alloc);
+
+   const uint32_t tsda_per_tile_size = 256;
+   const uint32_t tile_state_size = MAX2(fb_layers, 1) *
+                                    framebuffer->draw_tiles_x *
+                                    framebuffer->draw_tiles_y *
+                                    tsda_per_tile_size;
+   job->tile_state = v3dv_bo_alloc(cmd_buffer->device, tile_state_size);
+   v3dv_job_add_bo(job, job->tile_state);
+
+   /* This must go before the binning mode configuration. It is
+    * required for layered framebuffers to work.
+    */
+   if (fb_layers > 0) {
+      cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
+         config.number_of_layers = fb_layers;
+      }
+   }
+
+   cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+      config.width_in_pixels = framebuffer->width;
+      config.height_in_pixels = framebuffer->height;
+      config.number_of_render_targets = MAX2(framebuffer->attachment_count, 1);
+      config.multisample_mode_4x = false; /* FIXME */
+      config.maximum_bpp_of_all_render_targets = framebuffer->internal_bpp;
+   }
+
+   /* There's definitely nothing in the VCD cache we want. */
+   cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
+
+   /* Disable any leftover OQ state from another job. */
+   cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter);
+
+   /* "Binning mode lists must have a Start Tile Binning item (6) after
+    *  any prefix state data before the binning list proper starts."
+    */
+   cl_emit(&job->bcl, START_TILE_BINNING, bin);
+
+   /* If we don't have a scissor or viewport defined let's just use the render
+    * area as clip_window, as that would be required for a clear in any
+    * case. If we have that, it would be emitted as part of the pipeline
+    * dynamic state flush
+    *
+    * FIXME: this is mostly just needed for clear. radv has dedicated paths
+    * for them, so we could get that idea. In any case, need to revisit if
+    * this is the place to emit the clip window.
+    */
+   if (cmd_buffer->state.dynamic.scissor.count == 0 &&
+       cmd_buffer->state.dynamic.viewport.count == 0) {
+      emit_clip_window(job, &state->render_area);
+   }
 }
 
 static void
 subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->bcl, cl_packet_length(FLUSH));
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
 
-   /* We need to emit a flush between binning jobs, so do this before we start
-    * recording the next subpass.
+   /* This finishes the a binning job.
     *
     * FIXME: if the next subpass draws to the same RTs, we could skip this
     * and the binning setup for the next subpass.
     */
-   cl_emit(&cmd_buffer->bcl, FLUSH, flush);
+   emit_binning_flush(job);
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
 }
 
 static void
@@ -961,11 +1040,18 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   if (v3dv_cl_offset(&cmd_buffer->bcl) == 0)
-      return VK_SUCCESS; /* FIXME? */
-
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
 
+   struct v3dv_job *job = cmd_buffer->state.job;
+   if (!job)
+      return VK_SUCCESS;
+
+   /* We get here if we recorded commands after the last render pass in the
+    * command buffer. Make sure we finish this last job. */
+   assert(v3dv_cl_offset(&job->bcl) != 0);
+   emit_binning_flush(job);
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
+
    return VK_SUCCESS;
 }
 
@@ -1028,11 +1114,11 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
 
       /* FIXME: is here the best moment to do that? or when drawing? */
       if (pipeline->vs->assembly_bo)
-         v3dv_cmd_buffer_add_bo(cmd_buffer, pipeline->vs->assembly_bo);
+         v3dv_job_add_bo(cmd_buffer->state.job, pipeline->vs->assembly_bo);
       if (pipeline->vs_bin->assembly_bo)
-         v3dv_cmd_buffer_add_bo(cmd_buffer, pipeline->vs_bin->assembly_bo);
+         v3dv_job_add_bo(cmd_buffer->state.job, pipeline->vs_bin->assembly_bo);
       if (pipeline->fs->assembly_bo)
-         v3dv_cmd_buffer_add_bo(cmd_buffer, pipeline->fs->assembly_bo);
+         v3dv_job_add_bo(cmd_buffer->state.job, pipeline->fs->assembly_bo);
 
       cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
       break;
@@ -1181,7 +1267,7 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    clip_window.extent.width = maxx - minx;
    clip_window.extent.height = maxy - miny;
 
-   emit_clip_window(cmd_buffer, &clip_window);
+   emit_clip_window(cmd_buffer->state.job, &clip_window);
 }
 
 static void
@@ -1194,23 +1280,26 @@ emit_viewport(struct v3dv_cmd_buffer *cmd_buffer)
    float *vptranslate = dynamic->viewport.translate[0];
    float *vpscale = dynamic->viewport.scale[0];
 
-   cl_emit(&cmd_buffer->bcl, CLIPPER_XY_SCALING, clip) {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
    }
 
-   cl_emit(&cmd_buffer->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+   cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
       clip.viewport_z_offset_zc_to_zs = vptranslate[2];
       clip.viewport_z_scale_zc_to_zs = vpscale[2];
    }
-   cl_emit(&cmd_buffer->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
+   cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
       float z1 = (vptranslate[2] - vpscale[2]);
       float z2 = (vptranslate[2] + vpscale[2]);
       clip.minimum_zw = MIN2(z1, z2);
       clip.maximum_zw = MAX2(z1, z2);
    }
 
-   cl_emit(&cmd_buffer->bcl, VIEWPORT_OFFSET, vp) {
+   cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
       vp.viewport_centre_x_coordinate = vptranslate[0];
       vp.viewport_centre_y_coordinate = vptranslate[1];
    }
@@ -1233,9 +1322,11 @@ struct vpm_config {
 static void
 cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    struct v3dv_pipeline *pipeline = state->pipeline;
-
    assert(pipeline);
 
    /* Upload the uniforms to the indirect CL first */
@@ -1249,9 +1340,9 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
       v3dv_write_uniforms(cmd_buffer, pipeline->vs_bin);
 
    /* Update the cache dirty flag based on the shader progs data */
-   state->tmu_dirty_rcl |= pipeline->vs_bin->prog_data.vs->base.tmu_dirty_rcl;
-   state->tmu_dirty_rcl |= pipeline->vs->prog_data.vs->base.tmu_dirty_rcl;
-   state->tmu_dirty_rcl |= pipeline->fs->prog_data.fs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= pipeline->vs_bin->prog_data.vs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= pipeline->vs->prog_data.vs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= pipeline->fs->prog_data.fs->base.tmu_dirty_rcl;
 
    /* FIXME: fake vtx->num_elements, that is the vertex state that includes
     * data from the buffers used on the vertex. Such info is still not
@@ -1267,7 +1358,7 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
    uint32_t num_elements_to_emit = MAX2(vtx_num_elements, 1);
 
    uint32_t shader_rec_offset =
-      v3dv_cl_ensure_space(&cmd_buffer->indirect,
+      v3dv_cl_ensure_space(&job->indirect,
                            cl_packet_length(GL_SHADER_STATE_RECORD) +
                            num_elements_to_emit *
                            cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
@@ -1286,7 +1377,7 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
    vpm_cfg.Ve = 0;
    vpm_cfg.Vc = pipeline->vs->prog_data.vs->vcm_cache_size;
 
-   cl_emit(&cmd_buffer->indirect, GL_SHADER_STATE_RECORD, shader) {
+   cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) {
       shader.enable_clipping = true;
 
       shader.point_size_in_shaded_vertex_data =
@@ -1400,9 +1491,9 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
        * by CS and VS.  If we have no attributes being consumed by
        * the shader, set up a dummy to be loaded into the VPM.
        */
-      cl_emit(&cmd_buffer->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
+      cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
          /* Valid address of data whose value will be unused. */
-         attr.address = v3dv_cl_address(cmd_buffer->indirect.bo, 0);
+         attr.address = v3dv_cl_address(job->indirect.bo, 0);
 
          attr.type = ATTRIBUTE_FLOAT;
          attr.stride = 0;
@@ -1413,13 +1504,13 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
       }
    }
 
-   cl_emit(&cmd_buffer->bcl, VCM_CACHE_SIZE, vcm) {
+   cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
       vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc;
       vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc;
    }
 
-   cl_emit(&cmd_buffer->bcl, GL_SHADER_STATE, state) {
-      state.address = v3dv_cl_address(cmd_buffer->indirect.bo,
+   cl_emit(&job->bcl, GL_SHADER_STATE, state) {
+      state.address = v3dv_cl_address(job->indirect.bo,
                                       shader_rec_offset);
       state.number_of_attribute_arrays = num_elements_to_emit;
    }
@@ -1462,6 +1553,9 @@ static void
 cmd_buffer_emit_draw_packets(struct v3dv_cmd_buffer *cmd_buffer,
                              struct v3dv_draw_info *info)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    struct v3dv_pipeline *pipeline = state->pipeline;
 
@@ -1473,7 +1567,7 @@ cmd_buffer_emit_draw_packets(struct v3dv_cmd_buffer *cmd_buffer,
    /* FIXME: using VERTEX_ARRAY_PRIMS always as it fits our test caselist
     * right now. Need to be choosen based on the current case.
     */
-   cl_emit(&cmd_buffer->bcl, VERTEX_ARRAY_PRIMS, prim) {
+   cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
       prim.mode = hw_prim_type | prim_tf_enable;
       prim.length = info->vertex_count;
       prim.index_of_first_vertex = info->first_vertex;
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index a7728a22090..fc55be831c0 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -27,8 +27,7 @@
 #include "vk_format_info.h"
 
 static void
-emit_image_loads(struct v3dv_cmd_buffer *cmd_buffer,
-                 struct v3dv_cl *cl,
+emit_image_loads(struct v3dv_cl *cl,
                  struct v3dv_image *image,
                  uint32_t layer,
                  uint32_t mip_level)
@@ -67,8 +66,7 @@ emit_image_loads(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_buffer_stores(struct v3dv_cmd_buffer *cmd_buffer,
-                   struct v3dv_cl *cl,
+emit_buffer_stores(struct v3dv_cl *cl,
                    struct v3dv_buffer *buffer,
                    struct v3dv_image *image,
                    uint32_t buffer_offset,
@@ -92,13 +90,13 @@ emit_buffer_stores(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_copy_layer_to_buffer_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
                                         struct v3dv_buffer *buffer,
                                         struct v3dv_image *image,
                                         uint32_t layer,
                                         const VkBufferImageCopy *region)
 {
-   struct v3dv_cl *cl = &cmd_buffer->indirect;
+   struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
 
@@ -108,8 +106,7 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer,
    assert(layer < imgrsc->layerCount);
 
    /* Load image to TLB */
-   emit_image_loads(cmd_buffer, cl, image,
-                    imgrsc->baseArrayLayer + layer, imgrsc->mipLevel);
+   emit_image_loads(cl, image, imgrsc->baseArrayLayer + layer, imgrsc->mipLevel);
 
    cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
       fmt.primitive_type = LIST_TRIANGLES;
@@ -130,21 +127,20 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t buffer_stride = width * image->cpp;
    uint32_t buffer_offset =
       region->bufferOffset + height * buffer_stride * layer;
-   emit_buffer_stores(cmd_buffer, cl, buffer, image,
-                      buffer_offset, buffer_stride);
+   emit_buffer_stores(cl, buffer, image, buffer_offset, buffer_stride);
 
    cl_emit(cl, END_OF_TILE_MARKER, end);
 
    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
 
-   cl_emit(&cmd_buffer->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
       branch.start = tile_list_start;
       branch.end = v3dv_cl_get_address(cl);
    }
 }
 
 static void
-emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_layer_to_buffer(struct v3dv_job *job,
                           uint32_t min_x_supertile,
                           uint32_t min_y_supertile,
                           uint32_t max_x_supertile,
@@ -155,12 +151,12 @@ emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t layer,
                           const VkBufferImageCopy *region)
 {
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_cl *rcl = &job->rcl;
 
    const uint32_t tile_alloc_offset =
       64 * layer * framebuffer->draw_tiles_x * framebuffer->draw_tiles_y;
    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(cmd_buffer->tile_alloc, tile_alloc_offset);
+      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
    }
 
    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
@@ -189,8 +185,7 @@ emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
 
    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
 
-   emit_copy_layer_to_buffer_per_tile_list(cmd_buffer, buffer, image,
-                                           layer, region);
+   emit_copy_layer_to_buffer_per_tile_list(job, buffer, image, layer, region);
 
    for (int y = min_y_supertile; y <= max_y_supertile; y++) {
       for (int x = min_x_supertile; x <= max_x_supertile; x++) {
@@ -203,7 +198,7 @@ emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_image_to_buffer_rcl(struct v3dv_job *job,
                               struct v3dv_buffer *buffer,
                               struct v3dv_image *image,
                               struct v3dv_framebuffer *framebuffer,
@@ -212,7 +207,7 @@ emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
 {
    const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
 
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_cl *rcl = &job->rcl;
    v3dv_cl_ensure_space_with_branch(rcl, 200 +
                                     imgrsc->layerCount * 256 *
                                     cl_packet_length(SUPERTILE_COORDINATES));
@@ -263,7 +258,7 @@ emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
 
    for (int layer = 0; layer < imgrsc->layerCount; layer++) {
-      emit_copy_layer_to_buffer(cmd_buffer,
+      emit_copy_layer_to_buffer(job,
                                 min_x_supertile, min_y_supertile,
                                 max_x_supertile, max_y_supertile,
                                 buffer, image, framebuffer,
@@ -275,17 +270,17 @@ emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_copy_image_to_buffer_bcl(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_image_to_buffer_bcl(struct v3dv_job *job,
                               struct v3dv_framebuffer *framebuffer,
                               const VkBufferImageCopy *region)
 {
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->bcl, 256);
+   v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
 
-   cl_emit(&cmd_buffer->bcl, NUMBER_OF_LAYERS, config) {
+   cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
       config.number_of_layers = framebuffer->layers;
    }
 
-   cl_emit(&cmd_buffer->bcl, TILE_BINNING_MODE_CFG, config) {
+   cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
       config.width_in_pixels = framebuffer->width;
       config.height_in_pixels = framebuffer->height;
       config.number_of_render_targets = 1;
@@ -293,20 +288,20 @@ emit_copy_image_to_buffer_bcl(struct v3dv_cmd_buffer *cmd_buffer,
       config.maximum_bpp_of_all_render_targets = framebuffer->internal_bpp;
    }
 
-   cl_emit(&cmd_buffer->bcl, FLUSH_VCD_CACHE, bin);
+   cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
 
-   cl_emit(&cmd_buffer->bcl, OCCLUSION_QUERY_COUNTER, counter);
+   cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter);
 
-   cl_emit(&cmd_buffer->bcl, START_TILE_BINNING, bin);
+   cl_emit(&job->bcl, START_TILE_BINNING, bin);
 
-   cl_emit(&cmd_buffer->bcl, CLIP_WINDOW, clip) {
+   cl_emit(&job->bcl, CLIP_WINDOW, clip) {
       clip.clip_window_left_pixel_coordinate = region->imageOffset.x;
       clip.clip_window_bottom_pixel_coordinate = region->imageOffset.y;
       clip.clip_window_width_in_pixels = region->imageExtent.width;
       clip.clip_window_height_in_pixels = region->imageExtent.height;
    }
 
-   cl_emit(&cmd_buffer->bcl, FLUSH, flush);
+   cl_emit(&job->bcl, FLUSH, flush);
 }
 
 /* Sets framebuffer dimensions and computes tile size parameters based on the
@@ -365,35 +360,30 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       struct v3dv_framebuffer framebuffer;
       setup_framebuffer_params(&framebuffer, image, num_layers, internal_bpp);
 
-      /* FIXME: here we assume that we have a valid tile alloc/state setup,
-       *        which is usually the case for copy after render scenarios. The
-       *        code below simply checks and asserts this requirement,
-       *        however, a proper implementation should allocate new tile
-       *        alloc/state if we don't have one (for example if we haven't
-       *        recorded a render pass yet) or the one we have isn't large
-       *        enough. We still need to figure out how we want to handle
-       *        varying tile alloc/state requirements in a command buffer.
-       */
+      struct v3dv_job *job = v3dv_cmd_buffer_start_job(cmd_buffer);
+
       uint32_t tile_alloc_size = 64 * num_layers *
                                  framebuffer.draw_tiles_x *
                                  framebuffer.draw_tiles_y;
       tile_alloc_size = align(tile_alloc_size, 4096);
       tile_alloc_size += 8192;
       tile_alloc_size += 512 * 1024;
-      assert(cmd_buffer->tile_alloc &&
-             cmd_buffer->tile_alloc->size >= tile_alloc_size);
+      job->tile_alloc = v3dv_bo_alloc(cmd_buffer->device, tile_alloc_size);
+      v3dv_job_add_bo(job, job->tile_alloc);
 
       const uint32_t tsda_per_tile_size = 256;
       const uint32_t tile_state_size = num_layers *
                                        framebuffer.draw_tiles_x *
                                        framebuffer.draw_tiles_y *
                                        tsda_per_tile_size;
-      assert(cmd_buffer->tile_state &&
-             cmd_buffer->tile_state->size >= tile_state_size);
+      job->tile_state = v3dv_bo_alloc(cmd_buffer->device, tile_state_size);
+      v3dv_job_add_bo(job, job->tile_state);
 
-      emit_copy_image_to_buffer_bcl(cmd_buffer, &framebuffer, region);
-      emit_copy_image_to_buffer_rcl(cmd_buffer, buffer, image,
+      emit_copy_image_to_buffer_bcl(job, &framebuffer, region);
+      emit_copy_image_to_buffer_rcl(job, buffer, image,
                                     &framebuffer, internal_type, region);
+
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
 }
 
 void
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 3ef1d14e945..d9c698f1a98 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -439,6 +439,30 @@ struct v3dv_dynamic_state {
 
 extern const struct v3dv_dynamic_state default_dynamic_state;
 
+struct v3dv_job {
+   struct list_head list_link;
+
+   struct v3dv_cmd_buffer *cmd_buffer;
+
+   struct v3dv_cl bcl;
+   struct v3dv_cl rcl;
+   struct v3dv_cl indirect;
+
+   /* Set of all BOs referenced by the job. This will be used for making
+    * the list of BOs that the kernel will need to have paged in to
+    * execute our job.
+    */
+   struct set *bos;
+   uint32_t bo_count;
+
+   struct v3dv_bo *tile_alloc;
+   struct v3dv_bo *tile_state;
+
+   bool tmu_dirty_rcl;
+};
+
+void v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo);
+
 struct v3dv_cmd_buffer_state {
    const struct v3dv_render_pass *pass;
    const struct v3dv_framebuffer *framebuffer;
@@ -456,8 +480,8 @@ struct v3dv_cmd_buffer_state {
    struct v3dv_dynamic_state dynamic;
    uint32_t dirty;
 
-   /* FIXME: here? */
-   bool tmu_dirty_rcl;
+   /* Current job being recorded */
+   struct v3dv_job *job;
 };
 
 struct v3dv_cmd_buffer {
@@ -471,26 +495,16 @@ struct v3dv_cmd_buffer {
    VkCommandBufferUsageFlags usage_flags;
    VkCommandBufferLevel level;
 
-   struct v3dv_cl bcl;
-   struct v3dv_cl rcl;
-   struct v3dv_cl indirect;
-
    enum v3dv_cmd_buffer_status status;
 
    struct v3dv_cmd_buffer_state state;
 
-   /* Set of all BOs referenced by the job. This will be used for making
-    * the list of BOs that the kernel will need to have paged in to
-    * execute our job.
-    */
-   struct set *bos;
-   uint32_t bo_count;
-
-   struct v3dv_bo *tile_alloc;
-   struct v3dv_bo *tile_state;
+   /* List of jobs to submit to the kernel */
+   struct list_head submit_jobs;
 };
 
-void v3dv_cmd_buffer_add_bo(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *bo);
+struct v3dv_job *v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer);
+void v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer);
 
 struct v3dv_shader_module {
    unsigned char sha1[20];
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 678bfb4d813..186c9f07301 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -29,23 +29,23 @@
 #include <errno.h>
 
 static void
-v3dv_clif_dump(struct v3dv_queue *queue,
-               struct v3dv_cmd_buffer *cmd_buffer,
+v3dv_clif_dump(struct v3dv_device *device,
+               struct v3dv_job *job,
                struct drm_v3d_submit_cl *submit)
 {
    if (!(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CLIF)))
       return;
 
-   struct clif_dump *clif = clif_dump_init(&queue->device->devinfo,
+   struct clif_dump *clif = clif_dump_init(&device->devinfo,
                                            stderr,
                                            V3D_DEBUG & V3D_DEBUG_CL);
 
-   set_foreach(cmd_buffer->bos, entry) {
+   set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (void *)entry->key;
       char *name = ralloc_asprintf(NULL, "%s_0x%x",
                                    "" /* bo->name */ , bo->offset);
 
-      v3dv_bo_map(queue->device, bo, bo->size);
+      v3dv_bo_map(device, bo, bo->size);
       clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
 
       ralloc_free(name);
@@ -57,17 +57,9 @@ v3dv_clif_dump(struct v3dv_queue *queue,
 }
 
 static VkResult
-queue_submit(struct v3dv_queue *queue,
-             const VkSubmitInfo *pSubmit,
-             VkFence fence)
+job_submit(struct v3dv_job *job)
 {
-   /* FIXME */
-   assert(fence == 0);
-   assert(pSubmit->waitSemaphoreCount == 0);
-   assert(pSubmit->signalSemaphoreCount == 0);
-   assert(pSubmit->commandBufferCount == 1);
-
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pSubmit->pCommandBuffers[0]);
+   assert(job);
 
    struct drm_v3d_submit_cl submit;
 
@@ -79,36 +71,37 @@ queue_submit(struct v3dv_queue *queue,
    /* Update the sync object for the last rendering by our context. */
    submit.out_sync = 0; /* FIXME */
 
-   submit.bcl_start = cmd_buffer->bcl.bo->offset;
-   submit.bcl_end = cmd_buffer->bcl.bo->offset + v3dv_cl_offset(&cmd_buffer->bcl);
-   submit.rcl_start = cmd_buffer->rcl.bo->offset;
-   submit.rcl_end = cmd_buffer->rcl.bo->offset + v3dv_cl_offset(&cmd_buffer->rcl);
+   submit.bcl_start = job->bcl.bo->offset;
+   submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
+   submit.rcl_start = job->rcl.bo->offset;
+   submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
 
    submit.flags = 0;
    /* FIXME: we already know that we support cache flush, as we only support
     * hw that supports that, but would be better to just DRM-ask it
     */
-   if (cmd_buffer->state.tmu_dirty_rcl)
+   if (job->tmu_dirty_rcl)
       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
 
-   submit.qma = cmd_buffer->tile_alloc->offset;
-   submit.qms = cmd_buffer->tile_alloc->size;
-   submit.qts = cmd_buffer->tile_state->offset;
+   submit.qma = job->tile_alloc->offset;
+   submit.qms = job->tile_alloc->size;
+   submit.qts = job->tile_state->offset;
 
-   submit.bo_handle_count = cmd_buffer->bo_count;
+   submit.bo_handle_count = job->bo_count;
    uint32_t *bo_handles =
       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit.bo_handle_count * 2));
    uint32_t bo_idx = 0;
-   set_foreach(cmd_buffer->bos, entry) {
+   set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
       bo_handles[bo_idx++] = bo->handle;
    }
    assert(bo_idx == submit.bo_handle_count);
    submit.bo_handles = (uintptr_t)(void *)bo_handles;
 
-   v3dv_clif_dump(queue, cmd_buffer, &submit);
+   struct v3dv_device *device = job->cmd_buffer->device;
+   v3dv_clif_dump(device, job, &submit);
 
-   int ret = v3dv_ioctl(queue->device->fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
+   int ret = v3dv_ioctl(device->fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
    static bool warned = false;
    if (ret && !warned) {
       fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
@@ -124,6 +117,29 @@ queue_submit(struct v3dv_queue *queue,
    return VK_SUCCESS;
 }
 
+static VkResult
+queue_submit(struct v3dv_queue *queue,
+             const VkSubmitInfo *pSubmit,
+             VkFence fence)
+{
+   /* FIXME */
+   assert(fence == 0);
+   assert(pSubmit->waitSemaphoreCount == 0);
+   assert(pSubmit->signalSemaphoreCount == 0);
+   assert(pSubmit->commandBufferCount == 1);
+
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pSubmit->pCommandBuffers[0]);
+
+   list_for_each_entry_safe(struct v3dv_job, job,
+                            &cmd_buffer->submit_jobs, list_link) {
+      VkResult result = job_submit(job);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
 VkResult
 v3dv_QueueSubmit(VkQueue _queue,
                  uint32_t submitCount,
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 109dbe47158..0652753fb9b 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -34,6 +34,9 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3d_uniform_list *uinfo = &p_stage->prog_data.base->uniforms;
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
 
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    /* The hardware always pre-fetches the next uniform (also when there
     * aren't any), so we always allocate space for an extra slot. This
     * fixes MMU exceptions reported since Linux kernel 5.4 when the
@@ -42,13 +45,11 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
     * the last uniform it will read beyond the end of the page and trigger
     * the MMU exception.
     */
-   v3dv_cl_ensure_space(&cmd_buffer->indirect, (uinfo->count + 1) * 4, 4);
+   v3dv_cl_ensure_space(&job->indirect, (uinfo->count + 1) * 4, 4);
 
-   struct v3dv_cl_reloc uniform_stream =
-      v3dv_cl_get_address(&cmd_buffer->indirect);
+   struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
 
-   struct v3dv_cl_out *uniforms =
-      cl_start(&cmd_buffer->indirect);
+   struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
 
    for (int i = 0; i < uinfo->count; i++) {
       uint32_t data = uinfo->data[i];
@@ -79,7 +80,7 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   cl_end(&cmd_buffer->indirect, uniforms);
+   cl_end(&job->indirect, uniforms);
 
    return uniform_stream;
 }