summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlyssa Rosenzweig <alyssa@rosenzweig.io>2023-03-19 22:12:09 -0400
committerAlyssa Rosenzweig <alyssa@rosenzweig.io>2023-05-07 09:10:37 -0400
commitd72e1418ce4f66c42f20779f50f40091d3d310b0 (patch)
treee0bc5e07ceb3f08acffe7dd07e85c9611677bdd2
parent25646c7772c0bdda9ddf29490130e3a653d4c6aa (diff)
asahi: Implement transform feedback
This code was originally based on the Panfrost implementation, but has been improved in a number of ways. 1. Transform feedback programs are dispatched generically with Gallium calls, rather than emitting something hardware-specific. This is cleaner and portable to future GPUs. 2. Transform feedback with indexed draws is now fixed, by lowering to an index buffer pull. 3. Transform feedback with buffer overflows is now fixed, by correctly bounds checking in transform feedback programs. 4. Transform feedback with strips/fans/loops are fixed, by correctly tessellating to the underlying primitives as required by OpenGL. 5. Transform feedback with QUADS is fixed, by tessellating to triangles as required by OpenGL. That said, the code is still not in its final form. 1. It still does not support indirect draws. This will require a substantial overhaul to do tracking on the GPU instead of the CPU. Currently we force unroll indirect draws (slow but kosher in GL, treif in Vulkan). This isn't hard to solve but I'm not going to duplicate the code until the algorithms are otherwise complete because it's a lot easier to hack on the CPU versions than the GPU versions. 2. It still does not support primitive restart. This has especially nasty interactions with transform feedback. Again we force unroll to non-primitive restart forms, again slow but kosher in GL but treif in Vulkan. This is a lot harder to deal with. I sketched out something really nasty in my notebook (hinging on efficient GPU prefix sums) but I'm not in a hurry to type this out. 3. There will be interactions with geometry and tessellation shaders and I don't think I can get the core code here future-proofed without actually bringing up the new shader stages. As such, this is a hard fork of the panfrost code for now, I'm not trying to share the code (although it *would* clear out almost all of panfrost's transform feedback related piglit failures). Passes dEQP-GLES3.functional.transform_feedback.* and most of the relevant piglits. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22891>
-rw-r--r--docs/features.txt8
-rw-r--r--src/gallium/drivers/asahi/agx_blit.c3
-rw-r--r--src/gallium/drivers/asahi/agx_nir_lower_sysvals.c12
-rw-r--r--src/gallium/drivers/asahi/agx_pipe.c14
-rw-r--r--src/gallium/drivers/asahi/agx_query.c30
-rw-r--r--src/gallium/drivers/asahi/agx_state.c135
-rw-r--r--src/gallium/drivers/asahi/agx_state.h68
-rw-r--r--src/gallium/drivers/asahi/agx_streamout.c574
-rw-r--r--src/gallium/drivers/asahi/agx_uniforms.c10
-rw-r--r--src/gallium/drivers/asahi/meson.build1
10 files changed, 768 insertions, 87 deletions
diff --git a/docs/features.txt b/docs/features.txt
index f8e88faf89d..34709295c87 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -54,7 +54,7 @@ GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv
GL_EXT_draw_buffers2 (Per-buffer blend and masks) DONE (v3d, asahi)
GL_EXT_texture_compression_rgtc DONE (all drivers that support GL_EXT_texture_snorm)
GL_ARB_texture_rg DONE (v3d, lima, asahi)
- GL_EXT_transform_feedback (Transform feedback) DONE (v3d)
+ GL_EXT_transform_feedback (Transform feedback) DONE (v3d, asahi)
GL_ARB_vertex_array_object (Vertex array objects) DONE (v3d, vc4, lima, asahi)
GL_EXT_framebuffer_sRGB (sRGB framebuffer format) DONE (v3d, vc4, lima, asahi)
glClearBuffer commands DONE
@@ -136,8 +136,8 @@ GL 4.0, GLSL 4.00 --- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi
GL_ARB_texture_cube_map_array DONE (freedreno/a4xx+, i965/gen6+, nv50, softpipe, v3d)
GL_ARB_texture_gather DONE (freedreno, i965/gen6+, nv50, softpipe, v3d, panfrost, asahi)
GL_ARB_texture_query_lod DONE (freedreno, i965, nv50, softpipe, v3d, panfrost)
- GL_ARB_transform_feedback2 DONE (freedreno/a3xx+, i965/gen6+, nv50, softpipe, v3d, panfrost)
- GL_ARB_transform_feedback3 DONE (freedreno/a3xx+, i965/gen7+, softpipe, )
+ GL_ARB_transform_feedback2 DONE (freedreno/a3xx+, i965/gen6+, nv50, softpipe, v3d, panfrost, asahi)
+ GL_ARB_transform_feedback3 DONE (freedreno/a3xx+, i965/gen7+, softpipe, asahi)
GL 4.1, GLSL 4.10 --- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi, llvmpipe, virgl, zink, d3d12
@@ -156,7 +156,7 @@ GL 4.2, GLSL 4.20 -- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi,
GL_ARB_compressed_texture_pixel_storage DONE (all drivers)
GL_ARB_shader_atomic_counters DONE (freedreno/a5xx+, i965, softpipe, v3d, panfrost)
GL_ARB_texture_storage DONE (all drivers)
- GL_ARB_transform_feedback_instanced DONE (freedreno, i965, nv50, softpipe, v3d)
+ GL_ARB_transform_feedback_instanced DONE (freedreno, i965, nv50, softpipe, v3d, asahi)
GL_ARB_base_instance DONE (freedreno, i965, nv50, softpipe, v3d)
GL_ARB_shader_image_load_store DONE (freedreno/a5xx+, i965, softpipe, panfrost)
GL_ARB_conservative_depth DONE (all drivers that support GLSL 1.30)
diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c
index 873307c4164..5e104ba93ca 100644
--- a/src/gallium/drivers/asahi/agx_blit.c
+++ b/src/gallium/drivers/asahi/agx_blit.c
@@ -27,7 +27,8 @@ agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
util_blitter_save_blend(blitter, ctx->blend);
util_blitter_save_depth_stencil_alpha(blitter, ctx->zs);
util_blitter_save_stencil_ref(blitter, &ctx->stencil_ref);
- util_blitter_save_so_targets(blitter, 0, NULL);
+ util_blitter_save_so_targets(blitter, ctx->streamout.num_targets,
+ ctx->streamout.targets);
util_blitter_save_sample_mask(blitter, ctx->sample_mask, 0);
util_blitter_save_framebuffer(blitter, &ctx->framebuffer);
diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
index b57d9e10503..3ffe61ac36d 100644
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@@ -111,6 +111,18 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr)
&u->ssbo_size, intr->src[0].ssa);
case nir_intrinsic_load_num_workgroups:
return load_sysval(b, 3, 32, AGX_SYSVAL_TABLE_GRID, 0);
+ case nir_intrinsic_load_xfb_address:
+ return load_sysval_root(b, 1, 64,
+ &u->vs.xfb.base[nir_intrinsic_base(intr)]);
+ case nir_intrinsic_load_xfb_size:
+ return load_sysval_root(b, 1, 32,
+ &u->vs.xfb.size[nir_intrinsic_base(intr)]);
+ case nir_intrinsic_load_xfb_index_buffer:
+ return load_sysval_root(b, 1, 64, &u->vs.xfb.index_buffer);
+ case nir_intrinsic_load_base_vertex:
+ return load_sysval_root(b, 1, 32, &u->vs.xfb.base_vertex);
+ case nir_intrinsic_load_num_vertices:
+ return load_sysval_root(b, 1, 32, &u->vs.xfb.num_vertices);
default:
return NULL;
}
diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c
index 351bd8f7b06..6ce37d9396b 100644
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -1397,6 +1397,7 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
agx_init_state_functions(pctx);
agx_init_query_functions(pctx);
+ agx_init_streamout_functions(pctx);
agx_meta_init(&ctx->meta, agx_device(screen));
@@ -1556,15 +1557,15 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return 0;
case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
- return is_deqp ? PIPE_MAX_SO_BUFFERS : 0;
+ return PIPE_MAX_SO_BUFFERS;
case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
- return is_deqp ? PIPE_MAX_SO_OUTPUTS : 0;
+ return PIPE_MAX_SO_OUTPUTS;
case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
- return is_deqp ? 1 : 0;
+ return 1;
case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
return 2048;
@@ -1587,6 +1588,13 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_VERTEX_ATTRIB_ELEMENT_ALIGNED_ONLY:
return 1;
+ /* We run nir_lower_point_size so we need the GLSL linker to copy
+ * the original gl_PointSize when captured by transform feedback. We could
+ * also copy it ourselves but it's easier to set the CAP.
+ */
+ case PIPE_CAP_PSIZ_CLAMPED:
+ return 1;
+
case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
return 16384;
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c
index 0fe2e3c8ebb..fcea7fb41e7 100644
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -1,8 +1,10 @@
/*
* Copyright 2022 Alyssa Rosenzweig
+ * Copyright 2019-2020 Collabora, Ltd.
* SPDX-License-Identifier: MIT
*/
+#include "util/u_prim.h"
#include "agx_state.h"
static struct pipe_query *
@@ -39,12 +41,13 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
struct agx_context *ctx = agx_context(pctx);
struct agx_query *query = (struct agx_query *)pquery;
+ ctx->dirty |= AGX_DIRTY_QUERY;
+
switch (query->type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
ctx->occlusion_query = query;
- ctx->dirty |= AGX_DIRTY_QUERY;
/* begin_query zeroes, flush so we can do that write. If anything (i.e.
* other than piglit) actually hits this, we could shadow the query to
@@ -60,6 +63,16 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
query->value = 0;
return true;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ ctx->prims_generated = query;
+ query->value = 0;
+ return true;
+
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ ctx->tf_prims_generated = query;
+ query->value = 0;
+ return true;
+
default:
return false;
}
@@ -71,14 +84,20 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
struct agx_context *ctx = agx_context(pctx);
struct agx_query *query = (struct agx_query *)pquery;
+ ctx->dirty |= AGX_DIRTY_QUERY;
+
switch (query->type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
ctx->occlusion_query = NULL;
- ctx->dirty |= AGX_DIRTY_QUERY;
return true;
-
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ ctx->prims_generated = NULL;
+ return true;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ ctx->tf_prims_generated = NULL;
+ return true;
default:
return false;
}
@@ -117,6 +136,11 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
return true;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ vresult->u64 = query->value;
+ return true;
+
default:
unreachable("Other queries not yet supported");
}
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index f71fd9065ad..870f77b13cb 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -22,6 +22,7 @@
#include "gallium/auxiliary/util/u_draw.h"
#include "gallium/auxiliary/util/u_framebuffer.h"
#include "gallium/auxiliary/util/u_helpers.h"
+#include "gallium/auxiliary/util/u_prim_restart.h"
#include "gallium/auxiliary/util/u_viewport.h"
#include "pipe/p_context.h"
#include "pipe/p_defines.h"
@@ -37,59 +38,6 @@
#include "util/u_transfer.h"
#include "agx_disk_cache.h"
-static struct pipe_stream_output_target *
-agx_create_stream_output_target(struct pipe_context *pctx,
- struct pipe_resource *prsc,
- unsigned buffer_offset, unsigned buffer_size)
-{
- struct pipe_stream_output_target *target;
-
- target = &rzalloc(pctx, struct agx_streamout_target)->base;
-
- if (!target)
- return NULL;
-
- pipe_reference_init(&target->reference, 1);
- pipe_resource_reference(&target->buffer, prsc);
-
- target->context = pctx;
- target->buffer_offset = buffer_offset;
- target->buffer_size = buffer_size;
-
- return target;
-}
-
-static void
-agx_stream_output_target_destroy(struct pipe_context *pctx,
- struct pipe_stream_output_target *target)
-{
- pipe_resource_reference(&target->buffer, NULL);
- ralloc_free(target);
-}
-
-static void
-agx_set_stream_output_targets(struct pipe_context *pctx, unsigned num_targets,
- struct pipe_stream_output_target **targets,
- const unsigned *offsets)
-{
- struct agx_context *ctx = agx_context(pctx);
- struct agx_streamout *so = &ctx->streamout;
-
- assert(num_targets <= ARRAY_SIZE(so->targets));
-
- for (unsigned i = 0; i < num_targets; i++) {
- if (offsets[i] != -1)
- agx_so_target(targets[i])->offset = offsets[i];
-
- pipe_so_target_reference(&so->targets[i], targets[i]);
- }
-
- for (unsigned i = 0; i < so->num_targets; i++)
- pipe_so_target_reference(&so->targets[i], NULL);
-
- so->num_targets = num_targets;
-}
-
static void
agx_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader,
unsigned start_slot, unsigned count,
@@ -1403,6 +1351,9 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so,
struct asahi_vs_shader_key *key = &key_->vs;
NIR_PASS_V(nir, agx_nir_lower_vbo, &key->vbuf);
+
+ if (key->xfb.active && nir->xfb_info != NULL)
+ NIR_PASS_V(nir, agx_nir_lower_xfb, &key->xfb);
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
struct asahi_fs_shader_key *key = &key_->fs;
@@ -1672,12 +1623,14 @@ agx_update_vs(struct agx_context *ctx)
/* Only proceed if the shader or anything the key depends on changes
*
* vb_mask, attributes, vertex_buffers: VERTEX
+ * streamout.active: XFB
*/
- if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX)))
+ if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB)))
return false;
struct asahi_vs_shader_key key = {
.vbuf.count = util_last_bit(ctx->vb_mask),
+ .xfb = ctx->streamout.key,
};
memcpy(key.vbuf.attributes, ctx->attributes,
@@ -2563,7 +2516,46 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
return;
}
+ if (indirect && indirect->count_from_stream_output) {
+ agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect);
+ return;
+ }
+
+ const nir_shader *nir_vs = ctx->stage[PIPE_SHADER_VERTEX].shader->nir;
+ bool uses_xfb = nir_vs->xfb_info && ctx->streamout.num_targets;
+ bool uses_prims_generated = ctx->active_queries && ctx->prims_generated;
+
+ if (indirect && (uses_prims_generated || uses_xfb)) {
+ perf_debug_ctx(ctx, "Emulating indirect draw due to XFB");
+ util_draw_indirect(pctx, info, indirect);
+ return;
+ }
+
+ if (uses_xfb && info->primitive_restart) {
+ perf_debug_ctx(ctx, "Emulating primitive restart due to XFB");
+ util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect,
+ draws);
+ return;
+ }
+
+ if (!ctx->streamout.key.active && uses_prims_generated) {
+ agx_primitives_update_direct(ctx, info, draws);
+ }
+
struct agx_batch *batch = agx_get_batch(ctx);
+ unsigned idx_size = info->index_size;
+ uint64_t ib = 0;
+ size_t ib_extent = 0;
+
+ if (idx_size) {
+ if (indirect != NULL)
+ ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
+ else
+ ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent);
+ }
+
+ if (uses_xfb)
+ agx_launch_so(pctx, info, draws, ib);
#ifndef NDEBUG
if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
@@ -2573,8 +2565,10 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
if (agx_scissor_culls_everything(ctx))
return;
- /* We don't support side effects in vertex stages, so this is trivial */
- if (ctx->rast->base.rasterizer_discard)
+ /* We don't support side effects in vertex stages (only used internally for
+ * transform feedback lowering), so this is trivial.
+ */
+ if (ctx->rast->base.rasterizer_discard && !ctx->streamout.key.active)
return;
/* Dirty track the reduced prim: lines vs points vs triangles */
@@ -2631,17 +2625,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
reduced_prim == PIPE_PRIM_POINTS);
enum agx_primitive prim = agx_primitive_for_pipe(info->mode);
- unsigned idx_size = info->index_size;
- uint64_t ib = 0;
- size_t ib_extent = 0;
-
- if (idx_size) {
- if (indirect != NULL)
- ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
- else
- ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent);
- }
-
if (idx_size) {
/* Index sizes are encoded logarithmically */
STATIC_ASSERT(__builtin_ctz(1) == AGX_INDEX_SIZE_U8);
@@ -2729,6 +2712,21 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
out += AGX_INDEX_LIST_BUFFER_SIZE_LENGTH;
}
+ /* Insert a memory barrier after transform feedback so the result may be
+ * consumed by a subsequent vertex shader.
+ */
+ if (ctx->streamout.key.active) {
+ agx_pack(out, VDM_BARRIER, cfg) {
+ cfg.unk_5 = true;
+ cfg.unk_6 = true;
+ cfg.unk_8 = true;
+ cfg.unk_11 = true;
+ cfg.unk_20 = true;
+ }
+
+ out += AGX_VDM_BARRIER_LENGTH;
+ }
+
batch->encoder_current = out;
assert((batch->encoder_current + AGX_VDM_STREAM_LINK_LENGTH) <=
batch->encoder_end &&
@@ -2889,8 +2887,5 @@ agx_init_state_functions(struct pipe_context *ctx)
ctx->surface_destroy = agx_surface_destroy;
ctx->draw_vbo = agx_draw_vbo;
ctx->launch_grid = agx_launch_grid;
- ctx->create_stream_output_target = agx_create_stream_output_target;
- ctx->stream_output_target_destroy = agx_stream_output_target_destroy;
- ctx->set_stream_output_targets = agx_set_stream_output_targets;
ctx->texture_barrier = agx_texture_barrier;
}
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index e44d3010946..ab43824cd62 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -39,17 +39,45 @@ struct agx_streamout_target {
uint32_t offset;
};
-struct agx_streamout {
- struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
- unsigned num_targets;
-};
-
static inline struct agx_streamout_target *
agx_so_target(struct pipe_stream_output_target *target)
{
return (struct agx_streamout_target *)target;
}
+struct agx_xfb_key {
+ /* If true, compiles a "transform feedback" program instead of a vertex
+ * shader. This is a kernel that runs on the VDM and writes out the transform
+ * feedback buffers, with no rasterization.
+ */
+ bool active;
+
+ /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
+ uint8_t index_size;
+
+ /* The primitive mode for unrolling the vertex ID */
+ enum pipe_prim_type mode;
+
+ /* Use first vertex as the provoking vertex for flat shading */
+ bool flatshade_first;
+};
+
+struct agx_xfb_params {
+ uint64_t base[PIPE_MAX_SO_BUFFERS];
+ uint32_t size[PIPE_MAX_SO_BUFFERS];
+ uint64_t index_buffer;
+ uint32_t base_vertex;
+ uint32_t num_vertices;
+};
+
+struct agx_streamout {
+ struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+ unsigned num_targets;
+
+ struct agx_xfb_key key;
+ struct agx_xfb_params params;
+};
+
/* Shaders can access fixed-function state through system values.
* It is convenient to stash all of this information into a single "root"
* descriptor, then push individual parts as needed.
@@ -87,6 +115,9 @@ struct PACKED agx_draw_uniforms {
struct {
/* Vertex buffer object bases, if present */
uint64_t vbo_base[PIPE_MAX_ATTRIBS];
+
+ /* Transform feedback info for a transform feedback shader */
+ struct agx_xfb_params xfb;
} vs;
struct {
@@ -239,6 +270,7 @@ struct agx_blend {
struct asahi_vs_shader_key {
struct agx_vbufs vbuf;
+ struct agx_xfb_key xfb;
};
struct asahi_fs_shader_key {
@@ -277,6 +309,7 @@ enum agx_dirty {
AGX_DIRTY_BLEND = BITFIELD_BIT(12),
AGX_DIRTY_QUERY = BITFIELD_BIT(13),
+ AGX_DIRTY_XFB = BITFIELD_BIT(14),
};
/* Maximum number of in-progress + under-construction GPU batches.
@@ -336,6 +369,8 @@ struct agx_context {
enum pipe_render_cond_flag cond_mode;
struct agx_query *occlusion_query;
+ struct agx_query *prims_generated;
+ struct agx_query *tf_prims_generated;
bool active_queries;
struct util_debug_callback debug;
@@ -410,6 +445,27 @@ agx_context(struct pipe_context *pctx)
void agx_init_query_functions(struct pipe_context *ctx);
+void
+agx_primitives_update_direct(struct agx_context *ctx,
+ const struct pipe_draw_info *info,
+ const struct pipe_draw_start_count_bias *draw);
+
+void agx_nir_lower_xfb(nir_shader *shader, struct agx_xfb_key *key);
+
+void agx_draw_vbo_from_xfb(struct pipe_context *pctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect);
+
+void agx_launch_so(struct pipe_context *pctx, const struct pipe_draw_info *info,
+ const struct pipe_draw_start_count_bias *draws,
+ uint64_t index_buffer);
+
+uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
+ uint32_t *size);
+
+void agx_init_streamout_functions(struct pipe_context *ctx);
+
static inline void
agx_dirty_all(struct agx_context *ctx)
{
@@ -438,7 +494,7 @@ struct agx_query {
struct agx_batch *writer;
unsigned writer_index;
- /* For occlusion queries, which use some CPU work */
+ /* Accumulator flushed to the CPU */
uint64_t value;
};
diff --git a/src/gallium/drivers/asahi/agx_streamout.c b/src/gallium/drivers/asahi/agx_streamout.c
new file mode 100644
index 00000000000..190211ca7ee
--- /dev/null
+++ b/src/gallium/drivers/asahi/agx_streamout.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2022 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_xfb_info.h"
+#include "util/u_draw.h"
+#include "util/u_dump.h"
+#include "util/u_prim.h"
+#include "agx_state.h"
+
+static struct pipe_stream_output_target *
+agx_create_stream_output_target(struct pipe_context *pctx,
+ struct pipe_resource *prsc,
+ unsigned buffer_offset, unsigned buffer_size)
+{
+ struct pipe_stream_output_target *target;
+
+ target = &rzalloc(pctx, struct agx_streamout_target)->base;
+
+ if (!target)
+ return NULL;
+
+ pipe_reference_init(&target->reference, 1);
+ pipe_resource_reference(&target->buffer, prsc);
+
+ target->context = pctx;
+ target->buffer_offset = buffer_offset;
+ target->buffer_size = buffer_size;
+
+ return target;
+}
+
+static void
+agx_stream_output_target_destroy(struct pipe_context *pctx,
+ struct pipe_stream_output_target *target)
+{
+ pipe_resource_reference(&target->buffer, NULL);
+ ralloc_free(target);
+}
+
+static void
+agx_set_stream_output_targets(struct pipe_context *pctx, unsigned num_targets,
+ struct pipe_stream_output_target **targets,
+ const unsigned *offsets)
+{
+ struct agx_context *ctx = agx_context(pctx);
+ struct agx_streamout *so = &ctx->streamout;
+
+ assert(num_targets <= ARRAY_SIZE(so->targets));
+
+ for (unsigned i = 0; i < num_targets; i++) {
+ /* From the Gallium documentation:
+ *
+ * -1 means the buffer should be appended to, and everything else sets
+ * the internal offset.
+ *
+ * We append regardless, so just check for != -1. Yes, using a negative
+ * sentinel value with an unsigned type is bananas. But it's in the
+ * Gallium contract and it will work out fine. Probably should be
+ * redefined to be ~0 instead of -1 but it doesn't really matter.
+ */
+ if (offsets[i] != -1)
+ agx_so_target(targets[i])->offset = offsets[i];
+
+ pipe_so_target_reference(&so->targets[i], targets[i]);
+ }
+
+ for (unsigned i = num_targets; i < so->num_targets; i++)
+ pipe_so_target_reference(&so->targets[i], NULL);
+
+ so->num_targets = num_targets;
+}
+
+static struct pipe_stream_output_target *
+get_target(struct agx_context *ctx, unsigned buffer)
+{
+ if (buffer < ctx->streamout.num_targets)
+ return ctx->streamout.targets[buffer];
+ else
+ return NULL;
+}
+
+/*
+ * Return the address of the indexed streamout buffer. This will be
+ * pushed into the streamout shader.
+ */
+uint64_t
+agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
+ uint32_t *size)
+{
+ struct pipe_stream_output_target *target = get_target(batch->ctx, buffer);
+
+ /* If there's no target, don't write anything */
+ if (!target) {
+ *size = 0;
+ return 0;
+ }
+
+ /* Otherwise, write the target */
+ struct pipe_stream_output_info *so =
+ &batch->ctx->stage[PIPE_SHADER_VERTEX].shader->base.stream_output;
+
+ struct agx_resource *rsrc = agx_resource(target->buffer);
+ agx_batch_writes(batch, rsrc);
+
+ /* The amount of space left depends how much we've already consumed */
+ unsigned stride = so->stride[buffer] * 4;
+ uint32_t offset = agx_so_target(target)->offset * stride;
+
+ *size = offset < target->buffer_size ? (target->buffer_size - offset) : 0;
+ return rsrc->bo->ptr.gpu + target->buffer_offset + offset;
+}
+
+void
+agx_draw_vbo_from_xfb(struct pipe_context *pctx,
+ const struct pipe_draw_info *info, unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect)
+{
+ struct pipe_draw_start_count_bias draw = {
+ .start = 0,
+ .count = agx_so_target(indirect->count_from_stream_output)->offset,
+ };
+
+ pctx->draw_vbo(pctx, info, drawid_offset, NULL, &draw, 1);
+}
+
+static uint32_t
+xfb_prims_for_vertices(enum pipe_prim_type mode, unsigned verts)
+{
+ uint32_t prims = u_decomposed_prims_for_vertices(mode, verts);
+
+ /* The GL spec isn't super clear about this, but it implies that quads are
+ * supposed to be tessellated into primitives and piglit
+ * (ext_transform_feedback-tessellation quads) checks this.
+ */
+ if (u_decomposed_prim(mode) == PIPE_PRIM_QUADS)
+ prims *= 2;
+
+ return prims;
+}
+
+/*
+ * Launch a streamout pipeline.
+ */
+void
+agx_launch_so(struct pipe_context *pctx, const struct pipe_draw_info *info,
+ const struct pipe_draw_start_count_bias *draw,
+ uint64_t index_buffer)
+{
+ struct agx_context *ctx = agx_context(pctx);
+
+ /* Break recursion from draw_vbo creating draw calls below: Do not do a
+ * streamout draw for a streamout draw.
+ */
+ if (ctx->streamout.key.active)
+ return;
+
+ /* Configure the below draw to launch streamout rather than a regular draw */
+ ctx->streamout.key.active = true;
+ ctx->dirty |= AGX_DIRTY_XFB;
+
+ ctx->streamout.key.index_size = info->index_size;
+ ctx->streamout.key.mode = info->mode;
+ ctx->streamout.key.flatshade_first = ctx->rast->base.flatshade_first;
+ ctx->streamout.params.index_buffer = index_buffer;
+
+ /* Ignore provoking vertex for modes that don't depend on the provoking
+ * vertex, to reduce shader variants.
+ */
+ if (info->mode != PIPE_PRIM_TRIANGLE_STRIP)
+ ctx->streamout.key.flatshade_first = false;
+
+ /* Determine how many vertices are XFB there will be */
+ unsigned num_outputs =
+ u_stream_outputs_for_vertices(info->mode, draw->count);
+ unsigned count = draw->count;
+ u_trim_pipe_prim(info->mode, &count);
+
+ ctx->streamout.params.base_vertex =
+ info->index_size ? draw->index_bias : draw->start;
+ ctx->streamout.params.num_vertices = count;
+
+ /* Streamout runs as a vertex shader with rasterizer discard */
+ void *saved_rast = ctx->rast;
+ pctx->bind_rasterizer_state(
+ pctx, util_blitter_get_discard_rasterizer_state(ctx->blitter));
+
+ /* Dispatch a grid of points, this is compute-like */
+ util_draw_arrays_instanced(pctx, PIPE_PRIM_POINTS, 0, num_outputs, 0,
+ info->instance_count);
+ pctx->bind_rasterizer_state(pctx, saved_rast);
+
+ /*
+ * Finally, if needed, update the counter of primitives written. The spec
+ * requires:
+ *
+ * If recording the vertices of a primitive to the buffer objects being
+ * used for transform feedback purposes would result in [overflow]...
+ * the counter corresponding to the asynchronous query target
+ * TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN (see section 13.4) is not
+ * incremented.
+ *
+ * So clamp the number of primitives generated to the number of primitives
+ * we actually have space to write.
+ */
+ if (ctx->tf_prims_generated) {
+ uint32_t min_max = ~0;
+
+ for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
+ struct pipe_stream_output_target *target = get_target(ctx, i);
+
+ if (!target)
+ continue;
+
+ struct pipe_stream_output_info *so =
+ &ctx->stage[PIPE_SHADER_VERTEX].shader->base.stream_output;
+ unsigned stride = so->stride[i] * 4;
+
+ /* Ignore spurious targets. I don't see anything in the Gallium
+ * contract specifically forbidding this.
+ */
+ if (stride == 0)
+ continue;
+
+ uint32_t offset = agx_so_target(target)->offset * stride;
+ uint32_t remaining =
+ offset < target->buffer_size ? (target->buffer_size - offset) : 0;
+ uint32_t max_vertices = stride ? (remaining / stride) : ~0;
+
+ min_max = MIN2(min_max, max_vertices);
+ }
+
+ /* We now have the maximum vertices written, round down to primitives */
+ uint32_t max_prims = xfb_prims_for_vertices(info->mode, min_max);
+ uint32_t prims = xfb_prims_for_vertices(info->mode, draw->count);
+
+ ctx->tf_prims_generated->value += MIN2(prims, max_prims);
+ }
+
+ /* Update the offsets into the streamout buffers */
+ for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
+ if (ctx->streamout.targets[i])
+ agx_so_target(ctx->streamout.targets[i])->offset += num_outputs;
+ }
+
+ ctx->dirty |= AGX_DIRTY_XFB;
+ ctx->streamout.key.active = false;
+}
+
+/*
+ * Count generated primitives on the CPU for transform feedback. This only works
+ * in the absence of indirect draws, geometry shaders, or tessellation.
+ */
+void
+agx_primitives_update_direct(struct agx_context *ctx,
+ const struct pipe_draw_info *info,
+ const struct pipe_draw_start_count_bias *draw)
+{
+ assert(ctx->active_queries && ctx->prims_generated && "precondition");
+
+ ctx->prims_generated->value +=
+ xfb_prims_for_vertices(info->mode, draw->count);
+}
+
+/* The OpenGL spec says:
+ *
+ * If recording the vertices of a primitive to the buffer objects being
+ * used for transform feedback purposes would result in either exceeding
+ * the limits of any buffer object’s size, or in exceeding the end
+ * position offset + size − 1, as set by BindBufferRange, then no vertices
+ * of that primitive are recorded in any buffer object.
+ *
+ * This function checks for the absence of overflow.
+ *
+ * The difficulty is that we are processing a single vertex at a time, so we
+ * need to do some arithmetic to figure out the bounds for the whole containing
+ * primitive.
+ *
+ * XXX: How do quads get tessellated?
+ */
+static nir_ssa_def *
+primitive_fits(nir_builder *b, struct agx_xfb_key *key)
+{
+ /* Get the number of vertices per primitive in the current mode, usually just
+ * the base number but quads are tessellated.
+ */
+ uint32_t verts_per_prim = u_vertices_per_prim(key->mode);
+
+ if (u_decomposed_prim(key->mode) == PIPE_PRIM_QUADS)
+ verts_per_prim = 6;
+
+ /* Get the ID for this invocation */
+ nir_ssa_def *id = nir_load_vertex_id_zero_base(b);
+
+ /* Figure out the ID for the first vertex of the next primitive. Since
+ * transform feedback buffers are tightly packed, that's one byte after the
+ * end of this primitive, which will make bounds checking convenient. That
+ * will be:
+ *
+ * (id - (id % prim size)) + prim size
+ */
+ nir_ssa_def *rem = nir_umod_imm(b, id, verts_per_prim);
+ nir_ssa_def *next_id = nir_iadd_imm(b, nir_isub(b, id, rem), verts_per_prim);
+
+ /* Figure out where that vertex will land */
+ nir_ssa_def *index = nir_iadd(
+ b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)),
+ next_id);
+
+ /* Now check for overflow in each written buffer */
+ nir_ssa_def *all_fits = nir_imm_true(b);
+
+ u_foreach_bit(buffer, b->shader->xfb_info->buffers_written) {
+ uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
+ assert(stride != 0);
+
+ /* For this primitive to fit, the next primitive cannot start after the
+ * end of the transform feedback buffer.
+ */
+ nir_ssa_def *end_offset = nir_imul_imm(b, index, stride);
+
+ /* Check whether that will remain in bounds */
+ nir_ssa_def *fits =
+ nir_uge(b, nir_load_xfb_size(b, .base = buffer), end_offset);
+
+ /* Accumulate */
+ all_fits = nir_iand(b, all_fits, fits);
+ }
+
+ return all_fits;
+}
+
+static void
+insert_overflow_check(nir_shader *nir, struct agx_xfb_key *key)
+{
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ /* Extract the current transform feedback shader */
+ nir_cf_list list;
+ nir_cf_extract(&list, nir_before_block(nir_start_block(impl)),
+ nir_after_block(nir_impl_last_block(impl)));
+
+ /* Get a builder for the (now empty) shader */
+ nir_builder b;
+ nir_builder_init(&b, impl);
+ b.cursor = nir_after_block(nir_start_block(impl));
+
+ /* Rebuild the shader as
+ *
+ * if (!overflow) {
+ * shader();
+ * }
+ */
+ nir_push_if(&b, primitive_fits(&b, key));
+ {
+ b.cursor = nir_cf_reinsert(&list, b.cursor);
+ }
+ nir_pop_if(&b, NULL);
+}
+
+static void
+lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr,
+ unsigned start_component, unsigned num_components,
+ unsigned buffer, unsigned offset_words)
+{
+ assert(buffer < MAX_XFB_BUFFERS);
+ assert(nir_intrinsic_component(intr) == 0); // TODO
+
+ /* Transform feedback info in units of words, convert to bytes. */
+ uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
+ assert(stride != 0);
+
+ uint16_t offset = offset_words * 4;
+
+ nir_ssa_def *index = nir_iadd(
+ b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)),
+ nir_load_vertex_id_zero_base(b));
+
+ nir_ssa_def *xfb_offset =
+ nir_iadd_imm(b, nir_imul_imm(b, index, stride), offset);
+
+ nir_ssa_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
+ nir_ssa_def *addr = nir_iadd(b, buf, nir_u2u64(b, xfb_offset));
+
+ nir_ssa_def *value = nir_channels(
+ b, intr->src[0].ssa, BITFIELD_MASK(num_components) << start_component);
+ nir_store_global(b, addr, 4, value, BITFIELD_MASK(num_components));
+}
+
+static bool
+lower_xfb(nir_builder *b, nir_instr *instr, UNUSED void *data)
+{
+ if (instr->type != nir_instr_type_intrinsic)
+ return false;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ if (intr->intrinsic != nir_intrinsic_store_output)
+ return false;
+
+ /* Assume the inputs are read */
+ BITSET_SET(b->shader->info.system_values_read,
+ SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+ BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
+ bool progress = false;
+
+ for (unsigned i = 0; i < 2; ++i) {
+ nir_io_xfb xfb =
+ i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr);
+
+ for (unsigned j = 0; j < 2; ++j) {
+ if (xfb.out[j].num_components > 0) {
+ b->cursor = nir_before_instr(&intr->instr);
+ lower_xfb_output(b, intr, i * 2 + j, xfb.out[j].num_components,
+ xfb.out[j].buffer, xfb.out[j].offset);
+ progress = true;
+ }
+ }
+ }
+
+ nir_instr_remove(instr);
+ return progress;
+}
+
+static bool
+lower_xfb_intrinsics(struct nir_builder *b, nir_instr *instr, void *data)
+{
+ if (instr->type != nir_instr_type_intrinsic)
+ return false;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ b->cursor = nir_before_instr(instr);
+
+ struct agx_xfb_key *key = data;
+
+ switch (intr->intrinsic) {
+ /* XXX: Rename to "xfb index" to avoid the clash */
+ case nir_intrinsic_load_vertex_id_zero_base: {
+ nir_ssa_def *id = nir_load_vertex_id(b);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, id);
+ return true;
+ }
+
+ case nir_intrinsic_load_vertex_id: {
+ /* Get the raw invocation ID */
+ nir_ssa_def *id = nir_load_vertex_id(b);
+
+ /* Tessellate by primitive mode */
+ if (key->mode == PIPE_PRIM_LINE_STRIP ||
+ key->mode == PIPE_PRIM_LINE_LOOP) {
+ /* The last vertex is special for a loop. Check if that's we're dealing
+ * with.
+ */
+ nir_ssa_def *num_invocations =
+ nir_imul_imm(b, nir_load_num_vertices(b), 2);
+ nir_ssa_def *last_vertex =
+ nir_ieq(b, id, nir_iadd_imm(b, num_invocations, -1));
+
+ /* (0, 1), (1, 2) */
+ id = nir_iadd(b, nir_ushr_imm(b, id, 1), nir_iand_imm(b, id, 1));
+
+ /* (0, 1), (1, 2), (2, 0) */
+ if (key->mode == PIPE_PRIM_LINE_LOOP) {
+ id = nir_bcsel(b, last_vertex, nir_imm_int(b, 0), id);
+ }
+ } else if (key->mode == PIPE_PRIM_TRIANGLE_STRIP) {
+ /* Order depends on the provoking vertex.
+ *
+ * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
+ * Last: (0, 1, 2), (2, 1, 3), (2, 3, 4).
+ */
+ nir_ssa_def *prim = nir_udiv_imm(b, id, 3);
+ nir_ssa_def *rem = nir_umod_imm(b, id, 3);
+
+ unsigned pv = key->flatshade_first ? 0 : 2;
+
+ /* Swap the two non-provoking vertices third vertex in odd triangles */
+ nir_ssa_def *even = nir_ieq_imm(b, nir_iand_imm(b, prim, 1), 0);
+ nir_ssa_def *is_provoking = nir_ieq_imm(b, rem, pv);
+ nir_ssa_def *no_swap = nir_ior(b, is_provoking, even);
+ nir_ssa_def *swapped = nir_isub_imm(b, 3 - pv, rem);
+ nir_ssa_def *off = nir_bcsel(b, no_swap, rem, swapped);
+
+ /* Pull the (maybe swapped) vertex from the corresponding primitive */
+ id = nir_iadd(b, prim, off);
+ } else if (key->mode == PIPE_PRIM_TRIANGLE_FAN) {
+ /* (0, 1, 2), (0, 2, 3) */
+ nir_ssa_def *prim = nir_udiv_imm(b, id, 3);
+ nir_ssa_def *rem = nir_umod_imm(b, id, 3);
+
+ id = nir_bcsel(b, nir_ieq_imm(b, rem, 0), nir_imm_int(b, 0),
+ nir_iadd(b, prim, rem));
+ } else if (key->mode == PIPE_PRIM_QUADS ||
+ key->mode == PIPE_PRIM_QUAD_STRIP) {
+ /* Quads: [(0, 1, 3), (3, 1, 2)], [(4, 5, 7), (7, 5, 6)]
+ * Quad strips: [(0, 1, 3), (0, 2, 3)], [(2, 3, 5), (2, 4, 5)]
+ */
+ bool strips = key->mode == PIPE_PRIM_QUAD_STRIP;
+
+ nir_ssa_def *prim = nir_udiv_imm(b, id, 6);
+ nir_ssa_def *rem = nir_umod_imm(b, id, 6);
+ nir_ssa_def *base = nir_imul_imm(b, prim, strips ? 2 : 4);
+
+ /* Quads: [0, 1, 3, 3, 1, 2]
+ * Quad strips: [0, 1, 3, 0, 2, 3]
+ */
+ uint32_t order_quads = 0x213310;
+ uint32_t order_strips = 0x230310;
+ uint32_t order = strips ? order_strips : order_quads;
+
+ /* Index out of the bitpacked array */
+ nir_ssa_def *offset = nir_iand_imm(
+ b, nir_ushr(b, nir_imm_int(b, order), nir_imul_imm(b, rem, 4)),
+ 0xF);
+
+ id = nir_iadd(b, base, offset);
+ }
+
+ /* Add the "start", either an index bias or a base vertex */
+ id = nir_iadd(b, id, nir_load_base_vertex(b));
+
+ /* If drawing with an index buffer, pull the vertex ID. Otherwise, the
+ * vertex ID is just the index as-is.
+ */
+ if (key->index_size) {
+ nir_ssa_def *index_buffer = nir_load_xfb_index_buffer(b, 64);
+ nir_ssa_def *offset = nir_imul_imm(b, id, key->index_size);
+ nir_ssa_def *address = nir_iadd(b, index_buffer, nir_u2u64(b, offset));
+ nir_ssa_def *index = nir_load_global_constant(
+ b, address, key->index_size, 1, key->index_size * 8);
+
+ id = nir_u2uN(b, index, id->bit_size);
+ }
+
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, id);
+ return true;
+ }
+
+ default:
+ return false;
+ }
+}
+
+void
+agx_nir_lower_xfb(nir_shader *nir, struct agx_xfb_key *key)
+{
+ assert(nir->info.stage == MESA_SHADER_VERTEX);
+
+ NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
+ nir_var_shader_in | nir_var_shader_out);
+ NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
+
+ NIR_PASS_V(nir, insert_overflow_check, key);
+ NIR_PASS_V(nir, nir_shader_instructions_pass, lower_xfb,
+ nir_metadata_block_index | nir_metadata_dominance, key);
+ NIR_PASS_V(nir, nir_shader_instructions_pass, lower_xfb_intrinsics,
+ nir_metadata_block_index | nir_metadata_dominance, key);
+
+ /* Lowering XFB creates piles of dead code. Eliminate now so we don't
+ * push unnecessary sysvals.
+ */
+ NIR_PASS_V(nir, nir_opt_dce);
+}
+
+void
+agx_init_streamout_functions(struct pipe_context *ctx)
+{
+ ctx->create_stream_output_target = agx_create_stream_output_target;
+ ctx->stream_output_target_destroy = agx_stream_output_target_destroy;
+ ctx->set_stream_output_targets = agx_set_stream_output_targets;
+}
diff --git a/src/gallium/drivers/asahi/agx_uniforms.c b/src/gallium/drivers/asahi/agx_uniforms.c
index ad946fc144b..5999f69ecba 100644
--- a/src/gallium/drivers/asahi/agx_uniforms.c
+++ b/src/gallium/drivers/asahi/agx_uniforms.c
@@ -87,6 +87,16 @@ agx_upload_uniforms(struct agx_batch *batch, uint64_t textures,
u_foreach_bit(vbo, ctx->vb_mask) {
uniforms.vs.vbo_base[vbo] = agx_vertex_buffer_ptr(batch, vbo);
}
+
+ if (ctx->streamout.key.active) {
+ uniforms.vs.xfb = ctx->streamout.params;
+
+ for (unsigned i = 0; i < batch->ctx->streamout.num_targets; ++i) {
+ uint32_t size = 0;
+ uniforms.vs.xfb.base[i] = agx_batch_get_so_address(batch, i, &size);
+ uniforms.vs.xfb.size[i] = size;
+ }
+ }
} else if (stage == PIPE_SHADER_FRAGMENT) {
memcpy(uniforms.fs.blend_constant, &ctx->blend_color,
sizeof(ctx->blend_color));
diff --git a/src/gallium/drivers/asahi/meson.build b/src/gallium/drivers/asahi/meson.build
index 3a6bc9e6922..6a80e1f803a 100644
--- a/src/gallium/drivers/asahi/meson.build
+++ b/src/gallium/drivers/asahi/meson.build
@@ -10,6 +10,7 @@ files_asahi = files(
'agx_nir_lower_sysvals.c',
'agx_query.c',
'agx_state.c',
+ 'agx_streamout.c',
'agx_uniforms.c',
)