141 files changed, 25515 insertions, 13827 deletions
diff --git a/src/broadcom/ci/piglit-vc4-rpi3-fails.txt b/src/broadcom/ci/broadcom-rpi3-fails.txt
index cb9dfaa6eb6..fdcf09f1fef 100644
--- a/src/broadcom/ci/piglit-vc4-rpi3-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi3-fails.txt
@@ -1,35 +1,116 @@
-glx@glx-copy-sub-buffer samples=2,Crash
-glx@glx-copy-sub-buffer samples=4,Crash
-glx@glx-make-current,Crash
-glx@glx-multithread-buffer,Fail
-glx@glx-query-drawable-glx_fbconfig_id-window,Fail
+# Test expects red instead of luminance, contra OES_depth_texture spec.
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3815
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail
+
+#  Creating OpenGL ES 3 context
+# Fail, context: 0x00000000, error: EGL_BAD_MATCH
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3816
+x11-dEQP-EGL.functional.create_context.no_config,Fail
+wayland-dEQP-EGL.functional.create_context.no_config,Fail
+
+# wide line outside the viewport incorrectly clipped out when ES wants it
+# rendered as a quad and clipped appropriately.  I think by expanding
+# CLIPPER_XY_SCALING to have a guard band we might get these to work.
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
+
+dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail
+
+dEQP-GLES2.functional.uniform_api.random.3,Fail
+dEQP-GLES2.functional.uniform_api.random.79,Fail
+
+# Sampling grid slightly off in test 2?
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_mirror_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888,Fail
+
+# " Warning: High precision not supported in fragment shaders.
+#   ERROR: Image verification failed, found 2048 invalid pixels!"
+# one of the magnified pixels is (0xff, 0x29,0xd6) instead of (0xff,0x2d,0xd2).
+# We do support highp, so we should fix glGetShaderPrecisionFormat reporting.
+dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail
+
+# One of the pixels on the left edge near the bottom is wrong for both min and
+# mag.  Also a line of pixels through the image in minification.
+dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail
+
+# Despite exposing GL 2.1, the HW doesn't actually support 3D textures so we set
+# 0 max levels.  These tests fail (or assertion fail) as a result.
+spec@!opengl 1.1@max-texture-size,Crash
+spec@!opengl 1.2@copyteximage 3d,Fail
+spec@!opengl 1.2@getteximage-targets 3d,Fail
+spec@!opengl 1.2@tex3d-maxsize,Fail
+spec@!opengl 1.2@tex3d,Fail
+spec@!opengl 1.2@texture-packed-formats,Fail
+spec@!opengl 1.2@texwrap 3d bordercolor,Fail
+spec@!opengl 1.2@texwrap 3d proj bordercolor,Fail
+spec@!opengl 1.2@texwrap 3d proj,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- swizzled- projected,Fail
+spec@!opengl 1.2@texwrap 3d,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8- swizzled,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8,Fail
+spec@!opengl 1.3@tex3d-depth1,Fail
+spec@!opengl 2.0@tex3d-npot,Fail
+spec@!opengl 2.1@minmax,Fail
+spec@arb_framebuffer_object@fbo-generatemipmap-3d,Fail
+spec@arb_framebuffer_object@fbo-incomplete,Fail
+spec@arb_framebuffer_object@fbo-incomplete@invalid slice of 3D texture,Fail
+spec@arb_get_texture_sub_image@arb_get_texture_sub_image-get,Fail
+spec@arb_robustness@arb_robustness_client-mem-bounds,Fail
+spec@arb_texture_multisample@arb_texture_multisample-teximage-3d-multisample,Fail
+spec@arb_texture_storage@texture-storage,Crash
+spec@arb_texture_storage@texture-storage@3D mipmapped ,Fail
+spec@arb_texture_storage@texture-storage@3D mipmapped (EXT_dsa),Fail
+spec@arb_texture_storage@texture-storage@3D non-mipmapped ,Fail
+spec@arb_texture_storage@texture-storage@3D non-mipmapped (EXT_dsa),Fail
+spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail
+spec@ext_direct_state_access@textures,Crash
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT,Fail
+spec@ext_framebuffer_object@fbo-3d,Fail
+spec@glsl-1.10@execution@texture3d-computed-coord,Fail
+spec@glsl-1.10@execution@texture3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 3d,Fail
+spec@khr_texture_compression_astc@basic-gl,Fail
+
+glx@glx-make-current,Fail
 glx@glx-swap-pixmap-bad,Fail
-glx@glx-visuals-depth -pixmap,Crash
-glx@glx-visuals-depth,Crash
-glx@glx-visuals-stencil -pixmap,Crash
-glx@glx-visuals-stencil,Crash
 glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
 glx@glx_arb_create_context_no_error@no error,Fail
-glx@glx_ext_import_context@free context,Fail
-glx@glx_ext_import_context@get context id,Fail
-glx@glx_ext_import_context@get current display,Fail
-glx@glx_ext_import_context@import context- multi process,Fail
-glx@glx_ext_import_context@import context- single process,Fail
-glx@glx_ext_import_context@imported context has same context id,Fail
-glx@glx_ext_import_context@make current- multi process,Fail
-glx@glx_ext_import_context@make current- single process,Fail
-glx@glx_ext_import_context@query context info,Fail
+
+# piglit: error: Test timed out.
+glx@glx_arb_sync_control@waitformsc,Fail
+
+glslparsertest@glsl2@gst-gl-text-download-i420-yv12.frag,Fail
 shaders@glsl-arb-fragment-coord-conventions,Fail
 shaders@glsl-bug-110796,Fail
 shaders@glsl-max-vertex-attrib,Fail
-shaders@glsl-predication-on-large-array,Fail
-spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
-spec@!opengl 1.0@gl-1.0-dlist-bitmap,Crash
 spec@!opengl 1.0@gl-1.0-drawbuffer-modes,Fail
 spec@!opengl 1.0@gl-1.0-edgeflag,Fail
 spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail
 spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
-spec@!opengl 1.0@gl-1.0-logicop,Crash
 spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
 spec@!opengl 1.0@gl-1.0-scissor-offscreen,Fail
 spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
@@ -682,33 +763,53 @@ spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)-
 spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
 spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
 spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
-spec@!opengl 1.1@depthstencil-default_fb-blit samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-blit samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-clear samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-clear samples=4,Crash
+spec@arb_clear_texture@arb_clear_texture-3d,Fail
+spec@arb_clear_texture@arb_clear_texture-sized-formats,Fail
+spec@arb_clear_texture@arb_clear_texture-supported-formats,Fail
+spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash
+spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
+spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
+
+# fails on arm64, passes on armhf
+spec@arb_depth_buffer_float@depthstencil-render-miplevels 1024 s=z24_s8_d=z32f,Fail
+
+# Crashes in this group are CMA allocation fails
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=2,Fail
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=4,Fail
 spec@!opengl 1.1@depthstencil-default_fb-clear,Fail
-spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Fail
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4,Fail
 spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Crash
 spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=4,Crash
-spec@!opengl 1.1@draw-pixels,Fail
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_line_loop,Fail
+
+# These non-supported primitives draws are converted by Mesa into
+# indexed draws with supported primitives. But these indexed draws
+# require 4-byte index due the number of vertices to draw, but our
+# hardware is limited to 2-byte indexes at most.
 spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_polygon,Crash
 spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quad_strip,Crash
 spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quads,Crash
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_triangle_fan,Fail
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_line_loop,Fail
 spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_polygon,Crash
 spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quad_strip,Crash
 spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quads,Crash
+
+# GFXH-515 / SW-5891: binner uses 16-bit index for drawarrays, so the
+# draw is split in multiple calls. For trifans or lineloops it is not
+# supported because the 1st vertex must be always included, which
+# would require creating new vertex buffer to include the remaining
+# vertices plus the 1st one.
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_line_loop,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_triangle_fan,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_line_loop,Fail
 spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_triangle_fan,Fail
+
+spec@!opengl 1.1@draw-pixels,Fail
 spec@!opengl 1.1@line-flat-clip-color,Fail
+
+# Hardware do not support line/polygon stipple. In fact, this feature
+# was deprecated/removed in newer OpenGL spec versions. It could be
+# emulated using shaders
+spec@!opengl 1.1@line-smooth-stipple,Fail
 spec@!opengl 1.1@linestipple,Fail
 spec@!opengl 1.1@linestipple@Baseline,Fail
 spec@!opengl 1.1@linestipple@Factor 2x,Fail
@@ -716,6 +817,10 @@ spec@!opengl 1.1@linestipple@Factor 3x,Fail
 spec@!opengl 1.1@linestipple@Line loop,Fail
 spec@!opengl 1.1@linestipple@Line strip,Fail
 spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
+spec@!opengl 2.1@pbo,Fail
+spec@!opengl 2.1@pbo@test_polygon_stip,Fail
+spec@!opengl 2.1@polygon-stipple-fs,Fail
+
 spec@!opengl 1.1@polygon-mode,Fail
 spec@!opengl 1.1@polygon-mode-offset,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
@@ -742,11 +847,6 @@ spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,
 spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@read-front clear-front-first samples=2,Crash
-spec@!opengl 1.1@read-front clear-front-first samples=4,Crash
-spec@!opengl 1.1@read-front samples=2,Crash
-spec@!opengl 1.1@read-front samples=4,Crash
-spec@!opengl 1.1@tex-upside-down-miptree,Fail
 spec@!opengl 1.1@texsubimage-unpack,Fail
 spec@!opengl 1.1@texwrap 2d proj,Fail
 spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- NPOT- projected,Fail
@@ -787,25 +887,10 @@ spec@!opengl 1.1@texwrap formats@GL_RGBA16- swizzled,Fail
 spec@!opengl 1.1@texwrap formats@GL_RGBA8,Fail
 spec@!opengl 1.1@texwrap formats@GL_RGBA8- NPOT,Fail
 spec@!opengl 1.1@texwrap formats@GL_RGBA8- swizzled,Fail
-spec@!opengl 1.1@windowoverlap,Fail
-spec@!opengl 1.2@copyteximage 3d,Fail
-spec@!opengl 1.2@getteximage-targets 3d,Fail
 spec@!opengl 1.2@lodclamp,Fail
 spec@!opengl 1.2@lodclamp-between,Fail
 spec@!opengl 1.2@lodclamp-between-max,Fail
 spec@!opengl 1.2@mipmap-setup,Fail
-spec@!opengl 1.2@tex3d,Fail
-spec@!opengl 1.2@tex3d-maxsize,Fail
-spec@!opengl 1.2@teximage-errors,Fail
-spec@!opengl 1.2@texwrap 3d proj,Fail
-spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail
-spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail
-spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- swizzled- projected,Fail
-spec@!opengl 1.2@texwrap 3d,Fail
-spec@!opengl 1.2@texwrap 3d@GL_RGBA8,Fail
-spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail
-spec@!opengl 1.2@texwrap 3d@GL_RGBA8- swizzled,Fail
-spec@!opengl 1.3@tex3d-depth1,Fail
 spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
 spec@!opengl 1.4@tex-miplevel-selection,Fail
 spec@!opengl 1.4@tex-miplevel-selection-lod,Fail
@@ -814,14 +899,6 @@ spec@!opengl 1.5@depth-tex-compare,Fail
 spec@!opengl 2.0@attrib-assignments,Fail
 spec@!opengl 2.0@gl-2.0-edgeflag,Fail
 spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
-spec@!opengl 2.0@occlusion-query-discard,Fail
-spec@!opengl 2.0@tex3d-npot,Fail
-spec@!opengl 2.1@minmax,Fail
-spec@!opengl 2.1@pbo,Fail
-spec@!opengl 2.1@pbo@test_polygon_stip,Fail
-spec@!opengl 2.1@polygon-stipple-fs,Fail
-spec@!opengl es 2.0@draw_buffers_gles2,Fail
-spec@arb_arrays_of_arrays@execution@glsl-arrays-copy-size-mismatch,Fail
 spec@arb_depth_texture@depth-level-clamp,Fail
 spec@arb_depth_texture@texwrap formats,Fail
 spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16,Fail
@@ -835,7 +912,6 @@ spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
 spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- swizzled,Fail
 spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index,Crash
 spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index-user_varrays,Crash
-spec@arb_es2_compatibility@arb_es2_compatibility-drawbuffers,Fail
 spec@arb_es2_compatibility@texwrap formats,Fail
 spec@arb_es2_compatibility@texwrap formats@GL_RGB565,Fail
 spec@arb_es2_compatibility@texwrap formats@GL_RGB565- NPOT,Fail
@@ -844,58 +920,24 @@ spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-integer,Fa
 spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-none,Fail
 spec@arb_fragment_program@fp-indirections2,Fail
 spec@arb_fragment_program@minmax,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_depth24_stencil8,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index1,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index16,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index4,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index8,Fail
 spec@arb_framebuffer_object@fbo-attachments-blit-scaled-linear,Fail
 spec@arb_framebuffer_object@fbo-blit-stretch,Fail
-spec@arb_framebuffer_object@fbo-generatemipmap-3d,Fail
 spec@arb_framebuffer_object@fbo-mipmap-copypix,Fail
-spec@arb_framebuffer_object@framebuffer-blit-levels draw stencil,Fail
-spec@arb_framebuffer_object@framebuffer-blit-levels read stencil,Fail
 spec@arb_framebuffer_object@mixed-buffer-sizes,Fail
-spec@arb_framebuffer_object@same-attachment-glframebuffertexture2d-gl_depth_stencil_attachment,Fail
+spec@arb_framebuffer_object@same-attachment-tex2d-depth_stencil,Fail
 spec@arb_framebuffer_srgb@arb_framebuffer_srgb-srgb_conformance,Fail
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled render,Crash
 spec@arb_internalformat_query2@all internalformat_<x>_size pname checks,Fail
 spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_ALPHA_SIZE,Fail
 spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_BLUE_SIZE,Fail
 spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_DEPTH_SIZE,Fail
 spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_GREEN_SIZE,Fail
 spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_RED_SIZE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_ALPHA_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_BLUE_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_DEPTH_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_GREEN_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_RED_TYPE,Fail
 spec@arb_internalformat_query2@api error checks,Fail
 spec@arb_internalformat_query2@max dimensions related pname checks,Fail
 spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_COMBINED_DIMENSIONS,Fail
@@ -903,147 +945,47 @@ spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_DEPTH,
 spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_HEIGHT,Fail
 spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_WIDTH,Fail
 spec@arb_occlusion_query2@render,Fail
-spec@arb_occlusion_query@occlusion_query,Fail
 spec@arb_occlusion_query@occlusion_query_conform,Fail
-spec@arb_occlusion_query@occlusion_query_meta_fragments,Fail
-spec@arb_occlusion_query@occlusion_query_meta_save,Fail
+spec@arb_occlusion_query@occlusion_query_conform@GetObjivAval_multi2,Fail
 spec@arb_pixel_buffer_object@fbo-pbo-readpixels-small,Fail
 spec@arb_pixel_buffer_object@pbo-getteximage,Fail
 spec@arb_pixel_buffer_object@texsubimage-unpack pbo,Fail
 spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
 spec@arb_provoking_vertex@arb-provoking-vertex-render,Fail
 spec@arb_sampler_objects@sampler-objects,Fail
-spec@arb_shader_texture_lod@execution@glsl-fs-texturelod-01,Fail
-spec@arb_texture_multisample@arb_texture_multisample-teximage-3d-multisample,Fail
 spec@arb_texture_rectangle@1-1-linear-texture,Fail
-spec@arb_texture_rectangle@copyteximage rect samples=2,Crash
-spec@arb_texture_rectangle@copyteximage rect samples=4,Crash
 spec@arb_texture_rectangle@texrect-many,Crash
-spec@arb_texture_storage@texture-storage,Fail
-spec@arb_texture_storage@texture-storage@3D mipmapped ,Fail
-spec@arb_texture_storage@texture-storage@3D non-mipmapped ,Fail
 spec@arb_vertex_program@minmax,Fail
-spec@egl 1.4@egl-copy-buffers,Crash
 spec@egl 1.4@eglterminate then unbind context,Fail
 spec@egl 1.4@largest possible eglcreatepbuffersurface and then glclear,Fail
-spec@egl_ext_protected_content@conformance,Fail
 spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
 spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
 spec@egl_khr_surfaceless_context@viewport,Fail
 spec@egl_mesa_configless_context@basic,Fail
-spec@ext_direct_state_access@indexed-state-queries 12,Fail
-spec@ext_direct_state_access@indexed-state-queries 12@GetIntegerIndexedvEXT,Fail
 spec@ext_direct_state_access@multi-texture,Crash
-spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail
 spec@ext_direct_state_access@multi-texture@MultiTexSubImage1DEXT,Fail
-spec@ext_direct_state_access@textures,Fail
-spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT,Fail
 spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE,Fail
 spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
 spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex*,Fail
-spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT,Fail
-spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT,Fail
 spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE,Fail
 spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
 spec@ext_direct_state_access@textures@TextureSubImage2DEXT,Fail
-spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureSubImage3DEXT,Fail
 spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
-spec@ext_framebuffer_multisample@blit-flipped 2 x,Crash
-spec@ext_framebuffer_multisample@blit-flipped 2 y,Crash
-spec@ext_framebuffer_multisample@blit-flipped 4 x,Crash
-spec@ext_framebuffer_multisample@blit-flipped 4 y,Crash
+
+# Remaining crashes are CMA allocation failures.
 spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 downsample,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 msaa,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 upsample,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 downsample,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 msaa,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 upsample,Crash
-spec@ext_framebuffer_multisample@enable-flag,Crash
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 msaa,Fail
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 msaa,Fail
 spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
 spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
-spec@ext_framebuffer_multisample@line-smooth 2,Crash
-spec@ext_framebuffer_multisample@line-smooth 4,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 color linear,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 color,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 depth,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 stencil,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 color linear,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 color,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 depth,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 stencil,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth combined,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth single,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth-computed combined,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth-computed single,Crash
-spec@ext_framebuffer_multisample@no-color 2 stencil combined,Crash
-spec@ext_framebuffer_multisample@no-color 2 stencil single,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth combined,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth single,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth-computed combined,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth-computed single,Crash
-spec@ext_framebuffer_multisample@no-color 4 stencil combined,Crash
-spec@ext_framebuffer_multisample@no-color 4 stencil single,Crash
-spec@ext_framebuffer_multisample@point-smooth 2,Crash
-spec@ext_framebuffer_multisample@point-smooth 4,Crash
-spec@ext_framebuffer_multisample@polygon-smooth 2,Crash
-spec@ext_framebuffer_multisample@polygon-smooth 4,Crash
 spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 color,Fail
-spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 depth,Crash
 spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 color,Fail
-spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 depth,Crash
-spec@ext_framebuffer_multisample@sample-coverage 2 inverted,Crash
-spec@ext_framebuffer_multisample@sample-coverage 2 non-inverted,Crash
-spec@ext_framebuffer_multisample@sample-coverage 4 inverted,Crash
-spec@ext_framebuffer_multisample@sample-coverage 4 non-inverted,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 color downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 color msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 color upsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 depth downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 depth msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 depth upsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 color downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 color msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 color upsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 depth downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 depth msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 depth upsample,Crash
-spec@ext_framebuffer_multisample@upsample 2 color linear,Crash
-spec@ext_framebuffer_multisample@upsample 2 color,Crash
-spec@ext_framebuffer_multisample@upsample 2 depth,Crash
-spec@ext_framebuffer_multisample@upsample 2 stencil,Crash
-spec@ext_framebuffer_multisample@upsample 4 color linear,Crash
-spec@ext_framebuffer_multisample@upsample 4 color,Crash
-spec@ext_framebuffer_multisample@upsample 4 depth,Crash
-spec@ext_framebuffer_multisample@upsample 4 stencil,Crash
-spec@ext_framebuffer_multisample_blit_scaled@negative-blit-scaled,Crash
-spec@ext_framebuffer_object@fbo-3d,Fail
-spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
+spec@ext_framebuffer_multisample@sample-coverage 2 inverted,Fail
+spec@ext_framebuffer_multisample@sample-coverage 2 non-inverted,Fail
+spec@ext_framebuffer_multisample@sample-coverage 4 inverted,Fail
+spec@ext_framebuffer_multisample@sample-coverage 4 non-inverted,Fail
+
 spec@ext_framebuffer_object@fbo-depth-sample-compare,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index1-blit,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index16-blit,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index4-blit,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index8-blit,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail
@@ -1054,10 +996,8 @@ spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y410,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail
-spec@ext_occlusion_query_boolean@any-samples,Fail
 spec@ext_packed_depth_stencil@depth_stencil texture,Fail
 spec@ext_packed_depth_stencil@fbo-depthstencil-gl_depth24_stencil8-clear,Fail
-spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-blit,Fail
 spec@ext_packed_depth_stencil@texwrap formats,Fail
 spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8,Fail
 spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
@@ -1087,6 +1027,24 @@ spec@ext_texture_srgb@texwrap formats@GL_SRGB8- swizzled,Fail
 spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8,Fail
 spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- NPOT,Fail
 spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- swizzled,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SLUMINANCE- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SLUMINANCE_ALPHA- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SLUMINANCE- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SLUMINANCE_ALPHA- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- border color only,Fail
 spec@glsl-1.10@built-in constants,Fail
 spec@glsl-1.10@built-in constants@gl_MaxVertexAttribs,Fail
 spec@glsl-1.10@execution@built-in-functions@fs-cos-float,Fail
@@ -1153,12 +1111,7 @@ spec@glsl-1.10@execution@built-in-functions@vs-tan-float,Fail
 spec@glsl-1.10@execution@built-in-functions@vs-tan-vec2,Fail
 spec@glsl-1.10@execution@built-in-functions@vs-tan-vec3,Fail
 spec@glsl-1.10@execution@built-in-functions@vs-tan-vec4,Fail
-spec@glsl-1.10@execution@fs-texture-select,Fail
 spec@glsl-1.10@execution@glsl-fs-convolution-2,Fail
-spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-2,Fail
-spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-3,Fail
-spec@glsl-1.10@execution@samplers@in-parameter-array,Fail
-spec@glsl-1.10@execution@texture3d,Fail
 spec@glsl-1.20@built-in constants,Fail
 spec@glsl-1.20@built-in constants@gl_MaxVertexAttribs,Fail
 spec@glsl-1.20@execution@fs-nan-builtin-max,Fail
@@ -1167,13 +1120,11 @@ spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1dshadow,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 3d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() cube,Crash
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1dshadow,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 3d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) cube,Crash
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d_projvec4,Fail
@@ -1181,19 +1132,15 @@ spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1dshadow,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d_projvec4,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 3d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d_projvec4,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1dshadow,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d_projvec4,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 3d,Fail
-spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-col-row-wr,Fail
-spec@glsl-1.20@execution@variable-indexing@vs-temp-array-mat4-index-col-row-wr,Fail
+
 spec@glsl-1.20@execution@vs-nan-builtin-max,Fail
 spec@glsl-1.20@execution@vs-nan-builtin-min,Fail
-spec@intel_performance_query@intel_performance_query-issue_2235,Fail
 spec@khr_texture_compression_astc@basic-gles,Fail
 spec@khr_texture_compression_astc@miptree-gl ldr,Fail
 spec@khr_texture_compression_astc@miptree-gl ldr@LDR Profile,Fail
@@ -1208,3 +1155,48 @@ spec@khr_texture_compression_astc@miptree-gles ldr@LDR Profile,Fail
 spec@khr_texture_compression_astc@miptree-gles srgb,Fail
 spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
 spec@oes_compressed_etc1_rgb8_texture@miptree,Fail
+spec@!opengl 1.0@depth-clear-precision-check,Fail
+spec@!opengl 1.0@depth-clear-precision-check@depth16,Fail
+spec@!opengl 1.0@depth-clear-precision-check@depth32,Fail
+
+spec@glsl-1.10@execution@variable-indexing@vs-output-array-vec2-index-wr-no-unroll,Fail
+
+spec@ext_framebuffer_multisample@accuracy 2 depth_draw depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 2 depth_draw small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 2 depth_resolve depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 2 depth_resolve small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_draw depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_draw small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_resolve depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_resolve small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb small depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_draw depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_draw small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_resolve depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_resolve small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb small depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb small depthstencil,Fail
+spec@ext_framebuffer_multisample@multisample-blit 2 depth,Fail
+spec@ext_framebuffer_multisample@multisample-blit 4 depth,Fail
+spec@ext_framebuffer_multisample@no-color 2 depth combined,Fail
+spec@ext_framebuffer_multisample@no-color 2 depth-computed combined,Fail
+spec@ext_framebuffer_multisample@no-color 4 depth combined,Fail
+spec@ext_framebuffer_multisample@no-color 4 depth-computed combined,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 2 depth msaa,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 2 stencil msaa,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 4 depth msaa,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 4 stencil msaa,Fail
+
+# https://gitlab.freedesktop.org/mesa/piglit/-/merge_requests/817
+spec@intel_performance_query@intel_performance_query-issue_2235,Fail
+
+# Bisected to 35ae5dce39c ("mesa: don't pass Infs to the shader via gl_Fog.scale")
+spec@glsl-1.10@execution@glsl-1.10-built-in-uniform-state,Fail
+
+# Couldn't reproduce locally
+spec@oes_packed_depth_stencil@depth_stencil texture gles2,Fail
diff --git a/src/broadcom/ci/broadcom-rpi3-flakes.txt b/src/broadcom/ci/broadcom-rpi3-flakes.txt
new file mode 100644
index 00000000000..7e11d7da34e
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi3-flakes.txt
@@ -0,0 +1,52 @@
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_neg_x_neg_y_neg_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_pos_y_pos_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
+
+glx@glx-multi-window-single-context
+glx@glx-visuals-stencil
+shaders@glsl-vs-loop
+shaders@glsl-vs-loop-nested
+spec@ext_framebuffer_blit@fbo-sys-blit
+spec@ext_framebuffer_blit@fbo-sys-sub-blit
+spec@egl_chromium_sync_control@conformance
+
+# CMA allocations that may sometimes succeed
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=2
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/7186
+spec@!opengl 1.0@rasterpos
+
+# Sometimes fail when run along with other tests, never when run by themselves
+spec@!opengl 1.1@copypixels-sync
+spec@!opengl 1.1@copypixels-draw-sync
+spec@!opengl 1.1@draw-copypixels-sync
+spec@!opengl 1.1@draw-sync
+
+# flaky on wayland, was stable on x11
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import
+
+# fails on arm64, passes on armhf
+spec@arb_depth_buffer_float@depthstencil-render-miplevels 1024 s=z24_s8_d=z32f
+
+# Sometimes goes into an infinite loop and times out
+spec@arb_depth_buffer_float@depthstencil-render-miplevels 146 s=z24_s8_d=z32f_s8
+
+spec@arb_depth_texture@depthstencil-render-miplevels 273 d=z24
+spec@arb_shader_texture_lod@execution@tex-miplevel-selection *lod 1d
+spec@arb_occlusion_query2@render
+
+# Updated by ci-collate, found in this job run: https://gitlab.freedesktop.org/mesa/mesa/-/jobs/56164970
+glx@glx-multithread-clearbuffer
+
+spec@arb_vertex_buffer_object@vbo-subdata-many drawarrays
+spec@arb_vertex_buffer_object@vbo-subdata-many drawelements
+spec@arb_vertex_buffer_object@vbo-subdata-many drawrangeelements
+
+# Nightly run expectations update
+spec@glsl-1.20@execution@variable-indexing@fs-uniform-mat2-rd
+
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-skips.txt b/src/broadcom/ci/broadcom-rpi3-skips.txt
index 62d4d939d2d..6da79a463a7 100644
--- a/src/broadcom/ci/deqp-vc4-rpi3-skips.txt
+++ b/src/broadcom/ci/broadcom-rpi3-skips.txt
@@ -5,10 +5,6 @@
 # This is causing a binning memory overflow problem
 dEQP-GLES2.functional.fragment_ops.scissor.outside_render_line
 
-# These are very slow
-dEQP-GLES2.functional.uniform_api.random.3
-dEQP-GLES2.functional.uniform_api.random.79
-
 # Conformance issue: VC4 needs dynamic loops in the VS to cause a
 # shader link failure.
 #
@@ -20,6 +16,21 @@ dEQP-GLES2.functional.uniform_api.random.79
 # list for tracking.
 dEQP-GLES2.functional.shaders.loops.*dynamic.*vertex
 
-# Timeout tests (> 1 minute to run)
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_linear
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_linear
+# Slow tests (> 1 minute to run)
+spec@!opengl 1.1@streaming-texture-leak
+
+# Versions / Extensions not supported
+spec@!opengl 3.*
+spec@!opengl 4.*
+spec@!opengl es 3.*
+spec@arb_gpu_shader5.*
+spec@arb_gpu_shader_fp64.*
+spec@arb_gpu_shader_int64.*
+spec@arb_tessellation_shader.*
+spec@arb_texture_cube_map.*
+spec@glsl-1.30.*
+spec@glsl-1.40.*
+spec@glsl-1.50.*
+spec@glsl-3.*
+spec@glsl-4.*
+spec@glsl-es-3.*
diff --git a/src/broadcom/ci/broadcom-rpi4-fails.txt b/src/broadcom/ci/broadcom-rpi4-fails.txt
new file mode 100644
index 00000000000..bac3d618634
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi4-fails.txt
@@ -0,0 +1,602 @@
+glx@glx-make-current,Fail
+glx@glx-multi-window-single-context,Fail
+glx@glx-swap-pixmap-bad,Fail
+glx@glx-visuals-depth -pixmap,Fail
+glx@glx-visuals-stencil -pixmap,Fail
+glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
+glx@glx_arb_create_context_no_error@no error,Fail
+shaders@glsl-bug-110796,Fail
+shaders@point-vertex-id divisor,Fail
+shaders@point-vertex-id gl_instanceid divisor,Fail
+shaders@point-vertex-id gl_instanceid,Fail
+shaders@point-vertex-id gl_vertexid divisor,Fail
+shaders@point-vertex-id gl_vertexid gl_instanceid divisor,Fail
+shaders@point-vertex-id gl_vertexid gl_instanceid,Fail
+shaders@point-vertex-id gl_vertexid,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
+spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
+spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
+spec@!opengl 1.1@point-line-no-cull,Fail
+spec@!opengl 1.1@teximage-colors gl_alpha16@Exact upload-download of GL_ALPHA16,Fail
+spec@!opengl 1.1@texwrap formats bordercolor,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA4- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA4- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail
+spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-fog,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Fail
+spec@arb_copy_image@arb_copy_image-formats,Fail
+spec@arb_copy_image@arb_copy_image-formats@Source: GL_ALPHA16/Destination: GL_ALPHA16,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F NPOT,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH32F_STENCIL8- swizzled- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32F- swizzled- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH32F_STENCIL8- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH_COMPONENT32F- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats offset,Fail
+spec@arb_depth_buffer_float@texwrap formats offset@GL_DEPTH32F_STENCIL8- NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats offset@GL_DEPTH_COMPONENT32F- NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats,Fail
+spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH32F_STENCIL8- NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH_COMPONENT32F- NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail
+spec@arb_depth_texture@texwrap formats bordercolor,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail
+spec@arb_depth_texture@texwrap formats offset,Fail
+spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT16- NPOT,Fail
+spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT24- NPOT,Fail
+spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT32- NPOT,Fail
+spec@arb_depth_texture@texwrap formats,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
+spec@arb_direct_state_access@gettextureimage-formats init-by-rendering,Fail
+spec@arb_direct_state_access@gettextureimage-formats,Fail
+spec@arb_framebuffer_object@fbo-blit-scaled-linear,Fail
+spec@arb_point_sprite@arb_point_sprite-checkerboard,Fail
+spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
+spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail
+spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgrad,Fail
+spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail
+spec@arb_texture_buffer_object@formats (fs- arb),Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb),Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA8UI_EXT,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp@r8ui_texture_buffer_size_via_sampler,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp@rg8ui_texture_buffer_size_via_sampler,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp@rgba8ui_texture_buffer_size_via_sampler,Fail
+spec@arb_texture_float@fbo-blending-formats,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_ALPHA32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE_ALPHA32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_RGBA32F,Fail
+spec@arb_texture_float@texwrap formats bordercolor,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_ALPHA32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_INTENSITY32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE_ALPHA32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGB32F- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGBA32F- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_ALPHA32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_INTENSITY32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE_ALPHA32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_RGB32F- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_RGBA32F- border color only,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch.*,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R16_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R8_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG32F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG8_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10_A2,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10_A2UI,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB16_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB4,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB8,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB9_E5,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA32F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA4,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA8,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_SRGB8_ALPHA8,Fail
+spec@arb_texture_rectangle@1-1-linear-texture,Fail
+spec@arb_texture_rg@fbo-blending-formats-float,Fail
+spec@arb_texture_rg@fbo-blending-formats-float@GL_R32F,Fail
+spec@arb_texture_rg@fbo-blending-formats-float@GL_RG32F,Fail
+spec@arb_texture_rg@texwrap formats bordercolor,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_R16- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_RG16- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor@GL_R16- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor@GL_RG16- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_R32F- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_RG32F- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor@GL_R32F- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor@GL_RG32F- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float offset,Fail
+spec@arb_texture_rg@texwrap formats-float offset@GL_R32F- NPOT,Fail
+spec@arb_texture_rg@texwrap formats-float offset@GL_RG32F- NPOT,Fail
+spec@arb_texture_rg@texwrap formats-float,Fail
+spec@arb_texture_rg@texwrap formats-float@GL_R32F- NPOT,Fail
+spec@arb_texture_rg@texwrap formats-float@GL_RG32F- NPOT,Fail
+spec@arb_texture_storage@texture-storage@cube array texture,Fail
+spec@egl 1.4@eglterminate then unbind context,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
+spec@egl_khr_surfaceless_context@viewport,Fail
+spec@egl_mesa_configless_context@basic,Fail
+spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
+spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
+spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
+spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail
+spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-modifiers,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-modifiers@autogen-R16-DRM_FORMAT_MOD_LINEAR-clear_reimport,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_argb8888,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_nv12,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_nv21,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p016,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_uyvy,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_vyuy,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_xrgb8888,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y210,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y212,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuv420,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuyv,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvu420,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvyu,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-transcode-nv12-as-r8-gr88,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_STENCIL8- swizzled- border color only,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail
+spec@ext_packed_depth_stencil@texwrap formats offset,Fail
+spec@ext_packed_depth_stencil@texwrap formats offset@GL_DEPTH24_STENCIL8- NPOT,Fail
+spec@ext_packed_depth_stencil@texwrap formats,Fail
+spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
+spec@ext_packed_float@query-rgba-signed-components,Fail
+spec@ext_texture_integer@fbo-blending,Fail
+spec@ext_texture_integer@getteximage-clamping gl_arb_texture_rg,Fail
+spec@ext_texture_integer@getteximage-clamping,Fail
+spec@ext_texture_integer@multisample-formats 2 gl_ext_texture_integer,Fail
+spec@ext_texture_integer@multisample-formats 4 gl_ext_texture_integer,Fail
+spec@ext_texture_integer@texwrap formats bordercolor,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA16I_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA16UI_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA32I_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA32UI_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA8I_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA8UI_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA16I_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA16UI_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA32I_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA32UI_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA8I_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA8UI_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats offset,Fail
+spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT,Fail
+spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT- NPOT,Fail
+spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT- swizzled,Fail
+spec@ext_texture_integer@texwrap formats,Fail
+spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT,Fail
+spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT- NPOT,Fail
+spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT- swizzled,Fail
+spec@ext_texture_lod_bias@lodbias,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_ALPHA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_INTENSITY16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_R16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RG16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGB16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGBA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_ALPHA16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_INTENSITY16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_R16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RG16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGB16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGBA16_SNORM- border color only,Fail
+spec@ext_transform_feedback@tessellation line_loop flat_first,Fail
+spec@ext_transform_feedback@tessellation line_loop flat_last,Fail
+spec@ext_transform_feedback@tessellation line_loop monochrome,Fail
+spec@ext_transform_feedback@tessellation line_loop smooth,Fail
+spec@ext_transform_feedback@tessellation triangle_fan flat_first,Fail
+spec@ext_transform_feedback@tessellation triangle_strip flat_first,Fail
+spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash
+spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
+spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
+spec@glsl-1.20@execution@clipping@vs-clip-vertex-primitives,Fail
+spec@glsl-1.20@execution@fs-underflow-mul-compare-zero,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@nv_copy_image@nv_copy_image-formats,Fail
+spec@nv_copy_image@nv_copy_image-formats@Source: GL_ALPHA16/Destination: GL_ALPHA16,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.vert,Fail
+spec@nv_read_depth@read_depth_gles3,Fail
+spec@oes_point_sprite@arb_point_sprite-checkerboard_gles1,Fail
+spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail
+
+# This crashes only when LLVM is not enabled. This is because Gallium backend
+# uses TGSI to do some task that do not contains a sampler; when LLVM is
+# enabled, it uses LLVM instead, which is complete.
+spec@!opengl 1.0@rasterpos,Crash
+
+# https://gitlab.freedesktop.org/mesa/piglit/-/merge_requests/899
+spec@!opengl 1.0@depth-clear-precision-check,Fail
+
+# There are two problems here. On one side, hardware do not support
+# different polygon mode for front and back faces. By default we
+# choose the mode set for front face, unless we are culling it; in
+# this case we choose the mode set for back face. The other problem is
+# that we do not support rendering quads, so Gallium decomposes them
+# in triangles. This has the drawback that when setting polygon mode
+# as lines, we are rendering an extra edge.
+spec@!opengl 1.1@polygon-mode,Fail
+spec@!opengl 1.1@polygon-mode-facing,Fail
+spec@!opengl 1.1@polygon-mode-offset,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
+
+# V3D does not support PIPE_FORMAT_{R16,R16G16,R16G16B16A16}_UNORM for
+# rendering
+spec@!opengl 3.0@required-texture-attachment-formats,Fail
+spec@!opengl 3.1@required-texture-attachment-formats,Fail
+spec@arb_texture_view@rendering-formats,Crash
+
+# V3D does not support blending for GL_R{GBA}32F
+spec@!opengl 1.1@getteximage-formats,Fail
+
+# OpenGL 3.x requires 8 RT (MAX_DRAW_BUFFERS)/ color attachments (MAX_COLOR_ATTACHMENTS)
+spec@!opengl 3.0@bindfragdata-link-error,Fail
+spec@!opengl 3.0@bindfragdata-nonexistent-variable,Fail
+spec@!opengl 3.0@clearbuffer-mixed-format,Fail
+spec@!opengl 3.0@getfragdatalocation,Fail
+spec@!opengl 3.0@minmax,Fail
+spec@!opengl 3.1@minmax,Fail
+spec@glsl-1.30@built-in constants,Fail
+spec@glsl-1.30@built-in constants@gl_MaxDrawBuffers,Fail
+spec@glsl-1.40@built-in constants,Fail
+spec@glsl-1.40@built-in constants@gl_MaxDrawBuffers,Fail
+
+# OpenGL 3.x applies non-seamless cubemap texturing, while our
+# driver/GLES uses seamless cubemap texturing.
+spec@!opengl 3.0@sampler-cube-shadow,Fail
+spec@arb_texture_cube_map_array@arb_texture_cube_map_array-sampler-cube-array-shadow,Fail
+
+# Precision differences between expected and obtained; works if
+# exporting V3D_DEBUG=tmu32.
+spec@oes_texture_view@rendering-formats,Fail
+spec@oes_texture_view@rendering-formats@clear GL_R8 as GL_R8I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_R16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_R16I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_RG8I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_R32F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_R32I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RG16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RG16I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RGBA8I,Fail
+
+# Also related with precision issues
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_R32F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_R32I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RG16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RG16I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RGBA8I,Fail
+
+spec@!opengl 1.0@depth-clear-precision-check@depth16,Fail
+spec@!opengl 1.0@depth-clear-precision-check@depth24,Fail
+
+# This fails the subtest for GL_ALPHA16 because we don't support a 16-bit unorm format for rendering
+# so gallium falls back to using an 8-bit unorm format and we lose some precision in the result.
+spec@arb_clear_texture@arb_clear_texture-sized-formats,Fail
+
+# These fail because the shaders use indirect indexing on samplers which we
+# don't support (the GLSL linker fails to link the shaders because of this).
+# If loop unrolling kicks-in for these tests it removes the indirect indexing
+# and the tests pass, but this would just be working around an issue in the
+# tests.
+spec@!opengl 2.0@max-samplers,Fail
+spec@!opengl 2.0@max-samplers border,Fail
+
+# Hardware do not support line/polygon stipple. In fact, this feature
+# was deprecated/removed in newer OpenGL spec versions. It could be
+# emulated using shaders.
+spec@!opengl 1.1@line-smooth-stipple,Fail
+spec@!opengl 1.1@linestipple,Fail
+spec@!opengl 1.1@linestipple@Factor 2x,Fail
+spec@!opengl 1.1@linestipple@Factor 3x,Fail
+spec@!opengl 1.1@linestipple@Line loop,Fail
+spec@!opengl 1.1@linestipple@Line strip,Fail
+spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
+spec@!opengl 2.1@pbo,Fail
+spec@!opengl 2.1@pbo@test_polygon_stip,Fail
+spec@!opengl 2.1@polygon-stipple-fs,Fail
+
+# Works when run individually, but fail consistently on the CI
+dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32i_cube,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4422
+KHR-GL31.texture_size_promotion.functional,Fail
+
+# uprev Piglit in Mesa
+spec@glsl-1.40@uniform_buffer@two-stages,Fail
+
+# RPI4 only supports 4RT, so this tests with 8RT will fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 8,Fail
+
+# This seems to be a Vulkan Loader issue. Can be fixed by compiling the loader from the Github repo.
+dEQP-VK.api.get_device_proc_addr.non_enabled,Fail
+# This is a bug in CTS: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5096
+dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary,Fail
+
+# New CTS failures in 1.3.8.2
+dEQP-VK.api.info.vulkan1p2_limits_validation.khr_vertex_attribute_divisor,Fail
diff --git a/src/broadcom/ci/broadcom-rpi4-flakes.txt b/src/broadcom/ci/broadcom-rpi4-flakes.txt
new file mode 100644
index 00000000000..c1a2cd94b04
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi4-flakes.txt
@@ -0,0 +1,48 @@
+KHR-GLES31.core.shader_image_load_store.basic-glsl-earlyFragTests
+dEQP-GLES31.functional.ssbo.layout.instance_array_basic_type.std430.ivec4
+
+glx@glx_arb_sync_control@waitformsc
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=4
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
+spec@!opengl 1.1@masked-clear
+spec@arb_occlusion_query@occlusion_query_order
+spec@arb_texture_multisample@large-float-texture
+spec@egl_chromium_sync_control@conformance
+spec@ext_packed_depth_stencil@depthstencil-render-miplevels 585 ds=z24_s8
+
+# Seen this one flake a few times already
+spec@egl 1.4@largest possible eglcreatepbuffersurface and then glclear
+
+# This test works alone, but fails when executing all the tests together
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/8684
+dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32f_cube
+dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32i_cube
+
+# Seem reliable on arm64, but they flake on armhf
+dEQP-VK.glsl.builtin.function.integer.findMSB.ivec2_mediump_geometry
+dEQP-VK.glsl.builtin.function.integer.findMSB.ivec2_highp_geometry
+
+# Failed twice one day with two different bad renders, and never since:
+# https://gitlab.freedesktop.org/eric/mesa/-/jobs/37556931
+# https://gitlab.freedesktop.org/eric/mesa/-/jobs/37596148
+dEQP-VK.renderpass2.suballocation.load_store_op_none.depthstencil_d24_unorm_s8_uint_load_op_depth_load_stencil_none_store_op_depth_store_stencil_none_stencil_write_off
+
+# first encounter 01/04/2023
+spec@ext_framebuffer_blit@fbo-sys-blit
+spec@ext_framebuffer_blit@fbo-sys-sub-blit
+
+dEQP-VK.fragment_operations.occlusion_query.precise_test_scissors_depth_write_stencil_clear_stencil_write
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1048576
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.atomic_atomic.atomicrmw.device.payload_local.image.guard_local.image.frag
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.device.payload_local.image.guard_local.buffer.frag
+dEQP-VK.pipeline.monolithic.image.suballocation.sampling_type.combined.view_type.1d_array.format.r8_unorm.count_1.size.443x1_array_of_6
+dEQP-VK.renderpass.suballocation.load_store_op_none.depthstencil_d24_unorm_s8_uint_load_op_depth_load_stencil_none_store_op_depth_store_stencil_none_stencil_write_off
+dEQP-VK.synchronization.basic.timeline_semaphore.one_queue
+dEQP-VK.synchronization2.basic.timeline_semaphore.one_queue
+dEQP-VK.synchronization2.signal_order.shared_binary_semaphore.write_ssbo_compute_indirect_read_ssbo_geometry.buffer_262144_opaque_fd
+dEQP-VK.texture.shadow.cube.linear.less_d24_unorm_s8_uint
diff --git a/src/broadcom/ci/broadcom-rpi4-skips.txt b/src/broadcom/ci/broadcom-rpi4-skips.txt
new file mode 100644
index 00000000000..66d371eaae2
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi4-skips.txt
@@ -0,0 +1,293 @@
+# Slow tests (> 1 minute to run)
+spec@!opengl 1.1@streaming-texture-leak
+spec@!opengl 1.2@tex3d-maxsize
+spec@arb_texture_multisample@texelfetch fs sampler2dms 4 1x130-501x130
+spec@arb_texture_multisample@texelfetch fs sampler2dms 4 1x71-501x71
+spec@arb_texture_multisample@texelfetch fs sampler2dmsarray 4 98x1x9-98x129x9
+spec@glsl-1.30@execution@texelfetch fs sampler2d 1x281-501x281
+
+# Versions / Extensions not supported
+spec@!opengl 3.2@.*
+spec@!opengl 3.3@.*
+spec@!opengl 4.2@.*
+spec@!opengl 4.3@.*
+spec@!opengl 4.4@.*
+spec@!opengl 4.5@.*
+spec@arb_gpu_shader5.*
+spec@arb_gpu_shader_fp64.*
+spec@arb_gpu_shader_int64.*
+spec@arb_tessellation_shader.*
+spec@glsl-1.50.*
+spec@glsl-3.*
+spec@glsl-4.*
+spec@glsl-es-3.20.*
+
+# Broadcom waivers
+dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+dEQP-VK.rasterization.depth_bias.d32_sfloat
+
+# Kernel blocks (probably GMP violations)
+spec@arb_shading_language_420pack@active sampler conflict
+spec@arb_texture_buffer_object@render-no-bo
+
+# Slow tests (> 1 minute to run)
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.comp_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.comp_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.frag_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.frag_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.frag_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.single.std140.vert_offset_nonzero
+dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap
+dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap
+dEQP-VK.memory.mapping.suballocation.full.variable.implicit_unmap
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std140.mat4
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std430.mat4
+dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_store_cols
+dEQP-VK.ssbo.layout.random.16bit.all_per_block_buffers.47
+dEQP-VK.ssbo.layout.random.16bit.all_per_block_buffers.5
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.5
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.6
+dEQP-VK.ssbo.layout.random.8bit.nested_structs_arrays_instance_arrays.15
+dEQP-VK.ssbo.layout.random.8bit.nested_structs_arrays_instance_arrays.9
+dEQP-VK.ssbo.layout.random.all_shared_buffer.3
+dEQP-VK.ssbo.layout.random.arrays_of_arrays.13
+dEQP-VK.ssbo.layout.random.nested_structs_arrays.17
+dEQP-VK.ssbo.phys.layout.2_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.2_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.column_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.random.16bit.all_per_block_buffers.45
+dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.23
+dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.36
+dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.40
+dEQP-VK.ssbo.phys.layout.random.16bit.nested_structs_arrays.23
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.17
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.38
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.49
+dEQP-VK.ssbo.phys.layout.random.8bit.all_shared_buffer.19
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.17
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.20
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays_instance_arrays.12
+dEQP-VK.ssbo.phys.layout.random.8bit.unsized_arrays.0
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.14
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.18
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.22
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.46
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.20
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.3
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.8
+dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays.13
+dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays_instance_arrays.23
+dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays_instance_arrays.3
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_store_cols
+dEQP-VK.synchronization.basic.timeline_semaphore.chain
+dEQP-VK.synchronization2.basic.timeline_semaphore.chain
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat
+
+# WSI tests are too flaky to be useful
+dEQP-VK.image.swapchain_mutable.*
+dEQP-VK.wsi.*
+
+# These require VK_KHR_shader_draw_parameters but they don't check for it
+# (Seems to be fixed in some later release of CTS 1.3.7).
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multi_draw.*
+
+# Skip tests for unsupported features so we can increase the number of tests
+# that are actually useful in the limited CI time we have per job.
+dEQP-VK.pipeline.monolithic.multisample_with_fragment_shading_rate.*
+dEQP-VK.pipeline.monolithic.bind_point.graphics_raytracing.*
+dEQP-VK.pipeline.monolithic.bind_point.compute_raytracing.*
+dEQP-VK.pipeline.pipeline_library.*
+dEQP-VK.pipeline.fast_linked_library.*
+dEQP-VK.pipeline.shader_object*
+dEQP-VK.protected_memory.*
+dEQP-VK.transform_feedback.*
+dEQP-VK.ray_tracing_pipeline.*
+dEQP-VK.ray_query.*
+dEQP-VK.fragment_shading_rate.*
+dEQP-VK.mesh_shader.*
+dEQP-VK.shader_object.rendering.*
diff --git a/src/broadcom/ci/broadcom-rpi5-fails.txt b/src/broadcom/ci/broadcom-rpi5-fails.txt
new file mode 100644
index 00000000000..3241bf827dc
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi5-fails.txt
@@ -0,0 +1,11 @@
+# New CTS failures in 1.3.8.0
+dEQP-VK.query_pool.performance_query.query_compute,Fail
+dEQP-VK.query_pool.performance_query.query_compute_copy,Fail
+dEQP-VK.query_pool.performance_query.query_graphic,Fail
+dEQP-VK.query_pool.performance_query.query_graphic_copy,Fail
+# This seems to be a Vulkan Loader issue. Can be fixed by compiling the loader from the Github repo.
+dEQP-VK.api.get_device_proc_addr.non_enabled,Fail
+# This is a bug in CTS: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5096
+dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary,Fail
+
+dEQP-VK.api.info.vulkan1p2_limits_validation.khr_vertex_attribute_divisor,Fail
diff --git a/src/broadcom/ci/broadcom-rpi5-flakes.txt b/src/broadcom/ci/broadcom-rpi5-flakes.txt
new file mode 100644
index 00000000000..35a53c59666
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi5-flakes.txt
@@ -0,0 +1,15 @@
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.4_samples
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.4_samples
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1048576
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.8192
+dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.queuefamily.payload_local.image.guard_local.image.frag
+dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.subgroup.payload_local.image.guard_local.buffer.frag
+dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.subgroup.payload_local.image.guard_local.buffer.frag
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.device.payload_local.image.guard_local.buffer.frag
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.queuefamily.payload_local.image.guard_local.image.frag
+dEQP-VK.pipeline.monolithic.image.suballocation.sampling_type.combined.view_type.1d_array.format.r8_unorm.count_1.size.443x1_array_of_6
+dEQP-VK.spirv_assembly.type.scalar.i8.shift_left_logical_shift16_tesse
+dEQP-VK.synchronization2.cross_instance.suballocated.write_blit_image_read_image_tess_eval.image_128x128_r32g32b32a32_sfloat_binary_semaphore_fence_fd
+dEQP-VK.texture.shadow.cube.linear.less_d24_unorm_s8_uint
diff --git a/src/broadcom/ci/broadcom-rpi5-skips.txt b/src/broadcom/ci/broadcom-rpi5-skips.txt
new file mode 100644
index 00000000000..17110a448da
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi5-skips.txt
@@ -0,0 +1,96 @@
+# Slow tests (> 1 minute to run)
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.5
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.6
+dEQP-VK.ssbo.layout.random.8bit.scalar.78
+dEQP-VK.ssbo.layout.random.nested_structs_arrays.17
+dEQP-VK.ssbo.layout.random.scalar.75
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat3x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4x3_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.random.16bit.scalar.78
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.17
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays_instance_arrays.12
+dEQP-VK.ssbo.phys.layout.random.8bit.scalar.78
+dEQP-VK.ssbo.phys.layout.random.8bit.scalar.96
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.22
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.3
+dEQP-VK.ssbo.phys.layout.random.scalar.3
+dEQP-VK.ssbo.phys.layout.random.scalar.93
+
+# WSI tests are too flaky to be useful
+dEQP-VK.image.swapchain_mutable.*
+dEQP-VK.wsi.*
+
+# These require VK_KHR_shader_draw_parameters but they don't check for it
+# (Seems to be fixed in some later release of CTS 1.3.7).
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multi_draw.*
+
+# Skip tests for unsupported features so we can increase the number of tests
+# that are actually useful in the limited CI time we have per job.
+dEQP-VK.pipeline.monolithic.multisample_with_fragment_shading_rate.*
+dEQP-VK.pipeline.monolithic.bind_point.graphics_raytracing.*
+dEQP-VK.pipeline.monolithic.bind_point.compute_raytracing.*
+dEQP-VK.pipeline.pipeline_library.*
+dEQP-VK.pipeline.fast_linked_library.*
+dEQP-VK.pipeline.shader_object*
+dEQP-VK.protected_memory.*
+dEQP-VK.transform_feedback.*
+dEQP-VK.ray_tracing_pipeline.*
+dEQP-VK.ray_query.*
+dEQP-VK.fragment_shading_rate.*
+dEQP-VK.mesh_shader.*
+dEQP-VK.shader_object.rendering.*
diff --git a/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml b/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml
new file mode 100644
index 00000000000..a9649cbe516
--- /dev/null
+++ b/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml
@@ -0,0 +1,6 @@
+[[piglit]]
+piglit_folder = "/piglit"
+profile = "gpu"
+process_isolation = true
+  [piglit.env]
+  PIGLIT_PLATFORM = "mixed_glx_egl"
diff --git a/src/broadcom/ci/deqp-broadcom-rpi3.toml b/src/broadcom/ci/deqp-broadcom-rpi3.toml
new file mode 100644
index 00000000000..1b7293b7c5c
--- /dev/null
+++ b/src/broadcom/ci/deqp-broadcom-rpi3.toml
@@ -0,0 +1,61 @@
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-main.txt"]
+tests_per_group = 250
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+version_check = "GL ES 2.0.*git"
+renderer_check = "VC4"
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = ["/deqp/mustpass/gles2-khr-main.txt"]
+tests_per_group = 250
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+
+# We are getting frequent GPU hangs with piglit, but still haven't identified
+# the cause. So let's disable it for now.
+# [[piglit]]
+# piglit_folder = "/piglit"
+# profile = "quick_gl"
+# process_isolation = true
+#   [piglit.env]
+#   PIGLIT_PLATFORM = "mixed_glx_egl"
+
+[[piglit]]
+piglit_folder = "/piglit"
+profile = "quick_shader"
+process_isolation = true
+
+# wayland
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-wayland"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "wayland-"
+
+# x11
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-x11"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "x11-"
diff --git a/src/broadcom/ci/deqp-broadcom-rpi4.toml b/src/broadcom/ci/deqp-broadcom-rpi4.toml
new file mode 100644
index 00000000000..930077f31f2
--- /dev/null
+++ b/src/broadcom/ci/deqp-broadcom-rpi4.toml
@@ -0,0 +1,89 @@
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-main.txt"]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+version_check = "GL ES 3.1.*git"
+renderer_check = "V3D"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-main.txt"]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-main.txt"]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles31-khr-main.txt",
+    "/deqp/mustpass/gles3-khr-main.txt",
+    "/deqp/mustpass/gles2-khr-main.txt",
+]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = ["/deqp/mustpass/gl31-main.txt"]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+
+[[piglit]]
+piglit_folder = "/piglit"
+profile = "gpu"
+process_isolation = true
+  [piglit.env]
+  PIGLIT_PLATFORM = "mixed_glx_egl"
+
+# wayland
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-wayland"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "wayland-"
+
+# x11
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-x11"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "x11-"
diff --git a/src/broadcom/ci/deqp-v3d-rpi4-fails.txt b/src/broadcom/ci/deqp-v3d-rpi4-fails.txt
deleted file mode 100644
index 10ab688613d..00000000000
--- a/src/broadcom/ci/deqp-v3d-rpi4-fails.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_amplification,Fail
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_instanced,Fail
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_no_amplification,Fail
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_partial_primitives,Fail
diff --git a/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt b/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt
deleted file mode 100644
index 673cc5b0941..00000000000
--- a/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-dEQP-GLES31.functional.compute.shared_var.basic_type.ivec3_highp
-dEQP-GLES31.functional.ssbo.layout.single_basic_type.packed.highp_mat2
-KHR-GLES31.core.shader_image_load_store.basic-glsl-earlyFragTests
diff --git a/src/broadcom/ci/deqp-v3d-rpi4-gles.toml b/src/broadcom/ci/deqp-v3d-rpi4-gles.toml
deleted file mode 100644
index 32a569344d2..00000000000
--- a/src/broadcom/ci/deqp-v3d-rpi4-gles.toml
+++ /dev/null
@@ -1,47 +0,0 @@
-[[deqp]]
-deqp = "/deqp/modules/gles31/deqp-gles31"
-caselists = [ "/deqp/mustpass/gles31-master.txt" ]
-deqp_args = [
-    "--deqp-gl-config-name=rgba8888d24s8ms0",
-    "--deqp-surface-height=256",
-    "--deqp-surface-type=pbuffer",
-    "--deqp-surface-width=256",
-    "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/modules/gles3/deqp-gles3"
-caselists = [ "/deqp/mustpass/gles3-master.txt" ]
-deqp_args = [
-    "--deqp-gl-config-name=rgba8888d24s8ms0",
-    "--deqp-surface-height=256",
-    "--deqp-surface-type=pbuffer",
-    "--deqp-surface-width=256",
-    "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/modules/gles2/deqp-gles2"
-caselists = [ "/deqp/mustpass/gles2-master.txt" ]
-deqp_args = [
-    "--deqp-gl-config-name=rgba8888d24s8ms0",
-    "--deqp-surface-height=256",
-    "--deqp-surface-type=pbuffer",
-    "--deqp-surface-width=256",
-    "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/external/openglcts/modules/glcts"
-caselists = [
-    "/deqp/mustpass/gles31-khr-master.txt",
-    "/deqp/mustpass/gles3-khr-master.txt",
-    "/deqp/mustpass/gles2-khr-master.txt",
-]
-deqp_args = [
-    "--deqp-gl-config-name=rgba8888d24s8ms0",
-    "--deqp-surface-height=256",
-    "--deqp-surface-type=pbuffer",
-    "--deqp-surface-width=256",
-    "--deqp-visibility=hidden",
-]
diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt b/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt
deleted file mode 100644
index 7898bc2a2d1..00000000000
--- a/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-# This seems to fail due to the test error threshold being insufficient
-dEQP-VK.geometry.input.basic_primitive.line_strip_adjacency,Fail
-
-# CTS bug; fix submitted
-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_single_buffer_geom,Fail
-
-# Multiview doesn't work with points
-dEQP-VK.multiview.point_size.15,Fail
-dEQP-VK.multiview.point_size.8,Fail
-dEQP-VK.multiview.point_size.1_2_4_8,Fail
-dEQP-VK.multiview.point_size.15_15_15_15,Fail
-dEQP-VK.multiview.point_size.8_1_1_8,Fail
-dEQP-VK.multiview.point_size.5_10_5_10,Fail
-dEQP-VK.multiview.point_size.1_2_4_8_16_32,Fail
-dEQP-VK.multiview.point_size.max_multi_view_view_count,Fail
-
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt b/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt
deleted file mode 100644
index 0d22f002dbd..00000000000
--- a/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-dEQP-VK.api.external.fence.opaque_fd.reset_permanent
-dEQP-VK.api.external.fence.opaque_fd.reset_temporary
-dEQP-VK.api.external.fence.opaque_fd.signal_export_import_wait_permanent
-dEQP-VK.ssbo.layout.instance_array_basic_type.std430.uvec4
-dEQP-VK.wsi.display.get_display_plane_capabilities
diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt b/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt
deleted file mode 100644
index bf6a82c19bf..00000000000
--- a/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# Broadcom waivers
-dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
-dEQP-VK.rasterization.depth_bias.d32_sfloat
-
-# Timeout tests (> 1 minute to run)
-dEQP-VK.api.object_management.max_concurrent.query_pool
-dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
-dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap
-dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap
-dEQP-VK.memory.mapping.suballocation.full.variable.implicit_unmap
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert
-dEQP-VK.ssbo.layout.random.all_shared_buffer.5
-dEQP-VK.ssbo.layout.random.arrays_of_arrays.13
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.0
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat
-dEQP-VK.ubo.random.all_out_of_order_offsets.45
-dEQP-VK.ubo.random.all_shared_buffer.48
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-fails.txt b/src/broadcom/ci/deqp-vc4-rpi3-fails.txt
deleted file mode 100644
index d0722563e60..00000000000
--- a/src/broadcom/ci/deqp-vc4-rpi3-fails.txt
+++ /dev/null
@@ -1,420 +0,0 @@
-KHR-GLES2.core.internalformat.copy_tex_image.alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance4_alpha4_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb565,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb5_a1,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgba4,Fail
-KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail
-KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail
-KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail
-KHR-GLES2.texture_3d.copy_sub_image.negative,Fail
-KHR-GLES2.texture_3d.copy_sub_image.rgba,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.negative,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_linear,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.framebuffer_texture.rgba,Fail
-KHR-GLES2.texture_3d.sub_image.rgba8,Fail
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_window,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.create_context.no_config,Fail
-dEQP-EGL.functional.render.multi_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.render.multi_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_context.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgb888_window,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgba8888_window,Crash
-dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
-dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
-dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail
-dEQP-GLES2.functional.draw.draw_arrays.line_loop.multiple_attributes,Fail
-dEQP-GLES2.functional.draw.draw_arrays.line_loop.single_attribute,Fail
-dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail
-dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail
-dEQP-GLES2.functional.negative_api.shader.uniform_matrixfv_invalid_transpose,Fail
-dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level_array_compressed,Fail
-dEQP-GLES2.functional.negative_api.vertex_array.vertex_attrib,Fail
-dEQP-GLES2.functional.negative_api.vertex_array.vertex_attribv,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_mirror_rgba8888,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail
-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.2d_rgba,Fail
-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.cube_rgba,Fail
-dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt b/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt
deleted file mode 100644
index 497be959096..00000000000
--- a/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_neg_x_neg_y_neg_z
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_pos_y_pos_z
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
-dEQP-GLES2.functional.draw.random.51
-dEQP-GLES2.functional.fragment_ops.blend.rgb_func_alpha_func.src.one_minus_src_alpha_constant_color
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_direct_write_dynamic_loop_subscript_read_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_mediump_int_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.conditional_continue_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_inout_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_return_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_sequence_vertex
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.select_iteration_count_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.function_call_return_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.infinite_with_conditional_break_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.post_increment_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.single_iteration_vertex
-dEQP-GLES2.functional.shaders.operator.unary_operator.pre_decrement_result.mediump_vec3_fragment
-dEQP-GLES2.functional.shaders.random.exponential.fragment.51
-dEQP-GLES2.functional.shaders.random.texture.fragment.129
-dEQP-GLES2.functional.shaders.return.output_write_in_func_never_vertex
-dEQP-GLES2.functional.texture.filtering.2d.linear_linear_clamp_rgb888_pot
-dEQP-GLES2.functional.texture.filtering.cube.linear_mipmap_linear_nearest_mirror_rgba8888
-dEQP-GLES2.functional.texture.filtering.cube.nearest_linear_mirror_rgba8888_pot
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_linear_clamp_rgba8888
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_nearest_repeat_l8
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_clamp_rgba8888
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_mirror_rgba8888
-dEQP-GLES2.functional.texture.mipmap.cube.generate.rgb565_fastest
-dEQP-GLES2.functional.texture.size.cube.256x256_rgb888
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-gles.toml b/src/broadcom/ci/deqp-vc4-rpi3-gles.toml
deleted file mode 100644
index 4ca3ab03231..00000000000
--- a/src/broadcom/ci/deqp-vc4-rpi3-gles.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-[[deqp]]
-deqp = "/deqp/modules/gles2/deqp-gles2"
-caselists = [ "/deqp/mustpass/gles2-master.txt" ]
-tests_per_group = 250
-deqp_args = [
-    "--deqp-gl-config-name=rgba8888d24s8ms0",
-    "--deqp-surface-height=256",
-    "--deqp-surface-type=pbuffer",
-    "--deqp-surface-width=256",
-    "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/external/openglcts/modules/glcts"
-caselists = [ "/deqp/mustpass/gles2-khr-master.txt" ]
-tests_per_group = 250
-deqp_args = [
-    "--deqp-gl-config-name=rgba8888d24s8ms0",
-    "--deqp-surface-height=256",
-    "--deqp-surface-type=pbuffer",
-    "--deqp-surface-width=256",
-    "--deqp-visibility=hidden",
-]
diff --git a/src/broadcom/ci/gitlab-ci-inc.yml b/src/broadcom/ci/gitlab-ci-inc.yml
new file mode 100644
index 00000000000..4a106db4af2
--- /dev/null
+++ b/src/broadcom/ci/gitlab-ci-inc.yml
@@ -0,0 +1,156 @@
+.broadcom-common-rules:
+  rules:
+    - changes: &broadcom_file_list
+        - src/broadcom/meson.build
+        - src/broadcom/ci/gitlab-ci.yml
+        - src/broadcom/ci/gitlab-ci-inc.yml
+        - src/broadcom/ci/deqp-$DEQP_SUITE.toml
+        - src/broadcom/ci/$GPU_VERSION-fails.txt
+        - src/broadcom/ci/$GPU_VERSION-flakes.txt
+        - src/broadcom/ci/$GPU_VERSION-skips.txt
+        - src/broadcom/ci/$PIGLIT_TRACES_FILE
+        - src/broadcom/cle/**/*
+        - src/broadcom/clif/**/*
+        - src/broadcom/common/**/*
+        - src/broadcom/compiler/**/*
+        - src/broadcom/drm-shim/**/*
+        - src/broadcom/qpu/**/*
+        - src/broadcom/simulator/**/*
+      when: on_success
+
+.broadcom-common-manual-rules:
+  rules:
+    - changes: *broadcom_file_list
+      when: manual
+
+.vc4-rules:
+  stage: broadcom
+  rules:
+    - if: $FORCE_KERNEL_TAG != null
+      when: never
+    - !reference [.test, rules]
+    - !reference [.igalia-farm-rules, rules]
+    - !reference [.gl-rules, rules]
+    - !reference [.broadcom-common-rules, rules]
+    - changes: &vc4_file_list
+        - src/gallium/drivers/vc4/**/*
+        - src/gallium/winsys/vc4/**/*
+        - src/gallium/auxiliary/renderonly/**/*
+        - src/gallium/winsys/kmsro/**/*
+      when: on_success
+
+.vc4-manual-rules:
+  stage: broadcom
+  rules:
+    - !reference [.test, rules]
+    - !reference [.igalia-farm-manual-rules, rules]
+    - !reference [.gl-manual-rules, rules]
+    - !reference [.broadcom-common-manual-rules, rules]
+    - changes: *vc4_file_list
+      when: manual
+
+.v3d-rules:
+  stage: broadcom
+  rules:
+    - if: $FORCE_KERNEL_TAG != null
+      when: never
+    - !reference [.test, rules]
+    - !reference [.igalia-farm-rules, rules]
+    - !reference [.gl-rules, rules]
+    - !reference [.broadcom-common-rules, rules]
+    - changes: &v3d_file_list
+        - src/gallium/drivers/v3d/**/*
+        - src/gallium/winsys/v3d/**/*
+        - src/gallium/auxiliary/renderonly/**/*
+        - src/gallium/winsys/kmsro/**/*
+      when: on_success
+
+.v3d-manual-rules:
+  stage: broadcom
+  retry: !reference [.scheduled_pipeline-rules, retry]
+  rules:
+    - !reference [.test, rules]
+    - !reference [.igalia-farm-manual-rules, rules]
+    - !reference [.gl-manual-rules, rules]
+    - !reference [.broadcom-common-manual-rules, rules]
+    - changes:
+        *v3d_file_list
+      when: manual
+
+.v3dv-rules:
+  stage: broadcom
+  rules:
+    - if: $FORCE_KERNEL_TAG != null
+      when: never
+    - !reference [.test, rules]
+    - !reference [.igalia-farm-rules, rules]
+    - !reference [.vulkan-rules, rules]
+    - !reference [.broadcom-common-rules, rules]
+    - changes: &v3dv_file_list
+        - src/broadcom/vulkan/**/*
+      when: on_success
+
+.v3dv-manual-rules:
+  stage: broadcom
+  rules:
+    - !reference [.test, rules]
+    - !reference [.igalia-farm-manual-rules, rules]
+    - !reference [.vulkan-manual-rules, rules]
+    - !reference [.broadcom-common-manual-rules, rules]
+    - changes: *v3dv_file_list
+      when: manual
+
+# 8 devices (2023-12-18)
+.igalia-bcm2837-rpi-3-b:arm64:
+  variables:
+    DEVICE_TYPE: rpi3
+    GPU_VERSION: broadcom-rpi3
+  script:
+    - ./install/bare-metal/poe-powered.sh
+  tags:
+    - igalia-rpi3
+
+# 21 devices (2023-12-18)
+.igalia-bcm2711-rpi-4:arm64:
+  variables:
+    DEVICE_TYPE: rpi4
+    GPU_VERSION: broadcom-rpi4
+    VK_DRIVER: broadcom
+  script:
+    - ./install/bare-metal/poe-powered.sh
+  tags:
+    - igalia-rpi4
+
+# 1 device (2024-01-02)
+.igalia-bcm2712-rpi-5:arm64:
+  variables:
+    DEVICE_TYPE: rpi5
+    GPU_VERSION: broadcom-rpi5
+    VK_DRIVER: broadcom
+  script:
+    - ./install/bare-metal/poe-powered.sh
+  tags:
+    - igalia-rpi5
+
+.broadcom-test:
+  script:
+    - ./install/bare-metal/poe-powered.sh
+  variables:
+    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+    FLAKES_CHANNEL: "#videocore-ci"
+    FARM: igalia
+  timeout: 20m
+
+.broadcom-test:arm64:
+  extends:
+    - .broadcom-test
+    - .baremetal-test-arm64
+  variables:
+    BM_BOOTFS: /boot/raspberrypi_arm64
+
+.broadcom-test:arm32:
+  extends:
+    - .broadcom-test
+    - .baremetal-test-arm32
+  variables:
+    BM_BOOTFS: /boot/raspberrypi_armhf
diff --git a/src/broadcom/ci/gitlab-ci.yml b/src/broadcom/ci/gitlab-ci.yml
index 165f9959936..32ef88554fc 100644
--- a/src/broadcom/ci/gitlab-ci.yml
+++ b/src/broadcom/ci/gitlab-ci.yml
@@ -1,141 +1,113 @@
-.vc4-rpi3-test:armhf:
+include:
+  - local: 'src/broadcom/ci/gitlab-ci-inc.yml'
+
+vc4-rpi3-gl:arm32:
   extends:
-    - .baremetal-test-armhf
+    - .igalia-bcm2837-rpi-3-b:arm64
+    - .broadcom-test:arm32
     - .vc4-rules
-    - .use-debian/arm_test
+  parallel: 4
   variables:
-    BM_BOOTFS: /boot/raspberrypi_armhf
-    BM_ROOTFS: /rootfs-armhf
-    DEQP_EXPECTED_RENDERER: VC4
-    GPU_VERSION: vc4-rpi3
-    HWCI_KERNEL_MODULES: vc4
-    FLAKES_CHANNEL: "#videocore-ci"
-  script:
-    - ./install/bare-metal/poe-powered.sh
-  needs:
-    - job: debian/arm_test
-      artifacts: false
-    - debian-armhf
-  tags:
-    - igalia-rpi3
+    DEQP_SUITE: broadcom-rpi3
+    HWCI_START_WESTON: 1
 
-vc4-rpi3-gles:armhf:
+vc4-rpi3-gl-piglit-full:arm32:
   extends:
-    - .vc4-rpi3-test:armhf
-  parallel: 2
+    - vc4-rpi3-gl:arm32
+    - .vc4-manual-rules
+  tags:
+    - igalia-rpi3
+    - igalia-fullrun
   variables:
-    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
-    DEQP_SUITE: vc4-rpi3-gles
-    DEQP_VER: gles2
+    DEQP_SUITE: broadcom-rpi3-piglit-full
 
-vc4-rpi3-egl:armhf:
-  extends:
-    - .vc4-rpi3-test:armhf
-  variables:
-    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
-    HWCI_START_XORG: 1
-    DEQP_RUNNER_OPTIONS: "--tests-per-group 250"
-    DEQP_VER: egl
 
-.vc4-rpi3-piglit:armhf:
+v3d-rpi4-gl:arm64:
   extends:
-    - .piglit-test
-    - .vc4-rpi3-test:armhf
-    - .test-manual
+    - .igalia-bcm2711-rpi-4:arm64
+    - .broadcom-test:arm64
+    - .v3d-rules
+  parallel: 8
   variables:
-    HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
-    BM_POE_TIMEOUT: 180
-    HWCI_START_XORG: 1
-    PIGLIT_PLATFORM: mixed_glx_egl
+    HWCI_START_WESTON: 1
+    DEQP_SUITE: broadcom-rpi4
+    DEQP_FRACTION: 2
 
-vc4-rpi3-piglit-quick_gl:armhf:
+v3d-rpi4-gl-full:arm64:
   extends:
-    - .vc4-rpi3-piglit:armhf
-  parallel: 4
+    - v3d-rpi4-gl:arm64
+    - .v3d-manual-rules
+  tags:
+    - igalia-rpi4
+    - igalia-fullrun
+  parallel: 6
+  timeout: 45m
   variables:
-    FDO_CI_CONCURRENT: 1
-    PIGLIT_PROFILES: quick_gl
+    TEST_PHASE_TIMEOUT: 40
+    DEQP_FRACTION: 1
 
-vc4-rpi3-piglit-quick_shader:armhf:
-  extends:
-    - .vc4-rpi3-piglit:armhf
-  parallel: 2
-  variables:
-    FDO_CI_CONCURRENT: 2
-    PIGLIT_PROFILES: quick_shader
 
-.v3d-rpi4-test:armhf:
+v3d-rpi4-traces:arm64:
   extends:
-    - .baremetal-test-armhf
+    - .igalia-bcm2711-rpi-4:arm64
+    - .piglit-traces-test
+    - .broadcom-test:arm64
     - .v3d-rules
-    - .use-debian/arm_test
   variables:
-    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
-    BM_BOOTFS: /boot/raspberrypi_armhf
-    BM_POE_TIMEOUT: 300
-    BM_ROOTFS: /rootfs-armhf
-    DEQP_EXPECTED_RENDERER: V3D
-    FLAKES_CHANNEL: "#videocore-ci"
-    GPU_VERSION: v3d-rpi4
-    HWCI_KERNEL_MODULES: v3d,vc4
-  script:
-    - ./install/bare-metal/poe-powered.sh
-  needs:
-    - debian/arm_test
-    - debian-armhf
-  tags:
-    - igalia-rpi4
+    HWCI_TEST_SCRIPT: "/install/piglit/piglit-traces.sh"
+    PIGLIT_TRACES_FILE: traces-broadcom.yml
+    PIGLIT_REPLAY_DEVICE_NAME: "broadcom-rpi4"
+    PIGLIT_RESULTS: "broadcom-rpi4-replay"
 
-v3d-rpi4-gles:armhf:
-  extends:
-    - .v3d-rpi4-test:armhf
-  parallel: 8
-  variables:
-    DEQP_SUITE: v3d-rpi4-gles
-    DEQP_VER: gles31
 
-v3d-rpi4-egl:armhf:
+v3dv-rpi4-vk:arm64:
   extends:
-    - .v3d-rpi4-test:armhf
+    - .igalia-bcm2711-rpi-4:arm64
+    - .broadcom-test:arm64
+    - .v3dv-rules
+  parallel: 10
   variables:
-    HWCI_START_XORG: 1
-    DEQP_VER: egl
+    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+    HWCI_START_WESTON: 1
+    DEQP_EXPECTED_RENDERER: "V3D.4.2"
+    DEQP_FRACTION: 3
+    DEQP_VER: vk
+    FLAKES_CHANNEL: "#videocore-ci"
 
-v3d-rpi4-piglit:armhf:
+v3dv-rpi4-vk-full:arm64:
   extends:
-    - .piglit-test
-    - .v3d-rpi4-test:armhf
-  parallel: 4
+    - v3dv-rpi4-vk:arm64
+    - .v3dv-manual-rules
+  tags:
+    - igalia-rpi4
+    - igalia-fullrun
+  parallel: 6
+  timeout: 2h
   variables:
-    HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
-    HWCI_START_XORG: 1
-    PIGLIT_PLATFORM: mixed_glx_egl
-    PIGLIT_PROFILES: all
+    # Keep 10 minutes for boot + setup + uploading the artifacts at the end
+    TEST_PHASE_TIMEOUT: 110
+    DEQP_FRACTION: 1
 
-v3dv-rpi4-vk:arm64:
+
+.v3dv-rpi5-vk:arm64:
   extends:
-    - .baremetal-test
-    - .use-debian/arm_test
+    - .igalia-bcm2712-rpi-5:arm64
+    - .broadcom-test:arm64
     - .v3dv-rules
-  parallel: 8
   variables:
     HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
-    BM_BOOTFS: /boot/raspberrypi_arm64
-    BM_POE_TIMEOUT: 300
-    BM_ROOTFS: /rootfs-arm64
-    DEQP_EXPECTED_RENDERER: "V3D 4.2"
-    DEQP_FRACTION: 5
+    HWCI_START_WESTON: 1
+    DEQP_EXPECTED_RENDERER: "V3D.7.1"
+    DEQP_FRACTION: 15
     DEQP_VER: vk
     FLAKES_CHANNEL: "#videocore-ci"
-    GPU_VERSION: v3dv-rpi4
-    HWCI_KERNEL_MODULES: v3d,vc4
-    MINIO_ARTIFACT_NAME: mesa-arm64
-    VK_DRIVER: broadcom
-  script:
-    - ./install/bare-metal/poe-powered.sh
-  needs:
-    - debian/arm_test
-    - job: debian-arm64
-      artifacts: false
-  tags:
-    - igalia-rpi4
+
+v3dv-rpi5-vk-full:arm64:
+  extends:
+    - .v3dv-rpi5-vk:arm64
+    - .v3dv-manual-rules
+  timeout: 3h
+  variables:
+    # Keep 10 minutes for boot + setup + uploading the artifacts at the end
+    TEST_PHASE_TIMEOUT: 170
+    DEQP_FRACTION: 1
diff --git a/src/broadcom/ci/piglit-v3d-rpi4-fails.txt b/src/broadcom/ci/piglit-v3d-rpi4-fails.txt
deleted file mode 100644
index 4557a55562f..00000000000
--- a/src/broadcom/ci/piglit-v3d-rpi4-fails.txt
+++ /dev/null
@@ -1,337 +0,0 @@
-glx@glx-make-current,Crash
-glx@glx-multi-window-single-context,Fail
-glx@glx-multithread-buffer,Fail
-glx@glx-query-drawable-glx_fbconfig_id-window,Fail
-glx@glx-swap-pixmap-bad,Fail
-glx@glx-visuals-depth -pixmap,Crash
-glx@glx-visuals-stencil -pixmap,Crash
-glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
-glx@glx_arb_create_context_no_error@no error,Fail
-glx@glx_ext_import_context@free context,Fail
-glx@glx_ext_import_context@get context id,Fail
-glx@glx_ext_import_context@get current display,Fail
-glx@glx_ext_import_context@import context- multi process,Fail
-glx@glx_ext_import_context@import context- single process,Fail
-glx@glx_ext_import_context@imported context has same context id,Fail
-glx@glx_ext_import_context@make current- multi process,Fail
-glx@glx_ext_import_context@make current- single process,Fail
-glx@glx_ext_import_context@query context info,Fail
-shaders@glsl-bug-110796,Fail
-spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
-spec@!opengl 1.0@gl-1.0-dlist-bitmap,Fail
-spec@!opengl 1.0@gl-1.0-edgeflag,Fail
-spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail
-spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
-spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
-spec@!opengl 1.0@gl-1.0-spot-light,Fail
-spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=4,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Fail
-spec@!opengl 1.1@getteximage-depth,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT16,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT24,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT32,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT,Fail
-spec@!opengl 1.1@getteximage-formats,Fail
-spec@!opengl 1.1@linestipple,Fail
-spec@!opengl 1.1@linestipple@Factor 2x,Fail
-spec@!opengl 1.1@linestipple@Factor 3x,Fail
-spec@!opengl 1.1@linestipple@Line loop,Fail
-spec@!opengl 1.1@linestipple@Line strip,Fail
-spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
-spec@!opengl 1.1@point-line-no-cull,Fail
-spec@!opengl 1.1@polygon-mode,Fail
-spec@!opengl 1.1@polygon-mode-offset,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@texwrap formats bordercolor,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA4- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA4- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA16- swizzled- border color only,Fail
-spec@!opengl 1.1@windowoverlap,Fail
-spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
-spec@!opengl 2.0@gl-2.0-edgeflag,Fail
-spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
-spec@!opengl 2.0@max-samplers,Fail
-spec@!opengl 2.0@max-samplers border,Fail
-spec@!opengl 2.1@pbo,Fail
-spec@!opengl 2.1@pbo@test_polygon_stip,Fail
-spec@!opengl 2.1@polygon-stipple-fs,Fail
-spec@!opengl es 3.0@gles-3.0-transform-feedback-uniform-buffer-object,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render-fog,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Fail
-spec@arb_compute_shader@minmax,Fail
-spec@arb_copy_buffer@targets,Fail
-spec@arb_depth_buffer_float@fbo-generatemipmap-formats,Fail
-spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F,Fail
-spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F NPOT,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH32F_STENCIL8- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH_COMPONENT32F- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH32F_STENCIL8- swizzled- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32F- swizzled- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats,Fail
-spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH32F_STENCIL8- NPOT,Fail
-spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH_COMPONENT32F- NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16 NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT NPOT,Fail
-spec@arb_depth_texture@texwrap formats bordercolor,Fail
-spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail
-spec@arb_depth_texture@texwrap formats,Fail
-spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail
-spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail
-spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
-spec@arb_framebuffer_object@fbo-drawbuffers-none use_frag_out,Fail
-spec@arb_pixel_buffer_object@pbo-getteximage,Fail
-spec@arb_pixel_buffer_object@texsubimage array pbo,Fail
-spec@arb_point_sprite@arb_point_sprite-checkerboard,Fail
-spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
-spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail
-spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgrad,Fail
-spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail
-spec@arb_texture_float@fbo-blending-formats,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_ALPHA32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY16F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE16F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE_ALPHA32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_RGB16F,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_RGBA32F,Fail
-spec@arb_texture_float@texwrap formats bordercolor,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_ALPHA32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_INTENSITY32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE_ALPHA32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_RGB32F- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_RGBA32F- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_ALPHA32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_INTENSITY32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE_ALPHA32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGB32F- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGBA32F- swizzled- border color only,Fail
-spec@arb_texture_rectangle@1-1-linear-texture,Fail
-spec@arb_texture_rg@fbo-blending-formats-float,Fail
-spec@arb_texture_rg@fbo-blending-formats-float@GL_R32F,Fail
-spec@arb_texture_rg@fbo-blending-formats-float@GL_RG32F,Fail
-spec@arb_texture_rg@texwrap formats bordercolor,Fail
-spec@arb_texture_rg@texwrap formats bordercolor@GL_R16- border color only,Fail
-spec@arb_texture_rg@texwrap formats bordercolor@GL_RG16- border color only,Fail
-spec@arb_texture_rg@texwrap formats bordercolor-swizzled,Fail
-spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_R16- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_RG16- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor@GL_R32F- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor@GL_RG32F- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_R32F- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_RG32F- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float,Fail
-spec@arb_texture_rg@texwrap formats-float@GL_R32F- NPOT,Fail
-spec@arb_texture_rg@texwrap formats-float@GL_RG32F- NPOT,Fail
-spec@arb_transform_feedback2@change objects while paused (gles3),Fail
-spec@egl 1.4@egl-copy-buffers,Crash
-spec@egl 1.4@eglterminate then unbind context,Fail
-spec@egl_ext_protected_content@conformance,Fail
-spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
-spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
-spec@egl_khr_surfaceless_context@viewport,Fail
-spec@egl_mesa_configless_context@basic,Fail
-spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
-spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
-spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
-spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
-spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
-spec@ext_framebuffer_object@fbo-blending-formats,Fail
-spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB10,Fail
-spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail
-spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail
-spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() cubeshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darrayshadow,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_STENCIL8- swizzled- border color only,Fail
-spec@ext_packed_depth_stencil@texwrap formats,Fail
-spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
-spec@ext_packed_float@query-rgba-signed-components,Fail
-spec@ext_texture_array@array-texture,Fail
-spec@ext_texture_array@fbo-generatemipmap-array rgb9_e5,Fail
-spec@ext_texture_array@fbo-generatemipmap-array,Fail
-spec@ext_texture_array@texsubimage array,Fail
-spec@ext_texture_integer@getteximage-clamping gl_arb_texture_rg,Fail
-spec@ext_texture_integer@getteximage-clamping,Fail
-spec@ext_texture_lod_bias@lodbias,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_ALPHA16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_INTENSITY16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_R16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_RG16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGB16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGBA16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_ALPHA16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_INTENSITY16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_R16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RG16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGB16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGBA16_SNORM- swizzled- border color only,Fail
-spec@arb_texture_storage@texture-storage@cube array texture,Fail
-spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash
-spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
-spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
-spec@glsl-1.20@execution@clipping@vs-clip-vertex-primitives,Fail
-spec@glsl-1.20@execution@fs-underflow-mul-compare-zero,Fail
-spec@intel_performance_query@intel_performance_query-issue_2235,Fail
-spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
-spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail
-spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
-spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x223344ff,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x76356278,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x223344ff,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x76356278,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color,Crash
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.vert,Fail
-spec@nv_read_depth@read_depth_gles3,Fail
-spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Crash
-spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail
diff --git a/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt b/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt
deleted file mode 100644
index 14d2b9b4fd8..00000000000
--- a/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-glx@glx_arb_sync_control@swapbuffersmsc-divisor-zero
-glx@glx_arb_sync_control@waitformsc
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
-spec@arb_occlusion_query@occlusion_query_order
-spec@egl_chromium_sync_control@conformance
diff --git a/src/broadcom/ci/piglit-v3d-rpi4-skips.txt b/src/broadcom/ci/piglit-v3d-rpi4-skips.txt
deleted file mode 100644
index 2c70ff30c3f..00000000000
--- a/src/broadcom/ci/piglit-v3d-rpi4-skips.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-# Slow tests (> 1 minute to run)
-spec@!opengl 1.1@streaming-texture-leak
-spec@!opengl 1.2@tex3d-maxsize
-spec@ext_texture_env_combine@texture-env-combine
-spec@glsl-1.10@execution@loops@glsl-fs-unroll-explosion
-spec@glsl-1.10@execution@loops@glsl-vs-unroll-explosion
-spec@!opengl 1.0@gl-1.0-blend-func
-
-# Extensions not supported
-spec@arb_gpu_shader_fp64.*
-spec@arb_gpu_shader_gpu5.*
-spec@arb_gpu_shader_int64.*
-spec@arb_tessellation_shader.*
-spec@arb_texture_cube_map.*
-spec@glsl-1.30.*
-spec@glsl-1.40.*
-spec@glsl-1.50.*
-spec@glsl-3.*
-spec@glsl-4.*
-spec@glsl-es-3.20.*
diff --git a/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt b/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt
deleted file mode 100644
index afb7a908c87..00000000000
--- a/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-glx@glx-multi-window-single-context
-shaders@glsl-vs-loop
-shaders@glsl-vs-loop-nested
-spec@arb_framebuffer_srgb@blit renderbuffer srgb single_sampled enabled clear
-spec@egl_chromium_sync_control@conformance
-spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-readpixels
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
diff --git a/src/broadcom/ci/piglit-vc4-rpi3-skips.txt b/src/broadcom/ci/piglit-vc4-rpi3-skips.txt
deleted file mode 100644
index ae25a28bb9a..00000000000
--- a/src/broadcom/ci/piglit-vc4-rpi3-skips.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# Slow tests (> 1 minute to run)
-spec@ext_framebuffer_multisample@accuracy
-glx@glx-multithread-texture
-spec@arb_internalformat_query2@all internalformat_<x>_type pname checks
-spec@!opengl 1.1@streaming-texture-leak
-spec@!opengl 1.0@gl-1.0-blend-func
-
-# Extensions not supported
-spec@arb_gpu_shader_fp64.*
-spec@arb_gpu_shader_gpu5.*
-spec@arb_gpu_shader_int64.*
-spec@arb_tessellation_shader.*
-spec@arb_texture_cube_map.*
-spec@glsl-1.30.*
-spec@glsl-1.40.*
-spec@glsl-1.50.*
-spec@glsl-3.*
-spec@glsl-4.*
-spec@glsl-es-3.*
diff --git a/src/broadcom/ci/traces-broadcom.yml b/src/broadcom/ci/traces-broadcom.yml
new file mode 100644
index 00000000000..d330ad0dcc8
--- /dev/null
+++ b/src/broadcom/ci/traces-broadcom.yml
@@ -0,0 +1,205 @@
+%YAML 1.2
+---
+traces-db:
+  download-url: "http://192.168.40.131:8888/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public/"
+
+traces:
+  0ad/0ad-v2.trace:
+    broadcom-rpi4:
+      checksum: 8bdca9e63f483ee71970075842f003db
+
+  behdad-glyphy/glyphy-v2.trace:
+    broadcom-rpi4:
+      checksum: ea49462ff1545f21506dbd7b5028df45
+
+  blender/blender-demo-cube_diorama.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 4.5
+
+  blender/blender-demo-ellie_pose.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 4.5
+
+  filament/filament-default.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 4.1
+
+  glxgears/glxgears-2-v2.trace:
+    broadcom-rpi4:
+      label: [skip, flakes]
+      text: "Often fails when running on xwayland, with what looks like an incorrect resolution"
+      checksum: 2a9c5e35fa5693fd7d3a76f7b9746edb
+
+  godot/godot-thrive.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  godot/godot-tps-gles3-high.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  gputest/furmark-v2.trace:
+    broadcom-rpi4:
+      checksum: 800b2be5981d7e1a6570643f7dfd9a33
+
+  gputest/gimark-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  gputest/pixmark-julia-fp32-v2.trace:
+    broadcom-rpi4:
+      label: [skip, flakes]
+      checksum: be70fc9e3829fff5ad1b6ecfb6fa551c
+
+  gputest/pixmark-julia-fp64-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 4.0
+
+  gputest/pixmark-volplosion-v2.trace:
+    broadcom-rpi4:
+      checksum: 03f6b1c064af4e7eb117b800893cdba6
+
+  gputest/plot3d-v2.trace:
+    broadcom-rpi4:
+      checksum: 1ef33ad22679107a256501c79bfd9e7c
+
+  gputest/tessmark-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 4.0
+
+  gputest/triangle-v2.trace:
+    broadcom-rpi4:
+      checksum: df6df2af5fecfa42b5c2c332b726e93c
+
+  humus/AmbientAperture-v2.trace:
+    broadcom-rpi4:
+      checksum: a2d2a0141384a23e91ed30a27ed46bfe
+
+  humus/CelShading-v2.trace:
+    broadcom-rpi4:
+      checksum: 1135888a0e8723bbcded5ef9f0925964
+
+  humus/DynamicBranching3-v2.trace:
+    broadcom-rpi4:
+      checksum: 68011c66cfd83aa8a6b568de7c726d49
+
+  humus/HDR-v2.trace:
+    broadcom-rpi4:
+      checksum: de024f342418b578841f98ce697de8b5
+
+  humus/Portals-v2.trace:
+    broadcom-rpi4:
+      checksum: 269b9572113d6991cf58c96a833502bf
+
+  humus/RaytracedShadows-v2.trace:
+    broadcom-rpi4:
+      checksum: 6b572f241f4f9ee001ef849d10d03cc5
+
+  humus/VolumetricFogging2-v2.trace:
+    broadcom-rpi4:
+      checksum: d3b89dfaff0277be4b4b2ad2cf055d54
+
+  jvgs/jvgs-d27fb67-v2.trace:
+    broadcom-rpi4:
+      checksum: 831138a408cc9557528ef68381b080f2
+
+  neverball/neverball-v2.trace:
+    broadcom-rpi4:
+      checksum: c8e8ee352bdb303e4ed144b69272575e
+
+  nheko/nheko-colors.trace:
+    broadcom-rpi4:
+      checksum: 922597b0203ff18d6e430002bcf32ef4
+
+  supertuxkart/supertuxkart-mansion-egl-gles-v2.trace:
+    broadcom-rpi4:
+      checksum: 93fe17a18ab10d862b5a42b4ea05a658
+
+  valve/counterstrike-source-v2.trace:
+    broadcom-rpi4:
+      label: [skip, timeout]
+
+  valve/counterstrike-v2.trace:
+    broadcom-rpi4:
+      checksum: 547f6435bf21458e518bbcb2161962ab
+
+  valve/half-life-2-v2.trace:
+    broadcom-rpi4:
+      label: [crash]
+      text: v3d42_create_texture_shader_state_bo assertion abot serial_id
+
+  valve/portal-2-v2.trace:
+    broadcom-rpi4:
+      label: [skip, timeout]
+
+  paraview/pv-manyspheres-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  paraview/pv-waveletcontour-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  paraview/pv-waveletvolume-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  pathfinder/canvas_moire-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  pathfinder/canvas_text_v2-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  pathfinder/demo-v2.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  pioneer/pioneer.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.2
+
+  ror/ror-default.trace:
+    broadcom-rpi4:
+      label: [skip, flakes]
+      checksum: 533edca21409981b4983db846de4355e
+
+  thedarkmod/thedarkmod.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.3
+
+  unvanquished/unvanquished-lowest.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.2
+
+  unvanquished/unvanquished-ultra.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GL 3.2
+
+  warzone2100/warzone2100-default.trace:
+    broadcom-rpi4:
+      label: [unsupported]
+      text: needs GLSL 1.50
+
+  xonotic/xonotic-keybench-high-v2.trace:
+    broadcom-rpi4:
+      checksum: 3bc4ca2efa5a7b35701a8daad378e565
diff --git a/src/broadcom/cle/gen_pack_header.py b/src/broadcom/cle/gen_pack_header.py
index 0090b616d50..1cc2446d0bd 100644
--- a/src/broadcom/cle/gen_pack_header.py
+++ b/src/broadcom/cle/gen_pack_header.py
@@ -25,9 +25,8 @@
 import xml.parsers.expat
 import re
 import sys
-import copy
 
-license =  """/* Generated code, see v3d_packet_v21.xml, v3d_packet_v33.xml and gen_pack_header.py */
+license =  """/* Generated code, see vc4_packet.xml, v3d_packet.xml and gen_pack_header.py */
 """
 
 pack_header = """%(license)s
@@ -113,7 +112,7 @@ class Field(object):
         self.type = attrs["type"]
 
         if self.type == 'bool' and self.start != self.end:
-            print("#error Field {} has bool type but more than one bit of size".format(self.name));
+            print("#error Field {} has bool type but more than one bit of size".format(self.name))
 
         if "prefix" in attrs:
             self.prefix = safe_name(attrs["prefix"]).upper()
@@ -215,7 +214,7 @@ class Group(object):
             last_byte = field.end // 8
 
             for b in range(first_byte, last_byte + 1):
-                if not b in bytes:
+                if b not in bytes:
                     bytes[b] = self.Byte()
 
                 bytes[b].fields.append(field)
@@ -240,7 +239,7 @@ class Group(object):
 
         for index in range(self.length):
             # Handle MBZ bytes
-            if not index in bytes:
+            if index not in bytes:
                 print("   cl[%2d] = 0;" % index)
                 continue
             byte = bytes[index]
@@ -276,7 +275,6 @@ class Group(object):
 
             byte_start = index * 8
 
-            v = None
             prefix = "   cl[%2d] =" % index
 
             field_index = 0
@@ -296,46 +294,46 @@ class Group(object):
                     value = "%s - 1" % value
 
                 if field.type == "mbo":
-                    s = "__gen_mbo(%d, %d)" % \
+                    s = "util_bitpack_ones(%d, %d)" % \
                         (start, end)
                 elif field.type == "address":
                     extra_shift = (31 - (end - start)) // 8 * 8
                     s = "__gen_address_offset(&values->%s)" % byte.address.name
                 elif field.type == "uint":
-                    s = "__gen_uint(%s, %d, %d)" % \
+                    s = "util_bitpack_uint(%s, %d, %d)" % \
                         (value, start, end)
                 elif field.type in self.parser.enums:
-                    s = "__gen_uint(%s, %d, %d)" % \
+                    s = "util_bitpack_uint(%s, %d, %d)" % \
                         (value, start, end)
                 elif field.type == "int":
-                    s = "__gen_sint(%s, %d, %d)" % \
+                    s = "util_bitpack_sint(%s, %d, %d)" % \
                         (value, start, end)
                 elif field.type == "bool":
-                    s = "__gen_uint(%s, %d, %d)" % \
+                    s = "util_bitpack_uint(%s, %d, %d)" % \
                         (value, start, end)
                 elif field.type == "float":
                     s = "#error %s float value mixed in with other fields" % name
                 elif field.type == "f187":
-                    s = "__gen_uint(fui(%s) >> 16, %d, %d)" % \
+                    s = "util_bitpack_uint(fui(%s) >> 16, %d, %d)" % \
                         (value, start, end)
                 elif field.type == "offset":
                     s = "__gen_offset(%s, %d, %d)" % \
                         (value, start, end)
                 elif field.type == 'ufixed':
-                    s = "__gen_ufixed(%s, %d, %d, %d)" % \
+                    s = "util_bitpack_ufixed(%s, %d, %d, %d)" % \
                         (value, start, end, field.fractional_size)
                 elif field.type == 'sfixed':
-                    s = "__gen_sfixed(%s, %d, %d, %d)" % \
+                    s = "util_bitpack_sfixed(%s, %d, %d, %d)" % \
                         (value, start, end, field.fractional_size)
                 elif field.type in self.parser.structs:
-                    s = "__gen_uint(v%d_%d, %d, %d)" % \
+                    s = "util_bitpack_uint(v%d_%d, %d, %d)" % \
                         (index, field_index, start, end)
                     field_index = field_index + 1
                 else:
                     print("/* unhandled field %s, type %s */\n" % (name, field.type))
                     s = None
 
-                if not s == None:
+                if s is not None:
                     shift = byte_start - field_byte_start + extra_shift
                     if shift:
                         s = "%s >> %d" % (s, shift)
@@ -383,7 +381,6 @@ class Group(object):
                     convert = "__gen_unpack_sfixed"
                 else:
                     print("/* unhandled field %s, type %s */\n" % (field.name, field.type))
-                    s = None
 
                 plusone = ""
                 if field.minus_one:
@@ -545,9 +542,9 @@ class Parser(object):
     def emit_header(self, name):
         default_fields = []
         for field in self.group.fields:
-            if not type(field) is Field:
+            if type(field) is not Field:
                 continue
-            if field.default == None:
+            if field.default is None:
                 continue
             default_fields.append("   .%-35s = %6d" % (field.name, field.default))
 
@@ -577,7 +574,7 @@ class Parser(object):
             return
 
         name = self.register
-        if not self.reg_num == None:
+        if self.reg_num is not None:
             print('#define %-33s 0x%04x' %
                   (self.gen_prefix(name + "_num"), self.reg_num))
 
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
index 4cab2b38dda..da88cd220a5 100644
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -18,27 +18,25 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-# [version, cle XML version]
+# [version, cle XML file]
 v3d_versions = [
-  [21, 21],
-  [33, 33],
-  [41, 33],
-  [42, 33]
+  [21, 'vc4_packet.xml'],
+  [42, 'v3d_packet.xml'],
+  [71, 'v3d_packet.xml']
 ]
 
 v3d_xml_files = []
 v3d_xml_pack = []
 foreach _v : v3d_versions
   v = _v[0]
-  xmlver = _v[1]
-  f = 'v3d_packet_v@0@.xml'.format(xmlver)
+  xmlfile = _v[1]
   _name = 'v3d_packet_v@0@_pack.h'.format(v)
-  if not v3d_xml_files.contains(f)
-    v3d_xml_files += f
+  if not v3d_xml_files.contains(xmlfile)
+    v3d_xml_files += xmlfile
   endif
   v3d_xml_pack += custom_target(
     _name,
-    input : ['gen_pack_header.py', f],
+    input : ['gen_pack_header.py', xmlfile],
     output : _name,
     command : [prog_python, '@INPUT@', '@0@'.format(v)],
     capture : true,
@@ -47,7 +45,7 @@ endforeach
 
 v3d_xml_h = custom_target(
   'v3d_xml.h',
-  input : ['../../intel/genxml/gen_zipped_file.py', v3d_xml_files],
+  input : ['../../util/gen_zipped_xml_file.py', v3d_xml_files],
   output : 'v3d_xml.h',
   command : [prog_python, '@INPUT@'],
   capture : true,
@@ -59,9 +57,9 @@ if dep_expat.found()
 endif
 
 libbroadcom_cle = static_library(
-  ['broadcom_cle', v3d_xml_h],
-  'v3d_decoder.c',
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+  'broadcom_cle',
+  ['v3d_decoder.c', v3d_xml_h],
+  include_directories : [inc_include, inc_src, inc_broadcom],
   c_args : [no_override_init_args, expat_args],
   gnu_symbol_visibility : 'hidden',
   dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib],
diff --git a/src/broadcom/cle/v3d_decoder.c b/src/broadcom/cle/v3d_decoder.c
index 97dd8ce8423..46cd152e599 100644
--- a/src/broadcom/cle/v3d_decoder.c
+++ b/src/broadcom/cle/v3d_decoder.c
@@ -267,51 +267,6 @@ get_register_offset(const char **atts, uint32_t *offset)
         return;
 }
 
-static void
-get_start_end_pos(int *start, int *end)
-{
-        /* start value has to be mod with 32 as we need the relative
-         * start position in the first DWord. For the end position, add
-         * the length of the field to the start position to get the
-         * relative postion in the 64 bit address.
-         */
-        if (*end - *start > 32) {
-                int len = *end - *start;
-                *start = *start % 32;
-                *end = *start + len;
-        } else {
-                *start = *start % 32;
-                *end = *end % 32;
-        }
-
-        return;
-}
-
-static inline uint64_t
-mask(int start, int end)
-{
-        uint64_t v;
-
-        v = ~0ULL >> (63 - end + start);
-
-        return v << start;
-}
-
-static inline uint64_t
-field(uint64_t value, int start, int end)
-{
-        get_start_end_pos(&start, &end);
-        return (value & mask(start, end)) >> (start);
-}
-
-static inline uint64_t
-field_address(uint64_t value, int start, int end)
-{
-        /* no need to right shift for address/offset */
-        get_start_end_pos(&start, &end);
-        return (value & mask(start, end));
-}
-
 static struct v3d_type
 string_to_type(struct parser_context *ctx, const char *s)
 {
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet.xml
index de80a6b64a1..09dde392fac 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet.xml
@@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="33" max_ver="42">
+<vcxml gen="3.3" min_ver="42" max_ver="71">
 
   <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
     <value name="NEVER" value="0"/>
@@ -69,30 +69,7 @@
     <value name="TRIANGLE_FAN_TF" value="22"/>
   </enum>
 
-  <enum name="TMU Filter" prefix="V3D_TMU_FILTER" max_ver="33">
-    <!-- Names are mip filter, min filter, mag filter -->
-    <value name="MIN_LIN_MIP_NONE_MAG_LIN" value="0"/>
-    <value name="MIN_LIN_MIP_NONE_MAG_NEAR" value="1"/>
-    <value name="MIN_NEAR_MIP_NONE_MAG_LIN" value="2"/>
-    <value name="MIN_NEAR_MIP_NONE_MAG_NEAR" value="3"/>
-
-    <value name="MIN_NEAR_MIP_NEAR_MAG_LIN" value="4"/>
-    <value name="MIN_NEAR_MIP_NEAR_MAG_NEAR" value="5"/>
-    <value name="MIN_NEAR_MIP_LIN_MAG_LIN" value="6"/>
-    <value name="MIN_NEAR_MIP_LIN_MAG_NEAR" value="7"/>
-
-    <value name="MIN_LIN_MIP_NEAR_MAG_LIN" value="8"/>
-    <value name="MIN_LIN_MIP_NEAR_MAG_NEAR" value="9"/>
-    <value name="MIN_LIN_MIP_LIN_MAG_LIN" value="10"/>
-    <value name="MIN_LIN_MIP_LIN_MAG_NEAR" value="11"/>
-
-    <value name="ANISOTROPIC_2_1" value="12"/>
-    <value name="ANISOTROPIC_4_1" value="13"/>
-    <value name="ANISOTROPIC_8_1" value="14"/>
-    <value name="ANISOTROPIC_16_1" value="15"/>
-  </enum>
-
-  <enum name="Border Color Mode" prefix="V3D_BORDER_COLOR" min_ver="41">
+  <enum name="Border Color Mode" prefix="V3D_BORDER_COLOR">
     <value name="0000" value="0"/>
     <value name="0001" value="1"/>
     <value name="1111" value="2"/>
@@ -107,7 +84,7 @@
       <value name="MIRROR_ONCE" value="4"/>
   </enum>
 
-  <enum name="TMU Op" prefix="V3D_TMU_OP" min_ver="41">
+  <enum name="TMU Op" prefix="V3D_TMU_OP">
     <value name="Write ADD, Read Prefetch" value="0"/>
     <value name="Write SUB, Read Clear" value="1"/>
     <value name="Write XCHG, Read Flush" value="2"/>
@@ -167,11 +144,34 @@
     <value name="depth_16" value="2"/>
   </enum>
 
-  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
+  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" max_ver="42">
     <value name="none" value="0"/> <!-- no clamping -->
     <value name="norm" value="1"/> <!-- [0,1] for f16 -->
     <value name="pos" value="2"/> <!-- [0, for f16 -->
-    <value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
+    <value name="int" value="3"/> <!-- clamp to integer RT's range -->
+  </enum>
+
+  <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
+    <value name="8i"             value="0"/>  <!-- no clamping -->
+    <value name="16i"            value="1"/>  <!-- no clamping -->
+    <value name="32i"            value="2"/>  <!-- no clamping -->
+    <value name="8ui"            value="4"/>  <!-- no clamping -->
+    <value name="16ui"           value="5"/>  <!-- no clamping -->
+    <value name="32ui"           value="6"/>  <!-- no clamping -->
+    <value name="8"              value="8"/>  <!-- no clamping -->
+    <value name="16f"            value="9"/>  <!-- no clamping -->
+    <value name="32f"            value="10"/> <!-- no clamping -->
+    <value name="8i_clamped"     value="16"/> <!-- clamp to integer RT's range -->
+    <value name="16i_clamped"    value="17"/> <!-- clamp to integer RT's range -->
+    <value name="32i_clamped"    value="18"/> <!-- clamp to integer RT's range -->
+    <value name="8ui_clamped"    value="20"/> <!-- clamp to integer RT's range -->
+    <value name="16ui_clamped"   value="21"/> <!-- clamp to integer RT's range -->
+    <value name="32ui_clamped"   value="22"/> <!-- clamp to integer RT's range -->
+    <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
+    <value name="16f_clamp_pos"  value="25"/> <!-- [0, for f16 -->
+    <value name="16f_clamp_pq"   value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
+    <value name="16f_clamp_hlg"  value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
+    <value name="invalid"        value="32"/>
   </enum>
 
   <!---
@@ -261,22 +261,27 @@
     <value name="rgba8ui"  value="34"/>
     <value name="rg8ui"    value="35"/>
     <value name="r8ui"     value="36"/>
-    <value name="srgbx8"   value="37" max_ver="33"/>
-    <value name="rgbx8"    value="38" max_ver="33"/>
-    <value name="bstc"     value="39" min_ver="41"/>
-    <value name="d32f"     value="40" min_ver="41"/>
-    <value name="d24"      value="41" min_ver="41"/>
-    <value name="d16"      value="42" min_ver="41"/>
-    <value name="d24s8"    value="43" min_ver="41"/>
-    <value name="s8"       value="44" min_ver="41"/>
-    <value name="rgba5551" value="45" min_ver="41"/>
-  </enum>
-
-  <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
-    <value name="depth_component32f" value="0"/>
-    <value name="depth_component24" value="1"/> <!-- depth low, pad high -->
-    <value name="depth_component16" value="2"/>
-    <value name="depth24_stencil8" value="3"/> <!-- stencil low, depth high -->
+    <value name="bstc8"    value="39"/>
+    <value name="d32f"     value="40"/>
+    <value name="d24"      value="41"/>
+    <value name="d16"      value="42"/>
+    <value name="d24s8"    value="43"/>
+    <value name="s8"       value="44"/>
+    <value name="rgba5551" value="45"/>
+    <value name="bstc8_srgb"          value="46" min_ver="71"/>
+    <value name="bstc10"              value="47" min_ver="71"/>
+    <value name="bstc10_srgb"         value="48" min_ver="71"/>
+    <value name="bstc10_pq"           value="49" min_ver="71"/>
+    <value name="rgba10x6"            value="50" min_ver="71"/>
+    <value name="bstc10_hlg"          value="55" min_ver="71"/>
+    <value name="rgba10x6_hlg"        value="56" min_ver="71"/>
+    <value name="rgb10_a2_hlg"        value="57" min_ver="71"/>
+    <value name="bstc10_pq_bt1886"    value="58" min_ver="71"/>
+    <value name="rgba10x6_pq_bt1886"  value="59" min_ver="71"/>
+    <value name="rgb10_a2_pq_bt1886"  value="60" min_ver="71"/>
+    <value name="bstc10_hlg_bt1886"   value="61" min_ver="71"/>
+    <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
+    <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
   </enum>
 
   <enum name="Dither Mode" prefix="V3D_DITHER_MODE">
@@ -299,7 +304,7 @@
     <value name="packed complete patches" value="2"/>
   </enum>
 
-  <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS">
+  <enum name="Primitive counters" prefix="V3D_PRIM_COUNTS">
     <value name="tf_words_buffer0" value="0"/>
     <value name="tf_words_buffer1" value="1"/>
     <value name="tf_words_buffer2" value="2"/>
@@ -309,6 +314,17 @@
     <value name="tf_overflow" value="6"/>
   </enum>
 
+  <enum name="Line Rasterization" prefix="V3D_LINE_RASTERIZATION">
+    <value name="diamond exit" value="0"/>
+    <value name="perp end caps" value="1"/>
+  </enum>
+
+  <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
+    <value name="NONE" value="0"/>
+    <value name="MIN_ONE_TO_ONE" value="1"/>
+    <value name="ZERO_TO_ONE" value="2"/>
+  </enum>
+
   <packet code="0" name="Halt"/>
   <packet code="1" name="NOP"/>
   <packet code="4" name="Flush"/>
@@ -362,57 +378,18 @@
     <field name="column number in supertiles" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="24" shortname="store_subsample" name="Store Multi-Sample Resolved Tile Color Buffer" cl="R" max_ver="33"/>
-
-  <packet code="25" shortname="store_subsample_ex" name="Store Multi-Sample Resolved Tile Color Buffer (extended)" cl="R" max_ver="33">
-    <field name="Disable Color Buffer write" size="8" start="8" type="uint"/>
-    <field name="Enable Z write" size="1" start="7" type="bool"/>
-    <field name="Enable Stencil write" size="1" start="6" type="bool"/>
-    <!-- bit 5 unused -->
-    <field name="Disable Color buffer(s) clear on write" size="1" start="4" type="bool"/>
-    <field name="Disable Stencil buffer clear on write" size="1" start="3" type="bool"/>
-    <field name="Disable Z buffer clear on write" size="1" start="2" type="bool"/>
-    <field name="Disable fast opportunistic write out in multisample mode" size="1" start="1" type="bool"/>
-    <field name="Last Tile of Frame" size="1" start="0" type="bool"/>
-  </packet>
-
-  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
+  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" max_ver="42">
     <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
     <field name="Clear all Render Targets" size="1" start="0" type="bool"/>
   </packet>
 
-  <packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
-    <field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
-    <field name="Enable Z load" size="1" start="7" type="bool"/>
-    <field name="Enable Stencil load" size="1" start="6" type="bool"/>
-  </packet>
+  <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
 
-  <packet code="26" shortname="end_loads" name="End of Loads" cl="R" min_ver="41"/>
+  <packet code="26" shortname="end_loads" name="End of Loads" cl="R"/>
 
   <packet code="27" shortname="end_tile" name="End of Tile Marker" cl="R"/>
 
-  <packet code="29" shortname="store_general" name="Store Tile Buffer General" cl="R" max_ver="33">
-    <field name="Address" size="24" start="24" type="address"/>
-    <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/>
-    <field name="XOR UIF" size="1" start="10" type="bool"/>
-    <field name="Last Tile of Frame" size="1" start="8" type="bool"/>
-    <field name="Disable Color buffer(s) clear on write" size="1" start="7" type="bool"/>
-    <field name="Disable Stencil buffer clear on write" size="1" start="6" type="bool"/>
-    <field name="Disable Z buffer clear on write" size="1" start="5" type="bool"/>
-    <field name="Raw Mode" size="1" start="4" type="bool"/>
-    <field name="Buffer to Store" size="4" start="0" type="uint">
-      <value name="Render target 0" value="0"/>
-      <value name="Render target 1" value="1"/>
-      <value name="Render target 2" value="2"/>
-      <value name="Render target 3" value="3"/>
-      <value name="None" value="8"/>
-      <value name="Z" value="9"/>
-      <value name="Stencil" value="10"/>
-      <value name="Z+Stencil" value="11"/>
-    </field>
-  </packet>
-
-  <packet code="29" shortname="store" name="Store Tile Buffer General" cl="R" min_ver="41">
+  <packet code="29" shortname="store" name="Store Tile Buffer General" cl="R">
     <field name="Address" size="32" start="64" type="address"/>
 
     <!-- used for y flip -->
@@ -438,6 +415,10 @@
       <value name="Render target 1" value="1"/>
       <value name="Render target 2" value="2"/>
       <value name="Render target 3" value="3"/>
+      <value name="Render target 4" value="4" min_ver="71"/>
+      <value name="Render target 5" value="5" min_ver="71"/>
+      <value name="Render target 6" value="6" min_ver="71"/>
+      <value name="Render target 7" value="7" min_ver="71"/>
       <value name="None" value="8"/>
       <value name="Z" value="9"/>
       <value name="Stencil" value="10"/>
@@ -445,24 +426,7 @@
     </field>
   </packet>
 
-  <packet code="30" shortname="load_general" name="Load Tile Buffer General" cl="R" max_ver="33">
-    <field name="Address" size="24" start="24" type="address"/>
-    <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/>
-    <field name="XOR UIF" size="1" start="10" type="bool"/>
-    <field name="Raw Mode" size="1" start="4" type="bool"/>
-    <field name="Buffer to Load" size="4" start="0" type="uint">
-      <value name="Render target 0" value="0"/>
-      <value name="Render target 1" value="1"/>
-      <value name="Render target 2" value="2"/>
-      <value name="Render target 3" value="3"/>
-      <value name="None" value="8"/>
-      <value name="Z" value="9"/>
-      <value name="Stencil" value="10"/>
-      <value name="Z+Stencil" value="11"/>
-    </field>
-  </packet>
-
-  <packet code="30" shortname="load" name="Load Tile Buffer General" cl="R" min_ver="41">
+  <packet code="30" shortname="load" name="Load Tile Buffer General" cl="R">
     <field name="Address" size="32" start="64" type="address"/>
 
     <!-- used for y flip -->
@@ -496,23 +460,7 @@
 
   <packet code="31" shortname="tf_draw_flush_and_count" name="Transform Feedback Flush and Count"/>
 
-  <packet code="32" name="Indexed Prim List" cl="B" max_ver="33">
-    <field name="Minimum index" size="32" start="104" type="uint"/>
-    <field name="Enable Primitive Restarts" size="1" start="103" type="bool"/>
-    <field name="Maximum index" size="31" start="72" type="uint"/>
-    <field name="Address of Indices List" size="32" start="40" type="address"/>
-    <field name="Length" size="32" start="8" type="uint"/>
-
-    <field name="Index type" size="2" start="6" type="uint">
-      <value name="Index type 8-bit" value="0"/>
-      <value name="Index type 16-bit" value="1"/>
-      <value name="Index type 32-bit" value="2"/>
-    </field>
-
-    <field name="mode" size="5" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="32" name="Indexed Prim List" cl="B" min_ver="41">
+  <packet code="32" name="Indexed Prim List" cl="B">
     <field name="Index Offset" size="32" start="40" type="uint"/>
 
     <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
@@ -527,23 +475,7 @@
     <field name="mode" size="6" start="0" type="Primitive"/>
   </packet>
 
-  <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B" max_ver="33">
-    <field name="Stride in Multiples of 4 Bytes" size="8" start="104" type="uint"/>
-    <field name="Address of Indices List" size="32" start="72" type="address"/>
-    <field name="Address" size="32" start="40" type="address"/>
-    <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
-    <field name="Number of Draw Indirect Indexed Records" size="31" start="8" type="uint"/>
-
-    <field name="Index type" size="2" start="6" type="uint">
-      <value name="Index type 8-bit" value="0"/>
-      <value name="Index type 16-bit" value="1"/>
-      <value name="Index type 32-bit" value="2"/>
-    </field>
-
-    <field name="mode" size="6" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B" min_ver="41">
+  <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B">
     <field name="Stride in Multiples of 4 Bytes" size="8" start="72" type="uint"/>
     <field name="Address" size="32" start="40" type="address"/>
     <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
@@ -558,23 +490,7 @@
     <field name="mode" size="6" start="0" type="Primitive"/>
   </packet>
 
-  <packet code="34" name="Indexed Instanced Prim List" cl="B" max_ver="33">
-    <field name="Enable Primitive Restarts" size="1" start="135" type="bool"/>
-    <field name="Maximum index" size="31" start="104" type="uint"/>
-    <field name="Address of Indices List" size="32" start="72" type="address"/>
-    <field name="Number of Instances" size="32" start="40" type="uint"/>
-    <field name="Instance Length" size="32" start="8" type="uint"/>
-
-    <field name="Index type" size="2" start="6" type="uint">
-      <value name="Index type 8-bit" value="0"/>
-      <value name="Index type 16-bit" value="1"/>
-      <value name="Index type 32-bit" value="2"/>
-    </field>
-
-    <field name="mode" size="5" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="34" name="Indexed Instanced Prim List" cl="B" min_ver="41">
+  <packet code="34" name="Indexed Instanced Prim List" cl="B">
     <field name="Index Offset" size="32" start="72" type="uint"/>
     <field name="Number of Instances" size="32" start="40" type="uint"/>
     <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
@@ -626,16 +542,16 @@
     <field name="Base Vertex" size="32" start="0" type="uint"/>
   </packet>
 
-  <packet code="44" name="Index Buffer Setup" cl="B" min_ver="41">
+  <packet code="44" name="Index Buffer Setup" cl="B">
     <field name="Address" size="32" start="0" type="address"/>
     <field name="Size" size="32" start="32" type="uint"/>
   </packet>
 
-  <packet code="54" name="Set InstanceID" cl="B" min_ver="41">
+  <packet code="54" name="Set InstanceID" cl="B">
     <field name="Instance ID" size="32" start="0" type="uint"/>
   </packet>
 
-  <packet code="55" name="Set PrimitiveID" cl="B" min_ver="41">
+  <packet code="55" name="Set PrimitiveID" cl="B">
     <field name="Primitive ID" size="32" start="0" type="uint"/>
   </packet>
 
@@ -662,22 +578,22 @@
     <field name="number of attribute arrays" size="5" start="0" type="uint"/>
   </packet>
 
-  <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS" min_ver="41">
+  <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS">
     <field name="address" size="27" start="5" type="address"/>
     <field name="number of attribute arrays" size="5" start="0" type="uint"/>
   </packet>
 
-  <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS" min_ver="41">
+  <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS">
     <field name="address" size="27" start="5" type="address"/>
     <field name="number of attribute arrays" size="5" start="0" type="uint"/>
   </packet>
 
-  <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS" min_ver="41">
+  <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS">
     <field name="address" size="27" start="5" type="address"/>
     <field name="number of attribute arrays" size="5" start="0" type="uint"/>
   </packet>
 
-  <packet code="71" name="VCM Cache Size" min_ver="41">
+  <packet code="71" name="VCM Cache Size">
     <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
     <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
   </packet>
@@ -706,23 +622,13 @@
     </field>
   </packet>
 
-  <packet code="73" name="VCM Cache Size" max_ver="33">
-    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
-    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="73" name="Transform Feedback Buffer" min_ver="41">
+  <packet code="73" name="Transform Feedback Buffer">
     <field name="Buffer Address" size="32" start="32" type="address"/>
     <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
     <field name="Buffer Number" size="2" start="0" type="uint"/>
   </packet>
 
-  <packet code="74" name="Transform Feedback Enable" max_ver="33">
-    <field name="number of 32-bit Output Buffer Address following" size="3" start="8" type="uint"/>
-    <field name="number of 16-bit Output Data Specs following" size="5" start="11" type="uint"/>
-  </packet>
-
-  <packet code="74" name="Transform Feedback Specs" min_ver="41">
+  <packet code="74" name="Transform Feedback Specs">
     <field name="Enable" size="1" start="7" type="bool"/>
     <field name="Number of 16-bit Output Data Specs following" size="5" start="0" type="uint"/>
   </packet>
@@ -742,13 +648,7 @@
     <field name="L2T Flush Start" size="32" start="0" type="address"/>
   </packet>
 
-  <struct name="Transform Feedback Output Data Spec" max_ver="33">
-    <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
-    <field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/>
-    <field name="Output Buffer to write to" size="2" start="12" type="uint"/>
-  </struct>
-
-  <struct name="Transform Feedback Output Data Spec" min_ver="41">
+  <struct name="Transform Feedback Output Data Spec">
     <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
     <field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/>
     <field name="Output Buffer to write to" size="2" start="12" type="uint"/>
@@ -771,11 +671,12 @@
     <field name="Stencil Ref Value" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="83" name="Blend Enables" min_ver="41">
+  <packet code="83" name="Blend Enables">
     <field name="Mask" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="84" name="Blend Cfg" max_ver="33">
+  <packet code="84" name="Blend Cfg"  max_ver="42">
+    <field name="Render Target Mask" size="4" start="24" type="uint"/>
     <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
     <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
     <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
@@ -784,8 +685,8 @@
     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
   </packet>
 
-  <packet code="84" name="Blend Cfg" min_ver="41">
-    <field name="Render Target Mask" size="4" start="24" type="uint"/>
+  <packet code="84" name="Blend Cfg" min_ver="71">
+    <field name="Render Target Mask" size="8" start="24" type="uint"/>
     <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
     <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
     <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
@@ -805,16 +706,16 @@
     <field name="Mask" size="32" start="0" type="uint"/>
   </packet>
 
-  <packet code="88" name="Zero All Centroid Flags" min_ver="41"/>
+  <packet code="88" name="Zero All Centroid Flags" />
 
-  <packet code="89" name="Centroid Flags" min_ver="41">
+  <packet code="89" name="Centroid Flags">
     <field name="Centroid Flags for varyings V0*24" size="24" start="8" type="uint"/>
     <field name="Action for Centroid Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
     <field name="Action for Centroid Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
     <field name="Varying offset V0" size="4" start="0" type="uint"/>
   </packet>
 
-  <packet code="91" name="Sample State" min_ver="41">
+  <packet code="91" name="Sample State">
     <field name="Coverage" size="16" start="16" type="f187"/>
     <field name="Mask" size="4" start="0" type="uint"/>
   </packet>
@@ -823,7 +724,12 @@
     <field name="address" size="32" start="0" type="address"/>
   </packet>
 
-  <packet code="96" name="Cfg Bits">
+  <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
+    <field name="Lower Test Limit" size="32" start="0" type="float"/>
+    <field name="Upper Test Limit" size="32" start="32" type="float"/>
+  </packet>
+
+  <packet code="96" name="Cfg Bits" max_ver="42">
     <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
     <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
     <field name="Blend enable" size="1" start="19" type="bool"/>
@@ -834,7 +740,26 @@
     <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
     <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
     <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
-    <field name="Line Rasterization" size="2" start="4" type="uint"/>
+    <field name="Line Rasterization" size="2" start="4" type="Line Rasterization"/>
+    <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
+    <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
+    <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
+    <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+  </packet>
+
+  <packet code="96" name="Cfg Bits" min_ver="71">
+    <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
+    <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+    <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+    <field name="Blend enable" size="1" start="19" type="bool"/>
+    <field name="Stencil enable" size="1" start="18" type="bool"/>
+    <field name="Z updates enable" size="1" start="15" type="bool"/>
+    <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
+    <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
+    <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
+    <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
+    <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
+    <field name="Line Rasterization" size="1" start="4" type="uint"/>
     <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
     <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
     <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
@@ -850,9 +775,9 @@
     <field name="Varying offset V0" size="4" start="0" type="uint"/>
   </packet>
 
-  <packet code="99" shortname="zero_all_noperspective_flags" name="Zero All Non-perspective Flags" min_ver="41"/>
+  <packet code="99" shortname="zero_all_noperspective_flags" name="Zero All Non-perspective Flags" />
 
-  <packet code="100" shortname="noperspective_flags" name="Non-perspective Flags" min_ver="41">
+  <packet code="100" shortname="noperspective_flags" name="Non-perspective Flags">
     <field name="Non-perspective Flags for varyings V0*24" size="24" start="8" type="uint"/>
     <field name="Action for Non-perspective Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
     <field name="Action for Non-perspective Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
@@ -867,12 +792,7 @@
     <field name="Line width" size="32" start="0" type="float"/>
   </packet>
 
-  <packet name="Depth Offset" code="106" max_ver="33">
-    <field name="Depth Offset Units" size="16" start="16" type="f187"/>
-    <field name="Depth Offset Factor" size="16" start="0" type="f187"/>
-  </packet>
-
-  <packet name="Depth Offset" code="106" min_ver="41">
+  <packet name="Depth Offset" code="106">
     <field name="Limit" size="32" start="32" type="float"/>
     <field name="Depth Offset Units" size="16" start="16" type="f187"/>
     <field name="Depth Offset Factor" size="16" start="0" type="f187"/>
@@ -885,16 +805,11 @@
     <field name="Clip Window Left Pixel Coordinate" size="16" start="0" type="uint"/>
   </packet>
 
-  <packet name="Viewport Offset" code="108" max_ver="33">
-    <field name="Viewport Centre Y-coordinate" size="32" start="32" type="s24.8"/>
-    <field name="Viewport Centre X-coordinate" size="32" start="0" type="s24.8"/>
-  </packet>
-
-  <packet name="Viewport Offset" code="108" min_ver="41">
-    <field name="Coarse Y" size="10" start="54" type="uint"/>
-    <field name="Viewport Centre Y-coordinate" size="22" start="32" type="s14.8"/>
-    <field name="Coarse X" size="10" start="22" type="uint"/>
-    <field name="Viewport Centre X-coordinate" size="22" start="0" type="s14.8"/>
+  <packet name="Viewport Offset" code="108">
+    <field name="Coarse Y" size="10" start="54" type="int"/>
+    <field name="Fine Y" size="22" start="32" type="u14.8"/>
+    <field name="Coarse X" size="10" start="22" type="int"/>
+    <field name="Fine X" size="22" start="0" type="u14.8"/>
   </packet>
 
   <packet shortname="clipz" name="Clipper Z min/max clipping planes" code="109">
@@ -902,31 +817,41 @@
     <field name="Minimum Zw" size="32" start="0" type="float"/>
   </packet>
 
-  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
+  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
     <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
     <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
   </packet>
 
+  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
+    <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
+    <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
+  </packet>
+
   <packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
     <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
     <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
   </packet>
 
-  <packet name="Number of Layers" code="119" min_ver="41">
+  <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
+    <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+    <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+  </packet>
+
+  <packet name="Number of Layers" code="119">
     <field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
   </packet>
 
-  <packet code="120" name="Tile Binning Mode Cfg (Part1)" max_ver="33">
-    <field name="Double-buffer in non-ms mode" size="1" start="63" type="bool"/>
-    <field name="Multisample Mode (4x)" size="1" start="62" type="bool"/>
+  <packet code="120" name="Tile Binning Mode Cfg"  max_ver="42">
 
-    <field name="Maximum BPP of all render targets" size="2" start="60" type="Internal BPP"/>
+    <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+    <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
 
-    <field name="Number of Render Targets" size="4" start="56" type="uint"/>
-    <field name="Height (in tiles)" size="12" start="44" type="uint"/>
-    <field name="Width (in tiles)" size="12" start="32" type="uint"/>
+    <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
+    <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
 
-    <field name="Tile State Data Array Base Address" size="26" start="6" type="address"/>
+    <field name="Maximum BPP of all render targets" size="2" start="12" type="Internal BPP"/>
+
+    <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/>
 
     <field name="tile allocation block size" size="2" start="4" type="uint">
       <value name="tile allocation block size 64b" value="0"/>
@@ -938,21 +863,24 @@
       <value name="tile allocation initial block size 128b" value="1"/>
       <value name="tile allocation initial block size 256b" value="2"/>
     </field>
-    <field name="auto-initialize tile state data array" size="1" start="1" type="bool" default="1"/>
-    <field name="sub-id" size="1" start="0" type="uint" default="0"/>
   </packet>
 
-  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
-
+  <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
     <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
     <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
 
-    <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
-    <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
-
-    <field name="Maximum BPP of all render targets" size="2" start="12" type="Internal BPP"/>
-
-    <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/>
+    <field name="Log2 Tile Height" size="3" start="11" type="uint">
+      <value name="tile height 8 pixels" value="0"/>
+      <value name="tile height 16 pixels" value="1"/>
+      <value name="tile height 32 pixels" value="2"/>
+      <value name="tile height 64 pixels" value="3"/>
+    </field>
+    <field name="Log2 Tile Width"  size="3" start="8" type="uint">
+      <value name="tile width 8 pixels" value="0"/>
+      <value name="tile width 16 pixels" value="1"/>
+      <value name="tile width 32 pixels" value="2"/>
+      <value name="tile width 64 pixels" value="3"/>
+    </field>
 
     <field name="tile allocation block size" size="2" start="4" type="uint">
       <value name="tile allocation block size 64b" value="0"/>
@@ -966,17 +894,11 @@
     </field>
   </packet>
 
-  <packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
-    <field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
-    <field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
-
-    <field name="sub-id" size="1" start="0" type="uint" default="1"/>
-  </packet>
+  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R"  max_ver="42">
+    <field name="Pad" size="12" start="52" type="uint"/>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" max_ver="33">
-    <field name="Disable Render Target Stores" size="8" start="56" type="uint"/>
-    <field name="Enable Z Store" size="1" start="55" type="bool"/>
-    <field name="Enable Stencil Store" size="1" start="54" type="bool"/>
+    <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+    <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
 
     <field name="Early-Z disable" size="1" start="46" type="bool"/>
 
@@ -988,7 +910,11 @@
     <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
 
-    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
+    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
+      <value name="Render target maximum 32bpp" value="0"/>
+      <value name="Render target maximum 64bpp" value="1"/>
+      <value name="Render target maximum 128bpp" value="2"/>
+    </field>
 
     <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
     <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
@@ -997,8 +923,21 @@
     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
-    <field name="Pad" size="12" start="52" type="uint"/>
+  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
+    <field name="Pad" size="6" start="58" type="uint"/>
+
+    <field name="Log2 Tile Height" size="3" start="55" type="uint">
+      <value name="tile height 8 pixels" value="0"/>
+      <value name="tile height 16 pixels" value="1"/>
+      <value name="tile height 32 pixels" value="2"/>
+      <value name="tile height 64 pixels" value="3"/>
+    </field>
+    <field name="Log2 Tile Width"  size="3" start="52" type="uint">
+      <value name="tile width 8 pixels" value="0"/>
+      <value name="tile width 16 pixels" value="1"/>
+      <value name="tile width 32 pixels" value="2"/>
+      <value name="tile width 64 pixels" value="3"/>
+    </field>
 
     <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
     <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
@@ -1010,40 +949,18 @@
       <value name="Early-Z direction GT/GE" value="1"/>
     </field>
 
+    <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
     <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
 
-    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
-
     <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
     <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
     <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
 
-    <field name="sub-id" size="4" start="0" type="uint" default="0"/>
+    <field name="sub-id" size="3" start="0" type="uint" default="0"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
-    <field name="Address" size="32" start="32" type="address"/>
-
-    <field name="Pad" size="4" start="28" type="uint"/>
-
-    <field name="Flip Y" size="1" start="27" type="bool"/>
-
-    <field name="Memory Format" size="3" start="24" type="Memory Format"/>
-
-    <field name="Dither Mode" size="2" start="22" type="Dither Mode"/>
-
-    <field name="Output image format" size="6" start="16" type="Output Image Format"/>
-
-    <field name="Decimate mode" size="2" start="14" type="Decimate Mode"/>
-
-    <field name="Internal Type" size="4" start="10" type="Internal Type"/>
-    <field name="Internal BPP" size="2" start="8" type="Internal BPP"/>
-    <field name="Render Target Number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="2"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R"  max_ver="42">
 
     <field name="Pad" size="28" start="36" type="uint"/>
 
@@ -1066,53 +983,25 @@
     <field name="sub-id" size="4" start="0" type="uint" default="1"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Z/Stencil)" cl="R" max_ver="33">
-    <field name="Address" size="26" start="38" type="address"/>
-
-    <field name="Padded height of output image in UIF blocks" size="13" start="25" type="uint"/>
-
-    <field name="Memory Format" size="3" start="22" type="Memory Format"/>
-
-    <field name="Output image format" size="6" start="16" type="Z/S Output Image Format"/>
-
-    <field name="Decimate mode" size="2" start="14" type="uint"/>
-
-    <field name="Internal Type" size="4" start="10" type="Internal Depth Type"/>
-
-    <field name="Internal BPP (ignored)" size="2" start="8" type="uint"/>
-    <!-- selects between Z/Stencil config packet and Separate Stencil packet. -->
-    <field name="Z/Stencil ID" size="4" start="4" type="uint" default="0"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" max_ver="33">
+  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R"  max_ver="42">
     <field name="unused" size="16" start="48" type="uint"/>
 
     <field name="Z Clear Value" size="32" start="16" type="float"/>
 
     <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="3"/>
+    <field name="sub-id" size="4" start="0" type="uint" default="2"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
     <field name="unused" size="16" start="48" type="uint"/>
 
     <field name="Z Clear Value" size="32" start="16" type="float"/>
 
     <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="2"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
-    <!-- Express this as a 56-bit field? -->
-    <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
-    <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
-
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="4"/>
+    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R"  max_ver="42">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
     <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
@@ -1121,16 +1010,20 @@
     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
-    <!-- Express this as a 56-bit field? -->
-    <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
-    <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
+  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
 
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="5"/>
+    <field name="Clear Color low bits" size="32" start="32" type="uint"/>
+    <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
+    <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
+
+    <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
+    <!-- In multiples of 512 bits -->
+    <field name="Base Address" size="11" start="7" type="uint"/>
+    <field name="Render Target number" size="3" start="3" type="uint"/>
+    <field name="sub-id" size="3" start="0" type="uint" default="2"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R"  max_ver="42">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
     <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
@@ -1139,18 +1032,14 @@
     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
-    <field name="pad" size="11" start="53" type="uint"/>
-    <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
-    <!-- image height is for Y flipping -->
-    <field name="Raster Row Stride or Image Height in Pixels" size="16" start="24" type="uint"/>
-    <field name="Clear Color high 16 bits" size="16" start="8" type="uint"/>
+  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
+    <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
 
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="6"/>
+    <field name="Render Target number" size="3" start="3" type="uint"/>
+    <field name="sub-id" size="3" start="0" type="uint" default="3"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R"  max_ver="42">
     <field name="pad" size="11" start="53" type="uint"/>
     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
     <!-- image height is for Y flipping -->
@@ -1161,6 +1050,13 @@
     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
   </packet>
 
+  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
+    <field name="Clear Color top bits" size="56" start="8" type="uint"/>
+
+    <field name="Render Target number" size="3" start="3" type="uint"/>
+    <field name="sub-id" size="3" start="0" type="uint" default="4"/>
+  </packet>
+
   <packet code="124" shortname="tile_coords" name="Tile Coordinates">
     <field name="tile row number" size="12" start="12" type="uint"/>
     <field name="tile column number" size="12" start="0" type="uint"/>
@@ -1199,43 +1095,7 @@
     </field>
   </packet>
 
-  <struct name="GL Shader State Record" max_ver="33">
-    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
-    <field name="Enable clipping" size="1" start="1" type="bool"/>
-    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
-    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
-    <field name="Vertex ID read by vertex shader" size="1" start="4" type="bool"/>
-    <field name="Instance ID read by vertex shader" size="1" start="5" type="bool"/>
-    <field name="Fragment shader does Z writes" size="1" start="6" type="bool"/>
-    <field name="Turn off early-z test" size="1" start="7" type="bool"/>
-    <field name="Coordinate shader has separate input and output VPM blocks" size="1" start="8" type="bool"/>
-    <field name="Vertex shader has separate input and output VPM blocks" size="1" start="9" type="bool"/>
-    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="10" type="bool"/>
-
-    <field name="Number of varyings in Fragment Shader" size="8" start="2b" type="uint"/>
-    <field name="Coordinate Shader output VPM segment size" size="8" start="4b" type="uint"/>
-    <field name="Coordinate Shader input VPM segment size" size="8" start="5b" type="uint"/>
-    <field name="Vertex Shader output VPM segment size" size="8" start="6b" type="uint"/>
-    <field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/>
-    <field name="Address of default attribute values" size="32" start="8b" type="address"/>
-    <field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
-    <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/>
-    <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/>
-    <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/>
-    <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
-    <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
-    <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/>
-    <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/>
-    <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/>
-    <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
-    <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
-    <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/>
-    <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/>
-    <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/>
-    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
-  </struct>
-
-  <struct name="GL Shader State Record" min_ver="41">
+  <struct name="GL Shader State Record"  max_ver="42">
     <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
     <field name="Enable clipping" size="1" start="1" type="bool"/>
 
@@ -1294,7 +1154,64 @@
     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
   </struct>
 
-  <struct name="Geometry Shader State Record" min_ver="41">
+  <struct name="GL Shader State Record" min_ver="71">
+    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+    <field name="Enable clipping" size="1" start="1" type="bool"/>
+
+    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
+    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
+    <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
+    <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
+    <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
+    <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
+
+    <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
+    <field name="Turn off early-z test" size="1" start="9" type="bool"/>
+
+    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
+    <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
+    <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
+    <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
+    <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
+    <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
+    <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
+    <field name="No prim pack" size="1" start="19" type="bool"/>
+    <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
+
+    <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
+
+    <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
+    <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
+
+    <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
+    <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
+
+    <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
+    <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
+
+    <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
+    <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
+
+    <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
+    <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
+    <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
+    <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
+    <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
+
+    <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
+    <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
+    <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
+    <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
+    <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
+
+    <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
+    <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
+    <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
+    <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
+    <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
+  </struct>
+
+  <struct name="Geometry Shader State Record">
     <field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
     <field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
     <field name="Geometry Bin Mode Shader Start in final thread section" size="1" start="1" type="bool"/>
@@ -1307,7 +1224,7 @@
     <field name="Geometry Render Mode Shader Uniforms Address" size="32" start="12b" type="address"/>
   </struct>
 
-  <struct name="Tessellation Shader State Record" min_ver="41">
+  <struct name="Tessellation Shader State Record">
     <field name="Tessellation Bin Mode Control Shader Code Address" size="29" start="3" type="address"/>
     <field name="Tessellation Bin Mode Control Shader 4-way threadable" size="1" start="0" type="bool"/>
     <field name="Tessellation Bin Mode Control Shader Start in final thread section" size="1" start="1" type="bool"/>
@@ -1331,7 +1248,7 @@
     <field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/>
   </struct>
 
-  <struct name="Tessellation/Geometry Common Params" min_ver="41">
+  <struct name="Tessellation/Geometry Common Params">
     <field name="Tessellation Type" size="2" start="1" type="uint">
       <value name="Tessellation Type Triangle" value="0"/>
       <value name="Tessellation Type Quads" value="1"/>
@@ -1391,31 +1308,7 @@
     <field name="GBG min GS output segments required in play" size="3" start="59" type="uint" minus_one="true"/>
   </struct>
 
-  <struct name="GL Shader State Attribute Record" max_ver="33">
-    <field name="Address" size="32" start="0" type="address"/>
-
-    <field name="Vec size" size="2" start="32" type="uint"/>
-    <field name="Type" size="3" start="34" type="uint">
-      <value name="Attribute half-float" value="1"/>
-      <value name="Attribute float" value="2"/>
-      <value name="Attribute fixed" value="3"/>
-      <value name="Attribute byte" value="4"/>
-      <value name="Attribute short" value="5"/>
-      <value name="Attribute int" value="6"/>
-      <value name="Attribute int2_10_10_10" value="7"/>
-    </field>
-    <field name="Signed int type" size="1" start="37" type="bool"/>
-    <field name="Normalized int type" size="1" start="38" type="bool"/>
-    <field name="Read as int/uint" size="1" start="39" type="bool"/>
-
-    <field name="Number of values read by Coordinate shader" size="4" start="40" type="uint"/>
-    <field name="Number of values read by Vertex shader" size="4" start="44" type="uint"/>
-
-    <field name="Instance Divisor" size="16" start="6b" type="uint"/>
-    <field name="Stride" size="32" start="8b" type="uint"/>
-  </struct>
-
-  <struct name="GL Shader State Attribute Record" min_ver="41">
+  <struct name="GL Shader State Attribute Record">
     <field name="Address" size="32" start="0" type="address"/>
 
     <field name="Vec size" size="2" start="32" type="uint"/>
@@ -1476,55 +1369,19 @@
     <field name="addr" size="13" start="0" type="uint"/>
   </struct>
 
-  <struct name="Texture Uniform Parameter 0 CFG_MODE=1" max_ver="33">
-    <field name="Per-pixel mask enable" size="1" start="31" type="bool"/>
-
-    <field name="Texel offset for r coordinate" size="4" start="27" type="int"/>
-    <field name="Texel offset for t coordinate" size="4" start="23" type="int"/>
-    <field name="Texel offset for s coordinate" size="4" start="19" type="int"/>
-
-    <field name="R Wrap Mode" size="3" start="16" type="Wrap Mode"/>
-    <field name="T Wrap Mode" size="3" start="13" type="Wrap Mode"/>
-    <field name="S Wrap Mode" size="3" start="10" type="Wrap Mode"/>
-
-    <field name="New configuration mode" size="1" start="9" type="bool" default="1"/>
-
-    <field name="Shadow" size="1" start="8" type="bool"/>
-    <field name="Coefficient lookup mode" size="1" start="7" type="bool"/>
-    <field name="Disable AutoLOD, use bias only" size="1" start="6" type="bool"/>
-    <field name="Bias supplied" size="1" start="5" type="bool"/>
-    <field name="Gather sample mode" size="1" start="4" type="bool"/>
-    <field name="Fetch sample mode" size="1" start="3" type="bool"/>
-
-    <field name="Lookup Type" size="3" start="0" type="uint">
-      <value name="Texture 2D" value="0"/>
-      <value name="Texture 2D array" value="1"/>
-      <value name="Texture 3D" value="2"/>
-      <value name="Texture Cube Map" value="3"/>
-      <value name="Texture 1D" value="4"/>
-      <value name="Texture 1D Array" value="5"/>
-      <value name="Texture Child Image" value="6"/>
-    </field>
-  </struct>
-
-  <struct name="Texture Uniform Parameter 1 CFG_MODE=1" max_ver="33">
-    <field name="Texture state record base address" size="28" start="4" type="address"/>
-    <field name="Return words of texture data" size="4" start="0" type="uint"/>
-  </struct>
-
-  <struct name="TMU Config Parameter 0" min_ver="41">
+  <struct name="TMU Config Parameter 0">
     <field name="Texture state address" size="32" start="0" type="address"/>
     <field name="Return words of texture data" size="4" start="0" type="uint"/>
   </struct>
 
-  <struct name="TMU Config Parameter 1" min_ver="41">
+  <struct name="TMU Config Parameter 1">
     <field name="Sampler state address" size="32" start="0" type="address"/>
     <field name="Per-pixel mask enable" size="1" start="2" type="bool"/>
     <field name="Unnormalized coordinates" size="1" start="1" type="bool"/>
     <field name="Output Type 32-bit" size="1" start="0" type="bool"/>
   </struct>
 
-  <struct name="TMU Config Parameter 2" min_ver="41" max_ver="41">
+  <struct name="TMU Config Parameter 2"  max_ver="41">
     <field name="Pad" size="8" start="24" type="uint"/>
     <field name="Op" size="4" start="20" type="TMU Op"/>
     <field name="Offset R" size="4" start="16" type="int"/>
@@ -1538,7 +1395,7 @@
     <field name="Offset Format 8" size="1" start="0" type="bool"/>
   </struct>
 
-  <struct name="TMU Config Parameter 2" min_ver="42">
+  <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
     <field name="Pad" size="7" start="25" type="uint"/>
     <field name="LOD Query" size="1" start="24" type="bool"/>
     <field name="Op" size="4" start="20" type="TMU Op"/>
@@ -1553,30 +1410,34 @@
     <field name="Offset Format 8" size="1" start="0" type="bool"/>
   </struct>
 
-  <struct name="Texture Shader State" max_ver="33">
-    <field name="UIF XOR disable" size="1" start="255" type="bool"/>
-    <field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
-    <field name="Level 0 XOR enable" size="1" start="252" type="bool"/>
-    <field name="Level 0 UB_PAD" size="4" start="248" type="uint"/>
-    <field name="Output 32-bit" size="1" start="246" type="bool"/>
-    <field name="Sample Number" size="2" start="244" type="uint"/>
-
-    <field name="Base Level" size="4" start="240" type="uint"/>
-    <field name="Fixed Bias" size="16" start="224" type="s8.8"/>
-    <field name="Max Level-of-Detail" size="16" start="208" type="s8.8"/>
-    <field name="Min Level-of-Detail" size="16" start="192" type="s8.8"/>
-
-    <field name="Border Color alpha" size="16" start="176" type="uint"/>
-    <field name="Border Color blue" size="16" start="160" type="uint"/>
-    <field name="Border Color green" size="16" start="144" type="uint"/>
-    <field name="Border Color red" size="16" start="128" type="uint"/>
-
-    <field name="Flip S and T on incoming request" size="1" start="127" type="bool"/>
-    <field name="Flip ETC Y" size="1" start="126" type="bool" default="1"/>
-    <field name="Flip texture Y Axis" size="1" start="125" type="bool"/>
-    <field name="Flip texture X Axis" size="1" start="124" type="bool"/>
-
-    <field name="Swizzle A" size="3" start="121" type="uint">
+  <struct name="TMU Config Parameter 2" min_ver="71">
+    <field name="Pad" size="5" start="27" type="uint"/>
+    <field name="Write conversion" size="1" start="26" type="bool"/>
+    <field name="DIM query" size="1" start="25" type="bool"/>
+    <field name="LOD Query" size="1" start="24" type="bool"/>
+    <field name="Op" size="4" start="20" type="TMU Op"/>
+    <field name="Offset R" size="4" start="16" type="int"/>
+    <field name="Offset T" size="4" start="12" type="int"/>
+    <field name="Offset S" size="4" start="8" type="int"/>
+    <field name="Gather Mode" size="1" start="7" type="bool"/>
+    <field name="Gather Component" size="2" start="5" type="uint"/>
+    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+    <field name="Sample Number" size="2" start="2" type="uint"/>
+    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+    <field name="Offset Format 8" size="1" start="0" type="bool"/>
+  </struct>
+
+  <struct name="Texture Shader State"  max_ver="42">
+    <field name="Pad" size="56" start="136" type="uint"/>
+    <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+    <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+    <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
+    <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
+
+    <field name="Base Level" size="4" start="124" type="uint"/>
+    <field name="Max Level" size="4" start="120" type="uint"/>
+
+    <field name="Swizzle A" size="3" start="117" type="uint">
       <value name="Swizzle Zero" value="0"/>
       <value name="Swizzle One" value="1"/>
       <value name="Swizzle Red" value="2"/>
@@ -1585,29 +1446,54 @@
       <value name="Swizzle Alpha" value="5"/>
     </field>
 
-    <field name="Swizzle B" size="3" start="118" type="uint"/>
-    <field name="Swizzle G" size="3" start="115" type="uint"/>
-    <field name="Swizzle R" size="3" start="112" type="uint"/>
-
-    <field name="Depth Compare Function" size="3" start="109" type="Compare Function"/>
-
-    <field name="sRGB" size="1" start="107" type="bool"/>
+    <field name="Swizzle B" size="3" start="114" type="uint"/>
+    <field name="Swizzle G" size="3" start="111" type="uint"/>
+    <field name="Swizzle R" size="3" start="108" type="uint"/>
+    <field name="Extended" size="1" start="107" type="bool"/>
 
     <field name="Texture type" size="7" start="100" type="uint"/>
-
     <field name="Image Depth" size="14" start="86" type="uint"/>
     <field name="Image Height" size="14" start="72" type="uint"/>
     <field name="Image Width" size="14" start="58" type="uint"/>
 
     <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/>
 
-    <field name="Texture base pointer" size="30" start="2" type="address"/>
+    <field name="Texture base pointer" size="32" start="0" type="address"/>
 
-    <field name="Filter" size="4" start="0" type="TMU Filter"/>
+    <field name="Reverse Standard Border Color" size="1" start="5" type="bool"/>
+    <field name="AHDR" size="1" start="4" type="bool"/>
+    <field name="sRGB" size="1" start="3" type="bool"/>
+    <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/>
+    <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
+    <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
   </struct>
 
-  <struct name="Texture Shader State" min_ver="41">
-    <field name="Pad" size="56" start="136" type="uint"/>
+  <struct name="Texture Shader State" min_ver="71">
+    <field name="Pad" size="2" start="190" type="uint"/>
+    <!-- When we use an address type, there is an implicit requirement
+         that the address is a 32-bit that is encoded starting at a 32-bit
+         aligned bit offset into the packet. If the address field has less than
+         32 bits, it is assumed that the address is aligned. For example, a
+         26-bit address field is expected to be 64-byte aligned (6 lsb bits
+         are 0) and that this will be encoded into a packet starting at bit
+         offset 6 into a 32-bit dword (since bits 0..5 of the address are
+         implicitly 0 and don't need to be explicitly encoded).
+
+         Unfortunately, the CB address below doesn't match this requirement:
+         it starts at bit 138, which is 10 bits into a 32-bit dword, but it
+         represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
+         encode it as an address type. To fix this we encode these addresses
+         as uint types which has two implications:
+         1. the driver is responsible for manually addinng the buffer objects
+            for these addresses to the job BO list.
+         2. the driver needs to pass an actual 26-bit address value by manually
+            shifting the 6 lsb bits (that are implicitly 0).
+    -->
+    <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
+    <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
+    <field name="Chroma offset y" size="1" start="137" type="uint"/>
+    <field name="Chroma offset x" size="1" start="136" type="uint"/>
+
     <field name="UIF XOR disable" size="1" start="135" type="bool"/>
     <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
     <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
@@ -1635,19 +1521,30 @@
     <field name="Image Height" size="14" start="72" type="uint"/>
     <field name="Image Width" size="14" start="58" type="uint"/>
 
-    <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/>
+    <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
+         at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
+         Array Stride starting at 33, which is backwards incompatible,
+         We use the definition from 7.1.5.
+    -->
+    <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
+    <field name="R/B swap" size="1" start="32" type="bool"/>
 
     <field name="Texture base pointer" size="32" start="0" type="address"/>
 
-    <field name="Reverse Standard Border Color" size="1" start="5" type="bool"/>
-    <field name="AHDR" size="1" start="4" type="bool"/>
-    <field name="sRGB" size="1" start="3" type="bool"/>
-    <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/>
+    <field name="Reverse" size="1" start="5" type="bool"/>
+    <field name="Transfer func" size="3" start="2" type="uint">
+      <value name="Transfer Func None" value="0"/>
+      <value name="Transfer Func sRGB" value="1"/>
+      <value name="Transfer Func PQ" value="2"/>
+      <value name="Transfer Func HLG" value="3"/>
+      <value name="Transfer Func PQ BT1886" value="4"/>
+      <value name="Transfer Func HLG BT1886" value="5"/>
+    </field>
     <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
     <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
   </struct>
 
-  <struct name="Sampler State" min_ver="41">
+  <struct name="Sampler State">
     <field name="Border color word 3" size="32" start="160" type="uint"/>
     <field name="Border color word 2" size="32" start="128" type="uint"/>
     <field name="Border color word 1" size="32" start="96" type="uint"/>
diff --git a/src/broadcom/cle/v3d_packet_helpers.h b/src/broadcom/cle/v3d_packet_helpers.h
index 2b5e32ff215..41054618e3a 100644
--- a/src/broadcom/cle/v3d_packet_helpers.h
+++ b/src/broadcom/cle/v3d_packet_helpers.h
@@ -24,87 +24,20 @@
 #ifndef MESA_V3D_PACKET_HELPERS_H
 #define MESA_V3D_PACKET_HELPERS_H
 
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <assert.h>
-#include <math.h>
-#include "util/u_math.h"
+#include "util/bitpack_helpers.h"
 
 #ifdef HAVE_VALGRIND
 #include <valgrind.h>
 #include <memcheck.h>
 #define VG(x) x
-#ifndef NDEBUG
-#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
-#endif
 #else
 #define VG(x) ((void)0)
 #endif
 
-#ifndef __gen_validate_value
-#define __gen_validate_value(x)
-#endif
-/*
-#ifndef __gen_address_type
-#error #define __gen_address_type before including this file
-#endif
-
-#ifndef __gen_user_data
-#error #define __gen_combine_address before including this file
-#endif
-*/
-union __gen_value {
-   float f;
-   uint32_t dw;
-};
-
-static inline uint64_t
-__gen_mbo(uint32_t start, uint32_t end)
-{
-   return (~0ull >> (64 - (end - start + 1))) << start;
-}
-
-static inline uint64_t
-__gen_uint(uint64_t v, uint32_t start, uint32_t end)
-{
-   __gen_validate_value(v);
-
-#ifndef NDEBUG
-   const int width = end - start + 1;
-   if (width < 64) {
-      const uint64_t max = (1ull << width) - 1;
-      assert(v <= max);
-   }
-#endif
-
-   return v << start;
-}
-
-static inline uint64_t
-__gen_sint(int64_t v, uint32_t start, uint32_t end)
-{
-   const int width = end - start + 1;
-
-   __gen_validate_value(v);
-
-#ifndef NDEBUG
-   if (width < 64) {
-      const int64_t max = (1ll << (width - 1)) - 1;
-      const int64_t min = -(1ll << (width - 1));
-      assert(min <= v && v <= max);
-   }
-#endif
-
-   const uint64_t mask = ~0ull >> (64 - width);
-
-   return (v & mask) << start;
-}
-
 static inline uint64_t
 __gen_offset(uint64_t v, uint32_t start, uint32_t end)
 {
-   __gen_validate_value(v);
+   util_bitpack_validate_value(v);
 #ifndef NDEBUG
    uint64_t mask = (~0ull >> (64 - (end - start + 1))) << start;
 
@@ -114,50 +47,6 @@ __gen_offset(uint64_t v, uint32_t start, uint32_t end)
    return v;
 }
 
-static inline uint32_t
-__gen_float(float v)
-{
-   __gen_validate_value(v);
-   return ((union __gen_value) { .f = (v) }).dw;
-}
-
-static inline uint64_t
-__gen_sfixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits)
-{
-   __gen_validate_value(v);
-
-   const float factor = (1 << fract_bits);
-
-#ifndef NDEBUG
-   const float max = ((1 << (end - start)) - 1) / factor;
-   const float min = -(1 << (end - start)) / factor;
-   assert(min <= v && v <= max);
-#endif
-
-   const int64_t int_val = llroundf(v * factor);
-   const uint64_t mask = ~0ull >> (64 - (end - start + 1));
-
-   return (int_val & mask) << start;
-}
-
-static inline uint64_t
-__gen_ufixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits)
-{
-   __gen_validate_value(v);
-
-   const float factor = (1 << fract_bits);
-
-#ifndef NDEBUG
-   const float max = ((1 << (end - start + 1)) - 1) / factor;
-   const float min = 0.0f;
-   assert(min <= v && v <= max);
-#endif
-
-   const uint64_t uint_val = llroundf(v * factor);
-
-   return uint_val << start;
-}
-
 static inline uint64_t
 __gen_unpack_uint(const uint8_t *restrict cl, uint32_t start, uint32_t end)
 {
diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
index 5762e5aaa70..0062ddbd516 100644
--- a/src/broadcom/cle/v3dx_pack.h
+++ b/src/broadcom/cle/v3dx_pack.h
@@ -31,12 +31,10 @@
 
 #if (V3D_VERSION == 21)
 #  include "cle/v3d_packet_v21_pack.h"
-#elif (V3D_VERSION == 33)
-#  include "cle/v3d_packet_v33_pack.h"
-#elif (V3D_VERSION == 41)
-#  include "cle/v3d_packet_v41_pack.h"
 #elif (V3D_VERSION == 42)
 #  include "cle/v3d_packet_v42_pack.h"
+#elif (V3D_VERSION == 71)
+#  include "cle/v3d_packet_v71_pack.h"
 #else
 #  error "Need to add a pack header include for this v3d version"
 #endif
diff --git a/src/broadcom/cle/v3d_packet_v21.xml b/src/broadcom/cle/vc4_packet.xml
index df838a70845..df838a70845 100644
--- a/src/broadcom/cle/v3d_packet_v21.xml
+++ b/src/broadcom/cle/vc4_packet.xml
diff --git a/src/broadcom/clif/clif_dump.c b/src/broadcom/clif/clif_dump.c
index 0aaa6b6ad8b..db94edba113 100644
--- a/src/broadcom/clif/clif_dump.c
+++ b/src/broadcom/clif/clif_dump.c
@@ -106,12 +106,16 @@ static bool
 clif_dump_packet(struct clif_dump *clif, uint32_t offset, const uint8_t *cl,
                  uint32_t *size, bool reloc_mode)
 {
-        if (clif->devinfo->ver >= 42)
+
+        switch (clif->devinfo->ver) {
+        case 42:
                 return v3d42_clif_dump_packet(clif, offset, cl, size, reloc_mode);
-        else if (clif->devinfo->ver >= 41)
-                return v3d41_clif_dump_packet(clif, offset, cl, size, reloc_mode);
-        else
-                return v3d33_clif_dump_packet(clif, offset, cl, size, reloc_mode);
+        case 71:
+                return v3d71_clif_dump_packet(clif, offset, cl, size, reloc_mode);
+        default:
+                break;
+        };
+        unreachable("Unknown HW version");
 }
 
 static uint32_t
@@ -160,7 +164,8 @@ clif_dump_cl(struct clif_dump *clif, uint32_t start, uint32_t end,
 static uint32_t
 clif_dump_gl_shader_state_record(struct clif_dump *clif,
                                  struct reloc_worklist_entry *reloc,
-                                 void *vaddr)
+                                 void *vaddr,
+                                 bool including_gs)
 {
         struct v3d_group *state = v3d_spec_find_struct(clif->spec,
                                                        "GL Shader State Record");
@@ -170,6 +175,16 @@ clif_dump_gl_shader_state_record(struct clif_dump *clif,
         assert(attr);
         uint32_t offset = 0;
 
+        if (including_gs) {
+                struct v3d_group *gs_state = v3d_spec_find_struct(clif->spec,
+                                                                  "Geometry Shader State Record");
+                assert(gs_state);
+                out(clif, "@format shadrec_gl_geom\n");
+                v3d_print_group(clif, gs_state, 0, vaddr + offset);
+                offset += v3d_group_get_length(gs_state);
+                /* Extra pad when geometry/tessellation shader is present */
+                offset += 20;
+        }
         out(clif, "@format shadrec_gl_main\n");
         v3d_print_group(clif, state, 0, vaddr + offset);
         offset += v3d_group_get_length(state);
@@ -201,6 +216,7 @@ clif_process_worklist(struct clif_dump *clif)
                         break;
 
                 case reloc_gl_shader_state:
+                case reloc_gl_including_gs_shader_state:
                         break;
                 case reloc_generic_tile_list:
                         clif_dump_cl(clif, reloc->addr,
@@ -336,10 +352,12 @@ clif_dump_buffers(struct clif_dump *clif)
                         break;
 
                 case reloc_gl_shader_state:
+                case reloc_gl_including_gs_shader_state:
                         offset += clif_dump_gl_shader_state_record(clif,
                                                                    reloc,
                                                                    bo->vaddr +
-                                                                   offset);
+                                                                   offset,
+                                                                   reloc->type == reloc_gl_including_gs_shader_state);
                         break;
                 case reloc_generic_tile_list:
                         offset = clif_dump_cl(clif, reloc->addr,
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
index d96bfd12de9..d4e55e03730 100644
--- a/src/broadcom/clif/clif_private.h
+++ b/src/broadcom/clif/clif_private.h
@@ -64,6 +64,7 @@ struct clif_dump {
 enum reloc_worklist_type {
         reloc_cl,
         reloc_gl_shader_state,
+        reloc_gl_including_gs_shader_state,
         reloc_generic_tile_list,
 };
 
@@ -94,12 +95,10 @@ clif_dump_add_address_to_worklist(struct clif_dump *clif,
                                   enum reloc_worklist_type type,
                                   uint32_t addr);
 
-bool v3d33_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
-                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
-bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
-                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
 bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
                             const uint8_t *cl, uint32_t *size, bool reloc_mode);
+bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
 
 static inline void
 out(struct clif_dump *clif, const char *fmt, ...)
diff --git a/src/broadcom/clif/v3dx_dump.c b/src/broadcom/clif/v3dx_dump.c
index 9cf59f88920..454478531ff 100644
--- a/src/broadcom/clif/v3dx_dump.c
+++ b/src/broadcom/clif/v3dx_dump.c
@@ -94,6 +94,25 @@ v3dX(clif_dump_packet)(struct clif_dump *clif, uint32_t offset,
                 return true;
         }
 
+#if V3D_VERSION >= 41
+        case V3DX(GL_SHADER_STATE_INCLUDING_GS_opcode): {
+                struct V3DX(GL_SHADER_STATE_INCLUDING_GS) values;
+                V3DX(GL_SHADER_STATE_INCLUDING_GS_unpack)(cl, &values);
+
+                if (reloc_mode) {
+                        struct reloc_worklist_entry *reloc =
+                                clif_dump_add_address_to_worklist(clif,
+                                                                  reloc_gl_including_gs_shader_state,
+                                                                  values.address);
+                        if (reloc) {
+                                reloc->shader_state.num_attrs =
+                                        values.number_of_attribute_arrays;
+                        }
+                }
+                return true;
+        }
+#endif /* V3D_VERSION >= 41 */
+
 #if V3D_VERSION < 40
         case V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED_opcode): {
                 struct V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED) values;
diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h
index cb1ee7c96f4..4cfd98f961b 100644
--- a/src/broadcom/common/v3d_cpu_tiling.h
+++ b/src/broadcom/common/v3d_cpu_tiling.h
@@ -31,7 +31,7 @@ static inline void
 v3d_load_utile(void *cpu, uint32_t cpu_stride,
                void *gpu, uint32_t gpu_stride)
 {
-#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+#if defined(V3D_BUILD_NEON) && DETECT_ARCH_ARM
         if (gpu_stride == 8) {
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
@@ -80,7 +80,7 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
                         : "q0", "q1", "q2", "q3");
                 return;
         }
-#elif defined (PIPE_ARCH_AARCH64)
+#elif DETECT_ARCH_AARCH64
         if (gpu_stride == 8) {
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
@@ -141,7 +141,7 @@ static inline void
 v3d_store_utile(void *gpu, uint32_t gpu_stride,
                 void *cpu, uint32_t cpu_stride)
 {
-#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+#if defined(V3D_BUILD_NEON) && DETECT_ARCH_ARM
         if (gpu_stride == 8) {
                 __asm__ volatile (
                         /* Load each 8-byte line from cpu-side source,
@@ -188,7 +188,7 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
                         : "q0", "q1", "q2", "q3");
                 return;
         }
-#elif defined (PIPE_ARCH_AARCH64)
+#elif DETECT_ARCH_AARCH64
         if (gpu_stride == 8) {
                 __asm__ volatile (
                         /* Load each 8-byte line from cpu-side source,
diff --git a/src/broadcom/vulkan/v3dv_util.c b/src/broadcom/common/v3d_csd.h
index d26369f9f56..dc1bd11efc5 100644
--- a/src/broadcom/vulkan/v3dv_util.c
+++ b/src/broadcom/common/v3d_csd.h
@@ -1,12 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
- *
- * based in part on anv driver which is:
- * Copyright © 2015 Intel Corporation
- *
- * based in part on radv driver which is:
- * Copyright © 2016 Red Hat.
- * Copyright © 2016 Bas Nieuwenhuizen
+ * Copyright © 2023 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -28,38 +21,23 @@
  * IN THE SOFTWARE.
  */
 
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-
-#include "vk_enum_to_str.h"
-#include "v3dv_private.h"
-
-VkResult
-__vk_errorf(struct v3dv_instance *instance, VkResult error, const char *file,
-            int line, const char *format, ...)
-{
-   va_list ap;
-   char buffer[256];
+#ifndef V3D_CSD_H
+#define V3D_CSD_H
+
+#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
+#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
+/* Allow this dispatch to start while the last one is still running. */
+#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
+/* Maximum supergroup ID.  6 bits. */
+#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
+/* Batches per supergroup minus 1.  8 bits. */
+#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
+/* Workgroups per supergroup, 0 means 16 */
+#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
+#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
+
+#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
+#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
+#define V3D_CSD_CFG5_THREADING (1 << 0)
 
-#ifndef DEBUG
-      return error;
 #endif
-
-   const char *error_str = vk_Result_to_str(error);
-
-   if (format) {
-      va_start(ap, format);
-      vsnprintf(buffer, sizeof(buffer), format, ap);
-      va_end(ap);
-
-      fprintf(stderr, "%s:%d: %s (%s)\n", file, line, buffer, error_str);
-   } else {
-      fprintf(stderr, "%s:%d: %s\n", file, line, error_str);
-   }
-
-   return error;
-}
diff --git a/src/broadcom/common/v3d_debug.c b/src/broadcom/common/v3d_debug.c
index 508a2b7c74c..b6b32bc72ad 100644
--- a/src/broadcom/common/v3d_debug.c
+++ b/src/broadcom/common/v3d_debug.c
@@ -37,13 +37,13 @@
 #include "util/u_debug.h"
 #include "c11/threads.h"
 
-uint32_t V3D_DEBUG = 0;
+uint32_t v3d_mesa_debug = 0;
 
 static const struct debug_named_value debug_control[] = {
         { "cl",          V3D_DEBUG_CL,
           "Dump command list during creation" },
         { "cl_nobin",    V3D_DEBUG_CL_NO_BIN,
-          "Dump command listduring creation, excluding binary resources" },
+          "Dump command list during creation, excluding binary resources" },
         { "clif",        V3D_DEBUG_CLIF,
           "Dump command list (CLIF format) during creation", },
         { "qpu",         V3D_DEBUG_QPU,
@@ -53,15 +53,21 @@ static const struct debug_named_value debug_control[] = {
         { "nir",         V3D_DEBUG_NIR,
           "Dump NIR during program compile" },
         { "tgsi",        V3D_DEBUG_TGSI,
-          "Dump TGSI during program compile" },
+          "Dump TGSI during program compile (v3d only)" },
+        /* `shaderdb` is *not* used by shader-db, but is here so that any other
+         * game/app can dump its stats in the shader-db format, allowing them
+         * to be compared using shader-db's report.py tool.
+         */
         { "shaderdb",    V3D_DEBUG_SHADERDB,
           "Dump program compile information for shader-db analysis" },
         { "surface",     V3D_DEBUG_SURFACE,
-          "Print resource layout information" },
+          /* FIXME: evaluate to implement it on v3dv */
+          "Print resource layout information (v3d only)" },
         { "perf",        V3D_DEBUG_PERF,
-          "Print during runtime performance-related events" },
+          "Print performance-related events during runtime" },
         { "norast",      V3D_DEBUG_NORAST,
-          "Skip actual hardware execution of commands" },
+          /* FIXME: evaluate to implement on v3dv*/
+          "Skip actual hardware execution of commands (v3d only)" },
         { "fs",          V3D_DEBUG_FS,
           "Dump fragment shaders" },
         { "gs",          V3D_DEBUG_GS,
@@ -73,11 +79,11 @@ static const struct debug_named_value debug_control[] = {
         { "always_flush", V3D_DEBUG_ALWAYS_FLUSH,
           "Flush after each draw call" },
         { "precompile",  V3D_DEBUG_PRECOMPILE,
-          "Precompiles shader variant at shader state creation time" },
+          "Precompiles shader variant at shader state creation time (v3d only)" },
         { "ra",          V3D_DEBUG_RA,
           "Dump register allocation failures" },
         { "dump_spirv",  V3D_DEBUG_DUMP_SPIRV,
-          "Dump SPIR-V code" },
+          "Dump SPIR-V code (v3dv only)" },
         { "tmu32",  V3D_DEBUG_TMU_32BIT,
           "Force 32-bit precision on all TMU operations" },
         /* This can lead to incorrect behavior for applications that do
@@ -88,12 +94,25 @@ static const struct debug_named_value debug_control[] = {
           "Force 16-bit precision on all TMU operations" },
         { "noloopunroll",  V3D_DEBUG_NO_LOOP_UNROLL,
           "Disable loop unrolling" },
-        { NULL }
+        { "db", V3D_DEBUG_DOUBLE_BUFFER,
+          "Enable double buffer for Tile Buffer when MSAA is disabled" },
+#ifdef ENABLE_SHADER_CACHE
+        { "cache", V3D_DEBUG_CACHE,
+          "Print on-disk cache events (only with cache enabled)" },
+#endif
+        { "no_merge_jobs", V3D_DEBUG_NO_MERGE_JOBS,
+          "Don't try to merge subpasses in the same job even if they share framebuffer configuration (v3dv only)" },
+        { "opt_compile_time", V3D_DEBUG_OPT_COMPILE_TIME,
+          "Don't try to reduce shader spilling, might improve compile times with expensive shaders." },
+        /* disable_tfu is v3dv only because v3d has some uses of the TFU without alternative codepaths */
+        { "disable_tfu", V3D_DEBUG_DISABLE_TFU,
+          "Disable TFU (v3dv only)" },
+        DEBUG_NAMED_VALUE_END
 };
 
 DEBUG_GET_ONCE_FLAGS_OPTION(v3d_debug, "V3D_DEBUG", debug_control, 0)
 
-uint32_t
+bool
 v3d_debug_flag_for_shader_stage(gl_shader_stage stage)
 {
         uint32_t flags[] = {
@@ -105,14 +124,11 @@ v3d_debug_flag_for_shader_stage(gl_shader_stage stage)
                 [MESA_SHADER_COMPUTE] = V3D_DEBUG_CS,
         };
         STATIC_ASSERT(MESA_SHADER_STAGES == 6);
-        return flags[stage];
+        return v3d_mesa_debug & flags[stage];
 }
 
 void
 v3d_process_debug_variable(void)
 {
-        V3D_DEBUG = debug_get_option_v3d_debug();
-
-        if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
-                V3D_DEBUG |= V3D_DEBUG_NORAST;
+        v3d_mesa_debug = debug_get_option_v3d_debug();
 }
diff --git a/src/broadcom/common/v3d_debug.h b/src/broadcom/common/v3d_debug.h
index b5278c4c759..67112ebf361 100644
--- a/src/broadcom/common/v3d_debug.h
+++ b/src/broadcom/common/v3d_debug.h
@@ -39,7 +39,9 @@ extern "C" {
  * list of debugging flags, as well as some macros for handling them.
  */
 
-extern uint32_t V3D_DEBUG;
+extern uint32_t v3d_mesa_debug;
+
+#define V3D_DBG(flag) unlikely(v3d_mesa_debug & V3D_DEBUG_ ## flag)
 
 #define V3D_DEBUG_SHADERDB          (1 << 0)
 #define V3D_DEBUG_TGSI              (1 << 1)
@@ -63,6 +65,11 @@ extern uint32_t V3D_DEBUG;
 #define V3D_DEBUG_TMU_16BIT         (1 << 19)
 #define V3D_DEBUG_NO_LOOP_UNROLL    (1 << 20)
 #define V3D_DEBUG_CL_NO_BIN         (1 << 21)
+#define V3D_DEBUG_DOUBLE_BUFFER     (1 << 22)
+#define V3D_DEBUG_CACHE             (1 << 23)
+#define V3D_DEBUG_NO_MERGE_JOBS     (1 << 24)
+#define V3D_DEBUG_OPT_COMPILE_TIME  (1 << 25)
+#define V3D_DEBUG_DISABLE_TFU       (1 << 26)
 
 #define V3D_DEBUG_SHADERS           (V3D_DEBUG_TGSI | V3D_DEBUG_NIR | \
                                      V3D_DEBUG_VIR | V3D_DEBUG_QPU | \
@@ -85,12 +92,7 @@ extern uint32_t V3D_DEBUG;
 #define dbg_printf(...)	fprintf(stderr, __VA_ARGS__)
 #endif /* HAVE_ANDROID_PLATFORM */
 
-#define DBG(flag, ...) do {                                     \
-        if (unlikely(V3D_DEBUG & (flag)))                       \
-                dbg_printf(__VA_ARGS__);                        \
-} while(0)
-
-extern uint32_t v3d_debug_flag_for_shader_stage(gl_shader_stage stage);
+extern bool v3d_debug_flag_for_shader_stage(gl_shader_stage stage);
 
 extern void v3d_process_debug_variable(void);
 
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 272190eb2e5..fa85a7d5077 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
     struct drm_v3d_get_param ident1 = {
             .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
     };
+    struct drm_v3d_get_param hub_ident3 = {
+            .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
+    };
     int ret;
 
     ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
@@ -62,10 +65,11 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
     int qups = (ident1.value >> 8) & 0xf;
     devinfo->qpu_count = nslc * qups;
 
+    devinfo->has_accumulators = devinfo->ver < 71;
+
     switch (devinfo->ver) {
-        case 33:
-        case 41:
         case 42:
+        case 71:
                 break;
         default:
                 fprintf(stderr,
@@ -75,5 +79,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
                 return false;
     }
 
-    return true;
+    ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
+    if (ret != 0) {
+            fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
+                    strerror(errno));
+            return false;
+    }
+
+   devinfo->rev = (hub_ident3.value >> 8) & 0xff;
+
+   return true;
 }
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 97abd9b8d9f..8dfc7858727 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -34,11 +34,17 @@ struct v3d_device_info {
         /** Simple V3D version: major * 10 + minor */
         uint8_t ver;
 
+        /** V3D revision number */
+        uint8_t rev;
+
         /** Size of the VPM, in bytes. */
         int vpm_size;
 
         /* NSLC * QUPS from the core's IDENT registers. */
         int qpu_count;
+
+        /* If the hw has accumulator registers */
+        bool has_accumulators;
 };
 
 typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index 129e53e29a4..354c8784914 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -24,6 +24,8 @@
 #ifndef V3D_LIMITS_H
 #define V3D_LIMITS_H
 
+#define V3D_CL_MAX_INSTR_SIZE 25
+
 /* Number of channels a QPU thread executes in parallel.  Also known as
  * gl_SubGroupSizeARB.
  */
@@ -36,32 +38,35 @@
                                       V3D_MAX_GS_INPUTS, \
                                       V3D_MAX_FS_INPUTS)
 
-/* For now we need to maintain a different limits for OpenGL and Vulkan due
- * some OpenGL CTS tests hitting register allocation when trying to use all
- * the texture available.
- *
- * FIXME: nir_schedule should be able to handle that. When fixed it would be
- * simpler to keep just one limit
- */
-#define V3D_VULKAN_MAX_TEXTURE_SAMPLERS 24
-#define V3D_OPENGL_MAX_TEXTURE_SAMPLERS 16
-
-/* Not specifically a hardware limit, just coordination between compiler and
- * driver.
- */
-#define V3D_MAX_TEXTURE_SAMPLERS MAX2(V3D_VULKAN_MAX_TEXTURE_SAMPLERS, \
-                                      V3D_OPENGL_MAX_TEXTURE_SAMPLERS)
-
-/* The HW can do 16384 (15), but we run into hangs when we expose that. */
-#define V3D_MAX_MIP_LEVELS 13
+#define V3D_MAX_TEXTURE_SAMPLERS 24
 
 #define V3D_MAX_SAMPLES 4
 
-#define V3D_MAX_DRAW_BUFFERS 4
+#define V3D_MAX_DRAW_BUFFERS 8
+#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8)
 
 #define V3D_MAX_POINT_SIZE 512.0f
 #define V3D_MAX_LINE_WIDTH 32
 
-#define V3D_MAX_BUFFER_RANGE (1 << 27)
+#define V3D_MAX_BUFFER_RANGE (1 << 30)
+
+/* Sub-pixel precision bits in the rasterizer */
+#define V3D_COORD_SHIFT 6
+
+/* Size of a cache line */
+#define V3D_NON_COHERENT_ATOM_SIZE 256
+
+/* Minimum alignment for texel buffers */
+#define V3D_TMU_TEXEL_ALIGN 64
+
+#define V3D_MAX_IMAGE_DIMENSION 4096
+
+/* The HW can do 16384 (15), but we run into hangs when we expose that. Also,
+ * since we are only exposing images up to 4096 pixels per dimension 13 is
+ * all we need.
+ */
+#define V3D_MAX_MIP_LEVELS 13
+
+#define V3D_MAX_ARRAY_LAYERS 2048
 
 #endif /* V3D_LIMITS_H */
diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
index fe89398208a..4ab66f647ab 100644
--- a/src/broadcom/common/v3d_macros.h
+++ b/src/broadcom/common/v3d_macros.h
@@ -32,15 +32,12 @@
 #if (V3D_VERSION == 21)
 #  define V3DX(x) V3D21_##x
 #  define v3dX(x) v3d21_##x
-#elif (V3D_VERSION == 33)
-#  define V3DX(x) V3D33_##x
-#  define v3dX(x) v3d33_##x
-#elif (V3D_VERSION == 41)
-#  define V3DX(x) V3D41_##x
-#  define v3dX(x) v3d41_##x
 #elif (V3D_VERSION == 42)
 #  define V3DX(x) V3D42_##x
 #  define v3dX(x) v3d42_##x
+#elif (V3D_VERSION == 71)
+#  define V3DX(x) V3D71_##x
+#  define v3dX(x) v3d71_##x
 #else
 #  error "Need to add prefixing macros for this v3d version"
 #endif
diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h
new file mode 100644
index 00000000000..33e3e0e78db
--- /dev/null
+++ b/src/broadcom/common/v3d_performance_counters.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_PERFORMANCE_COUNTERS_H
+#define V3D_PERFORMANCE_COUNTERS_H
+
+#define V3D_PERFCNT_CATEGORY 0
+#define V3D_PERFCNT_NAME 1
+#define V3D_PERFCNT_DESCRIPTION 2
+
+#ifndef V3D_VERSION
+#  error "The V3D_VERSION macro must be defined"
+#endif
+
+#if (V3D_VERSION >= 71)
+
+static const char *v3d_performance_counters[][3] = {
+   {"CORE", "cycle-count", "[CORE] Cycle counter"},
+   {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"},
+   {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+   {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+   {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+   {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+   {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+   {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+   {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+   {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+   {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+   {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+   {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+   {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+   {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+   {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+   {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+   {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+   {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+   {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+   {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+   {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+   {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+   {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"},
+   {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"},
+   {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"},
+   {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+   {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+   {"L2T", "L2T-local", "[L2T] Local mode access"},
+   {"L2T", "L2T-writeback", "[L2T] Writeback"},
+   {"L2T", "L2T-zero", "[L2T] Zero"},
+   {"L2T", "L2T-merge", "[L2T] Merge"},
+   {"L2T", "L2T-fill", "[L2T] Fill"},
+   {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"},
+   {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"},
+   {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"},
+   {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"},
+   {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"},
+   {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"},
+   {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"},
+   {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"},
+   {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"},
+   {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"},
+   {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+   {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"},
+   {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+   {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"},
+   {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"},
+   {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+   {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+   {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+   {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+   {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+   {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+   {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+   {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+   {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+   {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+   {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+   {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+   {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+   {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+   {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+   {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+   {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+   {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+   {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+   {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+   {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+   {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+   {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+   {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+   {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+   {"AXI", "AXI-read-trans", "[AXI] Read transaction count"},
+   {"AXI", "AXI-write-trans", "[AXI] Write transaction count"},
+   {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"},
+   {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"},
+   {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"},
+   {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"},
+   {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"},
+   {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"},
+   {"QPU", "QPU-active", "[QPU] Executed shader instruction"},
+   {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+   {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"},
+   {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+   {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"},
+   {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"},
+   {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"},
+   {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"},
+   {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"},
+   {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"},
+};
+
+#elif (V3D_VERSION >= 42)
+
+static const char *v3d_performance_counters[][3] = {
+   {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+   {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+   {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+   {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+   {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+   {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
+   {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
+   {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+   {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+   {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+   {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+   {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
+   {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
+   {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+   {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
+   {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
+   {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
+   {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
+   {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+   {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+   {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+   {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+   {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+   {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
+   {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
+   {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
+   {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+   {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+   {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+   {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+   {"CORE", "cycle-count", "[CORE] Cycle counter"},
+   {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
+   {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+   {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+   {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+   {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+   {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+   {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+   {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+   {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+   {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+   {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+   {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+   {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+   {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+   {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+   {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+   {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
+   {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
+   {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
+   {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
+   {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+   {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+   {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
+   {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
+   {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
+   {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
+   {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
+   {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
+   {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
+   {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
+   {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
+   {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+   {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
+   {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+   {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
+   {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
+   {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
+   {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
+   {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+   {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+   {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+   {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+   {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+   {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+   {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+   {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+   {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+   {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
+   {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+   {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+   {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+   {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+   {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
+   {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+};
+
+#else
+static const char *v3d_performance_counters[][3] = { };
+#endif
+
+#endif
diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h
new file mode 100644
index 00000000000..572d0074794
--- /dev/null
+++ b/src/broadcom/common/v3d_tfu.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2021 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_TFU_H
+#define V3D_TFU_H
+
+/* Disable level 0 write, just write following mipmaps */
+#define V3D33_TFU_IOA_DIMTW (1 << 0)
+#define V3D33_TFU_IOA_FORMAT_SHIFT 3
+#define V3D33_TFU_IOA_FORMAT_LINEARTILE 3
+#define V3D33_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
+#define V3D33_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
+#define V3D33_TFU_IOA_FORMAT_UIF_NO_XOR 6
+#define V3D33_TFU_IOA_FORMAT_UIF_XOR 7
+
+#define V3D33_TFU_ICFG_NUMMM_SHIFT 5
+#define V3D33_TFU_ICFG_TTYPE_SHIFT 9
+
+#define V3D33_TFU_ICFG_OPAD_SHIFT 22
+
+#define V3D33_TFU_ICFG_FORMAT_SHIFT 18
+#define V3D33_TFU_ICFG_FORMAT_RASTER 0
+#define V3D33_TFU_ICFG_FORMAT_SAND_128 1
+#define V3D33_TFU_ICFG_FORMAT_SAND_256 2
+#define V3D33_TFU_ICFG_FORMAT_LINEARTILE 11
+#define V3D33_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D33_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+#define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15
+
+/* Disable level 0 write, just write following mipmaps */
+#define V3D71_TFU_IOC_DIMTW (1 << 0)
+#define V3D71_TFU_IOC_FORMAT_SHIFT              12
+#define V3D71_TFU_IOC_FORMAT_LINEARTILE          3
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN   4
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN   5
+#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR          6
+#define V3D71_TFU_IOA_FORMAT_UIF_XOR             7
+
+#define V3D71_TFU_IOC_STRIDE_SHIFT              16
+#define V3D71_TFU_IOC_NUMMM_SHIFT                4
+
+#define V3D71_TFU_ICFG_OTYPE_SHIFT              16
+#define V3D71_TFU_ICFG_IFORMAT_SHIFT            23
+#define V3D71_TFU_ICFG_FORMAT_RASTER             0
+#define V3D71_TFU_ICFG_FORMAT_SAND_128           1
+#define V3D71_TFU_ICFG_FORMAT_SAND_256           2
+#define V3D71_TFU_ICFG_FORMAT_LINEARTILE        11
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR        14
+#define V3D71_TFU_ICFG_FORMAT_UIF_XOR           15
+
+#endif
diff --git a/src/broadcom/common/v3d_tiling.c b/src/broadcom/common/v3d_tiling.c
index 22f84811e19..6e785916578 100644
--- a/src/broadcom/common/v3d_tiling.c
+++ b/src/broadcom/common/v3d_tiling.c
@@ -28,6 +28,7 @@
  */
 
 #include <stdint.h>
+#include "util/box.h"
 #include "v3d_tiling.h"
 #include "broadcom/common/v3d_cpu_tiling.h"
 
diff --git a/src/broadcom/common/v3d_tiling.h b/src/broadcom/common/v3d_tiling.h
index 08ae7cce805..2573c8a5f02 100644
--- a/src/broadcom/common/v3d_tiling.h
+++ b/src/broadcom/common/v3d_tiling.h
@@ -24,7 +24,7 @@
 #ifndef V3D_TILING_H
 #define V3D_TILING_H
 
-#include "util/u_box.h"
+#include "util/format/u_format.h"
 
 /* A UIFblock is a 256-byte region of memory that's 256-byte aligned.  These
  * will be grouped in 4x4 blocks (left-to-right, then top-to-bottom) in a 4KB
@@ -63,6 +63,8 @@ enum v3d_tiling_mode {
         V3D_TILING_UIF_XOR,
 };
 
+struct pipe_box;
+
 uint32_t v3d_utile_width(int cpp) ATTRIBUTE_CONST;
 uint32_t v3d_utile_height(int cpp) ATTRIBUTE_CONST;
 bool v3d_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
index 424656fd8b1..8a50d279985 100644
--- a/src/broadcom/common/v3d_util.c
+++ b/src/broadcom/common/v3d_util.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -86,3 +86,187 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
 
    return best_wgs_per_sg;
 }
+
+#define V3D71_TLB_COLOR_SIZE     (16 * 1024)
+#define V3D71_TLB_DETPH_SIZE     (16 * 1024)
+#define V3D71_TLB_AUX_DETPH_SIZE  (8 * 1024)
+
+static bool
+tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp)
+{
+   /* First, we check if we can fit this tile size allocating the depth
+    * TLB memory to color.
+    */
+   if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE &&
+       pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) {
+      return true;
+   }
+
+   /* Otherwise the tile must fit in the main TLB buffers */
+   return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE &&
+          pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE;
+}
+
+void
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+                     uint32_t color_attachment_count,
+                     /* V3D 4.x max internal bpp of all RTs */
+                     uint32_t max_internal_bpp,
+                     /* V3D 7.x accumulated bpp for all RTs (in bytes) */
+                     uint32_t total_color_bpp,
+                     bool msaa,
+                     bool double_buffer,
+                     uint32_t *width,
+                     uint32_t *height)
+{
+   static const uint8_t tile_sizes[] = {
+      64, 64,
+      64, 32,
+      32, 32,
+      32, 16,
+      16, 16,
+      16,  8,
+       8,  8
+   };
+
+   uint32_t idx = 0;
+   if (devinfo->ver >= 71) {
+      /* In V3D 7.x, we use the actual bpp used by color attachments to compute
+       * the tile size instead of the maximum bpp. This may allow us to choose a
+       * larger tile size than we would in 4.x in scenarios with multiple RTs
+       * with different bpps.
+       *
+       * Also, the TLB has an auxiliary buffer of 8KB that will be automatically
+       * used for depth instead of the main 16KB depth TLB buffer when the depth
+       * tile fits in the auxiliary buffer, allowing the hardware to allocate
+       * the 16KB from the main depth TLB to the color TLB. If we can do that,
+       * then we are effectively doubling the memory we have for color and we
+       * can also select a larger tile size. This is necessary to support
+       * the most expensive configuration: 8x128bpp RTs + MSAA.
+       *
+       * FIXME: the docs state that depth TLB memory can be used for color
+       * if depth testing is not used by setting the 'depth disable' bit in the
+       * rendering configuration. However, this comes with a requirement that
+       * occlussion queries must not be active. We need to clarify if this means
+       * active at the point at which we emit a tile rendering configuration
+       * item, meaning that the we have a query spanning a full render pass
+       * (this is something we can tell before we emit the rendering
+       * configuration item) or active in the subpass for which we are enabling
+       * the bit (which we can't tell until later, when we record commands for
+       * the subpass). If it is the latter, then we cannot use this feature.
+       *
+       * FIXME: pending handling double_buffer.
+       */
+      const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1);
+      const uint32_t depth_bpp = 4 * (msaa ? 4 : 1);
+      do {
+         const uint32_t tile_w = tile_sizes[idx * 2];
+         const uint32_t tile_h = tile_sizes[idx * 2 + 1];
+         if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp))
+            break;
+         idx++;
+      } while (idx < ARRAY_SIZE(tile_sizes) / 2);
+
+      /* FIXME: pending handling double_buffer */
+      assert(!double_buffer);
+   } else {
+      /* On V3D 4.x tile size is selected based on the number of RTs, the
+       * maximum bpp across all of them and whether 4x MSAA is used.
+       */
+      if (color_attachment_count > 4)
+         idx += 3;
+      else if (color_attachment_count > 2)
+         idx += 2;
+      else if (color_attachment_count > 1)
+         idx += 1;
+
+      /* MSAA and double-buffer are mutually exclusive */
+      assert(!msaa || !double_buffer);
+      if (msaa)
+         idx += 2;
+      else if (double_buffer)
+         idx += 1;
+
+      idx += max_internal_bpp;
+   }
+
+   assert(idx < ARRAY_SIZE(tile_sizes) / 2);
+
+   *width = tile_sizes[idx * 2];
+   *height = tile_sizes[idx * 2 + 1];
+}
+
+/* Translates a pipe swizzle to the swizzle values used in the
+ * TEXTURE_SHADER_STATE packet.
+ */
+uint32_t
+v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle)
+{
+   switch (swizzle) {
+   case PIPE_SWIZZLE_0:
+      return 0;
+   case PIPE_SWIZZLE_1:
+      return 1;
+   case PIPE_SWIZZLE_X:
+   case PIPE_SWIZZLE_Y:
+   case PIPE_SWIZZLE_Z:
+   case PIPE_SWIZZLE_W:
+      return 2 + swizzle;
+   default:
+      unreachable("unknown swizzle");
+   }
+}
+
+/* Translates a pipe primitive type to a hw value we can use in the various
+ * draw packets.
+ */
+uint32_t
+v3d_hw_prim_type(enum mesa_prim prim_type)
+{
+   switch (prim_type) {
+   case MESA_PRIM_POINTS:
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_LINE_LOOP:
+   case MESA_PRIM_LINE_STRIP:
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_TRIANGLE_STRIP:
+   case MESA_PRIM_TRIANGLE_FAN:
+      return prim_type;
+
+   case MESA_PRIM_LINES_ADJACENCY:
+   case MESA_PRIM_LINE_STRIP_ADJACENCY:
+   case MESA_PRIM_TRIANGLES_ADJACENCY:
+   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return 8 + (prim_type - MESA_PRIM_LINES_ADJACENCY);
+
+   default:
+      unreachable("Unsupported primitive type");
+   }
+}
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp)
+{
+        switch (internal_bpp) {
+        case 0 /* V3D_INTERNAL_BPP_32 */:
+                return 1;
+        case 1 /* V3D_INTERNAL_BPP_64 */:
+                return 2;
+        case 2 /* V3D_INTERNAL_BPP_128 */:
+                return 4;
+        default:
+                unreachable("Unsupported internal BPP");
+        }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+                                       uint32_t bpp)
+{
+        /* stride in multiples of 128 bits, and covers 2 rows. This is the
+         * reason we divide by 2 instead of 4, as we divide number of 32-bit
+         * words per row by 2.
+         */
+
+        return (tile_width * bpp) / 2;
+}
diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
index b9804f235ae..cc6b57b27b2 100644
--- a/src/broadcom/common/v3d_util.h
+++ b/src/broadcom/common/v3d_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,10 @@
 #ifndef V3D_UTIL_H
 #define V3D_UTIL_H
 
+#include "util/macros.h"
 #include "common/v3d_device_info.h"
+#include "compiler/shader_enums.h"
+#include "util/format/u_formats.h"
 
 uint32_t
 v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
@@ -34,4 +37,46 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
                                          uint32_t num_wgs,
                                          uint32_t wg_size);
 
+void
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+                     uint32_t color_attachment_count,
+                     uint32_t max_internal_bpp,
+                     uint32_t total_color_bpp,
+                     bool msaa,
+                     bool double_buffer,
+                     uint32_t *width,
+                     uint32_t *height);
+
+uint32_t
+v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
+
+uint32_t
+v3d_hw_prim_type(enum mesa_prim prim_type);
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp);
+
+/* Some configuration packets want the size on log2, but starting at 0 for
+ * size 8.
+ */
+static inline uint8_t
+log2_tile_size(uint32_t size)
+{
+        switch(size) {
+        case 8:
+                return 0;
+        case 16:
+                return 1;
+        case 32:
+                return 2;
+        case 64:
+                return 3;
+        default:
+                unreachable("Unsupported tile width/height");
+        }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+                                       uint32_t bpp);
 #endif
diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build
index 95156140ad9..d5aafb3879e 100644
--- a/src/broadcom/compiler/meson.build
+++ b/src/broadcom/compiler/meson.build
@@ -32,23 +32,22 @@ libbroadcom_compiler_files = files(
   'vir_to_qpu.c',
   'qpu_schedule.c',
   'qpu_validate.c',
-  'v3d33_tex.c',
-  'v3d40_tex.c',
-  'v3d33_vpm_setup.c',
+  'v3d_tex.c',
   'v3d_compiler.h',
   'v3d_nir_lower_io.c',
   'v3d_nir_lower_image_load_store.c',
   'v3d_nir_lower_line_smooth.c',
+  'v3d_nir_lower_load_store_bitsize.c',
   'v3d_nir_lower_logic_ops.c',
-  'v3d_nir_lower_robust_buffer_access.c',
   'v3d_nir_lower_scratch.c',
   'v3d_nir_lower_txf_ms.c',
+  'v3d_packing.c',
 )
 
 libbroadcom_compiler = static_library(
-  ['broadcom_compiler', v3d_xml_pack],
-  libbroadcom_compiler_files,
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+  'broadcom_compiler',
+  [libbroadcom_compiler_files, v3d_xml_pack],
+  include_directories : [inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_broadcom],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
   dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index d0a89f1a7d4..acc62a092f2 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -38,7 +38,7 @@
 #define __gen_address_type uint32_t
 #define __gen_address_offset(reloc) (*reloc)
 #define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
 
 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
@@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
 
-        /* We need to lock the scoreboard before any tlb acess happens. If this
+        /* We need to lock the scoreboard before any tlb access happens. If this
          * thread switch comes after we have emitted a tlb load, then it means
          * that we can't lock on the last thread switch any more.
          */
@@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
 }
 
 static uint32_t
+v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
+{
+        nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+        switch (atomic_op) {
+        case nir_atomic_op_iadd:
+                return  instr->intrinsic == nir_intrinsic_ssbo_atomic ?
+                        v3d_get_op_for_atomic_add(instr, 2) :
+                        v3d_get_op_for_atomic_add(instr, 1);
+        case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
+        case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+        case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
+        case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
+        case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
+        case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+        case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+        case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+        default:                    unreachable("unknown atomic op");
+        }
+}
+
+static uint32_t
 v3d_general_tmu_op(nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
@@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
         case nir_intrinsic_load_uniform:
         case nir_intrinsic_load_shared:
         case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_global_2x32:
         case nir_intrinsic_store_ssbo:
         case nir_intrinsic_store_shared:
         case nir_intrinsic_store_scratch:
+        case nir_intrinsic_store_global_2x32:
                 return V3D_TMU_OP_REGULAR;
-        case nir_intrinsic_ssbo_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 2);
-        case nir_intrinsic_shared_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 1);
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_shared_atomic_imin:
-                return V3D_TMU_OP_WRITE_SMIN;
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_shared_atomic_umin:
-                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_shared_atomic_imax:
-                return V3D_TMU_OP_WRITE_SMAX;
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_shared_atomic_umax:
-                return V3D_TMU_OP_WRITE_UMAX;
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_shared_atomic_and:
-                return V3D_TMU_OP_WRITE_AND_READ_INC;
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_shared_atomic_or:
-                return V3D_TMU_OP_WRITE_OR_READ_DEC;
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_shared_atomic_xor:
-                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_shared_atomic_exchange:
-                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
-        case nir_intrinsic_ssbo_atomic_comp_swap:
-        case nir_intrinsic_shared_atomic_comp_swap:
-                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+        case nir_intrinsic_ssbo_atomic:
+        case nir_intrinsic_ssbo_atomic_swap:
+        case nir_intrinsic_shared_atomic:
+        case nir_intrinsic_shared_atomic_swap:
+        case nir_intrinsic_global_atomic_2x32:
+        case nir_intrinsic_global_atomic_swap_2x32:
+                return v3d_general_tmu_op_for_atomic(instr);
+
         default:
                 unreachable("unknown intrinsic op");
         }
@@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c)
         bool emitted_tmuwt = false;
         for (int i = 0; i < c->tmu.flush_count; i++) {
                 if (c->tmu.flush[i].component_mask > 0) {
-                        nir_dest *dest = c->tmu.flush[i].dest;
-                        assert(dest);
+                        nir_def *def = c->tmu.flush[i].def;
+                        assert(def);
 
                         for (int j = 0; j < 4; j++) {
                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
-                                        ntq_store_dest(c, dest, j,
-                                                       vir_MOV(c, vir_LDTMU(c)));
+                                        ntq_store_def(c, def, j,
+                                                      vir_MOV(c, vir_LDTMU(c)));
                                 }
                         }
                 } else if (!emitted_tmuwt) {
@@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c)
 
 /**
  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
- * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
+ * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
  * and more specifically, the output fifo, since that can't stall.
  */
 void
 ntq_add_pending_tmu_flush(struct v3d_compile *c,
-                          nir_dest *dest,
+                          nir_def *def,
                           uint32_t component_mask)
 {
         const uint32_t num_components = util_bitcount(component_mask);
@@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
 
         if (num_components > 0) {
                 c->tmu.output_fifo_size += num_components;
-                if (!dest->is_ssa)
-                        _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
+
+                nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+                if (store != NULL) {
+                        nir_def *reg = store->src[1].ssa;
+                        _mesa_set_add(c->tmu.outstanding_regs, reg);
+                }
         }
 
-        c->tmu.flush[c->tmu.flush_count].dest = dest;
+        c->tmu.flush[c->tmu.flush_count].def = def;
         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
         c->tmu.flush_count++;
+        c->tmu.total_count++;
 
         if (c->disable_tmu_pipelining)
                 ntq_flush_tmu(c);
@@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
                               uint32_t base_const_offset,
                               uint32_t *writemask,
                               uint32_t *const_offset,
+                              uint32_t *type_size,
                               uint32_t *tmu_writes)
 {
         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
@@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
                 /* Update the offset for the TMU write based on the
                  * the first component we are writing.
                  */
-                *const_offset = base_const_offset + first_component * 4;
+                *type_size = nir_src_bit_size(instr->src[0]) / 8;
+                *const_offset =
+                        base_const_offset + first_component * (*type_size);
 
                 /* Clear these components from the writemask */
                 uint32_t written_mask =
@@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
                                int offset_src,
                                struct qreg base_offset,
                                uint32_t const_offset,
+                               uint32_t dest_components,
                                uint32_t *tmu_writes)
 {
         if (mode == MODE_COUNT) {
@@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
 
         if (vir_in_nonuniform_control_flow(c))
                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+        tmu->ldtmu_count = dest_components;
 }
 
 /**
@@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
  */
 static void
 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
-                     bool is_shared_or_scratch)
+                     bool is_shared_or_scratch, bool is_global)
 {
         uint32_t tmu_op = v3d_general_tmu_op(instr);
 
@@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
          * amount to add/sub, as that is implicit.
          */
         bool atomic_add_replaced =
-                ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
-                  instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+                (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
+                 instr->intrinsic == nir_intrinsic_shared_atomic ||
+                 instr->intrinsic == nir_intrinsic_global_atomic_2x32) &&
+                nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
-                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
 
         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                          instr->intrinsic == nir_intrinsic_store_scratch ||
-                         instr->intrinsic == nir_intrinsic_store_shared);
+                         instr->intrinsic == nir_intrinsic_store_shared ||
+                         instr->intrinsic == nir_intrinsic_store_global_2x32);
 
         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
                         instr->intrinsic == nir_intrinsic_load_ubo ||
                         instr->intrinsic == nir_intrinsic_load_ssbo ||
                         instr->intrinsic == nir_intrinsic_load_scratch ||
-                        instr->intrinsic == nir_intrinsic_load_shared);
+                        instr->intrinsic == nir_intrinsic_load_shared ||
+                        instr->intrinsic == nir_intrinsic_load_global_2x32);
 
         if (!is_load)
                 c->tmu_dirty_rcl = true;
 
-        bool has_index = !is_shared_or_scratch;
+        if (is_global)
+                c->has_global_address = true;
+
+        bool has_index = !is_shared_or_scratch && !is_global;
 
         int offset_src;
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                    instr->intrinsic == nir_intrinsic_load_ubo ||
                    instr->intrinsic == nir_intrinsic_load_scratch ||
                    instr->intrinsic == nir_intrinsic_load_shared ||
+                   instr->intrinsic == nir_intrinsic_load_global_2x32 ||
                    atomic_add_replaced) {
                 offset_src = 0 + has_index;
         } else if (is_store) {
@@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                           v3d_unit_data_create(0, const_offset));
                 const_offset = 0;
         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
-                uint32_t index = nir_src_as_uint(instr->src[0]);
-                /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
-                 * shifted up by 1 (0 is gallium's constant buffer 0).
+                /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
+                 * is gallium's constant buffer 0 in GL and push constants
+                 * in Vulkan)).
                  */
-                if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
-                        index++;
-
+                uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
                 base_offset =
                         vir_uniform(c, QUNIFORM_UBO_ADDR,
                                     v3d_unit_data_create(index, const_offset));
@@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                         base_offset = c->cs_shared_offset;
                         const_offset += nir_intrinsic_base(instr);
                 }
+        } else if (is_global) {
+                /* Global load/store intrinsics use gloal addresses, so the
+                 * offset is the target address and we don't need to add it
+                 * to a base offset.
+                 */
+                base_offset = vir_uniform_ui(c, 0);
         } else {
+                uint32_t idx = is_store ? 1 : 0;
                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
-                                          nir_src_as_uint(instr->src[is_store ?
-                                                                      1 : 0]));
+                                          nir_src_comp_as_uint(instr->src[idx], 0));
         }
 
         /* We are ready to emit TMU register writes now, but before we actually
@@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
                 assert(mode == MODE_COUNT || tmu_writes > 0);
 
+                uint32_t type_size = 4;
+
                 if (is_store) {
                         emit_tmu_general_store_writes(c, mode, instr,
                                                       base_const_offset,
                                                       &writemask,
                                                       &const_offset,
+                                                      &type_size,
                                                       &tmu_writes);
                 } else if (!is_load && !atomic_add_replaced) {
-                         emit_tmu_general_atomic_writes(c, mode, instr,
-                                                        tmu_op, has_index,
-                                                        &tmu_writes);
+                        emit_tmu_general_atomic_writes(c, mode, instr,
+                                                       tmu_op, has_index,
+                                                       &tmu_writes);
+                } else if (is_load) {
+                        type_size = instr->def.bit_size / 8;
                 }
 
                 /* For atomics we use 32bit except for CMPXCHG, that we need
@@ -618,17 +648,40 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
                                 V3D_TMU_OP_TYPE_ATOMIC;
 
+                        /* Only load per-quad if we can be certain that all
+                         * lines in the quad are active. Notice that demoted
+                         * invocations, unlike terminated ones, are still
+                         * active: we want to skip memory writes for them but
+                         * loads should still work.
+                         */
                         uint32_t perquad =
-                                is_load && !vir_in_nonuniform_control_flow(c)
-                                ? GENERAL_TMU_LOOKUP_PER_QUAD
-                                : GENERAL_TMU_LOOKUP_PER_PIXEL;
+                                is_load && !vir_in_nonuniform_control_flow(c) &&
+                                ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
+                                  c->s->info.fs.needs_quad_helper_invocations &&
+                                  !c->emitted_discard) ||
+                                 c->s->info.uses_wide_subgroup_intrinsics) ?
+                                GENERAL_TMU_LOOKUP_PER_QUAD :
+                                GENERAL_TMU_LOOKUP_PER_PIXEL;
                         config = 0xffffff00 | tmu_op << 3 | perquad;
 
                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
                         } else if (is_atomic || num_components == 1) {
-                                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                                switch (type_size) {
+                                case 4:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                                        break;
+                                case 2:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
+                                        break;
+                                case 1:
+                                        config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
+                                        break;
+                                default:
+                                        unreachable("Unsupported bitsize");
+                                }
                         } else {
+                                assert(type_size == 4);
                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
                                           num_components - 2;
                         }
@@ -637,7 +690,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                 emit_tmu_general_address_write(c, mode, instr, config,
                                                dynamic_src, offset_src,
                                                base_offset, const_offset,
-                                               &tmu_writes);
+                                               dest_components, &tmu_writes);
 
                 assert(tmu_writes > 0);
                 if (mode == MODE_COUNT) {
@@ -660,7 +713,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                          */
                         const uint32_t component_mask =
                                 (1 << dest_components) - 1;
-                        ntq_add_pending_tmu_flush(c, &instr->dest,
+                        ntq_add_pending_tmu_flush(c, &instr->def,
                                                   component_mask);
                 }
         }
@@ -673,7 +726,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
 }
 
 static struct qreg *
-ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
+ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
 {
         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
                                           def->num_components);
@@ -717,8 +770,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig)
  * its destination to be the NIR reg's destination
  */
 void
-ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
-               struct qreg result)
+ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+              struct qreg result)
 {
         struct qinst *last_inst = NULL;
         if (!list_is_empty(&c->cur_block->instructions))
@@ -731,23 +784,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
         assert(result.file == QFILE_TEMP && last_inst &&
                (last_inst == c->defs[result.index] || is_reused_uniform));
 
-        if (dest->is_ssa) {
-                assert(chan < dest->ssa.num_components);
+        nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+        if (store == NULL) {
+                assert(chan < def->num_components);
 
                 struct qreg *qregs;
                 struct hash_entry *entry =
-                        _mesa_hash_table_search(c->def_ht, &dest->ssa);
+                        _mesa_hash_table_search(c->def_ht, def);
 
                 if (entry)
                         qregs = entry->data;
                 else
-                        qregs = ntq_init_ssa_def(c, &dest->ssa);
+                        qregs = ntq_init_ssa_def(c, def);
 
                 qregs[chan] = result;
         } else {
-                nir_register *reg = dest->reg.reg;
-                assert(dest->reg.base_offset == 0);
-                assert(reg->num_array_elems == 0);
+                nir_def *reg = store->src[1].ssa;
+                ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                assert(nir_intrinsic_base(store) == 0);
+                assert(nir_intrinsic_num_array_elems(decl) == 0);
                 struct hash_entry *entry =
                         _mesa_hash_table_search(c->def_ht, reg);
                 struct qreg *qregs = entry->data;
@@ -802,7 +857,9 @@ struct qreg
 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
 {
         struct hash_entry *entry;
-        if (src.is_ssa) {
+
+        nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
+        if (load == NULL) {
                 assert(i < src.ssa->num_components);
 
                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
@@ -811,10 +868,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i)
                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
                 }
         } else {
-                nir_register *reg = src.reg.reg;
-                assert(reg->num_array_elems == 0);
-                assert(src.reg.base_offset == 0);
-                assert(i < reg->num_components);
+                nir_def *reg = load->src[0].ssa;
+                ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                assert(nir_intrinsic_base(load) == 0);
+                assert(nir_intrinsic_num_array_elems(decl) == 0);
+                assert(i < nir_intrinsic_num_components(decl));
 
                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
                         ntq_flush_tmu(c);
@@ -830,13 +888,8 @@ static struct qreg
 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
                 unsigned src)
 {
-        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
-        unsigned chan = ffs(instr->dest.write_mask) - 1;
         struct qreg r = ntq_get_src(c, instr->src[src].src,
-                                    instr->src[src].swizzle[chan]);
-
-        assert(!instr->src[src].abs);
-        assert(!instr->src[src].negate);
+                                    instr->src[src].swizzle[0]);
 
         return r;
 };
@@ -876,6 +929,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                 case GLSL_SAMPLER_DIM_3D:
                 case GLSL_SAMPLER_DIM_CUBE:
                 case GLSL_SAMPLER_DIM_BUF:
+                case GLSL_SAMPLER_DIM_EXTERNAL:
                         /* Don't minify the array size. */
                         if (!(instr->is_array && i == dest_size - 1)) {
                                 size = ntq_minify(c, size, lod);
@@ -890,7 +944,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                         unreachable("Bad sampler type");
                 }
 
-                ntq_store_dest(c, &instr->dest, i, size);
+                ntq_store_def(c, &instr->def, i, size);
         }
 }
 
@@ -905,12 +959,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
          */
         switch (instr->op) {
         case nir_texop_query_levels:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
                 return;
         case nir_texop_texture_samples:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
                 return;
         case nir_texop_txs:
                 ntq_emit_txs(c, instr);
@@ -919,10 +973,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 break;
         }
 
-        if (c->devinfo->ver >= 40)
-                v3d40_vir_emit_tex(c, instr);
-        else
-                v3d33_vir_emit_tex(c, instr);
+        v3d_vir_emit_tex(c, instr);
 }
 
 static struct qreg
@@ -963,44 +1014,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
 
 static struct qreg
 emit_smooth_varying(struct v3d_compile *c,
-                    struct qreg vary, struct qreg w, struct qreg r5)
+                    struct qreg vary, struct qreg w, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+        return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
 }
 
 static struct qreg
 emit_noperspective_varying(struct v3d_compile *c,
-                           struct qreg vary, struct qreg r5)
+                           struct qreg vary, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_MOV(c, vary), r5);
+        return vir_FADD(c, vir_MOV(c, vary), c_reg);
 }
 
 static struct qreg
 emit_flat_varying(struct v3d_compile *c,
-                  struct qreg vary, struct qreg r5)
+                  struct qreg vary, struct qreg c_reg)
 {
         vir_MOV_dest(c, c->undef, vary);
-        return vir_MOV(c, r5);
+        return vir_MOV(c, c_reg);
 }
 
 static struct qreg
 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                       int8_t input_idx, uint8_t swizzle, int array_index)
 {
-        struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
-        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        struct qreg c_reg; /* C coefficient */
+
+        if (c->devinfo->has_accumulators)
+                c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        else
+                c_reg = vir_reg(QFILE_REG, 0);
 
         struct qinst *ldvary = NULL;
         struct qreg vary;
-        if (c->devinfo->ver >= 41) {
-                ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
-                                      c->undef, c->undef);
-                ldvary->qpu.sig.ldvary = true;
-                vary = vir_emit_def(c, ldvary);
-        } else {
-                vir_NOP(c)->qpu.sig.ldvary = true;
-                vary = r3;
-        }
+        ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+                              c->undef, c->undef);
+        ldvary->qpu.sig.ldvary = true;
+        vary = vir_emit_def(c, ldvary);
 
         /* Store the input value before interpolation so we can implement
          * GLSL's interpolateAt functions if the shader uses them.
@@ -1008,7 +1058,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
         if (input_idx >= 0) {
                 assert(var);
                 c->interp[input_idx].vp = vary;
-                c->interp[input_idx].C = vir_MOV(c, r5);
+                c->interp[input_idx].C = vir_MOV(c, c_reg);
                 c->interp[input_idx].mode = var->data.interpolation;
         }
 
@@ -1018,7 +1068,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
          */
         if (!var) {
                 assert(input_idx < 0);
-                return emit_smooth_varying(c, vary, c->payload_w, r5);
+                return emit_smooth_varying(c, vary, c->payload_w, c_reg);
         }
 
         int i = c->num_inputs++;
@@ -1033,20 +1083,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                 if (var->data.centroid) {
                         BITSET_SET(c->centroid_flags, i);
                         result = emit_smooth_varying(c, vary,
-                                                     c->payload_w_centroid, r5);
+                                                     c->payload_w_centroid, c_reg);
                 } else {
-                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
+                        result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
                 }
                 break;
 
         case INTERP_MODE_NOPERSPECTIVE:
                 BITSET_SET(c->noperspective_flags, i);
-                result = emit_noperspective_varying(c, vary, r5);
+                result = emit_noperspective_varying(c, vary, c_reg);
                 break;
 
         case INTERP_MODE_FLAT:
                 BITSET_SET(c->flat_shade_flags, i);
-                result = emit_flat_varying(c, vary, r5);
+                result = emit_flat_varying(c, vary, c_reg);
                 break;
 
         default:
@@ -1163,16 +1213,6 @@ ntq_emit_comparison(struct v3d_compile *c,
                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                 break;
 
-        case nir_op_i2b32:
-                vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
-                cond_invert = true;
-                break;
-
-        case nir_op_f2b32:
-                vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
-                cond_invert = true;
-                break;
-
         default:
                 return false;
         }
@@ -1188,7 +1228,7 @@ ntq_emit_comparison(struct v3d_compile *c,
 static struct nir_alu_instr *
 ntq_get_alu_parent(nir_src src)
 {
-        if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
+        if (src.ssa->parent_instr->type != nir_instr_type_alu)
                 return NULL;
         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
         if (!instr)
@@ -1199,7 +1239,7 @@ ntq_get_alu_parent(nir_src src)
          * src.
          */
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
-                if (!instr->src[i].src.is_ssa)
+                if (nir_load_reg_for_def(instr->src[i].src.ssa))
                         return NULL;
         }
 
@@ -1242,12 +1282,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
         return result;
 }
 
+static struct qreg
+ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+        struct qreg result =
+                vir_MOV(c, vir_SEL(c, cond,
+                                   vir_uniform_ui(c, 1),
+                                   vir_uniform_ui(c, 0)));
+        c->flags_temp = result.index;
+        c->flags_cond = cond;
+        return result;
+}
+
+static struct qreg
+f2f16_rtz(struct v3d_compile *c, struct qreg f32)
+{
+   /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
+    * method and seems to be using RTE by default, so we need to implement
+    * RTZ rounding in software.
+    */
+   struct qreg rf16 = vir_FMOV(c, f32);
+   vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
+
+   struct qreg rf32 = vir_FMOV(c, rf16);
+   vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
+
+   struct qreg f32_abs = vir_FMOV(c, f32);
+   vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+   struct qreg rf32_abs = vir_FMOV(c, rf32);
+   vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+   vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
+              V3D_QPU_PF_PUSHN);
+   return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+                  vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
+}
+
+/**
+ * Takes the result value of a signed integer width conversion from a smaller
+ * type to a larger type and if needed, it applies sign extension to it.
+ */
+static struct qreg
+sign_extend(struct v3d_compile *c,
+            struct qreg value,
+            uint32_t src_bit_size,
+            uint32_t dst_bit_size)
+{
+        assert(src_bit_size < dst_bit_size);
+
+        struct qreg tmp = vir_MOV(c, value);
+
+        /* Do we need to sign-extend? */
+        uint32_t sign_mask = 1 << (src_bit_size - 1);
+        struct qinst *sign_check =
+                vir_AND_dest(c, vir_nop_reg(),
+                             tmp, vir_uniform_ui(c, sign_mask));
+        vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
+
+        /* If so, fill in leading sign bits */
+        uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
+                               ((1ull << dst_bit_size) - 1);
+        struct qinst *extend_inst =
+                vir_OR_dest(c, tmp, tmp,
+                            vir_uniform_ui(c, extend_bits));
+        vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
+
+        return tmp;
+}
+
 static void
 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 {
-        /* This should always be lowered to ALU operations for V3D. */
-        assert(!instr->dest.saturate);
-
         /* Vectors are special in that they have non-scalarized writemasks,
          * and just take the first swizzle channel for each argument in order
          * into each writemask channel.
@@ -1260,8 +1366,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-                        ntq_store_dest(c, &instr->dest.dest, i,
-                                       vir_MOV(c, srcs[i]));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_MOV(c, srcs[i]));
                 return;
         }
 
@@ -1327,6 +1433,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
                 break;
 
+        case nir_op_f2f16:
+        case nir_op_f2f16_rtne:
+                assert(nir_src_bit_size(instr->src[0].src) == 32);
+                result = vir_FMOV(c, src[0]);
+                vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
+                break;
+
+        case nir_op_f2f16_rtz:
+                assert(nir_src_bit_size(instr->src[0].src) == 32);
+                result = f2f16_rtz(c, src[0]);
+                break;
+
+        case nir_op_f2f32:
+                assert(nir_src_bit_size(instr->src[0].src) == 16);
+                result = vir_FMOV(c, src[0]);
+                vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
+                break;
+
+        case nir_op_i2i16: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 32 || bit_size == 8);
+                if (bit_size == 32) {
+                        /* We don't have integer pack/unpack methods for
+                         * converting between 16-bit and 32-bit, so we implement
+                         * the conversion manually by truncating the src.
+                         */
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+                } else {
+                        struct qreg tmp = vir_AND(c, src[0],
+                                                  vir_uniform_ui(c, 0xff));
+                        result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
+                }
+                break;
+        }
+
+        case nir_op_u2u16: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 32 || bit_size == 8);
+
+                /* We don't have integer pack/unpack methods for converting
+                 * between 16-bit and 32-bit, so we implement the conversion
+                 * manually by truncating the src. For the 8-bit case, we
+                 * want to make sure we don't copy garbage from any of the
+                 * 24 MSB bits.
+                 */
+                if (bit_size == 32)
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+                else
+                        result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+                break;
+        }
+
+        case nir_op_i2i8:
+        case nir_op_u2u8:
+                assert(nir_src_bit_size(instr->src[0].src) == 32 ||
+                       nir_src_bit_size(instr->src[0].src) == 16);
+                /* We don't have integer pack/unpack methods for converting
+                 * between 8-bit and 32-bit, so we implement the conversion
+                 * manually by truncating the src.
+                 */
+                result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+                break;
+
+        case nir_op_u2u32: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 16 || bit_size == 8);
+
+                /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
+                 * from the src but we make sure to clear any garbage bits that
+                 * may be present in the invalid src bits.
+                 */
+                uint32_t mask = (1 << bit_size) - 1;
+                result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
+                break;
+        }
+
+        case nir_op_i2i32: {
+                uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+                assert(bit_size == 16 || bit_size == 8);
+
+                uint32_t mask = (1 << bit_size) - 1;
+                struct qreg tmp = vir_AND(c, src[0],
+                                          vir_uniform_ui(c, mask));
+
+                result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
+                break;
+        }
+
         case nir_op_iadd:
                 result = vir_ADD(c, src[0], src[1]);
                 break;
@@ -1390,8 +1584,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
         }
 
-        case nir_op_i2b32:
-        case nir_op_f2b32:
         case nir_op_feq32:
         case nir_op_fneu32:
         case nir_op_fge32:
@@ -1485,13 +1677,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_uadd_carry:
                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
                            V3D_QPU_PF_PUSHC);
-                result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+                result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
+                break;
+
+        case nir_op_usub_borrow:
+                vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
+                           V3D_QPU_PF_PUSHC);
+                result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
                 break;
 
         case nir_op_pack_half_2x16_split:
                 result = vir_VFPACK(c, src[0], src[1]);
                 break;
 
+        case nir_op_pack_2x32_to_2x16_v3d:
+                result = vir_VPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_32_to_r11g11b10_v3d:
+                result = vir_V11FPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
+                result = vir_V10PACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_pack_4x16_to_4x8_v3d:
+                result = vir_V8PACK(c, src[0], src[1]);
+                break;
+
         case nir_op_unpack_half_2x16_split_x:
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1502,26 +1716,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
                 break;
 
-        case nir_op_fquantize2f16: {
-                /* F32 -> F16 -> F32 conversion */
-                struct qreg tmp = vir_FMOV(c, src[0]);
-                vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
-                tmp = vir_FMOV(c, tmp);
-                vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
+        case nir_op_pack_2x16_to_unorm_2x8_v3d:
+                result = vir_VFTOUNORM8(c, src[0]);
+                break;
 
-                /* Check for denorm */
-                struct qreg abs_src = vir_FMOV(c, src[0]);
-                vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
-                struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
-                vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
-                                         V3D_QPU_PF_PUSHC);
+        case nir_op_pack_2x16_to_snorm_2x8_v3d:
+                result = vir_VFTOSNORM8(c, src[0]);
+                break;
 
-                /* Return +/-0 for denorms */
-                struct qreg zero =
-                        vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
-                result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
+        case nir_op_pack_2x16_to_unorm_2x10_v3d:
+                result = vir_VFTOUNORM10LO(c, src[0]);
+                break;
+
+        case nir_op_pack_2x16_to_unorm_10_2_v3d:
+                result = vir_VFTOUNORM10HI(c, src[0]);
+                break;
+
+        case nir_op_f2unorm_16_v3d:
+                result = vir_FTOUNORM16(c, src[0]);
+                break;
+
+        case nir_op_f2snorm_16_v3d:
+                result = vir_FTOSNORM16(c, src[0]);
                 break;
-        }
 
         default:
                 fprintf(stderr, "unknown NIR ALU inst: ");
@@ -1530,17 +1747,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 abort();
         }
 
-        /* We have a scalar result, so the instruction should only have a
-         * single channel written to.
-         */
-        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
-        ntq_store_dest(c, &instr->dest.dest,
-                       ffs(instr->dest.write_mask) - 1, result);
+        ntq_store_def(c, &instr->def, 0, result);
 }
 
 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
  * specifier.  They come from a register that's preloaded with 0xffffffff
- * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
+ * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
  */
 #define TLB_TYPE_F16_COLOR         (3 << 6)
@@ -1670,15 +1882,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
 static void
 emit_frag_end(struct v3d_compile *c)
 {
-        /* If the shader has no non-TLB side effects and doesn't write Z
-         * we can promote it to enabling early_fragment_tests even
-         * if the user didn't.
-         */
-        if (c->output_position_index == -1 &&
-            !(c->s->info.num_images || c->s->info.num_ssbos)) {
-                c->s->info.fs.early_fragment_tests = true;
-        }
-
         if (c->output_sample_mask_index != -1) {
                 vir_SETMSF_dest(c, vir_nop_reg(),
                                 vir_AND(c,
@@ -1703,55 +1906,75 @@ emit_frag_end(struct v3d_compile *c)
         }
 
         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
-        if (c->output_position_index != -1 &&
-            !c->s->info.fs.early_fragment_tests) {
-                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
-                                                  c->outputs[c->output_position_index]);
-                uint8_t tlb_specifier = TLB_TYPE_DEPTH;
 
-                if (c->devinfo->ver >= 42) {
-                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
-                                          TLB_SAMPLE_MODE_PER_PIXEL);
-                } else
-                        tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
+        /* If the shader has no non-TLB side effects and doesn't write Z
+         * we can promote it to enabling early_fragment_tests even
+         * if the user didn't.
+         */
+        if (c->output_position_index == -1 &&
+            !(c->s->info.num_images || c->s->info.num_ssbos) &&
+            !c->s->info.fs.uses_discard &&
+            !c->s->info.fs.uses_demote &&
+            !c->fs_key->sample_alpha_to_coverage &&
+            c->output_sample_mask_index == -1 &&
+            has_any_tlb_color_write) {
+                c->s->info.fs.early_fragment_tests = true;
+        }
 
-                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                      tlb_specifier |
-                                                      0xffffff00);
+        /* By default, Z buffer writes are implicit using the Z values produced
+         * from FEP (Z value produced from rasterization). When this is not
+         * desirable (shader writes Z explicitly, has discards, etc) we need
+         * to let the hardware know by setting c->writes_z to true, in which
+         * case we always need to write a Z value from the QPU, even if it is
+         * just the passthrough Z value produced from FEP.
+         *
+         * Also, from the V3D 4.2 spec:
+         *
+         * "If a shader performs a Z read the “Fragment shader does Z writes”
+         *  bit in the shader record must be enabled to ensure deterministic
+         *  results"
+         *
+         * So if c->reads_z is set we always need to write Z, even if it is
+         * a passthrough from the Z value produced from FEP.
+         */
+        if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
                 c->writes_z = true;
-        } else if (c->s->info.fs.uses_discard ||
-                   !c->s->info.fs.early_fragment_tests ||
-                   c->fs_key->sample_alpha_to_coverage ||
-                   !has_any_tlb_color_write) {
-                /* Emit passthrough Z if it needed to be delayed until shader
-                 * end due to potential discards.
-                 *
-                 * Since (single-threaded) fragment shaders always need a TLB
-                 * write, emit passthrouh Z if we didn't have any color
-                 * buffers and flag us as potentially discarding, so that we
-                 * can use Z as the TLB write.
-                 */
-                c->s->info.fs.uses_discard = true;
-
-                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
-                                                  vir_nop_reg());
                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
+                struct qinst *inst;
+
+                if (c->output_position_index != -1) {
+                        /* Shader writes to gl_FragDepth, use that */
+                        inst = vir_MOV_dest(c, tlbu_reg,
+                                            c->outputs[c->output_position_index]);
+
+                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
+                                          TLB_SAMPLE_MODE_PER_PIXEL);
+                } else {
+                        /* Shader doesn't write to gl_FragDepth, take Z from
+                         * FEP.
+                         */
+                        c->writes_z_from_fep = true;
+                        inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
 
-                if (c->devinfo->ver >= 42) {
                         /* The spec says the PER_PIXEL flag is ignored for
                          * invariant writes, but the simulator demands it.
                          */
                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
                                           TLB_SAMPLE_MODE_PER_PIXEL);
-                } else {
-                        tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
+
+                        /* Since (single-threaded) fragment shaders always need
+                         * a TLB write, if we dond't have any we emit a
+                         * passthrouh Z and flag us as potentially discarding,
+                         * so that we can use Z as the required TLB write.
+                         */
+                        if (!has_any_tlb_color_write)
+                                c->s->info.fs.uses_discard = true;
                 }
 
-                inst->uniform = vir_get_uniform_index(c,
-                                                      QUNIFORM_CONSTANT,
+                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
                                                       tlb_specifier |
                                                       0xffffff00);
-                c->writes_z = true;
+                inst->is_tlb_z_write = true;
         }
 
         /* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1767,7 +1990,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
                        struct qreg vpm_index,
                        bool uniform_vpm_index)
 {
-        assert(c->devinfo->ver >= 40);
         if (uniform_vpm_index)
                 vir_STVPMV(c, vpm_index, val);
         else
@@ -1777,13 +1999,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
 static void
 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
 {
-        if (c->devinfo->ver >= 40) {
-                vir_VPM_WRITE_indirect(c, val,
-                                       vir_uniform_ui(c, vpm_index), true);
-        } else {
-                /* XXX: v3d33_vir_vpm_write_setup(c); */
-                vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
-        }
+        vir_VPM_WRITE_indirect(c, val,
+                               vir_uniform_ui(c, vpm_index), true);
 }
 
 static void
@@ -1791,7 +2008,7 @@ emit_vert_end(struct v3d_compile *c)
 {
         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
          */
-        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+        if (c->devinfo->ver == 42)
                 vir_VPMWT(c);
 }
 
@@ -1800,7 +2017,7 @@ emit_geom_end(struct v3d_compile *c)
 {
         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
          */
-        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+        if (c->devinfo->ver == 42)
                 vir_VPMWT(c);
 }
 
@@ -1812,8 +2029,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
                        nir_intrinsic_instr *high,
                        void *data)
 {
-        /* Our backend is 32-bit only at present */
-        if (bit_size != 32)
+        /* TMU general access only supports 32-bit vectors */
+        if (bit_size > 32)
+                return false;
+
+        if ((bit_size == 8 || bit_size == 16) && num_components > 1)
                 return false;
 
         if (align_mul % 4 != 0 || align_offset % 4 != 0)
@@ -1843,7 +2063,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
         do {
                 progress = false;
 
-                NIR_PASS_V(s, nir_lower_vars_to_ssa);
+                NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_opt_deref);
+
+                NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+                if (!s->info.var_copies_lowered) {
+                        /* Only run this pass if nir_lower_var_copies was not called
+                         * yet. That would lower away any copy_deref instructions and we
+                         * don't want to introduce any more.
+                         */
+                        NIR_PASS(progress, s, nir_opt_find_array_copies);
+                }
+
+                NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+                NIR_PASS(progress, s, nir_opt_dead_write_vars);
+                NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
+
+                NIR_PASS(progress, s, nir_remove_dead_variables,
+                         (nir_variable_mode)(nir_var_function_temp |
+                                             nir_var_shader_temp |
+                                             nir_var_mem_shared),
+                         NULL);
+
                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
                 NIR_PASS(progress, s, nir_copy_prop);
@@ -1851,10 +2093,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
 
+                NIR_PASS(progress, s, nir_opt_intrinsics);
+                NIR_PASS(progress, s, nir_opt_idiv_const, 32);
+                NIR_PASS(progress, s, nir_lower_alu);
+
+                if (nir_opt_loop(s)) {
+                   progress = true;
+                   NIR_PASS(progress, s, nir_copy_prop);
+                   NIR_PASS(progress, s, nir_opt_dce);
+                }
+
+                NIR_PASS(progress, s, nir_opt_conditional_discard);
+
+                NIR_PASS(progress, s, nir_opt_remove_phis);
+                NIR_PASS(progress, s, nir_opt_if, false);
+                if (c && !c->disable_gcm) {
+                        bool local_progress = false;
+                        NIR_PASS(local_progress, s, nir_opt_gcm, false);
+                        c->gcm_progress |= local_progress;
+                        progress |= local_progress;
+                }
+
+                /* Note that vectorization may undo the load/store scalarization
+                 * pass we run for non 32-bit TMU general load/store by
+                 * converting, for example, 2 consecutive 16-bit loads into a
+                 * single 32-bit load. This is fine (and desirable) as long as
+                 * the resulting 32-bit load meets 32-bit alignment requirements,
+                 * which mem_vectorize_callback() should be enforcing.
+                 */
                 nir_load_store_vectorize_options vectorize_opts = {
                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
                                  nir_var_mem_push_const | nir_var_mem_shared |
@@ -1862,7 +2133,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
                         .callback = mem_vectorize_callback,
                         .robust_modes = 0,
                 };
-                NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
+                bool vectorize_progress = false;
+
+
+                /* This requires that we have called
+                 * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
+                 * first, which we may not have done yet if we call here too
+                 * early durign NIR pre-processing. We can detect this because
+                 * in that case we won't have a compile object
+                 */
+                if (c) {
+                        NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
+                                 &vectorize_opts);
+                        if (vectorize_progress) {
+                                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
+                                NIR_PASS(progress, s, nir_lower_pack);
+                                progress = true;
+                        }
+                }
 
                 if (lower_flrp != 0) {
                         bool lower_flrp_progress = false;
@@ -1895,10 +2183,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
 
         nir_move_options sink_opts =
                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
-                nir_move_load_ubo;
+                nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
-
-        NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
 }
 
 static int
@@ -1915,27 +2201,9 @@ ntq_emit_vpm_read(struct v3d_compile *c,
                   uint32_t *remaining,
                   uint32_t vpm_index)
 {
-        struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
-
-        if (c->devinfo->ver >= 40 ) {
-                return vir_LDVPMV_IN(c,
-                                     vir_uniform_ui(c,
-                                                    (*num_components_queued)++));
-        }
-
-        if (*num_components_queued != 0) {
-                (*num_components_queued)--;
-                return vir_MOV(c, vpm);
-        }
-
-        uint32_t num_components = MIN2(*remaining, 32);
-
-        v3d33_vir_vpm_read_setup(c, num_components);
-
-        *num_components_queued = num_components - 1;
-        *remaining -= num_components;
-
-        return vir_MOV(c, vpm);
+        return vir_LDVPMV_IN(c,
+                             vir_uniform_ui(c,
+                                            (*num_components_queued)++));
 }
 
 static void
@@ -2005,31 +2273,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c)
         }
 
         /* The actual loads will happen directly in nir_intrinsic_load_input
-         * on newer versions.
          */
-        if (c->devinfo->ver >= 40)
-                return;
-
-        for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
-                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + 1) * 4);
-
-                for (int i = 0; i < c->vattr_sizes[loc]; i++) {
-                        c->inputs[loc * 4 + i] =
-                                ntq_emit_vpm_read(c,
-                                                  &vpm_components_queued,
-                                                  &num_components,
-                                                  loc * 4 + i);
-
-                }
-        }
-
-        if (c->devinfo->ver >= 40) {
-                assert(vpm_components_queued == num_components);
-        } else {
-                assert(vpm_components_queued == 0);
-                assert(num_components == 0);
-        }
+        return;
 }
 
 static bool
@@ -2058,14 +2303,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
                  */
                 assert(glsl_type_is_array(var->type));
                 const struct glsl_type *type = glsl_get_array_element(var->type);
-                unsigned array_len = MAX2(glsl_get_length(type), 1);
+                unsigned var_len = glsl_count_vec4_slots(type, false, false);
                 unsigned loc = var->data.driver_location;
 
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + array_len) * 4);
+                                  (loc + var_len) * 4);
 
                 if (var->data.compact) {
-                        for (unsigned j = 0; j < array_len; j++) {
+                        for (unsigned j = 0; j < var_len; j++) {
                                 unsigned input_idx = c->num_inputs++;
                                 unsigned loc_frac = var->data.location_frac + j;
                                 unsigned loc = var->data.location + loc_frac / 4;
@@ -2076,8 +2321,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
                        continue;
                 }
 
-                for (unsigned j = 0; j < array_len; j++) {
-                        unsigned num_elements = glsl_get_vector_elements(type);
+                for (unsigned j = 0; j < var_len; j++) {
+                        unsigned num_elements =
+                                glsl_type_is_struct(glsl_without_array(type)) ?
+                                4 : glsl_get_vector_elements(type);
                         for (unsigned k = 0; k < num_elements; k++) {
                                 unsigned chan = var->data.location_frac + k;
                                 unsigned input_idx = c->num_inputs++;
@@ -2124,7 +2371,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
                 } else if (var->data.compact) {
                         for (int j = 0; j < var_len; j++)
                                 emit_compact_fragment_input(c, loc, var, j);
-                } else if (glsl_type_is_struct(var->type)) {
+                } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
                         for (int j = 0; j < var_len; j++) {
                            emit_fragment_input(c, loc, var, j, 4);
                         }
@@ -2143,12 +2390,9 @@ ntq_setup_outputs(struct v3d_compile *c)
                 return;
 
         nir_foreach_shader_out_variable(var, c->s) {
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+                assert(glsl_type_is_vector_or_scalar(var->type));
                 unsigned loc = var->data.driver_location * 4;
 
-                assert(array_len == 1);
-                (void)array_len;
-
                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
                         add_output(c, loc + var->data.location_frac + i,
                                    var->data.location,
@@ -2157,15 +2401,17 @@ ntq_setup_outputs(struct v3d_compile *c)
 
                 switch (var->data.location) {
                 case FRAG_RESULT_COLOR:
-                        c->output_color_var[0] = var;
-                        c->output_color_var[1] = var;
-                        c->output_color_var[2] = var;
-                        c->output_color_var[3] = var;
+                        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+                                c->output_color_var[i] = var;
                         break;
                 case FRAG_RESULT_DATA0:
                 case FRAG_RESULT_DATA1:
                 case FRAG_RESULT_DATA2:
                 case FRAG_RESULT_DATA3:
+                case FRAG_RESULT_DATA4:
+                case FRAG_RESULT_DATA5:
+                case FRAG_RESULT_DATA6:
+                case FRAG_RESULT_DATA7:
                         c->output_color_var[var->data.location -
                                             FRAG_RESULT_DATA0] = var;
                         break;
@@ -2185,17 +2431,19 @@ ntq_setup_outputs(struct v3d_compile *c)
  * Each nir_register gets a struct qreg per 32-bit component being stored.
  */
 static void
-ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
+ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
 {
-        foreach_list_typed(nir_register, nir_reg, node, list) {
-                unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
+        nir_foreach_reg_decl(decl, impl) {
+                unsigned num_components = nir_intrinsic_num_components(decl);
+                unsigned array_len = nir_intrinsic_num_array_elems(decl);
+                array_len = MAX2(array_len, 1);
                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
-                                                  array_len *
-                                                  nir_reg->num_components);
+                                                  array_len * num_components);
 
+                nir_def *nir_reg = &decl->def;
                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
 
-                for (int i = 0; i < array_len * nir_reg->num_components; i++)
+                for (int i = 0; i < array_len * num_components; i++)
                         qregs[i] = vir_get_temp(c);
         }
 }
@@ -2222,23 +2470,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
         assert(nir_src_as_uint(instr->src[1]) == 0);
 
-        ntq_store_dest(c, &instr->dest, 0,
+        ntq_store_def(c, &instr->def, 0,
                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
         if (instr->num_components > 1) {
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_uniform(c,
-                                           instr->num_components == 2 && is_array ?
-                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
-                                                   QUNIFORM_IMAGE_HEIGHT,
-                                           image_index));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_uniform(c,
+                                          instr->num_components == 2 && is_array ?
+                                                  QUNIFORM_IMAGE_ARRAY_SIZE :
+                                                  QUNIFORM_IMAGE_HEIGHT,
+                                          image_index));
         }
         if (instr->num_components > 2) {
-                ntq_store_dest(c, &instr->dest, 2,
-                               vir_uniform(c,
-                                           is_array ?
-                                           QUNIFORM_IMAGE_ARRAY_SIZE :
-                                           QUNIFORM_IMAGE_DEPTH,
-                                           image_index));
+                ntq_store_def(c, &instr->def, 2,
+                              vir_uniform(c,
+                                          is_array ?
+                                          QUNIFORM_IMAGE_ARRAY_SIZE :
+                                          QUNIFORM_IMAGE_DEPTH,
+                                          image_index));
         }
 }
 
@@ -2263,16 +2511,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
          *
          * To fix that, we make sure we always emit a thread switch before the
          * first tlb color read. If that happens to be the last thread switch
-         * we emit, then everything is fine, but otherwsie, if any code after
+         * we emit, then everything is fine, but otherwise, if any code after
          * this point needs to emit additional thread switches, then we will
          * switch the strategy to locking the scoreboard on the first thread
          * switch instead -- see vir_emit_thrsw().
          */
         if (!c->emitted_tlb_load) {
-                if (!c->last_thrsw_at_top_level) {
-                        assert(c->devinfo->ver >= 41);
+                if (!c->last_thrsw_at_top_level)
                         vir_emit_thrsw(c);
-                }
 
                 c->emitted_tlb_load = true;
         }
@@ -2371,27 +2617,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
         }
 
         assert(color_reads_for_sample[component].file != QFILE_NULL);
-        ntq_store_dest(c, &instr->dest, 0,
-                       vir_MOV(c, color_reads_for_sample[component]));
+        ntq_store_def(c, &instr->def, 0,
+                      vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
+
+static bool
+try_emit_uniform(struct v3d_compile *c,
+                 int offset,
+                 int num_components,
+                 nir_def *def,
+                 enum quniform_contents contents)
+{
+        /* Even though ldunif is strictly 32-bit we can still use it
+         * to load scalar 8-bit/16-bit uniforms so long as their offset
+         * is 32-bit aligned. In this case, ldunif would still load
+         * 32-bit into the destination with the 8-bit/16-bit uniform
+         * data in the LSB and garbage in the MSB, but that is fine
+         * because we should only be accessing the valid bits of the
+         * destination.
+         *
+         * FIXME: if in the future we improve our register allocator to
+         * pack 2 16-bit variables in the MSB and LSB of the same
+         * register then this optimization would not be valid as is,
+         * since the load clobbers the MSB.
+         */
+        if (offset % 4 != 0)
+                return false;
+
+        /* We need dwords */
+        offset = offset / 4;
+
+        for (int i = 0; i < num_components; i++) {
+                ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
+        }
+
+        return true;
 }
 
 static void
 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
+        /* We scalarize general TMU access for anything that is not 32-bit. */
+        assert(instr->def.bit_size == 32 ||
+               instr->num_components == 1);
+
+        /* Try to emit ldunif if possible, otherwise fallback to general TMU */
         if (nir_src_is_const(instr->src[0])) {
                 int offset = (nir_intrinsic_base(instr) +
                              nir_src_as_uint(instr->src[0]));
-                assert(offset % 4 == 0);
-                /* We need dwords */
-                offset = offset / 4;
-                for (int i = 0; i < instr->num_components; i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_UNIFORM,
-                                                   offset + i));
+
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->def, QUNIFORM_UNIFORM)) {
+                        return;
+                }
+        }
+
+        if (!ntq_emit_load_unifa(c, instr)) {
+                ntq_emit_tmu_general(c, instr, false, false);
+                c->has_general_tmu_load = true;
+        }
+}
+
+static bool
+ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        if (c->compiler->max_inline_uniform_buffers <= 0)
+                return false;
+
+        /* Regular UBOs start after inline UBOs */
+        uint32_t index = nir_src_as_uint(instr->src[0]);
+        if (index >= c->compiler->max_inline_uniform_buffers)
+                return false;
+
+        /* We scalarize general TMU access for anything that is not 32-bit */
+        assert(instr->def.bit_size == 32 ||
+               instr->num_components == 1);
+
+        if (nir_src_is_const(instr->src[1])) {
+                int offset = nir_src_as_uint(instr->src[1]);
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->def,
+                                     QUNIFORM_INLINE_UBO_0 + index)) {
+                        return true;
                 }
-        } else {
-               ntq_emit_tmu_general(c, instr, false);
         }
+
+        /* Fallback to regular UBO load */
+        return false;
 }
 
 static void
@@ -2411,7 +2726,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
         unsigned offset =
                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
 
-        if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+        if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
                /* Emit the LDVPM directly now, rather than at the top
                 * of the shader like we did for V3D 3.x (which needs
                 * vpmsetup when not just taking the next offset).
@@ -2433,19 +2748,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
                                SYSTEM_VALUE_VERTEX_ID)) {
                       index++;
                }
-               for (int i = 0; i < offset; i++)
-                      index += c->vattr_sizes[i];
+
+               for (int i = 0; i < offset; i++) {
+                      /* GFXH-1602: if any builtins (vid, iid, etc) are read then
+                       * attribute 0 must be active (size > 0). When we hit this,
+                       * the driver is expected to program attribute 0 to have a
+                       * size of 1, so here we need to add that.
+                       */
+                      if (i == 0 && c->vs_key->is_coord &&
+                          c->vattr_sizes[i] == 0 && index > 0) {
+                         index++;
+                      } else {
+                         index += c->vattr_sizes[i];
+                      }
+               }
+
                index += nir_intrinsic_component(instr);
                for (int i = 0; i < instr->num_components; i++) {
                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
-                      ntq_store_dest(c, &instr->dest, i,
-                                     vir_LDVPMV_IN(c, vpm_offset));
+                      ntq_store_def(c, &instr->def, i,
+                                    vir_LDVPMV_IN(c, vpm_offset));
                 }
         } else {
                 for (int i = 0; i < instr->num_components; i++) {
                         int comp = nir_intrinsic_component(instr) + i;
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
+                        struct qreg input = c->inputs[offset * 4 + comp];
+                        ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
+
+                        if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
+                            input.file == c->payload_z.file &&
+                            input.index == c->payload_z.index) {
+                                c->reads_z = true;
+                        }
                 }
         }
 }
@@ -2610,18 +2944,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c,
         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
         struct qreg F = vir_uniform_ui(c, 0);
         struct qreg T = vir_uniform_ui(c, ~0);
-        struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
+        struct qreg s0 = vir_AND(c, sample_mask, i1);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
-        s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
+        s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s1 = vir_AND(c, sample_mask, i2);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
-        s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
+        s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s2 = vir_AND(c, sample_mask, i4);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
-        s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
-        struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
+        s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+        struct qreg s3 = vir_AND(c, sample_mask, i8);
         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
-        s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
+        s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
 
         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
         struct qreg sample_idx = i3;
@@ -2708,28 +3042,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result)
         c->current_unifa_offset += 4;
 }
 
-static void
-ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
+/* Checks if the value of a nir src is derived from a nir register */
+static bool
+nir_src_derived_from_reg(nir_src src)
+{
+        nir_def *def = src.ssa;
+        if (nir_load_reg_for_def(def))
+                return true;
+
+        nir_instr *parent = def->parent_instr;
+        switch (parent->type) {
+        case nir_instr_type_alu: {
+                nir_alu_instr *alu = nir_instr_as_alu(parent);
+                int num_srcs = nir_op_infos[alu->op].num_inputs;
+                for (int i = 0; i < num_srcs; i++) {
+                        if (nir_src_derived_from_reg(alu->src[i].src))
+                                return true;
+                }
+                return false;
+        }
+        case nir_instr_type_intrinsic: {
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+                int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
+                for (int i = 0; i < num_srcs; i++) {
+                        if (nir_src_derived_from_reg(intr->src[i]))
+                                return true;
+                }
+                return false;
+        }
+        case nir_instr_type_load_const:
+        case nir_instr_type_undef:
+                return false;
+        default:
+                /* By default we assume it may come from a register, the above
+                 * cases should be able to handle the majority of situations
+                 * though.
+                 */
+                return true;
+        };
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
+        assert(instr->intrinsic == nir_intrinsic_load_ubo ||
+               instr->intrinsic == nir_intrinsic_load_ssbo ||
+               instr->intrinsic == nir_intrinsic_load_uniform);
+
+        bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
+        bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
+        bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
+
         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
          * current unifa offset is 4 bytes ahead of the offset of the last load.
          */
         static const int32_t max_unifa_skip_dist =
                 MAX_UNIFA_SKIP_DISTANCE - 4;
 
-        bool dynamic_src = !nir_src_is_const(instr->src[1]);
-        uint32_t const_offset =
-                dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
+        /* We can only use unifa if the offset is uniform */
+        nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
+        if (nir_src_is_divergent(offset))
+                return false;
 
-        /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
-         * shifted up by 1 (0 is gallium's constant buffer 0).
+        /* Emitting loads from unifa may not be safe under non-uniform control
+         * flow. It seems the address that is used to write to the unifa
+         * register is taken from the first lane and if that lane is disabled
+         * by control flow then the value we read may be bogus and lead to
+         * invalid memory accesses on follow-up ldunifa instructions. However,
+         * ntq_store_def only emits conditional writes for nir registersas long
+         * we can be certain that the offset isn't derived from a load_reg we
+         * should be fine.
+         *
+         * The following CTS test can be used to trigger the problem, which
+         * causes a GMP violations in the sim without this check:
+         * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
          */
-        uint32_t index = nir_src_as_uint(instr->src[0]);
-        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
+        if (vir_in_nonuniform_control_flow(c) &&
+            nir_src_derived_from_reg(offset)) {
+                return false;
+        }
+
+        /* We can only use unifa with SSBOs if they are read-only. Otherwise
+         * ldunifa won't see the shader writes to that address (possibly
+         * because ldunifa doesn't read from the L2T cache).
+         */
+        if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
+                return false;
+
+        /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
+         * that we may have been written to scratch using the TMU.
+         */
+        bool dynamic_src = !nir_src_is_const(offset);
+        if (is_uniform && dynamic_src && c->s->scratch_size > 0)
+                return false;
+
+        uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
+        if (is_uniform)
+                const_offset += nir_intrinsic_base(instr);
+
+        /* ldunifa is a 32-bit load instruction so we can only use it with
+         * 32-bit aligned addresses. We always produce 32-bit aligned addresses
+         * except for types smaller than 32-bit, so in these cases we can only
+         * use ldunifa if we can verify alignment, which we can only do for
+         * loads with a constant offset.
+         */
+        uint32_t bit_size = instr->def.bit_size;
+        uint32_t value_skips = 0;
+        if (bit_size < 32) {
+                if (dynamic_src) {
+                        return false;
+                } else if (const_offset % 4 != 0) {
+                        /* If we are loading from an unaligned offset, fix
+                         * alignment and skip over unused elements in result.
+                         */
+                        value_skips = (const_offset % 4) / (bit_size / 8);
+                        const_offset &= ~0x3;
+                }
+        }
+
+        assert((bit_size == 32 && value_skips == 0) ||
+               (bit_size == 16 && value_skips <= 1) ||
+               (bit_size == 8  && value_skips <= 3));
+
+        /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
+         * constants.
+         */
+        uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
+
+        /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
+         * index 0 for Gallium's constant buffer (GL) or push constants
+         * (Vulkan).
+         */
+        if (is_ubo)
                 index++;
 
         /* We can only keep track of the last unifa address we used with
-         * constant offset loads. If the new load targets the same UBO and
+         * constant offset loads. If the new load targets the same buffer and
          * is close enough to the previous load, we can skip the unifa register
          * write by emitting dummy ldunifa instructions to update the unifa
          * address.
@@ -2739,6 +3187,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
         if (dynamic_src) {
                 c->current_unifa_block = NULL;
         } else if (c->cur_block == c->current_unifa_block &&
+                   c->current_unifa_is_ubo == !is_ssbo &&
                    c->current_unifa_index == index &&
                    c->current_unifa_offset <= const_offset &&
                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
@@ -2746,32 +3195,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
         } else {
                 c->current_unifa_block = c->cur_block;
+                c->current_unifa_is_ubo = !is_ssbo;
                 c->current_unifa_index = index;
                 c->current_unifa_offset = const_offset;
         }
 
         if (!skip_unifa) {
-                struct qreg base_offset =
+                struct qreg base_offset = !is_ssbo ?
                         vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                    v3d_unit_data_create(index, const_offset));
+                                    v3d_unit_data_create(index, const_offset)) :
+                        vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
 
                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 if (!dynamic_src) {
-                        vir_MOV_dest(c, unifa, base_offset);
+                        if (!is_ssbo) {
+                                /* Avoid the extra MOV to UNIFA by making
+                                 * ldunif load directly into it. We can't
+                                 * do this if we have not actually emitted
+                                 * ldunif and are instead reusing a previous
+                                 * one.
+                                 */
+                                struct qinst *inst =
+                                        (struct qinst *)c->cur_block->instructions.prev;
+                                if (inst == c->defs[base_offset.index]) {
+                                   inst->dst = unifa;
+                                   c->defs[base_offset.index] = NULL;
+                                } else {
+                                   vir_MOV_dest(c, unifa, base_offset);
+                                }
+                        } else {
+                                vir_ADD_dest(c, unifa, base_offset,
+                                             vir_uniform_ui(c, const_offset));
+                        }
                 } else {
                         vir_ADD_dest(c, unifa, base_offset,
-                                     ntq_get_src(c, instr->src[1], 0));
+                                     ntq_get_src(c, offset, 0));
                 }
         } else {
                 for (int i = 0; i < ldunifa_skips; i++)
                         emit_ldunifa(c, NULL);
         }
 
-        for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+        uint32_t num_components = nir_intrinsic_dest_components(instr);
+        for (uint32_t i = 0; i < num_components; ) {
                 struct qreg data;
                 emit_ldunifa(c, &data);
-                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
+
+                if (bit_size == 32) {
+                        assert(value_skips == 0);
+                        ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
+                        i++;
+                } else {
+                        assert((bit_size == 16 && value_skips <= 1) ||
+                               (bit_size ==  8 && value_skips <= 3));
+
+                        /* If we have any values to skip, shift to the first
+                         * valid value in the ldunifa result.
+                         */
+                        if (value_skips > 0) {
+                                data = vir_SHR(c, data,
+                                               vir_uniform_ui(c, bit_size *
+                                                                 value_skips));
+                        }
+
+                        /* Check how many valid components we have discounting
+                         * read components to skip.
+                         */
+                        uint32_t valid_count = (32 / bit_size) - value_skips;
+                        assert((bit_size == 16 && valid_count <= 2) ||
+                               (bit_size ==  8 && valid_count <= 4));
+                        assert(valid_count > 0);
+
+                        /* Process the valid components */
+                        do {
+                                struct qreg tmp;
+                                uint32_t mask = (1 << bit_size) - 1;
+                                tmp = vir_AND(c, vir_MOV(c, data),
+                                              vir_uniform_ui(c, mask));
+                                ntq_store_def(c, &instr->def, i,
+                                              vir_MOV(c, tmp));
+                                i++;
+                                valid_count--;
+
+                                /* Shift to next component */
+                                if (i < num_components && valid_count > 0) {
+                                        data = vir_SHR(c, data,
+                                                       vir_uniform_ui(c, bit_size));
+                                }
+                        } while (i < num_components && valid_count > 0);
+                }
         }
+
+        return true;
 }
 
 static inline struct qreg
@@ -2781,187 +3296,273 @@ emit_load_local_invocation_index(struct v3d_compile *c)
                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
 }
 
-/* Various subgroup operations rely on the A flags, so this helper ensures that
- * A flags represents currently active lanes in the subgroup.
+/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
+ * fragment shaders a lane is considered active if any sample flags are set
+ * for *any* lane in the same quad, however, we still need to ensure that
+ * terminated lanes (OpTerminate) are not included. Further, we also need to
+ * disable lanes that may be disabled because of non-uniform control
+ * flow.
  */
-static void
-set_a_flags_for_subgroup(struct v3d_compile *c)
+static enum v3d_qpu_cond
+setup_subgroup_control_flow_condition(struct v3d_compile *c)
 {
-        /* MSF returns 0 for disabled lanes in compute shaders so
-         * PUSHZ will set A=1 for disabled lanes. We want the inverse
-         * of this but we don't have any means to negate the A flags
-         * directly, but we can do it by repeating the same operation
-         * with NORZ (A = ~A & ~Z).
+        assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
+               c->s->info.stage == MESA_SHADER_COMPUTE);
+
+        enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
+
+        /* We need to make sure that terminated lanes in fragment shaders are
+         * not included. We can identify these lanes by comparing the inital
+         * sample mask with the current. This fixes:
+         * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
          */
-        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
-        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
-        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
+                vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
+                                           vir_NOT(c, vir_XOR(c, c->start_msf,
+                                                              vir_MSF(c)))),
+                           V3D_QPU_PF_PUSHZ);
+                cond = V3D_QPU_COND_IFNA;
+        }
 
-        /* If we are under non-uniform control flow we also need to
-         * AND the A flags with the current execute mask.
+        /* If we are in non-uniform control-flow update the condition to
+         * also limit lanes to those in the current execution mask.
          */
         if (vir_in_nonuniform_control_flow(c)) {
-                const uint32_t bidx = c->cur_block->index;
-                vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
-                                           c->execute,
-                                           vir_uniform_ui(c, bidx)),
-                           V3D_QPU_UF_ANDZ);
+                if (cond == V3D_QPU_COND_IFNA) {
+                        vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_UF_NORNZ);
+                } else {
+                        assert(cond == V3D_QPU_COND_NONE);
+                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                }
+                cond = V3D_QPU_COND_IFA;
         }
+
+        return cond;
+}
+
+static void
+emit_compute_barrier(struct v3d_compile *c)
+{
+        /* Ensure we flag the use of the control barrier. NIR's
+         * gather info pass usually takes care of this, but that
+         * requires that we call that pass after any other pass
+         * may emit a control barrier, so this is safer.
+         */
+        c->s->info.uses_control_barrier = true;
+
+        /* Emit a TSY op to get all invocations in the workgroup
+         * (actually supergroup) to block until the last
+         * invocation reaches the TSY op.
+         */
+        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
+}
+
+static void
+emit_barrier(struct v3d_compile *c)
+{
+        struct qreg eidx = vir_EIDX(c);
+
+        /* The config for the TSY op should be setup like this:
+         * - Lane 0: Quorum
+         * - Lane 2: TSO id
+         * - Lane 3: TSY opcode
+         */
+
+        /* Lane 0: we want to synchronize across one subgroup. Here we write to
+         * all lanes unconditionally and will overwrite other lanes below.
+         */
+        struct qreg tsy_conf = vir_uniform_ui(c, 1);
+
+        /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
+         * curent QPU index and thread index to ensure we get a unique one for
+         * this group of invocations in this core.
+         */
+        struct qreg tso_id =
+                vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
+        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
+                   V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
+
+        /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
+        struct qreg tsy_op = vir_uniform_ui(c, 16);
+        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
+                   V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
+
+        /* Emit TSY sync */
+        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
 }
 
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
+        case nir_intrinsic_decl_reg:
+        case nir_intrinsic_load_reg:
+        case nir_intrinsic_store_reg:
+                break; /* Ignore these */
+
         case nir_intrinsic_load_uniform:
                 ntq_emit_load_uniform(c, instr);
                 break;
 
+        case nir_intrinsic_load_global_2x32:
+                ntq_emit_tmu_general(c, instr, false, true);
+                c->has_general_tmu_load = true;
+                break;
+
         case nir_intrinsic_load_ubo:
-                if (!nir_src_is_divergent(instr->src[1]))
-                        ntq_emit_load_ubo_unifa(c, instr);
-                else
-                        ntq_emit_tmu_general(c, instr, false);
-                break;
-
-        case nir_intrinsic_ssbo_atomic_add:
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_ssbo_atomic_comp_swap:
+           if (ntq_emit_inline_ubo_load(c, instr))
+                   break;
+           FALLTHROUGH;
         case nir_intrinsic_load_ssbo:
+                if (!ntq_emit_load_unifa(c, instr)) {
+                        ntq_emit_tmu_general(c, instr, false, false);
+                        c->has_general_tmu_load = true;
+                }
+                break;
+
         case nir_intrinsic_store_ssbo:
-                ntq_emit_tmu_general(c, instr, false);
-                break;
-
-        case nir_intrinsic_shared_atomic_add:
-        case nir_intrinsic_shared_atomic_imin:
-        case nir_intrinsic_shared_atomic_umin:
-        case nir_intrinsic_shared_atomic_imax:
-        case nir_intrinsic_shared_atomic_umax:
-        case nir_intrinsic_shared_atomic_and:
-        case nir_intrinsic_shared_atomic_or:
-        case nir_intrinsic_shared_atomic_xor:
-        case nir_intrinsic_shared_atomic_exchange:
-        case nir_intrinsic_shared_atomic_comp_swap:
-        case nir_intrinsic_load_shared:
+        case nir_intrinsic_ssbo_atomic:
+        case nir_intrinsic_ssbo_atomic_swap:
+                ntq_emit_tmu_general(c, instr, false, false);
+                break;
+
+        case nir_intrinsic_store_global_2x32:
+        case nir_intrinsic_global_atomic_2x32:
+        case nir_intrinsic_global_atomic_swap_2x32:
+                ntq_emit_tmu_general(c, instr, false, true);
+                break;
+
+        case nir_intrinsic_shared_atomic:
+        case nir_intrinsic_shared_atomic_swap:
         case nir_intrinsic_store_shared:
-        case nir_intrinsic_load_scratch:
         case nir_intrinsic_store_scratch:
-                ntq_emit_tmu_general(c, instr, true);
+                ntq_emit_tmu_general(c, instr, true, false);
+                break;
+
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_shared:
+                ntq_emit_tmu_general(c, instr, true, false);
+                c->has_general_tmu_load = true;
                 break;
 
-        case nir_intrinsic_image_load:
         case nir_intrinsic_image_store:
-        case nir_intrinsic_image_atomic_add:
-        case nir_intrinsic_image_atomic_imin:
-        case nir_intrinsic_image_atomic_umin:
-        case nir_intrinsic_image_atomic_imax:
-        case nir_intrinsic_image_atomic_umax:
-        case nir_intrinsic_image_atomic_and:
-        case nir_intrinsic_image_atomic_or:
-        case nir_intrinsic_image_atomic_xor:
-        case nir_intrinsic_image_atomic_exchange:
-        case nir_intrinsic_image_atomic_comp_swap:
-                v3d40_vir_emit_image_load_store(c, instr);
+        case nir_intrinsic_image_atomic:
+        case nir_intrinsic_image_atomic_swap:
+                v3d_vir_emit_image_load_store(c, instr);
+                break;
+
+        case nir_intrinsic_image_load:
+                v3d_vir_emit_image_load_store(c, instr);
+                /* Not really a general TMU load, but we only use this flag
+                 * for NIR scheduling and we do schedule these under the same
+                 * policy as general TMU.
+                 */
+                c->has_general_tmu_load = true;
                 break;
 
         case nir_intrinsic_get_ssbo_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
-                                           nir_src_comp_as_uint(instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
+                                          nir_src_comp_as_uint(instr->src[0], 0)));
                 break;
 
         case nir_intrinsic_get_ubo_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
-                                           nir_src_comp_as_uint(instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
+                                          nir_src_comp_as_uint(instr->src[0], 0)));
                 break;
 
         case nir_intrinsic_load_user_clip_plane:
                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
-                                                   nir_intrinsic_ucp_id(instr) *
-                                                   4 + i));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+                                                  nir_intrinsic_ucp_id(instr) *
+                                                  4 + i));
                 }
                 break;
 
         case nir_intrinsic_load_viewport_x_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_y_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_z_scale:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
                 break;
 
         case nir_intrinsic_load_viewport_z_offset:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
                 break;
 
         case nir_intrinsic_load_line_coord:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
                 break;
 
         case nir_intrinsic_load_line_width:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
                 break;
 
         case nir_intrinsic_load_aa_line_width:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
                 break;
 
         case nir_intrinsic_load_sample_mask_in:
-                ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
+                ntq_store_def(c, &instr->def, 0, vir_MSF(c));
                 break;
 
         case nir_intrinsic_load_helper_invocation:
                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
-                ntq_store_dest(c, &instr->dest, 0, qdest);
+                ntq_store_def(c, &instr->def, 0, qdest);
                 break;
 
         case nir_intrinsic_load_front_face:
                 /* The register contains 0 (front) or 1 (back), and we need to
                  * turn it into a NIR bool where true means front.
                  */
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_ADD(c,
-                                       vir_uniform_ui(c, -1),
-                                       vir_REVF(c)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_ADD(c,
+                                      vir_uniform_ui(c, -1),
+                                      vir_REVF(c)));
                 break;
 
         case nir_intrinsic_load_base_instance:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
                 break;
 
         case nir_intrinsic_load_instance_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
                 break;
 
         case nir_intrinsic_load_vertex_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
+                break;
+
+        case nir_intrinsic_load_draw_id:
+                ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
                 break;
 
         case nir_intrinsic_load_tlb_color_v3d:
                 vir_emit_tlb_color_read(c, instr);
                 break;
 
+        case nir_intrinsic_load_fep_w_v3d:
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
+                break;
+
         case nir_intrinsic_load_input:
                 ntq_emit_load_input(c, instr);
                 break;
@@ -2978,7 +3579,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_emit_image_size(c, instr);
                 break;
 
+        /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
+         * is intended to match the semantics of GLSL's discard) should
+         * terminate the invocation immediately. Our implementation doesn't
+         * do that. What we do is actually a demote by removing the invocations
+         * from the sample mask. Maybe we could be more strict and force an
+         * early termination by emitting a (maybe conditional) jump to the
+         * end section of the fragment shader for affected invocations.
+         */
         case nir_intrinsic_discard:
+        case nir_intrinsic_terminate:
+                c->emitted_discard = true;
+                FALLTHROUGH;
+        case nir_intrinsic_demote:
                 ntq_flush_tmu(c);
 
                 if (vir_in_nonuniform_control_flow(c)) {
@@ -2993,7 +3606,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
-        case nir_intrinsic_discard_if: {
+        case nir_intrinsic_discard_if:
+        case nir_intrinsic_terminate_if:
+                c->emitted_discard = true;
+                FALLTHROUGH;
+        case nir_intrinsic_demote_if: {
                 ntq_flush_tmu(c);
 
                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
@@ -3011,102 +3628,79 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
                                              vir_uniform_ui(c, 0)), cond);
-
                 break;
         }
 
-        case nir_intrinsic_memory_barrier:
-        case nir_intrinsic_memory_barrier_buffer:
-        case nir_intrinsic_memory_barrier_image:
-        case nir_intrinsic_memory_barrier_shared:
-        case nir_intrinsic_memory_barrier_tcs_patch:
-        case nir_intrinsic_group_memory_barrier:
-                /* We don't do any instruction scheduling of these NIR
-                 * instructions between each other, so we just need to make
-                 * sure that the TMU operations before the barrier are flushed
+        case nir_intrinsic_barrier:
+                /* Ensure that the TMU operations before the barrier are flushed
                  * before the ones after the barrier.
                  */
                 ntq_flush_tmu(c);
-                break;
-
-        case nir_intrinsic_control_barrier:
-                /* Emit a TSY op to get all invocations in the workgroup
-                 * (actually supergroup) to block until the last invocation
-                 * reaches the TSY op.
-                 */
-                ntq_flush_tmu(c);
 
-                if (c->devinfo->ver >= 42) {
-                        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
-                                                      V3D_QPU_WADDR_SYNCB));
-                } else {
-                        struct qinst *sync =
-                                vir_BARRIERID_dest(c,
-                                                   vir_reg(QFILE_MAGIC,
-                                                           V3D_QPU_WADDR_SYNCU));
-                        sync->uniform =
-                                vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                      0xffffff00 |
-                                                      V3D_TSY_WAIT_INC_CHECK);
+                if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
+                        if (c->s->info.stage == MESA_SHADER_COMPUTE)
+                                emit_compute_barrier(c);
+                        else
+                                emit_barrier(c);
 
+                        /* The blocking of a TSY op only happens at the next
+                         * thread switch. No texturing may be outstanding at the
+                         * time of a TSY blocking operation.
+                         */
+                        vir_emit_thrsw(c);
                 }
-
-                /* The blocking of a TSY op only happens at the next thread
-                 * switch.  No texturing may be outstanding at the time of a
-                 * TSY blocking operation.
-                 */
-                vir_emit_thrsw(c);
                 break;
 
         case nir_intrinsic_load_num_workgroups:
                 for (int i = 0; i < 3; i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
-                                                   i));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
+                                                  i));
                 }
                 break;
 
         case nir_intrinsic_load_workgroup_id: {
                 struct qreg x = vir_AND(c, c->cs_payload[0],
                                          vir_uniform_ui(c, 0xffff));
+                ntq_store_def(c, &instr->def, 0, x);
 
                 struct qreg y = vir_SHR(c, c->cs_payload[0],
                                          vir_uniform_ui(c, 16));
+                ntq_store_def(c, &instr->def, 1, y);
 
                 struct qreg z = vir_AND(c, c->cs_payload[1],
                                          vir_uniform_ui(c, 0xffff));
+                ntq_store_def(c, &instr->def, 2, z);
+                break;
+        }
 
-                /* We only support dispatch base in Vulkan */
-                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
-                        x = vir_ADD(c, x,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
-                        y = vir_ADD(c, y,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
-                        z = vir_ADD(c, z,
-                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
-                }
+        case nir_intrinsic_load_base_workgroup_id: {
+                struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
+                ntq_store_def(c, &instr->def, 0, x);
 
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
-                ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
+                struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
+                ntq_store_def(c, &instr->def, 1, y);
+
+                struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
+                ntq_store_def(c, &instr->def, 2, z);
                 break;
         }
 
         case nir_intrinsic_load_local_invocation_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               emit_load_local_invocation_index(c));
+                ntq_store_def(c, &instr->def, 0,
+                              emit_load_local_invocation_index(c));
                 break;
 
         case nir_intrinsic_load_subgroup_id: {
                 /* This is basically the batch index, which is the Local
                  * Invocation Index divided by the SIMD width).
                  */
-                STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+                STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
                 struct qreg lii = emit_load_local_invocation_index(c);
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_SHR(c, lii,
-                                       vir_uniform_ui(c, divide_shift)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_SHR(c, lii,
+                                      vir_uniform_ui(c, divide_shift)));
                 break;
         }
 
@@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
                 for (int i = 0; i < instr->num_components; i++) {
                         struct qreg row = vir_uniform_ui(c, row_idx++);
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_LDVPMG_IN(c, row, col));
+                        ntq_store_def(c, &instr->def, i,
+                                      vir_LDVPMG_IN(c, row, col));
                 }
                 break;
         }
@@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  * using ldvpm(v,d)_in (See Table 71).
                  */
                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
                 break;
         }
 
         case nir_intrinsic_load_invocation_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
+                ntq_store_def(c, &instr->def, 0, vir_IID(c));
                 break;
 
         case nir_intrinsic_load_fb_layers_v3d:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
                 break;
 
         case nir_intrinsic_load_sample_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
+                ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
                 break;
 
         case nir_intrinsic_load_sample_pos:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
                 break;
 
         case nir_intrinsic_load_barycentric_at_offset:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
                 break;
 
         case nir_intrinsic_load_barycentric_pixel:
-                ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
-                ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+                ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+                ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
                 break;
 
         case nir_intrinsic_load_barycentric_at_sample: {
                 if (!c->fs_key->msaa) {
-                        ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
-                        ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+                        ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+                        ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
                         return;
                 }
 
@@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
 
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+                ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
                 break;
         }
 
@@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 struct qreg offset_y =
                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
 
-                ntq_store_dest(c, &instr->dest, 0,
-                                  vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
-                ntq_store_dest(c, &instr->dest, 1,
-                                  vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
+                ntq_store_def(c, &instr->def, 0,
+                             vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
+                ntq_store_def(c, &instr->def, 1,
+                              vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
                 break;
         }
 
         case nir_intrinsic_load_barycentric_centroid: {
                 struct qreg offset_x, offset_y;
                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
-                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
-                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+                ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
                 break;
         }
 
@@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                          */
                         if (!c->fs_key->msaa ||
                             c->interp[input_idx].vp.file == QFILE_NULL) {
-                                ntq_store_dest(c, &instr->dest, i,
-                                               vir_MOV(c, c->inputs[input_idx]));
+                                ntq_store_def(c, &instr->def, i,
+                                              vir_MOV(c, c->inputs[input_idx]));
                                 continue;
                         }
 
@@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                               ntq_emit_load_interpolated_input(c, p, C,
                                                                offset_x, offset_y,
                                                                interp_mode);
-                        ntq_store_dest(c, &instr->dest, i, result);
+                        ntq_store_def(c, &instr->def, i, result);
                 }
                 break;
         }
 
         case nir_intrinsic_load_subgroup_size:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform_ui(c, V3D_CHANNELS));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform_ui(c, V3D_CHANNELS));
                 break;
 
         case nir_intrinsic_load_subgroup_invocation:
-                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+                ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
                 break;
 
         case nir_intrinsic_elect: {
-                set_a_flags_for_subgroup(c);
-                struct qreg first = vir_FLAFIRST(c);
+                struct qreg first;
+                if (vir_in_nonuniform_control_flow(c)) {
+                        /* Sets A=1 for lanes enabled in the execution mask */
+                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                        /* Updates A ANDing with lanes enabled in MSF */
+                        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
+                                   V3D_QPU_UF_ANDNZ);
+                        first = vir_FLAFIRST(c);
+                } else {
+                        /* Sets A=1 for inactive lanes */
+                        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
+                                   V3D_QPU_PF_PUSHZ);
+                        first = vir_FLNAFIRST(c);
+                }
 
-                /* Produce a boolean result from Flafirst */
+                /* Produce a boolean result */
                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
                                            first, vir_uniform_ui(c, 1)),
                                            V3D_QPU_PF_PUSHZ);
                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
-                ntq_store_dest(c, &instr->dest, 0, result);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_ballot: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_read_invocation: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                struct qreg index = ntq_get_src(c, instr->src[1], 0);
+                struct qreg res = vir_SHUFFLE(c, value, index);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_read_first_invocation: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_shuffle: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                struct qreg indices = ntq_get_src(c, instr->src[1], 0);
+                struct qreg res = vir_SHUFFLE(c, value, indices);
+                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+                break;
+        }
+
+        case nir_intrinsic_vote_feq:
+        case nir_intrinsic_vote_ieq: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
+                             vir_ALLEQ_dest(c, res, value) :
+                             vir_ALLFEQ_dest(c, res, value),
+                             cond);
+
+                /* Produce boolean result */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_vote_all: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+                /* We want to check if 'all lanes are equal (alleq != 0) and
+                 * their value is True (value != 0)'.
+                 *
+                 * The first MOV.pushz generates predicate for 'alleq == 0'.
+                 * The second MOV.NORZ generates predicate for:
+                 * '!(alleq == 0) & !(value == 0).
+                 */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+                           V3D_QPU_UF_NORZ);
+                struct qreg result =
+                        ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+                ntq_store_def(c, &instr->def, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_vote_any: {
+                assert(c->devinfo->ver >= 71);
+                struct qreg value = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+                struct qreg res = vir_get_temp(c);
+                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+                /* We want to check 'not (all lanes are equal (alleq != 0)'
+                 * and their value is False (value == 0))'.
+                 *
+                 * The first MOV.pushz generates predicate for 'alleq == 0'.
+                 * The second MOV.NORNZ generates predicate for:
+                 * '!(alleq == 0) & (value == 0).
+                 * The IFNA condition negates the predicate when evaluated:
+                 * '!(!alleq == 0) & (value == 0))
+                 */
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+                           V3D_QPU_PF_PUSHZ);
+                vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+                           V3D_QPU_UF_NORNZ);
+                struct qreg result =
+                        ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+                ntq_store_def(c, &instr->def, 0, result);
                 break;
         }
 
@@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_view_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
+                ntq_store_def(c, &instr->def, 0,
+                              vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
                 break;
 
         default:
@@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c)
         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
 }
 
+static bool
+is_cheap_block(nir_block *block)
+{
+        int32_t cost = 3;
+        nir_foreach_instr(instr, block) {
+                switch (instr->type) {
+                case nir_instr_type_alu:
+                case nir_instr_type_undef:
+                case nir_instr_type_load_const:
+                        if (--cost <= 0)
+                                return false;
+                break;
+                case nir_instr_type_intrinsic: {
+                        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                        switch (intr->intrinsic) {
+                        case nir_intrinsic_decl_reg:
+                        case nir_intrinsic_load_reg:
+                        case nir_intrinsic_store_reg:
+                                continue;
+                        default:
+                                return false;
+                        }
+                }
+                default:
+                        return false;
+                }
+        }
+        return true;
+}
+
 static void
 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
 {
@@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                      c->execute,
                      vir_uniform_ui(c, else_block->index));
 
-        /* Jump to ELSE if nothing is active for THEN, otherwise fall
-         * through.
+        /* Set the flags for taking the THEN block */
+        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                   V3D_QPU_PF_PUSHZ);
+
+        /* Jump to ELSE if nothing is active for THEN (unless THEN block is
+         * so small it won't pay off), otherwise fall through.
          */
-        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
-        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
-        vir_link_blocks(c->cur_block, else_block);
+        bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
+                        is_cheap_block(nir_if_first_then_block(if_stmt));
+        if (!is_cheap) {
+                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+                vir_link_blocks(c->cur_block, else_block);
+        }
         vir_link_blocks(c->cur_block, then_block);
 
-        /* Process the THEN block. */
+        /* Process the THEN block.
+         *
+         * Notice we don't call ntq_activate_execute_for_block here on purpose:
+         * c->execute is already set up to be 0 for lanes that must take the
+         * THEN block.
+         */
         vir_set_emit_block(c, then_block);
         ntq_emit_cf_list(c, &if_stmt->then_list);
 
@@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
                              vir_uniform_ui(c, after_block->index));
 
-                /* If everything points at ENDIF, then jump there immediately. */
-                vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
-                                        c->execute,
-                                        vir_uniform_ui(c, after_block->index)),
-                           V3D_QPU_PF_PUSHZ);
-                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
-                vir_link_blocks(c->cur_block, after_block);
+                /* If everything points at ENDIF, then jump there immediately
+                 * (unless ELSE block is so small it won't pay off).
+                 */
+                bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
+                                is_cheap_block(nir_else_block);
+                if (!is_cheap) {
+                        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
+                                                   c->execute,
+                                                   vir_uniform_ui(c, after_block->index)),
+                                   V3D_QPU_PF_PUSHZ);
+                        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+                        vir_link_blocks(c->cur_block, after_block);
+                }
                 vir_link_blocks(c->cur_block, else_block);
 
                 vir_set_emit_block(c, else_block);
@@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
                 break;
 
-        case nir_instr_type_ssa_undef:
+        case nir_instr_type_undef:
                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
                 break;
 
@@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
 static void
 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
 {
-
         c->loop_cont_block = vir_new_block(c);
         c->loop_break_block = vir_new_block(c);
 
@@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
 static void
 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
 {
+        assert(!nir_loop_has_continue_construct(loop));
+
+        /* Disable flags optimization for loop conditions. The problem here is
+         * that we can have code like this:
+         *
+         *  // block_0
+         *  vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
+         *  loop {
+         *     // block_1
+         *     if ssa_9 {
+         *
+         * In this example we emit flags to compute ssa_9 and the optimization
+         * will skip regenerating them again for the loop condition in the
+         * loop continue block (block_1). However, this is not safe after the
+         * first iteration because the loop body can stomp the flags if it has
+         * any conditionals.
+         */
+        c->flags_temp = -1;
+
         bool was_in_control_flow = c->in_control_flow;
         c->in_control_flow = true;
 
@@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
 static void
 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
 {
-        ntq_setup_registers(c, &impl->registers);
+        ntq_setup_registers(c, impl);
         ntq_emit_cf_list(c, &impl->body);
 }
 
@@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c)
 {
         switch (c->s->info.stage) {
         case MESA_SHADER_FRAGMENT:
-                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                c->start_msf = vir_MSF(c);
+                if (c->devinfo->ver < 71)
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                else
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
@@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c)
                                emit_fragment_varying(c, NULL, -1, 0, 0);
                 }
 
-                if (c->fs_key->is_points &&
-                    (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
+                if (c->fs_key->is_points && program_reads_point_coord(c)) {
                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->uses_implicit_point_line_varyings = true;
                 } else if (c->fs_key->is_lines &&
-                           (c->devinfo->ver < 40 ||
-                            BITSET_TEST(c->s->info.system_values_read,
+                           (BITSET_TEST(c->s->info.system_values_read,
                                         SYSTEM_VALUE_LINE_COORD))) {
                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
                         c->uses_implicit_point_line_varyings = true;
                 }
-
-                c->force_per_sample_msaa =
-                   c->s->info.fs.uses_sample_qualifier ||
-                   BITSET_TEST(c->s->info.system_values_read,
-                               SYSTEM_VALUE_SAMPLE_ID) ||
-                   BITSET_TEST(c->s->info.system_values_read,
-                               SYSTEM_VALUE_SAMPLE_POS);
                 break;
         case MESA_SHADER_COMPUTE:
                 /* Set up the TSO for barriers, assuming we do some. */
@@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c)
                                                       V3D_QPU_WADDR_SYNC));
                 }
 
-                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
-                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                if (c->devinfo->ver == 42) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                } else if (c->devinfo->ver >= 71) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                }
 
                 /* Set up the division between gl_LocalInvocationIndex and
                  * wg_in_mem in the payload reg.
@@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c)
 
         /* Find the main function and emit the body. */
         nir_foreach_function(function, c->s) {
-                assert(strcmp(function->name, "main") == 0);
+                assert(function->is_entrypoint);
                 assert(function->impl);
                 ntq_emit_impl(c, function->impl);
         }
@@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c,
 {
         *restore_last_thrsw = c->last_thrsw;
 
-        /* On V3D before 4.1, we need a TMU op to be outstanding when thread
-         * switching, so disable threads if we didn't do any TMU ops (each of
-         * which would have emitted a THRSW).
-         */
-        if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
-                c->threads = 1;
-                if (c->last_thrsw)
-                        vir_remove_thrsw(c);
-                *restore_last_thrsw = NULL;
-        }
-
         /* If we're threaded and the last THRSW was in conditional code, then
          * we need to emit another one so that we can flag it as the last
          * thrsw.
          */
-        if (c->last_thrsw && !c->last_thrsw_at_top_level) {
-                assert(c->devinfo->ver >= 41);
+        if (c->last_thrsw && !c->last_thrsw_at_top_level)
                 vir_emit_thrsw(c);
-        }
 
         /* If we're threaded, then we need to mark the last THRSW instruction
          * so we can emit a pair of them at QPU emit time.
@@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c,
          * For V3D 4.x, we can spawn the non-fragment shaders already in the
          * post-last-THRSW state, so we can skip this.
          */
-        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                assert(c->devinfo->ver >= 41);
+        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
                 vir_emit_thrsw(c);
-        }
 
         /* If we have not inserted a last thread switch yet, do it now to ensure
          * any potential spilling we do happens before this. If we don't spill
@@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c)
 
         vir_for_each_inst_inorder(inst, c) {
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                        if (inst->src[i].file == QFILE_REG &&
-                            inst->src[i].index == 0) {
+                        if (inst->src[i].file == c->payload_w.file &&
+                            inst->src[i].index == c->payload_w.index) {
                                 c->uses_center_w = true;
                                 return;
                         }
@@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c)
 void
 v3d_nir_to_vir(struct v3d_compile *c)
 {
-        if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(NIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d NIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 unreachable("bad stage");
         }
 
-        if (V3D_DEBUG & (V3D_DEBUG_VIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(VIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
          * instructions until the results are needed.
          */
 
-        if (V3D_DEBUG & (V3D_DEBUG_VIR |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(VIR) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 fprintf(stderr, "%s prog %d/%d VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
@@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c)
         /* Attempt to allocate registers for the temporaries.  If we fail,
          * reduce thread count and try again.
          */
-        int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
+        int min_threads = 2;
         struct qpu_reg *temp_registers;
         while (true) {
-                bool spilled;
-                temp_registers = v3d_register_allocate(c, &spilled);
-                if (spilled)
-                        continue;
-
-                if (temp_registers)
+                temp_registers = v3d_register_allocate(c);
+                if (temp_registers) {
+                        assert(c->spills + c->fills <= c->max_tmu_spills);
                         break;
+                }
 
                 if (c->threads == min_threads &&
-                    (V3D_DEBUG & V3D_DEBUG_RA)) {
+                    V3D_DBG(RA)) {
                         fprintf(stderr,
                                 "Failed to register allocate using %s\n",
                                 c->fallback_scheduler ? "the fallback scheduler:" :
@@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 }
 
                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
-                        if (V3D_DEBUG & V3D_DEBUG_PERF) {
+                        if (V3D_DBG(PERF)) {
                                 fprintf(stderr,
-                                        "Failed to register allocate %s at "
-                                        "%d threads.\n", vir_get_stage_name(c),
-                                        c->threads);
+                                        "Failed to register allocate %s "
+                                        "prog %d/%d at %d threads.\n",
+                                        vir_get_stage_name(c),
+                                        c->program_id, c->variant_id, c->threads);
                         }
                         c->compilation_result =
                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
                         return;
                 }
 
-                c->spill_count = 0;
+                c->spills = 0;
+                c->fills = 0;
                 c->threads /= 2;
 
                 if (c->threads == 1)
@@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
 
         if (c->spills &&
-            (V3D_DEBUG & (V3D_DEBUG_VIR |
-                          v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+            (V3D_DBG(VIR) ||
+             v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id);
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index c559814b9ea..ba76ac87e1e 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -85,6 +85,7 @@ struct schedule_state {
         struct schedule_node *last_unif;
         struct schedule_node *last_rtop;
         struct schedule_node *last_unifa;
+        struct schedule_node *last_setmsf;
         enum direction dir;
         /* Estimated cycle when the current instruction would start. */
         uint32_t time;
@@ -97,7 +98,7 @@ add_dep(struct schedule_state *state,
         bool write)
 {
         bool write_after_read = !write && state->dir == R;
-        void *edge_data = (void *)(uintptr_t)write_after_read;
+        uintptr_t edge_data = write_after_read;
 
         if (!before || !after)
                 return;
@@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return false;
 
-        if (inst->alu.add.magic_write &&
+        if (inst->alu.add.op != V3D_QPU_A_NOP &&
+            inst->alu.add.magic_write &&
             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
                 return true;
 
-        if (inst->alu.mul.magic_write &&
+        if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+            inst->alu.mul.magic_write &&
             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
                 return true;
@@ -153,12 +156,13 @@ static void
 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
                  enum v3d_qpu_mux mux)
 {
+        assert(state->devinfo->ver < 71);
         switch (mux) {
         case V3D_QPU_MUX_A:
                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
                 break;
         case V3D_QPU_MUX_B:
-                if (!n->inst->qpu.sig.small_imm) {
+                if (!n->inst->qpu.sig.small_imm_b) {
                         add_read_dep(state,
                                      state->last_rf[n->inst->qpu.raddr_b], n);
                 }
@@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
         }
 }
 
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+                   uint8_t raddr, bool is_small_imm)
+{
+        assert(state->devinfo->ver >= 71);
+
+        if (!is_small_imm)
+                add_read_dep(state, state->last_rf[raddr], n);
+}
+
 static bool
 tmu_write_is_sequence_terminator(uint32_t waddr)
 {
@@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr)
 static bool
 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
 {
-        if (devinfo->ver < 40)
-                return false;
-
         if (tmu_write_is_sequence_terminator(waddr))
                 return false;
 
@@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
                         break;
 
                 case V3D_QPU_WADDR_UNIFA:
-                        if (state->devinfo->ver >= 40)
-                                add_write_dep(state, &state->last_unifa, n);
+                        add_write_dep(state, &state->last_unifa, n);
                         break;
 
                 case V3D_QPU_WADDR_NOP:
@@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
         /* If the input and output segments are shared, then all VPM reads to
          * a location need to happen before all writes.  We handle this by
          * serializing all VPM operations for now.
+         *
+         * FIXME: we are assuming that the segments are shared. That is
+         * correct right now as we are only using shared, but technically you
+         * can choose.
          */
         bool separate_vpm_segment = false;
 
@@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 
         /* XXX: LOAD_IMM */
 
-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
-                process_mux_deps(state, n, inst->alu.add.a);
-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
-                process_mux_deps(state, n, inst->alu.add.b);
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.add.a.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.add.a.raddr,
+                                           inst->sig.small_imm_a);
+                }
+        }
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.add.b.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.add.b.raddr,
+                                           inst->sig.small_imm_b);
+                }
+        }
 
-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
-                process_mux_deps(state, n, inst->alu.mul.a);
-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
-                process_mux_deps(state, n, inst->alu.mul.b);
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.mul.a.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+                                           inst->sig.small_imm_c);
+                }
+        }
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.mul.b.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+                                           inst->sig.small_imm_d);
+                }
+        }
 
         switch (inst->alu.add.op) {
         case V3D_QPU_A_VPMSETUP:
@@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 
         case V3D_QPU_A_MSF:
                 add_read_dep(state, state->last_tlb, n);
+                add_read_dep(state, state->last_setmsf, n);
                 break;
 
         case V3D_QPU_A_SETMSF:
+                add_write_dep(state, &state->last_setmsf, n);
+                add_write_dep(state, &state->last_tmu_write, n);
+                FALLTHROUGH;
         case V3D_QPU_A_SETREVF:
                 add_write_dep(state, &state->last_tlb, n);
                 break;
 
+        case V3D_QPU_A_BALLOT:
+        case V3D_QPU_A_BCASTF:
+        case V3D_QPU_A_ALLEQ:
+        case V3D_QPU_A_ALLFEQ:
+                add_read_dep(state, state->last_setmsf, n);
+                break;
+
         default:
                 break;
         }
@@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
                 add_write_dep(state, &state->last_r[4], n);
         if (v3d_qpu_writes_r5(devinfo, inst))
                 add_write_dep(state, &state->last_r[5], n);
+        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+                add_write_dep(state, &state->last_rf[0], n);
 
         /* If we add any more dependencies here we should consider whether we
          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -492,9 +544,16 @@ struct choose_scoreboard {
         int last_thrsw_tick;
         int last_branch_tick;
         int last_setmsf_tick;
-        bool tlb_locked;
+        bool first_thrsw_emitted;
+        bool last_thrsw_emitted;
         bool fixup_ldvary;
         int ldvary_count;
+        int pending_ldtmu_count;
+        bool first_ldtmu_after_thrsw;
+
+        /* V3D 7.x */
+        int last_implicit_rf0_write_tick;
+        bool has_rf0_flops_conflict;
 };
 
 static bool
@@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
 }
 
 static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+               const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+        switch (raddr) {
+        case 0: /* ldvary delayed write of C coefficient to rf0 */
+                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+                        return true;
+                break;
+        default:
+                break;
+        }
+
+        return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+                           struct choose_scoreboard *scoreboard,
                            struct qinst *qinst)
 {
         const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
 
         if (inst->alu.add.op != V3D_QPU_A_NOP) {
-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
-                        return true;
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+                                        return true;
+                        }
                 }
-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
-                        return true;
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+                                        return true;
+                        }
                 }
         }
 
         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
-                        return true;
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
+                                        return true;
+                        }
                 }
-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
-                        return true;
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+                                        return true;
+                        }
                 }
         }
 
@@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
             v3d_qpu_writes_r4(devinfo, inst))
                 return true;
 
+        if (devinfo->ver == 42)
+           return false;
+
+        /* Don't schedule anything that writes rf0 right after ldvary, since
+         * that would clash with the ldvary's delayed rf0 write (the exception
+         * is another ldvary, since its implicit rf0 write would also have
+         * one cycle of delay and would not clash).
+         */
+        if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+            (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+             (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+              !inst->sig.ldvary))) {
+            return true;
+       }
+
         return false;
 }
 
 static bool
-pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+scoreboard_is_locked(struct choose_scoreboard *scoreboard,
+                     bool lock_scoreboard_on_first_thrsw)
+{
+        if (lock_scoreboard_on_first_thrsw) {
+                return scoreboard->first_thrsw_emitted &&
+                       scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+        }
+
+        return scoreboard->last_thrsw_emitted &&
+               scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct v3d_compile *c,
+                          struct choose_scoreboard *scoreboard,
                           const struct v3d_qpu_instr *inst)
 {
-        return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+        return qpu_inst_is_tlb(inst) &&
+               !scoreboard_is_locked(scoreboard,
+                                     c->lock_scoreboard_on_first_thrsw);
 }
 
 static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+                        const struct v3d_qpu_instr *inst,
                         uint32_t waddr) {
 
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
            return false;
 
-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
-            inst->raddr_a == waddr)
-              return true;
+        if (devinfo->ver < 71) {
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+                    inst->raddr_a == waddr)
+                        return true;
 
-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
-            !inst->sig.small_imm && (inst->raddr_b == waddr))
-              return true;
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+                    !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+                        return true;
+        } else {
+                if (v3d71_qpu_reads_raddr(inst, waddr))
+                        return true;
+        }
 
         return false;
 }
 
 static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
-                const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+            struct choose_scoreboard *scoreboard,
+            const struct v3d_qpu_instr *inst)
 {
         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
-                qpu_instruction_uses_rf(inst,
+                qpu_instruction_uses_rf(devinfo, inst,
                                         scoreboard->last_stallable_sfu_reg);
 }
 
 /* We define a max schedule priority to allow negative priorities as result of
- * substracting this max when an instruction stalls. So instructions that
+ * subtracting this max when an instruction stalls. So instructions that
  * stall have lower priority than regular instructions. */
 #define MAX_SCHEDULE_PRIORITY 16
 
@@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
                 return next_score;
         next_score++;
 
+        /* Empirical testing shows that using priorities to hide latency of
+         * TMU operations when scheduling QPU leads to slightly worse
+         * performance, even at 2 threads. We think this is because the thread
+         * switching is already quite effective at hiding latency and NIR
+         * scheduling (and possibly TMU pipelining too) are sufficient to hide
+         * TMU latency, so piling up on that here doesn't provide any benefits
+         * and instead may cause us to postpone critical paths that depend on
+         * the TMU results.
+         */
+#if 0
         /* Schedule texture read results collection late to hide latency. */
         if (v3d_qpu_waits_on_tmu(inst))
                 return next_score;
         next_score++;
+#endif
 
         /* Default score for things that aren't otherwise special. */
         baseline_score = next_score;
         next_score++;
 
+#if 0
         /* Schedule texture read setup early to hide their latency better. */
         if (v3d_qpu_writes_tmu(devinfo, inst))
                 return next_score;
         next_score++;
+#endif
 
         /* We should increase the maximum if we assert here */
         assert(next_score < MAX_SCHEDULE_PRIORITY);
@@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
         return baseline_score;
 }
 
-static bool
-qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
-                          enum v3d_qpu_waddr waddr)
-{
-        return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
-                v3d_qpu_magic_waddr_is_sfu(waddr) ||
-                v3d_qpu_magic_waddr_is_tlb(waddr) ||
-                v3d_qpu_magic_waddr_is_vpm(waddr) ||
-                v3d_qpu_magic_waddr_is_tsy(waddr));
-}
+enum {
+        V3D_PERIPHERAL_VPM_READ           = (1 << 0),
+        V3D_PERIPHERAL_VPM_WRITE          = (1 << 1),
+        V3D_PERIPHERAL_VPM_WAIT           = (1 << 2),
+        V3D_PERIPHERAL_SFU                = (1 << 3),
+        V3D_PERIPHERAL_TMU_WRITE          = (1 << 4),
+        V3D_PERIPHERAL_TMU_READ           = (1 << 5),
+        V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
+        V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
+        V3D_PERIPHERAL_TSY                = (1 << 8),
+        V3D_PERIPHERAL_TLB_READ           = (1 << 9),
+        V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
+};
 
-static bool
-qpu_accesses_peripheral(const struct v3d_device_info *devinfo,
-                        const struct v3d_qpu_instr *inst)
+static uint32_t
+qpu_peripherals(const struct v3d_device_info *devinfo,
+                const struct v3d_qpu_instr *inst)
 {
-        if (v3d_qpu_uses_vpm(inst))
-                return true;
+        uint32_t result = 0;
+        if (v3d_qpu_reads_vpm(inst))
+                result |= V3D_PERIPHERAL_VPM_READ;
+        if (v3d_qpu_writes_vpm(inst))
+                result |= V3D_PERIPHERAL_VPM_WRITE;
+        if (v3d_qpu_waits_vpm(inst))
+                result |= V3D_PERIPHERAL_VPM_WAIT;
+
+        if (v3d_qpu_writes_tmu(devinfo, inst))
+                result |= V3D_PERIPHERAL_TMU_WRITE;
+        if (inst->sig.ldtmu)
+                result |= V3D_PERIPHERAL_TMU_READ;
+        if (inst->sig.wrtmuc)
+                result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
+
         if (v3d_qpu_uses_sfu(inst))
-                return true;
+                result |= V3D_PERIPHERAL_SFU;
+
+        if (v3d_qpu_reads_tlb(inst))
+                result |= V3D_PERIPHERAL_TLB_READ;
+        if (v3d_qpu_writes_tlb(inst))
+                result |= V3D_PERIPHERAL_TLB_WRITE;
 
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
                     inst->alu.add.magic_write &&
-                    qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
-                        return true;
+                    v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
+                        result |= V3D_PERIPHERAL_TSY;
                 }
 
                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
-                        return true;
-
-                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
-                    inst->alu.mul.magic_write &&
-                    qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
-                        return true;
-                }
+                        result |= V3D_PERIPHERAL_TMU_WAIT;
         }
 
-        return (inst->sig.ldvpm ||
-                inst->sig.ldtmu ||
-                inst->sig.ldtlb ||
-                inst->sig.ldtlbu ||
-                inst->sig.wrtmuc);
+        return result;
 }
 
 static bool
@@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
                                  const struct v3d_qpu_instr *a,
                                  const struct v3d_qpu_instr *b)
 {
-        const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
-        const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
+        const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
+        const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
 
         /* We can always do one peripheral access per instruction. */
-        if (!a_uses_peripheral || !b_uses_peripheral)
+        if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
                 return true;
 
-        if (devinfo->ver < 41)
+        /* V3D 4.x can't do more than one peripheral access except in a
+         * few cases:
+         */
+        if (devinfo->ver == 42) {
+                /* WRTMUC signal with TMU register write (other than tmuc). */
+                if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                    b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+                }
+                if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                    a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+                }
+
+                /* TMU read with VPM read/write. */
+                if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+                    (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+                     b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+                        return true;
+                }
+                if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+                    (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+                     a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+                        return true;
+                }
+
                 return false;
+        }
 
-        /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
-         * WRTMUC with a TMU magic register write (other than tmuc).
-         */
-        if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
-            (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
-                return true;
+        /* V3D 7.x can't have more than one of these restricted peripherals */
+        const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+                                    V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+                                    V3D_PERIPHERAL_TSY |
+                                    V3D_PERIPHERAL_TLB_READ |
+                                    V3D_PERIPHERAL_SFU |
+                                    V3D_PERIPHERAL_VPM_READ |
+                                    V3D_PERIPHERAL_VPM_WRITE;
+
+        const uint32_t a_restricted = a_peripherals & restricted;
+        const uint32_t b_restricted = b_peripherals & restricted;
+        if (a_restricted && b_restricted) {
+                /* WRTMUC signal with TMU register write (other than tmuc) is
+                 * allowed though.
+                 */
+                if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                       b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+                       v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+                      (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                       a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+                       v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+                        return false;
+                }
         }
 
-        if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
-            (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
-                return true;
+        /* Only one TMU read per instruction */
+        if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+            (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+                return false;
         }
 
-        return false;
+        /* Only one TLB access per instruction */
+        if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+                              V3D_PERIPHERAL_TLB_READ)) &&
+            (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+                              V3D_PERIPHERAL_TLB_READ))) {
+                return false;
+        }
+
+        return true;
 }
 
 /* Compute a bitmask of which rf registers are used between
@@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
         uint64_t raddrs_used = 0;
         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
                 raddrs_used |= (1ll << a->raddr_a);
-        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+        if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
                 raddrs_used |= (1ll << a->raddr_b);
         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
                 raddrs_used |= (1ll << b->raddr_a);
-        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+        if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
                 raddrs_used |= (1ll << b->raddr_b);
 
         return raddrs_used;
 }
 
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
  */
 static bool
 qpu_merge_raddrs(struct v3d_qpu_instr *result,
                  const struct v3d_qpu_instr *add_instr,
-                 const struct v3d_qpu_instr *mul_instr)
+                 const struct v3d_qpu_instr *mul_instr,
+                 const struct v3d_device_info *devinfo)
 {
+        if (devinfo->ver >= 71) {
+                assert(add_instr->sig.small_imm_a +
+                       add_instr->sig.small_imm_b <= 1);
+                assert(add_instr->sig.small_imm_c +
+                       add_instr->sig.small_imm_d == 0);
+                assert(mul_instr->sig.small_imm_a +
+                       mul_instr->sig.small_imm_b == 0);
+                assert(mul_instr->sig.small_imm_c +
+                       mul_instr->sig.small_imm_d <= 1);
+
+                result->sig.small_imm_a = add_instr->sig.small_imm_a;
+                result->sig.small_imm_b = add_instr->sig.small_imm_b;
+                result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+                result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+                return (result->sig.small_imm_a +
+                        result->sig.small_imm_b +
+                        result->sig.small_imm_c +
+                        result->sig.small_imm_d) <= 1;
+        }
+
+        assert(devinfo->ver == 42);
+
         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
         int naddrs = util_bitcount64(raddrs_used);
 
         if (naddrs > 2)
                 return false;
 
-        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+        if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
                 if (naddrs > 1)
                         return false;
 
-                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+                if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
                         if (add_instr->raddr_b != mul_instr->raddr_b)
                                 return false;
 
-                result->sig.small_imm = true;
-                result->raddr_b = add_instr->sig.small_imm ?
+                result->sig.small_imm_b = true;
+                result->raddr_b = add_instr->sig.small_imm_b ?
                         add_instr->raddr_b : mul_instr->raddr_b;
         }
 
@@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
         raddrs_used &= ~(1ll << raddr_a);
         result->raddr_a = raddr_a;
 
-        if (!result->sig.small_imm) {
+        if (!result->sig.small_imm_b) {
                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
                     raddr_a == add_instr->raddr_b) {
-                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
-                                result->alu.add.a = V3D_QPU_MUX_A;
-                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+                        if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+                                result->alu.add.a.mux = V3D_QPU_MUX_A;
+                        if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
-                                result->alu.add.b = V3D_QPU_MUX_A;
+                                result->alu.add.b.mux = V3D_QPU_MUX_A;
                         }
                 }
                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
                     raddr_a == mul_instr->raddr_b) {
-                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
-                                result->alu.mul.a = V3D_QPU_MUX_A;
-                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+                        if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+                                result->alu.mul.a.mux = V3D_QPU_MUX_A;
+                        if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
-                                result->alu.mul.b = V3D_QPU_MUX_A;
+                                result->alu.mul.b.mux = V3D_QPU_MUX_A;
                         }
                 }
         }
@@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
         result->raddr_b = raddr_b;
         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
             raddr_b == add_instr->raddr_a) {
-                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
-                        result->alu.add.a = V3D_QPU_MUX_B;
-                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+                if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+                        result->alu.add.a.mux = V3D_QPU_MUX_B;
+                if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
-                        result->alu.add.b = V3D_QPU_MUX_B;
+                        result->alu.add.b.mux = V3D_QPU_MUX_B;
                 }
         }
         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
             raddr_b == mul_instr->raddr_a) {
-                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
-                        result->alu.mul.a = V3D_QPU_MUX_B;
-                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+                if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+                        result->alu.mul.a.mux = V3D_QPU_MUX_B;
+                if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
-                        result->alu.mul.b = V3D_QPU_MUX_B;
+                        result->alu.mul.b.mux = V3D_QPU_MUX_B;
                 }
         }
 
@@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
 }
 
 static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+                       struct v3d_qpu_instr *inst)
 {
         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
         assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
         inst->flags.ac = V3D_QPU_COND_NONE;
         inst->flags.apf = V3D_QPU_PF_NONE;
         inst->flags.auf = V3D_QPU_UF_NONE;
+
+        inst->alu.mul.output_pack = inst->alu.add.output_pack;
+
+        inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+        inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
+        inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+        inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+        inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+        if (devinfo->ver >= 71) {
+                assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+                assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+                if (inst->sig.small_imm_a) {
+                        inst->sig.small_imm_c = true;
+                        inst->sig.small_imm_a = false;
+                } else if (inst->sig.small_imm_b) {
+                        inst->sig.small_imm_d = true;
+                        inst->sig.small_imm_b = false;
+                }
+        }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+        switch (op) {
+        case V3D_QPU_M_MOV:
+        case V3D_QPU_M_FMOV:
+                return devinfo->ver >= 71;
+        default:
+                return false;
+        }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+        switch (op) {
+        case V3D_QPU_M_MOV:
+                return V3D_QPU_A_MOV;
+        case V3D_QPU_M_FMOV:
+                return V3D_QPU_A_FMOV;
+        default:
+                unreachable("unexpected mov opcode");
+        }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+        STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+        assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+        assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+        memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+        inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+        inst->alu.mul.op = V3D_QPU_M_NOP;
+
+        inst->flags.ac = inst->flags.mc;
+        inst->flags.apf = inst->flags.mpf;
+        inst->flags.auf = inst->flags.muf;
+        inst->flags.mc = V3D_QPU_COND_NONE;
+        inst->flags.mpf = V3D_QPU_PF_NONE;
+        inst->flags.muf = V3D_QPU_UF_NONE;
+
+        inst->alu.add.output_pack = inst->alu.mul.output_pack;
+        inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+        inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+        inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+        inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+        inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+        assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+        assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+        if (inst->sig.small_imm_c) {
+                inst->sig.small_imm_a = true;
+                inst->sig.small_imm_c = false;
+        } else if (inst->sig.small_imm_d) {
+                inst->sig.small_imm_b = true;
+                inst->sig.small_imm_d = false;
+        }
 }
 
 static bool
@@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
                          can_do_add_as_mul(b->alu.add.op)) {
                         mul_inst = *b;
-                        qpu_convert_add_to_mul(&mul_inst);
+                        qpu_convert_add_to_mul(devinfo, &mul_inst);
 
                         merge.alu.mul = mul_inst.alu.mul;
 
-                        merge.flags.mc = b->flags.ac;
-                        merge.flags.mpf = b->flags.apf;
-                        merge.flags.muf = b->flags.auf;
+                        merge.flags.mc = mul_inst.flags.mc;
+                        merge.flags.mpf = mul_inst.flags.mpf;
+                        merge.flags.muf = mul_inst.flags.muf;
 
                         add_instr = a;
                         mul_instr = &mul_inst;
                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
                            can_do_add_as_mul(a->alu.add.op)) {
                         mul_inst = *a;
-                        qpu_convert_add_to_mul(&mul_inst);
+                        qpu_convert_add_to_mul(devinfo, &mul_inst);
 
                         merge = mul_inst;
                         merge.alu.add = b->alu.add;
@@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
                 }
         }
 
+        struct v3d_qpu_instr add_inst;
         if (b->alu.mul.op != V3D_QPU_M_NOP) {
-                if (a->alu.mul.op != V3D_QPU_M_NOP)
-                        return false;
-                merge.alu.mul = b->alu.mul;
+                if (a->alu.mul.op == V3D_QPU_M_NOP) {
+                        merge.alu.mul = b->alu.mul;
 
-                merge.flags.mc = b->flags.mc;
-                merge.flags.mpf = b->flags.mpf;
-                merge.flags.muf = b->flags.muf;
+                        merge.flags.mc = b->flags.mc;
+                        merge.flags.mpf = b->flags.mpf;
+                        merge.flags.muf = b->flags.muf;
 
-                mul_instr = b;
-                add_instr = a;
+                        mul_instr = b;
+                        add_instr = a;
+                }
+                /* If a's mul op is used but its add op is not, then see if we
+                 * can convert either a's mul op or b's mul op to an add op
+                 * so we can merge.
+                 */
+                else if (a->alu.add.op == V3D_QPU_A_NOP &&
+                         can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+                        add_inst = *b;
+                        qpu_convert_mul_to_add(&add_inst);
+
+                        merge.alu.add = add_inst.alu.add;
+
+                        merge.flags.ac = add_inst.flags.ac;
+                        merge.flags.apf = add_inst.flags.apf;
+                        merge.flags.auf = add_inst.flags.auf;
+
+                        mul_instr = a;
+                        add_instr = &add_inst;
+                } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+                           can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+                        add_inst = *a;
+                        qpu_convert_mul_to_add(&add_inst);
+
+                        merge = add_inst;
+                        merge.alu.mul = b->alu.mul;
+
+                        merge.flags.mc = b->flags.mc;
+                        merge.flags.mpf = b->flags.mpf;
+                        merge.flags.muf = b->flags.muf;
+
+                        mul_instr = b;
+                        add_instr = &add_inst;
+                } else {
+                        return false;
+                }
         }
 
+        /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+         * they have restrictions on the number of raddrs that can be adressed
+         * in a single instruction. In V3D 7.x, we don't have that restriction,
+         * but we are still limited to a single small immediate per instruction.
+         */
         if (add_instr && mul_instr &&
-            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
-                        return false;
+            !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+                return false;
         }
 
         merge.sig.thrsw |= b->sig.thrsw;
@@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
         merge.sig.ldtmu |= b->sig.ldtmu;
         merge.sig.ldvary |= b->sig.ldvary;
         merge.sig.ldvpm |= b->sig.ldvpm;
-        merge.sig.small_imm |= b->sig.small_imm;
         merge.sig.ldtlb |= b->sig.ldtlb;
         merge.sig.ldtlbu |= b->sig.ldtlbu;
         merge.sig.ucb |= b->sig.ucb;
@@ -1047,24 +1403,25 @@ retry:
                  *  regfile A or B that was written to by the previous
                  *  instruction."
                  */
-                if (reads_too_soon_after_write(scoreboard, n->inst))
+                if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
                         continue;
 
                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
                         continue;
 
-                /* "A scoreboard wait must not occur in the first two
-                 *  instructions of a fragment shader. This is either the
-                 *  explicit Wait for Scoreboard signal or an implicit wait
-                 *  with the first tile-buffer read or write instruction."
+                /* "Before doing a TLB access a scoreboard wait must have been
+                 *  done. This happens either on the first or last thread
+                 *  switch, depending on a setting (scb_wait_on_first_thrsw) in
+                 *  the shader state."
                  */
-                if (pixel_scoreboard_too_soon(scoreboard, inst))
+                if (pixel_scoreboard_too_soon(c, scoreboard, inst))
                         continue;
 
-                /* ldunif and ldvary both write r5, but ldunif does so a tick
-                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
+                /* ldunif and ldvary both write the same register (r5 for v42
+                 * and below, rf0 for v71), but ldunif does so a tick sooner.
+                 * If the ldvary's register wasn't used, then ldunif might
                  * otherwise get scheduled so ldunif and ldvary try to update
-                 * r5 in the same tick.
+                 * the register in the same tick.
                  */
                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1131,24 +1488,54 @@ retry:
                                 continue;
                         }
 
-                        /* Don't merge in something that will lock the TLB.
-                         * Hopwefully what we have in inst will release some
-                         * other instructions, allowing us to delay the
-                         * TLB-locking instruction until later.
+                        /* Don't merge TLB instructions before we have acquired
+                         * the scoreboard lock.
                          */
-                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+                        if (pixel_scoreboard_too_soon(c, scoreboard, inst))
                                 continue;
 
-                        /* When we succesfully pair up an ldvary we then try
+                        /* When we successfully pair up an ldvary we then try
                          * to merge it into the previous instruction if
                          * possible to improve pipelining. Don't pick up the
                          * ldvary now if the follow-up fixup would place
                          * it in the delay slots of a thrsw, which is not
                          * allowed and would prevent the fixup from being
-                         * successul.
+                         * successful. In V3D 7.x we can allow this to happen
+                         * as long as it is not the last delay slot.
                          */
-                        if (inst->sig.ldvary &&
-                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
+                        if (inst->sig.ldvary) {
+                                if (c->devinfo->ver == 42 &&
+                                    scoreboard->last_thrsw_tick + 2 >=
+                                    scoreboard->tick - 1) {
+                                        continue;
+                                }
+                                if (c->devinfo->ver >= 71 &&
+                                    scoreboard->last_thrsw_tick + 2 ==
+                                    scoreboard->tick - 1) {
+                                        continue;
+                                }
+                        }
+
+                        /* We can emit a new tmu lookup with a previous ldtmu
+                         * if doing this would free just enough space in the
+                         * TMU output fifo so we don't overflow, however, this
+                         * is only safe if the ldtmu cannot stall.
+                         *
+                         * A ldtmu can stall if it is not the first following a
+                         * thread switch and corresponds to the first word of a
+                         * read request.
+                         *
+                         * FIXME: For now we forbid pairing up a new lookup
+                         * with a previous ldtmu that is not the first after a
+                         * thrsw if that could overflow the TMU output fifo
+                         * regardless of whether the ldtmu is reading the first
+                         * word of a TMU result or not, since we don't track
+                         * this aspect in the compiler yet.
+                         */
+                        if (prev_inst->inst->qpu.sig.ldtmu &&
+                            !scoreboard->first_ldtmu_after_thrsw &&
+                            (scoreboard->pending_ldtmu_count +
+                             n->inst->ldtmu_count > 16 / c->threads)) {
                                 continue;
                         }
 
@@ -1161,7 +1548,7 @@ retry:
 
                 int prio = get_instruction_priority(c->devinfo, inst);
 
-                if (mux_read_stalls(scoreboard, inst)) {
+                if (read_stalls(c->devinfo, scoreboard, inst)) {
                         /* Don't merge an instruction that stalls */
                         if (prev_inst)
                                 continue;
@@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
 {
         if (v3d_qpu_magic_waddr_is_sfu(waddr))
                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
-        else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
+        else if (waddr == V3D_QPU_WADDR_UNIFA)
                 scoreboard->last_unifa_write_tick = scoreboard->tick;
 }
 
@@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
 }
 
 static void
+update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+                               const struct qinst *inst)
+{
+        /* Track if the have seen any ldtmu after the last thread switch */
+        if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
+                scoreboard->first_ldtmu_after_thrsw = true;
+
+        /* Track the number of pending ldtmu instructions for outstanding
+         * TMU lookups.
+         */
+        scoreboard->pending_ldtmu_count += inst->ldtmu_count;
+        if (inst->qpu.sig.ldtmu) {
+                assert(scoreboard->pending_ldtmu_count > 0);
+                scoreboard->pending_ldtmu_count--;
+                scoreboard->first_ldtmu_after_thrsw = false;
+        }
+}
+
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+                           const struct v3d_qpu_instr *inst,
+                           const struct v3d_device_info *devinfo)
+{
+        if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+            v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+            !inst->sig_magic) {
+                scoreboard->has_rf0_flops_conflict = true;
+        }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+                                const struct v3d_qpu_instr *inst,
+                                const struct v3d_device_info *devinfo)
+{
+        if (devinfo->ver < 71)
+                return;
+
+        /* Thread switch restrictions:
+         *
+         * At the point of a thread switch or thread end (when the actual
+         * thread switch or thread end happens, not when the signalling
+         * instruction is processed):
+         *
+         *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
+         *      ldvary instruction in which another signal also wrote to the
+         *      register file, and the final instruction of the thread section
+         *      contained a signal which wrote to the register file, then the
+         *      value of rf0 is undefined at the start of the new section
+         *
+         * Here we use the scoreboard to track if our last rf0 implicit write
+         * happens at the same time that another signal writes the register
+         * file (has_rf0_flops_conflict). We will use that information when
+         * scheduling thrsw instructions to avoid putting anything in their
+         * last delay slot which has a signal that writes to the register file.
+         */
+
+        /* Reset tracking if we have an explicit rf0 write or we are starting
+         * a new thread section.
+         */
+        if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+            scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+                scoreboard->last_implicit_rf0_write_tick = -10;
+                scoreboard->has_rf0_flops_conflict = false;
+        }
+
+        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+                scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+                        scoreboard->tick + 1 : scoreboard->tick;
+        }
+
+        set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
+static void
 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
-                             const struct v3d_qpu_instr *inst,
+                             const struct qinst *qinst,
                              const struct v3d_device_info *devinfo)
 {
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
                 return;
 
@@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                 }
         }
 
+        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
+                update_scoreboard_for_magic_waddr(scoreboard,
+                                                  inst->sig_addr,
+                                                  devinfo);
+        }
+
         if (inst->sig.ldvary)
                 scoreboard->last_ldvary_tick = scoreboard->tick;
 
-        if (qpu_inst_is_tlb(inst))
-                scoreboard->tlb_locked = true;
+        update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
+        update_scoreboard_tmu_tracking(scoreboard, qinst);
 }
 
 static void
@@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo,
             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return latency;
 
-        if (before_inst->alu.add.magic_write) {
+        if (v3d_qpu_instr_is_sfu(before_inst))
+                return 2;
+
+        if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
+            before_inst->alu.add.magic_write) {
                 latency = MAX2(latency,
                                magic_waddr_latency(devinfo,
                                                    before_inst->alu.add.waddr,
                                                    after_inst));
         }
 
-        if (before_inst->alu.mul.magic_write) {
+        if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
+            before_inst->alu.mul.magic_write) {
                 latency = MAX2(latency,
                                magic_waddr_latency(devinfo,
                                                    before_inst->alu.mul.waddr,
                                                    after_inst));
         }
 
-        if (v3d_qpu_instr_is_sfu(before_inst))
-                return 2;
-
         return latency;
 }
 
@@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
 {
         list_addtail(&inst->link, &block->instructions);
 
-        update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
+        update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
         c->qpu_inst_count++;
         scoreboard->tick++;
 }
@@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
 {
         const struct v3d_qpu_instr *inst = &qinst->qpu;
 
-        /* Only TLB Z writes are prohibited in the last slot, but we don't
-         * have those flagged so prohibit all TLB ops for now.
-         */
-        if (slot == 2 && qpu_inst_is_tlb(inst))
+        if (slot == 2 && qinst->is_tlb_z_write)
                 return false;
 
         if (slot > 0 && qinst->uniform != ~0)
                 return false;
 
-        if (v3d_qpu_uses_vpm(inst))
+        if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
                 return false;
 
         if (inst->sig.ldvary)
@@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
 
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
-                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+                if (c->devinfo->ver == 42 && slot == 2 &&
+                    inst->alu.add.op == V3D_QPU_A_TMUWT) {
                         return false;
+                }
 
-                /* No writing physical registers at the end. */
-                if (!inst->alu.add.magic_write ||
-                    !inst->alu.mul.magic_write) {
-                        return false;
+                if (c->devinfo->ver == 42) {
+                        /* No writing physical registers at the end. */
+                        bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+                        bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+                        if ((!add_is_nop && !inst->alu.add.magic_write) ||
+                            (!mul_is_nop && !inst->alu.mul.magic_write)) {
+                                return false;
+                        }
+
+                        if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+                            !inst->sig_magic) {
+                                return false;
+                        }
                 }
 
-                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
-                        return false;
+                if (c->devinfo->ver >= 71) {
+                        /* The thread end instruction must not write to the
+                         * register file via the add/mul ALUs.
+                         */
+                        if (slot == 0 &&
+                            (!inst->alu.add.magic_write ||
+                             !inst->alu.mul.magic_write)) {
+                                return false;
+                        }
+                }
 
-                /* RF0-2 might be overwritten during the delay slots by
-                 * fragment shader setup.
-                 */
-                if (inst->raddr_a < 3 &&
-                    (inst->alu.add.a == V3D_QPU_MUX_A ||
-                     inst->alu.add.b == V3D_QPU_MUX_A ||
-                     inst->alu.mul.a == V3D_QPU_MUX_A ||
-                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
-                        return false;
+                if (c->devinfo->ver == 42) {
+                        /* RF0-2 might be overwritten during the delay slots by
+                         * fragment shader setup.
+                         */
+                        if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+                                return false;
+
+                        if (inst->raddr_b < 3 &&
+                            !inst->sig.small_imm_b &&
+                            v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+                                return false;
+                        }
                 }
 
-                if (inst->raddr_b < 3 &&
-                    !inst->sig.small_imm &&
-                    (inst->alu.add.a == V3D_QPU_MUX_B ||
-                     inst->alu.add.b == V3D_QPU_MUX_B ||
-                     inst->alu.mul.a == V3D_QPU_MUX_B ||
-                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
-                        return false;
+                if (c->devinfo->ver >= 71) {
+                        /* RF2-3 might be overwritten during the delay slots by
+                         * fragment shader setup.
+                         */
+                        if (v3d71_qpu_reads_raddr(inst, 2) ||
+                            v3d71_qpu_reads_raddr(inst, 3)) {
+                                return false;
+                        }
+
+                        if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+                            v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+                                return false;
+                        }
                 }
         }
 
@@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
  */
 static bool
 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+                                          struct choose_scoreboard *scoreboard,
                                           const struct qinst *qinst,
                                           uint32_t slot)
 {
@@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
          * thread.  The simulator complains for safety, though it
          * would only occur for dead code in our case.
          */
-        if (slot > 0 &&
-            qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
-            (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
-             v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
-                return false;
+        if (slot > 0) {
+                if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
+                        return false;
+                if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
+                        return false;
         }
 
-        if (slot > 0 && qinst->qpu.sig.ldvary)
-                return false;
+        if (qinst->qpu.sig.ldvary) {
+                if (c->devinfo->ver == 42 && slot > 0)
+                        return false;
+                if (c->devinfo->ver >= 71 && slot == 2)
+                        return false;
+        }
 
         /* unifa and the following 3 instructions can't overlap a
          * thread switch/end. The docs further clarify that this means
@@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
                 return false;
 
+        /* See comment when we set has_rf0_flops_conflict for details */
+        if (c->devinfo->ver >= 71 &&
+            slot == 2 &&
+            v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+            !qinst->qpu.sig_magic) {
+                if (scoreboard->has_rf0_flops_conflict)
+                        return false;
+                if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+                        return false;
+        }
+
         return true;
 }
 
@@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         assert(slot <= 2);
 
         /* We merge thrsw instructions back into the instruction stream
-         * manually, so any instructions scheduled after a thrsw shold be
+         * manually, so any instructions scheduled after a thrsw should be
          * in the actual delay slots and not in the same slot as the thrsw.
          */
         assert(slot >= 1);
@@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
          * also apply to instructions scheduled after the thrsw that we want
          * to place in its delay slots.
          */
-        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
                 return false;
 
         /* TLB access is disallowed until scoreboard wait is executed, which
@@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         if (v3d_qpu_writes_flags(&qinst->qpu))
                 return false;
 
+        /* TSY sync ops materialize at the point of the next thread switch,
+         * therefore, if we have a TSY sync right after a thread switch, we
+         * cannot place it in its delay slots, or we would be moving the sync
+         * to the thrsw before it instead.
+         */
+        if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
+                return false;
+
         return true;
 }
 
@@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
                      struct qinst *qinst, int instructions_in_sequence,
                      bool is_thrend)
 {
-        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
-        if (scoreboard->last_thrsw_tick + 3 >
-            scoreboard->tick - instructions_in_sequence) {
-                return false;
-        }
-
         for (int slot = 0; slot < instructions_in_sequence; slot++) {
-                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+                                                               qinst, slot)) {
                         return false;
+                }
 
                 if (is_thrend &&
                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c,
 
         /* Find how far back into previous instructions we can put the THRSW. */
         int slots_filled = 0;
+        int invalid_sig_count = 0;
+        int invalid_seq_count = 0;
+        bool last_thrsw_after_invalid_ok = false;
         struct qinst *merge_inst = NULL;
         vir_for_each_inst_rev(prev_inst, block) {
-                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
-                sig.thrsw = true;
-                uint32_t packed_sig;
-
-                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
+                /* No emitting our thrsw while the previous thrsw hasn't
+                 * happened yet.
+                 */
+                if (scoreboard->last_thrsw_tick + 3 >
+                    scoreboard->tick - (slots_filled + 1)) {
                         break;
+                }
+
 
                 if (!valid_thrsw_sequence(c, scoreboard,
                                           prev_inst, slots_filled + 1,
                                           is_thrend)) {
-                        break;
+                        /* Even if the current sequence isn't valid, we may
+                         * be able to get a valid sequence by trying to move the
+                         * thrsw earlier, so keep going.
+                         */
+                        invalid_seq_count++;
+                        goto cont_block;
+                }
+
+                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
+                sig.thrsw = true;
+                uint32_t packed_sig;
+                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
+                        /* If we can't merge the thrsw here because of signal
+                         * incompatibility, keep going, we might be able to
+                         * merge it in an earlier instruction.
+                         */
+                        invalid_sig_count++;
+                        goto cont_block;
                 }
 
+                /* For last thrsw we need 2 consecutive slots that are
+                 * thrsw compatible, so if we have previously jumped over
+                 * an incompatible signal, flag that we have found the first
+                 * valid slot here and keep going.
+                 */
+                if (inst->is_last_thrsw && invalid_sig_count > 0 &&
+                    !last_thrsw_after_invalid_ok) {
+                        last_thrsw_after_invalid_ok = true;
+                        invalid_sig_count++;
+                        goto cont_block;
+                }
+
+                /* We can merge the thrsw in this instruction */
+                last_thrsw_after_invalid_ok = false;
+                invalid_sig_count = 0;
+                invalid_seq_count = 0;
                 merge_inst = prev_inst;
+
+cont_block:
                 if (++slots_filled == 3)
                         break;
         }
 
+        /* If we jumped over a signal incompatibility and did not manage to
+         * merge the thrsw in the end, we need to adjust slots filled to match
+         * the last valid merge point.
+         */
+        assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
+                slots_filled >= invalid_sig_count + invalid_seq_count);
+        if (invalid_sig_count > 0)
+                slots_filled -= invalid_sig_count;
+        if (invalid_seq_count > 0)
+                slots_filled -= invalid_seq_count;
+
         bool needs_free = false;
         if (merge_inst) {
                 merge_inst->qpu.sig.thrsw = true;
@@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c,
                 merge_inst = inst;
         }
 
+        scoreboard->first_thrsw_emitted = true;
+
         /* If we're emitting the last THRSW (other than program end), then
          * signal that to the HW by emitting two THRSWs in a row.
          */
@@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c,
                 struct qinst *second_inst =
                         (struct qinst *)merge_inst->link.next;
                 second_inst->qpu.sig.thrsw = true;
+                scoreboard->last_thrsw_emitted = true;
         }
 
         /* Make sure the thread end executes within the program lifespan */
@@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c,
         assert(scoreboard->last_branch_tick + 3 < branch_tick);
         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
 
-        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+        /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
          * setmsf.
          */
         bool is_safe_msf_branch =
+                c->devinfo->ver >= 71 ||
                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c,
                         break;
                 }
 
+                /* Do not move up a branch if it can disrupt an ldvary sequence
+                 * as that can cause stomping of the r5 register.
+                 */
+                if (scoreboard->last_ldvary_tick + 2 >=
+                    branch_tick - slots_filled) {
+                       break;
+                }
+
                 /* Can't move a conditional branch before the instruction
                  * that writes the flags for its condition.
                  */
@@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c,
 }
 
 static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+                   struct v3d_qpu_instr *inst,
                    bool add, bool magic, uint32_t index)
 {
         uint32_t num_src;
-        enum v3d_qpu_mux mux_a, mux_b;
-
-        if (add) {
+        if (add)
                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
-                mux_a = inst->alu.add.a;
-                mux_b = inst->alu.add.b;
-        } else {
+        else
                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
-                mux_a = inst->alu.mul.a;
-                mux_b = inst->alu.mul.b;
-        }
 
-        for (int i = 0; i < num_src; i++) {
-                if (magic) {
-                        if (i == 0 && mux_a == index)
-                                return true;
-                        if (i == 1 && mux_b == index)
-                                return true;
+        if (devinfo->ver == 42) {
+                enum v3d_qpu_mux mux_a, mux_b;
+                if (add) {
+                        mux_a = inst->alu.add.a.mux;
+                        mux_b = inst->alu.add.b.mux;
                 } else {
-                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
+                        mux_a = inst->alu.mul.a.mux;
+                        mux_b = inst->alu.mul.b.mux;
+                }
+
+                for (int i = 0; i < num_src; i++) {
+                        if (magic) {
+                                if (i == 0 && mux_a == index)
+                                        return true;
+                                if (i == 1 && mux_b == index)
+                                        return true;
+                        } else {
+                                if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
                         }
                 }
+
+                return false;
+        }
+
+        assert(devinfo->ver >= 71);
+        assert(!magic);
+
+        uint32_t raddr_a, raddr_b;
+        if (add) {
+                raddr_a = inst->alu.add.a.raddr;
+                raddr_b = inst->alu.add.b.raddr;
+        } else {
+                raddr_a = inst->alu.mul.a.raddr;
+                raddr_b = inst->alu.mul.b.raddr;
+        }
+
+        for (int i = 0; i < num_src; i++) {
+                if (i == 0 && raddr_a == index)
+                        return true;
+                if (i == 1 && raddr_b == index)
+                        return true;
         }
 
         return false;
@@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
                        struct qblock *block,
                        struct v3d_qpu_instr *inst)
 {
-        /* We only call this if we have successfuly merged an ldvary into a
+        const struct v3d_device_info *devinfo = c->devinfo;
+
+        /* We only call this if we have successfully merged an ldvary into a
          * previous instruction.
          */
         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
@@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
          * the ldvary destination, if it does, then moving the ldvary before
          * it would overwrite it.
          */
-        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
                 return false;
-        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
+                return false;
+
+        /* The implicit ldvary destination may not be written to by a signal
+         * in the instruction following ldvary. Since we are planning to move
+         * ldvary to the previous instruction, this means we need to check if
+         * the current instruction has any other signal that could create this
+         * conflict. The only other signal that can write to the implicit
+         * ldvary destination that is compatible with ldvary in the same
+         * instruction is ldunif.
+         */
+        if (inst->sig.ldunif)
                 return false;
 
         /* The previous instruction can't write to the same destination as the
@@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         }
 
         /* The previous instruction cannot have a conflicting signal */
-        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+        if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
+                return false;
+
+        uint32_t sig;
+        struct v3d_qpu_sig new_sig = prev->qpu.sig;
+        new_sig.ldvary = true;
+        if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
                 return false;
 
         /* The previous instruction cannot use flags since ldvary uses the
@@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
 
         /* We can't put an ldvary in the delay slots of a thrsw. We should've
          * prevented this when pairing up the ldvary with another instruction
-         * and flagging it for a fixup.
+         * and flagging it for a fixup. In V3D 7.x this is limited only to the
+         * second delay slot.
          */
-        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+        assert((devinfo->ver == 42 &&
+                scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+               (devinfo->ver >= 71 &&
+                scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
 
         /* Move the ldvary to the previous instruction and remove it from the
          * current one.
@@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         inst->sig_magic = false;
         inst->sig_addr = 0;
 
-        /* By moving ldvary to the previous instruction we make it update
-         * r5 in the current one, so nothing else in it should write r5.
+        /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+        if (devinfo->ver >= 71) {
+                scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+                set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+        }
+
+        /* By moving ldvary to the previous instruction we make it update r5
+         * (rf0 for ver >= 71) in the current one, so nothing else in it
+         * should write this register.
+         *
          * This should've been prevented by our depedency tracking, which
          * would not allow ldvary to be paired up with an instruction that
-         * writes r5 (since our dependency tracking doesn't know that the
-         * ldvary write r5 happens in the next instruction).
+         * writes r5/rf0 (since our dependency tracking doesn't know that the
+         * ldvary write to r5/rf0 happens in the next instruction).
          */
-        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+        assert(!v3d_qpu_writes_r5(devinfo, inst));
+        assert(devinfo->ver == 42 ||
+               (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+                !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
 
         return true;
 }
@@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c,
                                                 merge->inst->uniform;
                                 }
 
+                                chosen->inst->ldtmu_count +=
+                                        merge->inst->ldtmu_count;
+
                                 if (debug) {
                                         fprintf(stderr, "t=%4d: merging: ",
                                                 time);
@@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c,
                                         }
                                 }
                         }
-                        if (mux_read_stalls(scoreboard, inst))
+                        if (read_stalls(c->devinfo, scoreboard, inst))
                                 c->qpu_inst_stalled_count++;
                 }
 
@@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         scoreboard.last_branch_tick = -10;
         scoreboard.last_setmsf_tick = -10;
         scoreboard.last_stallable_sfu_tick = -10;
+        scoreboard.first_ldtmu_after_thrsw = true;
+        scoreboard.last_implicit_rf0_write_tick = - 10;
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index ec9ed66650c..538b247e3e0 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
         int last_sfu_write;
         int last_branch_ip;
         int last_thrsw_ip;
+        int first_tlb_z_write;
 
         /* Set when we've found the last-THRSW signal, or if we were started
          * in single-segment mode.
@@ -110,11 +111,58 @@ static void
 qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
 {
         const struct v3d_device_info *devinfo = state->c->devinfo;
+
+        if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+                state->first_tlb_z_write = state->ip;
+
         const struct v3d_qpu_instr *inst = &qinst->qpu;
 
+        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+            state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write &&
+            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+                fail_instr(state, "Implicit branch MSF read after TLB Z write");
+        }
+
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return;
 
+        if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+            state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write) {
+                fail_instr(state, "SETMSF after TLB Z write");
+        }
+
+        if (state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write &&
+            inst->alu.add.op == V3D_QPU_A_MSF) {
+                fail_instr(state, "MSF read after TLB Z write");
+        }
+
+        if (devinfo->ver < 71) {
+                if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+                    inst->sig.small_imm_d) {
+                        fail_instr(state, "small imm a/c/d added after V3D 7.1");
+                }
+        } else {
+                if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+                    !vir_is_add(qinst)) {
+                        fail_instr(state, "small imm a/b used but no ADD inst");
+                }
+                if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+                    !vir_is_mul(qinst)) {
+                        fail_instr(state, "small imm c/d used but no MUL inst");
+                }
+                if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+                    inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+                        fail_instr(state, "only one small immediate can be "
+                                   "enabled per instruction");
+                }
+        }
+
         /* LDVARY writes r5 two instructions later and LDUNIF writes
          * r5 one instruction later, which is illegal to have
          * together.
@@ -128,7 +176,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
          *
          * FIXME: This would not check correctly for V3D 4.2 versions lower
          * than V3D 4.2.14, but that is not a real issue because the simulator
-         * will still catch this, and we are not really targetting any such
+         * will still catch this, and we are not really targeting any such
          * versions anyway.
          */
         if (state->c->devinfo->ver < 42) {
@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
                                    "SFU write started during THRSW delay slots ");
                 }
 
-                if (inst->sig.ldvary)
-                        fail_instr(state, "LDVARY during THRSW delay slots");
+                if (inst->sig.ldvary) {
+                        if (devinfo->ver == 42)
+                                fail_instr(state, "LDVARY during THRSW delay slots");
+                        if (devinfo->ver >= 71 &&
+                            state->ip - state->last_thrsw_ip == 2) {
+                                fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+                        }
+                }
         }
 
         (void)qpu_magic_waddr_matches; /* XXX */
@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
             vpm_writes +
             tlb_writes +
             tsy_writes +
-            inst->sig.ldtmu +
+            (devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
             inst->sig.ldtlb +
             inst->sig.ldvpm +
             inst->sig.ldtlbu > 1) {
@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
             inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
                      !inst->alu.add.magic_write)) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver == 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71) {
+                                if (state->last_thrsw_ip - state->ip == 0) {
+                                        fail_instr(state,
+                                                   "ADD RF write at THREND");
+                                }
+                                if (inst->alu.add.waddr == 2 ||
+                                    inst->alu.add.waddr == 3) {
+                                        fail_instr(state,
+                                                   "RF2-3 write after THREND");
+                                }
+                        }
                 }
 
                 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
                      !inst->alu.mul.magic_write)) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver == 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71) {
+                                if (state->last_thrsw_ip - state->ip == 0) {
+                                        fail_instr(state,
+                                                   "MUL RF write at THREND");
+                                }
+
+                                if (inst->alu.mul.waddr == 2 ||
+                                    inst->alu.mul.waddr == 3) {
+                                        fail_instr(state,
+                                                   "RF2-3 write after THREND");
+                                }
+                        }
                 }
 
                 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
                     !inst->sig_magic) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver == 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71 &&
+                                   (inst->sig_addr == 2 ||
+                                    inst->sig_addr == 3)) {
+                                fail_instr(state, "RF2-3 write after THREND");
+                        }
                 }
 
                 /* GFXH-1625: No TMUWT in the last instruction */
@@ -312,7 +397,7 @@ qpu_validate(struct v3d_compile *c)
          * keep compiling the validation code to make sure it doesn't get
          * broken.
          */
-#ifndef DEBUG
+#if !MESA_DEBUG
         return;
 #endif
 
@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
                 .last_sfu_write = -10,
                 .last_thrsw_ip = -10,
                 .last_branch_ip = -10,
+                .first_tlb_z_write = INT_MAX,
                 .ip = 0,
 
                 .last_thrsw_found = !c->last_thrsw,
diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c
deleted file mode 100644
index b933635f6fe..00000000000
--- a/src/broadcom/compiler/v3d33_tex.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
-{
-        /* FIXME: We don't bother implementing pipelining for texture reads
-         * for any pre 4.x hardware. It should be straight forward to do but
-         * we are not really testing or even targetting this hardware at
-         * present.
-         */
-        ntq_flush_tmu(c);
-
-        unsigned unit = instr->texture_index;
-
-        struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = {
-                V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header,
-
-                .fetch_sample_mode = instr->op == nir_texop_txf,
-        };
-
-        struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked = {
-        };
-
-        switch (instr->sampler_dim) {
-        case GLSL_SAMPLER_DIM_1D:
-                if (instr->is_array)
-                        p0_unpacked.lookup_type = TEXTURE_1D_ARRAY;
-                else
-                        p0_unpacked.lookup_type = TEXTURE_1D;
-                break;
-        case GLSL_SAMPLER_DIM_2D:
-        case GLSL_SAMPLER_DIM_RECT:
-                if (instr->is_array)
-                        p0_unpacked.lookup_type = TEXTURE_2D_ARRAY;
-                else
-                        p0_unpacked.lookup_type = TEXTURE_2D;
-                break;
-        case GLSL_SAMPLER_DIM_3D:
-                p0_unpacked.lookup_type = TEXTURE_3D;
-                break;
-        case GLSL_SAMPLER_DIM_CUBE:
-                p0_unpacked.lookup_type = TEXTURE_CUBE_MAP;
-                break;
-        default:
-                unreachable("Bad sampler type");
-        }
-
-        struct qreg coords[5];
-        int next_coord = 0;
-        for (unsigned i = 0; i < instr->num_srcs; i++) {
-                switch (instr->src[i].src_type) {
-                case nir_tex_src_coord:
-                        for (int j = 0; j < instr->coord_components; j++) {
-                                coords[next_coord++] =
-                                        ntq_get_src(c, instr->src[i].src, j);
-                        }
-                        if (instr->coord_components < 2)
-                                coords[next_coord++] = vir_uniform_f(c, 0.5);
-                        break;
-                case nir_tex_src_bias:
-                        coords[next_coord++] =
-                                ntq_get_src(c, instr->src[i].src, 0);
-
-                        p0_unpacked.bias_supplied = true;
-                        break;
-                case nir_tex_src_lod:
-                        coords[next_coord++] =
-                                vir_FADD(c,
-                                         ntq_get_src(c, instr->src[i].src, 0),
-                                         vir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL,
-                                                     unit));
-
-                        if (instr->op != nir_texop_txf &&
-                            instr->op != nir_texop_tg4) {
-                                p0_unpacked.disable_autolod_use_bias_only = true;
-                        }
-                        break;
-                case nir_tex_src_comparator:
-                        coords[next_coord++] =
-                                ntq_get_src(c, instr->src[i].src, 0);
-
-                        p0_unpacked.shadow = true;
-                        break;
-
-                case nir_tex_src_offset: {
-                        p0_unpacked.texel_offset_for_s_coordinate =
-                                nir_src_comp_as_int(instr->src[i].src, 0);
-
-                        if (instr->coord_components >= 2)
-                                p0_unpacked.texel_offset_for_t_coordinate =
-                                        nir_src_comp_as_int(instr->src[i].src, 1);
-
-                        if (instr->coord_components >= 3)
-                                p0_unpacked.texel_offset_for_r_coordinate =
-                                        nir_src_comp_as_int(instr->src[i].src, 2);
-                        break;
-                }
-
-                default:
-                        unreachable("unknown texture source");
-                }
-        }
-
-        /* Limit the number of channels returned to both how many the NIR
-         * instruction writes and how many the instruction could produce.
-         */
-        p1_unpacked.return_words_of_texture_data =
-                instr->dest.is_ssa ?
-                nir_ssa_def_components_read(&instr->dest.ssa) :
-                (1 << instr->dest.reg.reg->num_components) - 1;
-
-        uint32_t p0_packed;
-        V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
-                                                         (uint8_t *)&p0_packed,
-                                                         &p0_unpacked);
-
-        uint32_t p1_packed;
-        V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL,
-                                                         (uint8_t *)&p1_packed,
-                                                         &p1_unpacked);
-        /* Load unit number into the address field, which will be be used by
-         * the driver to decide which texture to put in the actual address
-         * field.
-         */
-        p1_packed |= unit << 5;
-
-        /* There is no native support for GL texture rectangle coordinates, so
-         * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
-         * 1]).
-         */
-        if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
-                coords[0] = vir_FMUL(c, coords[0],
-                                     vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X,
-                                                 unit));
-                coords[1] = vir_FMUL(c, coords[1],
-                                     vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y,
-                                                 unit));
-        }
-
-        int texture_u[] = {
-                vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
-                vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
-        };
-
-        for (int i = 0; i < next_coord; i++) {
-                struct qreg dst;
-
-                if (i == next_coord - 1)
-                        dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL);
-                else
-                        dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU);
-
-                struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
-
-                if (i < 2)
-                        tmu->uniform = texture_u[i];
-        }
-
-        vir_emit_thrsw(c);
-
-        for (int i = 0; i < 4; i++) {
-                if (p1_unpacked.return_words_of_texture_data & (1 << i))
-                        ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
-        }
-}
diff --git a/src/broadcom/compiler/v3d33_vpm_setup.c b/src/broadcom/compiler/v3d33_vpm_setup.c
deleted file mode 100644
index 8bce67dfae9..00000000000
--- a/src/broadcom/compiler/v3d33_vpm_setup.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "broadcom/cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components)
-{
-        struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = {
-                V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header,
-
-                .horiz = true,
-                .laned = false,
-                /* If the field is 0, that means a read count of 32. */
-                .num = num_components & 31,
-                .segs = true,
-                .stride = 1,
-                .size = VPM_SETUP_SIZE_32_BIT,
-                .addr = c->num_inputs,
-        };
-
-        uint32_t packed;
-        V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL,
-                                                (uint8_t *)&packed,
-                                                &unpacked);
-        vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
-
-void
-v3d33_vir_vpm_write_setup(struct v3d_compile *c)
-{
-        uint32_t packed;
-        struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = {
-                V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header,
-
-                .horiz = true,
-                .laned = false,
-                .segs = true,
-                .stride = 1,
-                .size = VPM_SETUP_SIZE_32_BIT,
-                .addr = 0,
-        };
-
-        V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL,
-                                                (uint8_t *)&packed,
-                                                &unpacked);
-        vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 0c1419661d3..12aaacdc14a 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -31,6 +31,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "util/blend.h"
 #include "util/macros.h"
 #include "common/v3d_debug.h"
 #include "common/v3d_device_info.h"
@@ -40,7 +41,6 @@
 #include "util/u_math.h"
 
 #include "qpu/qpu_instr.h"
-#include "pipe/p_state.h"
 
 /**
  * Maximum number of outstanding TMU operations we can queue for execution.
@@ -87,7 +87,7 @@ enum qfile {
 
         /** A physical register, such as the W coordinate payload. */
         QFILE_REG,
-        /** One of the regsiters for fixed function interactions. */
+        /** One of the registers for fixed function interactions. */
         QFILE_MAGIC,
 
         /**
@@ -97,12 +97,6 @@ enum qfile {
         QFILE_TEMP,
 
         /**
-         * VPM reads use this with an index value to say what part of the VPM
-         * is being read.
-         */
-        QFILE_VPM,
-
-        /**
          * Stores an immediate value in the index field that will be used
          * directly by qpu_load_imm().
          */
@@ -169,6 +163,19 @@ struct qinst {
          * otherwise.
          */
         int uniform;
+
+        /* If this is a a TLB Z write */
+        bool is_tlb_z_write;
+
+        /* If this is a retiring TMU instruction (the last in a lookup sequence),
+         * how many ldtmu instructions are required to read the results.
+         */
+        uint32_t ldtmu_count;
+
+        /* Position of this instruction in the program. Filled in during
+         * register allocation.
+         */
+        int32_t ip;
 };
 
 enum quniform_contents {
@@ -330,6 +337,19 @@ enum quniform_contents {
          * Current value of gl_ViewIndex for Multiview rendering.
          */
         QUNIFORM_VIEW_INDEX,
+
+        /**
+         * Inline uniform buffers
+         */
+         QUNIFORM_INLINE_UBO_0,
+         QUNIFORM_INLINE_UBO_1,
+         QUNIFORM_INLINE_UBO_2,
+         QUNIFORM_INLINE_UBO_3,
+
+        /**
+         * Current value of DrawIndex for Multidraw
+         */
+        QUNIFORM_DRAW_ID,
 };
 
 static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
@@ -369,13 +389,7 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
         return slot.slot_and_component & 3;
 }
 
-enum v3d_execution_environment {
-   V3D_ENVIRONMENT_OPENGL = 0,
-   V3D_ENVIRONMENT_VULKAN,
-};
-
 struct v3d_key {
-        void *shader_state;
         struct {
                 uint8_t swizzle[4];
         } tex[V3D_MAX_TEXTURE_SAMPLERS];
@@ -388,9 +402,9 @@ struct v3d_key {
         uint8_t num_samplers_used;
         uint8_t ucp_enables;
         bool is_last_geometry_stage;
-        bool robust_buffer_access;
-
-        enum v3d_execution_environment environment;
+        bool robust_uniform_access;
+        bool robust_storage_access;
+        bool robust_image_access;
 };
 
 struct v3d_fs_key {
@@ -400,7 +414,6 @@ struct v3d_fs_key {
         bool line_smoothing;
         bool point_coord_upper_left;
         bool msaa;
-        bool sample_coverage;
         bool sample_alpha_to_coverage;
         bool sample_alpha_to_one;
         /* Mask of which color render targets are present. */
@@ -419,14 +432,12 @@ struct v3d_fs_key {
          */
         struct {
                 enum pipe_format format;
-                const uint8_t *swizzle;
+                uint8_t swizzle[4];
         } color_fmt[V3D_MAX_DRAW_BUFFERS];
 
-        uint8_t logicop_func;
+        enum pipe_logicop logicop_func;
         uint32_t point_sprite_mask;
 
-        struct pipe_rt_blend_state blend;
-
         /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios:
          *
          * - If there is a geometry shader, then gl_PrimitiveID must be written
@@ -468,7 +479,7 @@ struct v3d_vs_key {
         bool clamp_color;
 };
 
-/** A basic block of VIR intructions. */
+/** A basic block of VIR instructions. */
 struct qblock {
         struct list_head link;
 
@@ -566,6 +577,7 @@ enum v3d_compilation_result {
  */
 struct v3d_compiler {
         const struct v3d_device_info *devinfo;
+        uint32_t max_inline_uniform_buffers;
         struct ra_regs *regs;
         struct ra_class *reg_class_any[3];
         struct ra_class *reg_class_r5[3];
@@ -584,6 +596,19 @@ struct v3d_interp_input {
    unsigned mode; /* interpolation mode */
 };
 
+struct v3d_ra_node_info {
+        struct {
+                uint32_t priority;
+                uint8_t class_bits;
+                bool is_program_end;
+                bool unused;
+
+                /* V3D 7.x */
+                bool is_ldunif_dst;
+        } *info;
+        uint32_t alloc_count;
+};
+
 struct v3d_compile {
         const struct v3d_device_info *devinfo;
         nir_shader *s;
@@ -596,7 +621,7 @@ struct v3d_compile {
         void *debug_output_data;
 
         /**
-         * Mapping from nir_register * or nir_ssa_def * to array of struct
+         * Mapping from nir_register * or nir_def * to array of struct
          * qreg for the values.
          */
         struct hash_table *def_ht;
@@ -615,11 +640,12 @@ struct v3d_compile {
                 uint32_t output_fifo_size;
 
                 struct {
-                        nir_dest *dest;
+                        nir_def *def;
                         uint8_t num_components;
                         uint8_t component_mask;
                 } flush[MAX_TMU_QUEUE_SIZE];
                 uint32_t flush_count;
+                uint32_t total_count;
         } tmu;
 
         /**
@@ -652,16 +678,13 @@ struct v3d_compile {
 
         bool uses_center_w;
         bool writes_z;
+        bool writes_z_from_fep;
+        bool reads_z;
         bool uses_implicit_point_line_varyings;
 
         /* True if a fragment shader reads gl_PrimitiveID */
         bool fs_uses_primitive_id;
 
-        /* If the fragment shader does anything that requires to force
-         * per-sample MSAA, such as reading gl_SampleID.
-         */
-        bool force_per_sample_msaa;
-
         /* Whether we are using the fallback scheduler. This will be set after
          * register allocation has failed once.
          */
@@ -681,6 +704,11 @@ struct v3d_compile {
         bool disable_constant_ubo_load_sorting;
         bool sorted_any_ubo_loads;
 
+        /* Moves UBO/SSBO loads right before their first user (nir_opt_move).
+         * This can reduce register pressure.
+         */
+        bool move_buffer_loads;
+
         /* Emits ldunif for each new uniform, even if the uniform was already
          * emitted in the same block. Useful to compile shaders with high
          * register pressure or to disable the optimization during uniform
@@ -692,6 +720,19 @@ struct v3d_compile {
         bool disable_loop_unrolling;
         bool unrolled_any_loops;
 
+        /* Disables nir_opt_gcm to reduce register pressure. */
+        bool disable_gcm;
+
+        /* If calling nir_opt_gcm made any progress. Used to skip new rebuilds
+         * if possible
+         */
+        bool gcm_progress;
+
+        /* Disables scheduling of general TMU loads (and unfiltered image load).
+         */
+        bool disable_general_tmu_sched;
+        bool has_general_tmu_load;
+
         /* Minimum number of threads we are willing to use to register allocate
          * a shader with the current compilation strategy. This only prevents
          * us from lowering the thread count to register allocate successfully,
@@ -705,7 +746,9 @@ struct v3d_compile {
          * strategies that can reduce register pressure and hopefully reduce or
          * eliminate TMU spills in the shader.
          */
-        bool tmu_spilling_allowed;
+        uint32_t max_tmu_spills;
+
+        uint32_t compile_strategy_idx;
 
         /* The UBO index and block used with the last unifa load, as well as the
          * current unifa offset *after* emitting that load. This is used to skip
@@ -715,6 +758,7 @@ struct v3d_compile {
         struct qblock *current_unifa_block;
         int32_t current_unifa_index;
         uint32_t current_unifa_offset;
+        bool current_unifa_is_ubo;
 
         /* State for whether we're executing on each channel currently.  0 if
          * yes, otherwise a block number + 1 that the channel jumped to.
@@ -749,6 +793,11 @@ struct v3d_compile {
         struct qreg cs_shared_offset;
         int local_invocation_index_bits;
 
+        /* Starting value of the sample mask in a fragment shader. We use
+         * this to identify lanes that have been terminated/discarded.
+         */
+        struct qreg start_msf;
+
         /* If the shader uses subgroup functionality */
         bool has_subgroups;
 
@@ -761,14 +810,27 @@ struct v3d_compile {
         uint32_t spill_size;
         /* Shader-db stats */
         uint32_t spills, fills, loops;
+
+        /* Whether we are in the process of spilling registers for
+         * register allocation
+         */
+        bool spilling;
+
         /**
          * Register spilling's per-thread base address, shared between each
-         * spill/fill's addressing calculations.
+         * spill/fill's addressing calculations (also used for scratch
+         * access).
          */
         struct qreg spill_base;
+
         /* Bit vector of which temps may be spilled */
         BITSET_WORD *spillable;
 
+        /* Used during register allocation */
+        int thread_index;
+        struct v3d_ra_node_info nodes;
+        struct ra_graph *g;
+
         /**
          * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
          *
@@ -799,11 +861,16 @@ struct v3d_compile {
         uint32_t uniform_array_size;
         uint32_t num_uniforms;
         uint32_t output_position_index;
-        nir_variable *output_color_var[4];
+        nir_variable *output_color_var[V3D_MAX_DRAW_BUFFERS];
         uint32_t output_sample_mask_index;
 
         struct qreg undef;
         uint32_t num_temps;
+        /* Number of temps in the program right before we spill a new temp. We
+         * use this to know which temps existed before a spill and which were
+         * added with the spill itself.
+         */
+        uint32_t spill_start_num_temps;
 
         struct vir_cursor cursor;
         struct list_head blocks;
@@ -848,12 +915,16 @@ struct v3d_compile {
         bool emitted_tlb_load;
         bool lock_scoreboard_on_first_thrsw;
 
-        /* Total number of spilled registers in the program */
-        uint32_t spill_count;
-
         enum v3d_compilation_result compilation_result;
 
         bool tmu_dirty_rcl;
+        bool has_global_address;
+
+        /* If we have processed a discard/terminate instruction. This may
+         * cause some lanes to be inactive even during uniform control
+         * flow.
+         */
+        bool emitted_discard;
 };
 
 struct v3d_uniform_list {
@@ -866,6 +937,13 @@ struct v3d_prog_data {
         struct v3d_uniform_list uniforms;
 
         uint32_t spill_size;
+        uint32_t tmu_spills;
+        uint32_t tmu_fills;
+        uint32_t tmu_count;
+
+        uint32_t qpu_read_stalls;
+
+        uint8_t compile_strategy_idx;
 
         uint8_t threads;
 
@@ -877,6 +955,8 @@ struct v3d_prog_data {
         bool tmu_dirty_rcl;
 
         bool has_control_barrier;
+
+        bool has_global_address;
 };
 
 struct v3d_vs_prog_data {
@@ -964,10 +1044,15 @@ struct v3d_fs_prog_data {
 
         uint8_t num_inputs;
         bool writes_z;
+        bool writes_z_from_fep;
         bool disable_ez;
         bool uses_center_w;
         bool uses_implicit_point_line_varyings;
         bool lock_scoreboard_on_first_thrsw;
+
+        /* If the fragment shader does anything that requires to force
+         * per-sample MSAA, such as reading gl_SampleID.
+         */
         bool force_per_sample_msaa;
 };
 
@@ -998,6 +1083,10 @@ v3d_compute_vpm_config(struct v3d_device_info *devinfo,
                        struct v3d_gs_prog_data *gs,
                        struct vpm_config *vpm_cfg_bin,
                        struct vpm_config *vpm_cfg);
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+                                  uint32_t *p1_packed,
+                                  bool unnormalized_coordinates);
 
 static inline bool
 vir_has_uniform(struct qinst *inst)
@@ -1005,7 +1094,8 @@ vir_has_uniform(struct qinst *inst)
         return inst->uniform != ~0;
 }
 
-const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo,
+                                             uint32_t max_inline_uniform_buffers);
 void v3d_compiler_free(const struct v3d_compiler *compiler);
 void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
 
@@ -1066,15 +1156,14 @@ bool vir_is_raw_mov(struct qinst *inst);
 bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
 bool vir_is_add(struct qinst *inst);
 bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
 struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
 uint8_t vir_channels_written(struct qinst *inst);
 struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
-void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
-                    struct qreg result);
+void ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+                   struct qreg result);
 bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
-void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
+void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_def *def,
                                uint32_t component_mask);
 void ntq_flush_tmu(struct v3d_compile *c);
 void vir_emit_thrsw(struct v3d_compile *c);
@@ -1095,32 +1184,27 @@ bool vir_opt_redundant_flags(struct v3d_compile *c);
 bool vir_opt_small_immediates(struct v3d_compile *c);
 bool vir_opt_vpm(struct v3d_compile *c);
 bool vir_opt_constant_alu(struct v3d_compile *c);
-void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_line_smooth(nir_shader *shader);
-void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c);
-void v3d_nir_lower_scratch(nir_shader *s);
-void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_image_load_store(nir_shader *s);
-void vir_lower_uniforms(struct v3d_compile *c);
-
-void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
-void v3d33_vir_vpm_write_setup(struct v3d_compile *c);
-void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_image_load_store(struct v3d_compile *c,
-                                     nir_intrinsic_instr *instr);
+bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_line_smooth(nir_shader *shader);
+bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_scratch(nir_shader *s);
+bool v3d_nir_lower_txf_ms(nir_shader *s);
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
+
+void v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
+void v3d_vir_emit_image_load_store(struct v3d_compile *c,
+                                   nir_intrinsic_instr *instr);
 
 void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
 uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
 void qpu_validate(struct v3d_compile *c);
-struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
 bool vir_init_reg_sets(struct v3d_compiler *compiler);
 
 int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str);
 
-bool v3d_gl_format_is_return_32(GLenum format);
+bool v3d_gl_format_is_return_32(enum pipe_format format);
 
 uint32_t
 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
@@ -1220,28 +1304,35 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)         \
 #define VIR_SFU(name)                                                      \
 static inline struct qreg                                                \
 vir_##name(struct v3d_compile *c, struct qreg a)                         \
-{                                                                        \
-        if (c->devinfo->ver >= 41) {                                     \
-                return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,    \
-                                                    c->undef,            \
-                                                    a, c->undef));       \
-        } else {                                                         \
-                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
-                return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
-        }                                                                \
+{                                                                       \
+        return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,           \
+                                            c->undef,                   \
+                                            a, c->undef));              \
 }                                                                        \
 static inline struct qinst *                                             \
 vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
                   struct qreg a)                                         \
 {                                                                        \
-        if (c->devinfo->ver >= 41) {                                     \
-                return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
-                                                       dest,             \
-                                                       a, c->undef));    \
-        } else {                                                         \
-                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
-                return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
-        }                                                                \
+        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name,        \
+                                               dest,                    \
+                                               a, c->undef));           \
+}
+
+#define VIR_SFU2(name)                                                   \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)          \
+{                                                                        \
+        return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,            \
+                                            c->undef,                    \
+                                            a, b));                      \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a, struct qreg b)                          \
+{                                                                        \
+        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name,         \
+                                               dest,                     \
+                                               a, b));                   \
 }
 
 #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
@@ -1343,6 +1434,28 @@ VIR_SFU(LOG)
 VIR_SFU(SIN)
 VIR_SFU(RSQRT2)
 
+VIR_SFU(BALLOT)
+VIR_SFU(BCASTF)
+VIR_SFU(ALLEQ)
+VIR_SFU(ALLFEQ)
+VIR_SFU2(ROTQ)
+VIR_SFU2(ROT)
+VIR_SFU2(SHUFFLE)
+
+VIR_A_ALU2(VPACK)
+VIR_A_ALU2(V8PACK)
+VIR_A_ALU2(V10PACK)
+VIR_A_ALU2(V11FPACK)
+
+VIR_M_ALU1(FTOUNORM16)
+VIR_M_ALU1(FTOSNORM16)
+
+VIR_M_ALU1(VFTOUNORM8)
+VIR_M_ALU1(VFTOSNORM8)
+
+VIR_M_ALU1(VFTOUNORM10LO)
+VIR_M_ALU1(VFTOUNORM10HI)
+
 static inline struct qinst *
 vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
              struct qreg dest, struct qreg src)
@@ -1372,16 +1485,11 @@ vir_NOP(struct v3d_compile *c)
 static inline struct qreg
 vir_LDTMU(struct v3d_compile *c)
 {
-        if (c->devinfo->ver >= 41) {
-                struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
-                                                   c->undef, c->undef);
-                ldtmu->qpu.sig.ldtmu = true;
-
-                return vir_emit_def(c, ldtmu);
-        } else {
-                vir_NOP(c)->qpu.sig.ldtmu = true;
-                return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
-        }
+        struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+                                           c->undef, c->undef);
+        ldtmu->qpu.sig.ldtmu = true;
+
+        return vir_emit_def(c, ldtmu);
 }
 
 static inline struct qreg
@@ -1394,7 +1502,6 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
 static inline struct qreg
 vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
 {
-        assert(c->devinfo->ver >= 41); /* XXX */
         assert((config & 0xffffff00) == 0xffffff00);
 
         struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
@@ -1407,38 +1514,12 @@ vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
 static inline struct qreg
 vir_TLB_COLOR_READ(struct v3d_compile *c)
 {
-        assert(c->devinfo->ver >= 41); /* XXX */
-
         struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                            c->undef, c->undef);
         ldtlb->qpu.sig.ldtlb = true;
         return vir_emit_def(c, ldtlb);
 }
 
-/*
-static inline struct qreg
-vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val), c->undef));
-}
-
-static inline struct qreg
-vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-static inline struct qreg
-vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-*/
-
 static inline struct qinst *
 vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 {
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2706432d5ef..9a651bfc6a7 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,9 +40,20 @@
  * calculations and load/store using the TMU general memory access path.
  */
 
+static const unsigned bits_8[4] = {8, 8, 8, 8};
+static const unsigned bits_16[4] = {16, 16, 16, 16};
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+
 bool
 v3d_gl_format_is_return_32(enum pipe_format format)
 {
+        /* We can get a NONE format in Vulkan because we support the
+         * shaderStorageImageReadWithoutFormat feature. We consider these to
+         * always use 32-bit precision.
+         */
+        if (format == PIPE_FORMAT_NONE)
+                return true;
+
         const struct util_format_description *desc =
                 util_format_description(format);
         const struct util_format_channel_description *chan = &desc->channel[0];
@@ -52,15 +63,17 @@ v3d_gl_format_is_return_32(enum pipe_format format)
 
 /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
  * 32-bit SSA value, with as many channels as necessary to store all the bits
+ *
+ * This is the generic helper, using all common nir operations.
  */
-static nir_ssa_def *
-pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
+static nir_def *
+pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
           int num_components, bool mask)
 {
-        nir_ssa_def *results[4];
+        nir_def *results[4];
         int offset = 0;
         for (int i = 0; i < num_components; i++) {
-                nir_ssa_def *chan = nir_channel(b, color, i);
+                nir_def *chan = nir_channel(b, color, i);
 
                 /* Channels being stored shouldn't cross a 32-bit boundary. */
                 assert((offset & ~31) == ((offset + bits[i] - 1) & ~31));
@@ -84,10 +97,187 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
         return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
 }
 
-static void
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static inline nir_def *
+nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2)
+{
+        return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_def *
+pack_11f11f10f(nir_builder *b, nir_def *color)
+{
+        nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                     nir_channel(b, color, 1));
+        nir_def *undef = nir_undef(b, 1, color->bit_size);
+        nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+        return nir_pack_32_to_r11g11b10_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_def *color)
+{
+        nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+                                                nir_channel(b, color, 1));
+        nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+                                                nir_channel(b, color, 3));
+
+        return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color)
+{
+        nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                     nir_channel(b, color, 1));
+        p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1);
+
+        nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+                                     nir_channel(b, color, 3));
+        p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2);
+
+        return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+enum hw_conversion {
+        NONE,
+        TO_SNORM,
+        TO_UNORM
+};
+
+static inline nir_def *
+pack_8bit(nir_builder *b, nir_def *color,
+                        unsigned num_components,
+                        enum hw_conversion conversion)
+{
+        /* Note that usually you should not use this method (that relies on
+         * custom packing) for 1 component if we are not doing any
+         * conversion. But we support also that case, and let the caller
+         * decide which method to use.
+         */
+        nir_def *p1;
+        nir_def *p2;
+
+        if (conversion == NONE) {
+                p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+                                               nir_channel(b, color, num_components == 1 ? 0 : 1));
+        } else {
+                p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                nir_channel(b, color, num_components == 1 ? 0 : 1));
+                p1 = (conversion == TO_UNORM) ?
+                   nir_pack_2x16_to_unorm_2x8_v3d(b, p1) :
+                   nir_pack_2x16_to_snorm_2x8_v3d(b, p1);
+        }
+        if (num_components == 4) {
+                if (conversion == NONE) {
+                        p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+                                                       nir_channel(b, color, 3));
+                } else {
+                        p2 = nir_vfpack(b, nir_channel(b, color, 2),
+                                        nir_channel(b, color, 3));
+                        p2 = (conversion == TO_UNORM) ?
+                           nir_pack_2x16_to_unorm_2x8_v3d(b, p2) :
+                           nir_pack_2x16_to_snorm_2x8_v3d(b, p2);
+                }
+        } else {
+                /* Using an undef here would be more correct. But for this
+                 * case we are getting worse shader-db values with some CTS
+                 * tests, so we just reuse the first packing.
+                 */
+                p2 = p1;
+        }
+
+        return nir_pack_4x16_to_4x8_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_16bit(nir_builder *b, nir_def *color,
+                         unsigned num_components,
+                         enum hw_conversion conversion)
+{
+        nir_def *results[2] = {0};
+        nir_def *channels[4] = {0};
+
+        for (unsigned i = 0; i < num_components; i++) {
+                channels[i] = nir_channel(b, color, i);
+                switch (conversion) {
+                case TO_SNORM:
+                        channels[i] = nir_f2snorm_16_v3d(b, channels[i]);
+                        break;
+                case TO_UNORM:
+                        channels[i] = nir_f2unorm_16_v3d(b, channels[i]);
+                        break;
+                default:
+                        /* Note that usually you should not use this method
+                         * (that relies on custom packing) if we are not doing
+                         * any conversion. But we support also that case, and
+                         * let the caller decide which method to use.
+                         */
+                        break;
+                }
+        }
+
+        switch (num_components) {
+        case 1:
+                results[0] = channels[0];
+                break;
+        case 4:
+                results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]);
+                FALLTHROUGH;
+        case 2:
+                results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]);
+                break;
+        default:
+                unreachable("Invalid number of components");
+        }
+
+        return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_def *
+pack_xbit(nir_builder *b, nir_def *color,
+          unsigned num_components,
+          const struct util_format_channel_description *r_chan)
+{
+        bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
+        enum hw_conversion conversion = NONE;
+        if (r_chan->normalized) {
+                conversion =
+                        (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
+        }
+
+        switch (r_chan->size) {
+        case 8:
+                if (conversion == NONE && num_components < 2)
+                        return pack_bits(b, color, bits_8, num_components, pack_mask);
+                else
+                        return pack_8bit(b, color, num_components, conversion);
+                break;
+        case 16:
+                /* pack_mask implies that the generic packing method would
+                 * need to include extra operations to handle negative values,
+                 * so in that case, even without a conversion, it is better to
+                 * use the packing using custom hw operations.
+                 */
+                if (conversion == NONE && !pack_mask)
+                        return pack_bits(b, color, bits_16, num_components, pack_mask);
+                else
+                        return pack_16bit(b, color, num_components, conversion);
+                break;
+        default:
+                unreachable("unrecognized bits");
+        }
+}
+
+static bool
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
 {
         enum pipe_format format = nir_intrinsic_format(instr);
+        assert(format != PIPE_FORMAT_NONE);
         const struct util_format_description *desc =
                 util_format_description(format);
         const struct util_format_channel_description *r_chan = &desc->channel[0];
@@ -95,10 +285,10 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
 
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *color = nir_channels(b,
-                                          nir_ssa_for_src(b, instr->src[3], 4),
-                                          (1 << num_components) - 1);
-        nir_ssa_def *formatted = NULL;
+        nir_def *color = nir_trim_vector(b,
+                                             instr->src[3].ssa,
+                                             num_components);
+        nir_def *formatted = NULL;
 
         if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
                 formatted = nir_format_pack_11f11f10f(b, color);
@@ -110,9 +300,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
                  */
                 formatted = color;
         } else {
-                static const unsigned bits_8[4] = {8, 8, 8, 8};
-                static const unsigned bits_16[4] = {16, 16, 16, 16};
-                static const unsigned bits_1010102[4] = {10, 10, 10, 2};
                 const unsigned *bits;
 
                 switch (r_chan->size) {
@@ -132,11 +319,13 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
                 bool pack_mask = false;
                 if (r_chan->pure_integer &&
                     r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        formatted = nir_format_clamp_sint(b, color, bits);
+                        /* We don't need to do any conversion or clamping in this case */
+                        formatted = color;
                         pack_mask = true;
                 } else if (r_chan->pure_integer &&
                            r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
-                        formatted = nir_format_clamp_uint(b, color, bits);
+                        /* We don't need to do any conversion or clamping in this case */
+                        formatted = color;
                 } else if (r_chan->normalized &&
                            r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
                         formatted = nir_format_float_to_snorm(b, color, bits);
@@ -154,75 +343,116 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
                                       pack_mask);
         }
 
-        nir_instr_rewrite_src(&instr->instr, &instr->src[3],
-                              nir_src_for_ssa(formatted));
+        nir_src_rewrite(&instr->src[3], formatted);
         instr->num_components = formatted->num_components;
+
+        return true;
 }
 
-static void
+
+static bool
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
+{
+        enum pipe_format format = nir_intrinsic_format(instr);
+        assert(format != PIPE_FORMAT_NONE);
+        const struct util_format_description *desc =
+                util_format_description(format);
+        const struct util_format_channel_description *r_chan = &desc->channel[0];
+        unsigned num_components = util_format_get_nr_components(format);
+        b->cursor = nir_before_instr(&instr->instr);
+
+        nir_def *color =
+           nir_trim_vector(b, instr->src[3].ssa, num_components);
+        nir_def *formatted = NULL;
+        if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+                formatted = nir_format_pack_r9g9b9e5(b, color);
+        } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+                formatted = pack_11f11f10f(b, color);
+        } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
+                formatted = pack_r10g10b10a2_uint(b, color);
+        } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+                formatted = pack_r10g10b10a2_unorm(b, color);
+        } else if (r_chan->size == 32) {
+                /* For 32-bit formats, we just have to move the vector
+                 * across (possibly reducing the number of channels).
+                 */
+                formatted = color;
+        } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+                assert(r_chan->size == 16);
+                formatted = nir_format_float_to_half(b, color);
+                formatted = pack_bits(b, formatted, bits_16, num_components,
+                                      false);
+        } else {
+                assert(r_chan->size == 8 || r_chan->size == 16);
+                formatted = pack_xbit(b, color, num_components, r_chan);
+        }
+
+        nir_src_rewrite(&instr->src[3], formatted);
+        instr->num_components = formatted->num_components;
+
+        return true;
+}
+
+static bool
 v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
 {
         static const unsigned bits16[] = {16, 16, 16, 16};
         enum pipe_format format = nir_intrinsic_format(instr);
 
         if (v3d_gl_format_is_return_32(format))
-                return;
+                return false;
 
         b->cursor = nir_after_instr(&instr->instr);
 
-        assert(instr->dest.is_ssa);
-        nir_ssa_def *result = &instr->dest.ssa;
+        nir_def *result = &instr->def;
         if (util_format_is_pure_uint(format)) {
                 result = nir_format_unpack_uint(b, result, bits16, 4);
         } else if (util_format_is_pure_sint(format)) {
                 result = nir_format_unpack_sint(b, result, bits16, 4);
         } else {
-            nir_ssa_def *rg = nir_channel(b, result, 0);
-            nir_ssa_def *ba = nir_channel(b, result, 1);
-            result = nir_vec4(b,
-                              nir_unpack_half_2x16_split_x(b, rg),
-                              nir_unpack_half_2x16_split_y(b, rg),
-                              nir_unpack_half_2x16_split_x(b, ba),
-                              nir_unpack_half_2x16_split_y(b, ba));
+                nir_def *rg = nir_channel(b, result, 0);
+                nir_def *ba = nir_channel(b, result, 1);
+                result = nir_vec4(b,
+                                  nir_unpack_half_2x16_split_x(b, rg),
+                                  nir_unpack_half_2x16_split_y(b, rg),
+                                  nir_unpack_half_2x16_split_x(b, ba),
+                                  nir_unpack_half_2x16_split_y(b, ba));
         }
 
-        nir_ssa_def_rewrite_uses_after(&instr->dest.ssa, result,
+        nir_def_rewrite_uses_after(&instr->def, result,
                                        result->parent_instr);
+
+        return true;
 }
 
-void
-v3d_nir_lower_image_load_store(nir_shader *s)
+static bool
+v3d_nir_lower_image_load_store_cb(nir_builder *b,
+                                  nir_intrinsic_instr *intr,
+                                  void *_state)
 {
-        nir_foreach_function(function, s) {
-                if (!function->impl)
-                        continue;
-
-                nir_builder b;
-                nir_builder_init(&b, function->impl);
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
-
-                                nir_intrinsic_instr *intr =
-                                        nir_instr_as_intrinsic(instr);
-
-                                switch (intr->intrinsic) {
-                                case nir_intrinsic_image_load:
-                                        v3d_nir_lower_image_load(&b, intr);
-                                        break;
-                                case nir_intrinsic_image_store:
-                                        v3d_nir_lower_image_store(&b, intr);
-                                        break;
-                                default:
-                                        break;
-                                }
-                        }
-                }
+        struct v3d_compile *c = (struct v3d_compile *) _state;
 
-                nir_metadata_preserve(function->impl,
-                                      nir_metadata_block_index |
-                                      nir_metadata_dominance);
+        switch (intr->intrinsic) {
+        case nir_intrinsic_image_load:
+                return v3d_nir_lower_image_load(b, intr);
+        case nir_intrinsic_image_store:
+                if (c->devinfo->ver >= 71)
+                        return v3d_nir_lower_image_store_v71(b, intr);
+                else
+                        return v3d_nir_lower_image_store_v42(b, intr);
+                break;
+        default:
+                return false;
         }
+
+        return false;
+}
+
+bool
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
+{
+        return nir_shader_intrinsics_pass(s,
+                                            v3d_nir_lower_image_load_store_cb,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance, c);
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 895b1a39163..55e2e4f2e11 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -24,8 +24,6 @@
 #include "compiler/v3d_compiler.h"
 #include "compiler/nir/nir_builder.h"
 
-#include "util/u_helpers.h"
-
 /**
  * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
  * intrinsics into something amenable to the V3D architecture.
@@ -64,7 +62,7 @@ struct v3d_nir_lower_io_state {
 
         BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
 
-        nir_ssa_def *pos[4];
+        nir_def *pos[4];
 };
 
 static void
@@ -72,8 +70,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                             struct v3d_nir_lower_io_state *state);
 
 static void
-v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
-                     nir_ssa_def *chan)
+v3d_nir_store_output(nir_builder *b, int base, nir_def *offset,
+                     nir_def *chan)
 {
         if (offset) {
                 /* When generating the VIR instruction, the base and the offset
@@ -90,29 +88,6 @@ v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
         nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0);
 }
 
-/* Convert the uniform offset to bytes.  If it happens to be a constant,
- * constant-folding will clean up the shift for us.
- */
-static void
-v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
-                      nir_intrinsic_instr *intr)
-{
-        /* On SPIR-V/Vulkan we are already getting our offsets in
-         * bytes.
-         */
-        if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
-                return;
-
-        b->cursor = nir_before_instr(&intr->instr);
-
-        nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
-
-        nir_instr_rewrite_src(&intr->instr,
-                              &intr->src[0],
-                              nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
-                                                       nir_imm_int(b, 4))));
-}
-
 static int
 v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component)
 {
@@ -159,14 +134,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
         /* If this is a geometry shader we need to emit our outputs
          * to the current vertex offset in the VPM.
          */
-        nir_ssa_def *offset_reg =
+        nir_def *offset_reg =
                 c->s->info.stage == MESA_SHADER_GEOMETRY ?
                         nir_load_var(b, state->gs.output_offset_var) : NULL;
 
         int start_comp = nir_intrinsic_component(intr);
         unsigned location = nir_intrinsic_io_semantics(intr).location;
-        nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
-                                           intr->num_components);
+        nir_def *src = intr->src[0].ssa;
         /* Save off the components of the position for the setup of VPM inputs
          * read by fixed function HW.
          */
@@ -184,8 +158,8 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
 
         if (location == VARYING_SLOT_LAYER) {
                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
-                nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
-                header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff));
+                nir_def *header = nir_load_var(b, state->gs.header_var);
+                header = nir_iand_imm(b, header, 0xff00ffff);
 
                 /* From the GLES 3.2 spec:
                  *
@@ -205,24 +179,26 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
                  * to 0 in that case (we always allocate tile state for at
                  * least one layer).
                  */
-                nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
-                nir_ssa_def *cond = nir_ige(b, src, fb_layers);
-                nir_ssa_def *layer_id =
+                nir_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
+                nir_def *cond = nir_ige(b, src, fb_layers);
+                nir_def *layer_id =
                         nir_bcsel(b, cond,
                                   nir_imm_int(b, 0),
-                                  nir_ishl(b, src, nir_imm_int(b, 16)));
+                                  nir_ishl_imm(b, src, 16));
                 header = nir_ior(b, header, layer_id);
                 nir_store_var(b, state->gs.header_var, header, 0x1);
         }
 
         /* Scalarize outputs if it hasn't happened already, since we want to
-         * schedule each VPM write individually.  We can skip any outut
+         * schedule each VPM write individually.  We can skip any output
          * components not read by the FS.
          */
         for (int i = 0; i < intr->num_components; i++) {
                 int vpm_offset =
                         v3d_varying_slot_vpm_offset(c, location, start_comp + i);
 
+                if (!(nir_intrinsic_write_mask(intr) & (1 << i)))
+                        continue;
 
                 if (vpm_offset == -1)
                         continue;
@@ -261,9 +237,9 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
 {
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
-        nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
-        nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
+        nir_def *header = nir_load_var(b, state->gs.header_var);
+        nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
+        nir_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
 
         /* Emit fixed function outputs */
         v3d_nir_emit_ff_vpm_outputs(c, b, state);
@@ -273,13 +249,13 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
 
         /* Update VPM offset for next vertex output data and header */
         output_offset =
-                nir_iadd(b, output_offset,
-                            nir_imm_int(b, state->gs.output_vertex_data_size));
+                nir_iadd_imm(b, output_offset,
+                             state->gs.output_vertex_data_size);
 
-        header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
+        header_offset = nir_iadd_imm(b, header_offset, 1);
 
         /* Reset the New Primitive bit */
-        header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
+        header = nir_iand_imm(b, header, 0xfffffffe);
 
         nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
         nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
@@ -304,7 +280,7 @@ v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
  * doesn't provide means to do that, so we need to apply the swizzle in the
  * vertex shader.
  *
- * This is required at least in Vulkan to support madatory vertex attribute
+ * This is required at least in Vulkan to support mandatory vertex attribute
  * format VK_FORMAT_B8G8R8A8_UNORM.
  */
 static void
@@ -327,59 +303,6 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
                 nir_intrinsic_set_component(instr, (comp + 2) % 4);
 }
 
-/* Sometimes the origin of gl_PointCoord is in the upper left rather than the
- * lower left so we need to flip it.
- *
- * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
- */
-static void
-v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
-                             nir_intrinsic_instr *intr)
-{
-        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
-
-        /* Gallium uses lower_wpos_pntc */
-        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
-                return;
-
-        b->cursor = nir_after_instr(&intr->instr);
-
-        int comp = nir_intrinsic_component(intr);
-
-        nir_variable *input_var =
-                nir_find_variable_with_driver_location(c->s,
-                                                       nir_var_shader_in,
-                                                       nir_intrinsic_base(intr));
-
-        if (input_var && util_varying_is_point_coord(input_var->data.location,
-                                                     c->fs_key->point_sprite_mask)) {
-                assert(intr->num_components == 1);
-
-                nir_ssa_def *result = &intr->dest.ssa;
-
-                switch (comp) {
-                case 0:
-                case 1:
-                        if (!c->fs_key->is_points)
-                                result = nir_imm_float(b, 0.0);
-                        break;
-                case 2:
-                        result = nir_imm_float(b, 0.0);
-                        break;
-                case 3:
-                        result = nir_imm_float(b, 1.0);
-                        break;
-                }
-                if (c->fs_key->point_coord_upper_left && comp == 1)
-                        result = nir_fsub(b, nir_imm_float(b, 1.0), result);
-                if (result != &intr->dest.ssa) {
-                        nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
-                                                       result,
-                                                       result->parent_instr);
-                }
-        }
-}
-
 static void
 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                        struct nir_instr *instr,
@@ -393,12 +316,6 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
         case nir_intrinsic_load_input:
                 if (c->s->info.stage == MESA_SHADER_VERTEX)
                         v3d_nir_lower_vertex_input(c, b, intr);
-                else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
-                        v3d_nir_lower_fragment_input(c, b, intr);
-                break;
-
-        case nir_intrinsic_load_uniform:
-                v3d_nir_lower_uniform(c, b, intr);
                 break;
 
         case nir_intrinsic_store_output:
@@ -558,16 +475,16 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
         /* If this is a geometry shader we need to emit our fixed function
          * outputs to the current vertex offset in the VPM.
          */
-        nir_ssa_def *offset_reg =
+        nir_def *offset_reg =
                 c->s->info.stage == MESA_SHADER_GEOMETRY ?
                         nir_load_var(b, state->gs.output_offset_var) : NULL;
 
         for (int i = 0; i < 4; i++) {
                 if (!state->pos[i])
-                        state->pos[i] = nir_ssa_undef(b, 1, 32);
+                        state->pos[i] = nir_undef(b, 1, 32);
         }
 
-        nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
+        nir_def *rcp_wc = nir_frcp(b, state->pos[3]);
 
         if (state->pos_vpm_offset != -1) {
                 for (int i = 0; i < 4; i++) {
@@ -578,8 +495,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
 
         if (state->vp_vpm_offset != -1) {
                 for (int i = 0; i < 2; i++) {
-                        nir_ssa_def *pos;
-                        nir_ssa_def *scale;
+                        nir_def *pos;
+                        nir_def *scale;
                         pos = state->pos[i];
                         if (i == 0)
                                 scale = nir_load_viewport_x_scale(b);
@@ -598,14 +515,18 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                          * The correct fix for this as recommended by Broadcom
                          * is to convert to .8 fixed-point with ffloor().
                          */
-                        pos = nir_f2i32(b, nir_ffloor(b, pos));
-                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
-                                             offset_reg, pos);
+                        if (c->devinfo->ver == 42)
+                                 pos = nir_f2i32(b, nir_ffloor(b, pos));
+                        else
+                                 pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+                       v3d_nir_store_output(b, state->vp_vpm_offset + i,
+                                            offset_reg, pos);
                 }
         }
 
         if (state->zs_vpm_offset != -1) {
-                nir_ssa_def *z = state->pos[2];
+                nir_def *z = state->pos[2];
                 z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
                 z = nir_fmul(b, z, rcp_wc);
                 z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
@@ -679,21 +600,22 @@ emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
          * have a variable just to keep track of the number of vertices we
          * emitted and instead we can just compute it here from the header
          * offset variable by removing the one generic header slot that always
-         * goes at the begining of out header.
+         * goes at the beginning of out header.
          */
-        nir_ssa_def *header_offset =
+        nir_def *header_offset =
                 nir_load_var(b, state->gs.header_offset_var);
-        nir_ssa_def *vertex_count =
-                nir_isub(b, header_offset, nir_imm_int(b, 1));
-        nir_ssa_def *header =
-                nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
-                           nir_ishl(b, vertex_count,
-                                    nir_imm_int(b, VERTEX_COUNT_OFFSET)));
+        nir_def *vertex_count =
+                nir_iadd_imm(b, header_offset, -1);
+        nir_def *header =
+                nir_ior_imm(b,
+                            nir_ishl_imm(b, vertex_count,
+                                         VERTEX_COUNT_OFFSET),
+                            state->gs.output_header_size);
 
         v3d_nir_store_output(b, 0, NULL, header);
 }
 
-void
+bool
 v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
 {
         struct v3d_nir_lower_io_state state = { 0 };
@@ -713,36 +635,39 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
                 unreachable("Unsupported shader stage");
         }
 
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_builder b;
-                        nir_builder_init(&b, function->impl);
-
-                        if (c->s->info.stage == MESA_SHADER_GEOMETRY)
-                                emit_gs_prolog(c, &b, function->impl, &state);
-
-                        nir_foreach_block(block, function->impl) {
-                                nir_foreach_instr_safe(instr, block)
-                                        v3d_nir_lower_io_instr(c, &b, instr,
-                                                               &state);
-                        }
-
-                        nir_block *last = nir_impl_last_block(function->impl);
-                        b.cursor = nir_after_block(last);
-                        if (s->info.stage == MESA_SHADER_VERTEX) {
-                                v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
-                        } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
-                                emit_gs_vpm_output_header_prolog(c, &b, &state);
-                        }
-
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
+        nir_foreach_function_impl(impl, s) {
+                nir_builder b = nir_builder_create(impl);
+
+                if (c->s->info.stage == MESA_SHADER_GEOMETRY)
+                        emit_gs_prolog(c, &b, impl, &state);
+
+                nir_foreach_block(block, impl) {
+                        nir_foreach_instr_safe(instr, block)
+                                v3d_nir_lower_io_instr(c, &b, instr,
+                                                       &state);
                 }
+
+                nir_block *last = nir_impl_last_block(impl);
+                b.cursor = nir_after_block(last);
+                if (s->info.stage == MESA_SHADER_VERTEX) {
+                        v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+                } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
+                        emit_gs_vpm_output_header_prolog(c, &b, &state);
+                }
+
+                nir_metadata_preserve(impl,
+                                      nir_metadata_block_index |
+                                      nir_metadata_dominance);
         }
 
         if (s->info.stage == MESA_SHADER_VERTEX ||
             s->info.stage == MESA_SHADER_GEOMETRY) {
                 v3d_nir_lower_io_update_output_var_base(c, &state);
         }
+
+        /* It is really unlikely that we don't get progress here, and fully
+         * filtering when not would make code more complex, but we are still
+         * interested on getting this lowering going through NIR_PASS
+         */
+        return true;
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
index 8f6e7d4e648..05b5224bc52 100644
--- a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
+++ b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -42,25 +42,23 @@ lower_line_smooth_intrinsic(struct lower_line_smooth_state *state,
 {
         b->cursor = nir_before_instr(&intr->instr);
 
-        nir_ssa_def *one = nir_imm_float(b, 1.0f);
+        nir_def *one = nir_imm_float(b, 1.0f);
 
-        nir_ssa_def *coverage = nir_load_var(b, state->coverage);
+        nir_def *coverage = nir_load_var(b, state->coverage);
 
-        nir_ssa_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
+        nir_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
                                         intr->src[0].ssa);
 
-        nir_instr_rewrite_src(&intr->instr,
-                              &intr->src[0],
-                              nir_src_for_ssa(new_val));
+        nir_src_rewrite(&intr->src[0], new_val);
 }
 
-static void
+static bool
 lower_line_smooth_func(struct lower_line_smooth_state *state,
                        nir_function_impl *impl)
 {
-        nir_builder b;
+        bool progress = false;
 
-        nir_builder_init(&b, impl);
+        nir_builder b = nir_builder_create(impl);
 
         nir_foreach_block(block, impl) {
                 nir_foreach_instr_safe(instr, block) {
@@ -72,58 +70,66 @@ lower_line_smooth_func(struct lower_line_smooth_state *state,
 
                         if (intr->intrinsic != nir_intrinsic_store_output ||
                             nir_intrinsic_base(intr) != 0 ||
-                            intr->num_components != 4 ||
-                            !intr->src[0].is_ssa)
+                            intr->num_components != 4)
                                 continue;
 
                         lower_line_smooth_intrinsic(state, &b, intr);
+                        progress = true;
                 }
         }
+
+        return progress;
 }
 
 static void
 initialise_coverage_var(struct lower_line_smooth_state *state,
                         nir_function_impl *impl)
 {
-        nir_builder b;
-
-        nir_builder_init(&b, impl);
+        nir_builder b = nir_builder_at(nir_before_impl(impl));
 
-        b.cursor = nir_before_block(nir_start_block(impl));
+        nir_def *line_width = nir_load_line_width(&b);
 
-        nir_ssa_def *line_width = nir_load_line_width(&b);
+        nir_def *real_line_width = nir_load_aa_line_width(&b);
 
-        nir_ssa_def *real_line_width = nir_load_aa_line_width(&b);
-
-        /* The line coord varies from 0.0 to 1.0 across the width of the line */
-        nir_ssa_def *line_coord = nir_load_line_coord(&b);
+        /* According to the PRM, the line coord varies from 0.0 to 1.0 across
+         * the width of the line. But actually, when a perspective projection
+         * is used, it is also applied to the line coords, so the values end
+         * up being between [min_coord, 1], based on the Wc coordinate.  We
+         * need to re-map the values to be between [0.0, 1.0].
+         */
+        nir_def *line_coord = nir_load_line_coord(&b);
+        nir_def *wc = nir_load_fep_w_v3d(&b, 32);
+        nir_def *min_coord_val = nir_fsub(&b, nir_imm_float(&b, 1.0f), wc);
+        nir_def *normalized_line_coord = nir_fdiv(&b,
+                                                  nir_fsub(&b, line_coord, min_coord_val),
+                                                  nir_fsub_imm(&b, 1.0, min_coord_val));;
 
         /* fabs(line_coord - 0.5) * real_line_width */
-        nir_ssa_def *pixels_from_center =
+        nir_def *pixels_from_center =
                 nir_fmul(&b, real_line_width,
-                         nir_fabs(&b, nir_fsub(&b, line_coord,
+                         nir_fabs(&b, nir_fsub(&b, normalized_line_coord,
                                                nir_imm_float(&b, 0.5f))));
 
         /* 0.5 - 1/√2 * (pixels_from_center - line_width * 0.5) */
-        nir_ssa_def *coverage =
+        nir_def *coverage =
                 nir_fsub(&b,
                          nir_imm_float(&b, 0.5f),
                          nir_fmul(&b,
                                   nir_imm_float(&b, 1.0f / M_SQRT2),
                                   nir_fsub(&b, pixels_from_center,
-                                           nir_fmul(&b,
-                                                    line_width,
-                                                    nir_imm_float(&b, 0.5f)))));
+                                           nir_fmul_imm(&b,
+                                                        line_width,
+                                                        0.5f))));
 
         /* Discard fragments that aren’t covered at all by the line */
-        nir_ssa_def *outside = nir_fge(&b, nir_imm_float(&b, 0.0f), coverage);
+        nir_def *outside = nir_fle_imm(&b, coverage, 0.0f);
 
         nir_discard_if(&b, outside);
 
         /* Clamp to at most 1.0. If it was less than 0.0 then the fragment will
          * be discarded so we don’t need to handle that.
          */
-        nir_ssa_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
+        nir_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
 
         nir_store_var(&b, state->coverage, clamped, 0x1 /* writemask */);
 }
@@ -140,9 +146,11 @@ make_coverage_var(nir_shader *s)
         return var;
 }
 
-void
+bool
 v3d_nir_lower_line_smooth(nir_shader *s)
 {
+        bool progress = false;
+
         assert(s->info.stage == MESA_SHADER_FRAGMENT);
 
         struct lower_line_smooth_state state = {
@@ -150,10 +158,20 @@ v3d_nir_lower_line_smooth(nir_shader *s)
                 .coverage = make_coverage_var(s),
         };
 
-        nir_foreach_function(function, s) {
+        nir_foreach_function_with_impl(function, impl, s) {
                 if (function->is_entrypoint)
-                        initialise_coverage_var(&state, function->impl);
+                        initialise_coverage_var(&state, impl);
+
+                progress |= lower_line_smooth_func(&state, impl);
 
-                lower_line_smooth_func(&state, function->impl);
+                if (progress) {
+                        nir_metadata_preserve(impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                } else {
+                        nir_metadata_preserve(impl, nir_metadata_all);
+                }
         }
+
+        return progress;
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
new file mode 100644
index 00000000000..0caf5dbc92c
--- /dev/null
+++ b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright © 2021 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * The V3D TMU unit can only do 32-bit general vector access so for anything
+ * else we need to split vector load/store instructions to scalar.
+ *
+ * Note that a vectorization pass after this lowering may be able to
+ * re-vectorize some of these using 32-bit load/store instructions instead,
+ * which we do support.
+ */
+
+static int
+value_src(nir_intrinsic_op intrinsic)
+{
+   switch (intrinsic) {
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_store_global_2x32:
+      return 0;
+   default:
+      unreachable("Unsupported intrinsic");
+   }
+}
+
+static int
+offset_src(nir_intrinsic_op intrinsic)
+{
+   switch (intrinsic) {
+   case nir_intrinsic_load_uniform:
+   case nir_intrinsic_load_shared:
+   case nir_intrinsic_load_scratch:
+   case nir_intrinsic_load_global_2x32:
+      return 0;
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_store_global_2x32:
+      return 1;
+   case nir_intrinsic_store_ssbo:
+      return 2;
+   default:
+      unreachable("Unsupported intrinsic");
+   }
+}
+
+static nir_intrinsic_instr *
+init_scalar_intrinsic(nir_builder *b,
+                      nir_intrinsic_instr *intr,
+                      uint32_t component,
+                      nir_def *offset,
+                      uint32_t bit_size,
+                      nir_def **scalar_offset)
+{
+
+        nir_intrinsic_instr *new_intr =
+                nir_intrinsic_instr_create(b->shader, intr->intrinsic);
+
+        nir_intrinsic_copy_const_indices(new_intr, intr);
+
+        const int offset_units = bit_size / 8;
+        assert(offset_units >= 1);
+
+        if (nir_intrinsic_has_align_mul(intr)) {
+                assert(nir_intrinsic_has_align_offset(intr));
+                unsigned align_mul = nir_intrinsic_align_mul(intr);
+                unsigned align_off = nir_intrinsic_align_offset(intr);
+
+                align_off += offset_units * component;
+                align_off = align_off % align_mul;
+
+                nir_intrinsic_set_align(new_intr, align_mul, align_off);
+        }
+
+        *scalar_offset = offset;
+        unsigned offset_adj = offset_units * component;
+        if (nir_intrinsic_has_base(intr)) {
+                nir_intrinsic_set_base(
+                        new_intr, nir_intrinsic_base(intr) + offset_adj);
+        } else {
+                *scalar_offset =
+                        nir_iadd(b, offset,
+                                 nir_imm_intN_t(b, offset_adj,
+                                                offset->bit_size));
+        }
+
+        new_intr->num_components = 1;
+
+        return new_intr;
+}
+
+static bool
+lower_load_bitsize(nir_builder *b,
+                   nir_intrinsic_instr *intr)
+{
+        uint32_t bit_size = intr->def.bit_size;
+        if (bit_size == 32)
+                return false;
+
+        /* No need to split if it is already scalar */
+        int num_comp = nir_intrinsic_dest_components(intr);
+        if (num_comp <= 1)
+                return false;
+
+        b->cursor = nir_before_instr(&intr->instr);
+
+        /* For global 2x32 we ignore Y component because it must be zero */
+        unsigned offset_idx = offset_src(intr->intrinsic);
+        nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+        /* Split vector store to multiple scalar loads */
+        nir_def *dest_components[4] = { NULL };
+        const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+        for (int component = 0; component < num_comp; component++) {
+                nir_def *scalar_offset;
+                nir_intrinsic_instr *new_intr =
+                        init_scalar_intrinsic(b, intr, component, offset,
+                                              bit_size, &scalar_offset);
+
+                for (unsigned i = 0; i < info->num_srcs; i++) {
+                        if (i == offset_idx) {
+                                nir_def *final_offset;
+                                final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
+                                        scalar_offset :
+                                        nir_vec2(b, scalar_offset,
+                                                 nir_imm_int(b, 0));
+                                new_intr->src[i] = nir_src_for_ssa(final_offset);
+                        } else {
+                                new_intr->src[i] = intr->src[i];
+                        }
+                }
+
+                nir_def_init(&new_intr->instr, &new_intr->def, 1,
+                             bit_size);
+                dest_components[component] = &new_intr->def;
+
+                nir_builder_instr_insert(b, &new_intr->instr);
+        }
+
+        nir_def *new_dst = nir_vec(b, dest_components, num_comp);
+        nir_def_rewrite_uses(&intr->def, new_dst);
+
+        nir_instr_remove(&intr->instr);
+        return true;
+}
+
+static bool
+lower_store_bitsize(nir_builder *b,
+                    nir_intrinsic_instr *intr)
+{
+        /* No need to split if it is already scalar */
+        int value_idx = value_src(intr->intrinsic);
+        int num_comp = nir_intrinsic_src_components(intr, value_idx);
+        if (num_comp <= 1)
+                return false;
+
+        /* No need to split if it is 32-bit */
+        if (nir_src_bit_size(intr->src[value_idx]) == 32)
+                return false;
+
+        nir_def *value = intr->src[value_idx].ssa;
+
+        b->cursor = nir_before_instr(&intr->instr);
+
+        /* For global 2x32 we ignore Y component because it must be zero */
+        unsigned offset_idx = offset_src(intr->intrinsic);
+        nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+        /* Split vector store to multiple scalar stores */
+        const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+        unsigned wrmask = nir_intrinsic_write_mask(intr);
+        while (wrmask) {
+                unsigned component = ffs(wrmask) - 1;
+
+                nir_def *scalar_offset;
+                nir_intrinsic_instr *new_intr =
+                        init_scalar_intrinsic(b, intr, component, offset,
+                                              value->bit_size, &scalar_offset);
+
+                nir_intrinsic_set_write_mask(new_intr, 0x1);
+
+                for (unsigned i = 0; i < info->num_srcs; i++) {
+                        if (i == value_idx) {
+                                nir_def *scalar_value =
+                                        nir_channels(b, value, 1 << component);
+                                new_intr->src[i] = nir_src_for_ssa(scalar_value);
+                        } else if (i == offset_idx) {
+                                nir_def *final_offset;
+                                final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
+                                        scalar_offset :
+                                        nir_vec2(b, scalar_offset,
+                                                 nir_imm_int(b, 0));
+                                new_intr->src[i] = nir_src_for_ssa(final_offset);
+                        } else {
+                                new_intr->src[i] = intr->src[i];
+                        }
+                }
+
+                nir_builder_instr_insert(b, &new_intr->instr);
+
+                wrmask &= ~(1 << component);
+        }
+
+        nir_instr_remove(&intr->instr);
+        return true;
+}
+
+static bool
+lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
+                         void *data)
+{
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_ssbo:
+        case nir_intrinsic_load_ubo:
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_global_2x32:
+               return lower_load_bitsize(b, intr);
+
+        case nir_intrinsic_store_ssbo:
+        case nir_intrinsic_store_scratch:
+        case nir_intrinsic_store_global_2x32:
+                return lower_store_bitsize(b, intr);
+
+        default:
+                return false;
+        }
+}
+
+bool
+v3d_nir_lower_load_store_bitsize(nir_shader *s)
+{
+        return nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance,
+                                            NULL);
+}
diff --git a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
index 11782c7348f..4affb79a7e2 100644
--- a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
+++ b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
@@ -36,8 +36,8 @@
 #include "v3d_compiler.h"
 
 
-typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c);
-typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c);
+typedef nir_def *(*nir_pack_func)(nir_builder *b, nir_def *c);
+typedef nir_def *(*nir_unpack_func)(nir_builder *b, nir_def *c);
 
 static bool
 logicop_depends_on_dst_color(int logicop_func)
@@ -53,9 +53,9 @@ logicop_depends_on_dst_color(int logicop_func)
         }
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_logicop(nir_builder *b, int logicop_func,
-            nir_ssa_def *src, nir_ssa_def *dst)
+            nir_def *src, nir_def *dst)
 {
         switch (logicop_func) {
         case PIPE_LOGICOP_CLEAR:
@@ -96,8 +96,8 @@ v3d_logicop(nir_builder *b, int logicop_func,
         }
 }
 
-static nir_ssa_def *
-v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+static nir_def *
+v3d_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz)
 {
         switch (swiz) {
         default:
@@ -116,57 +116,57 @@ v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
         }
 }
 
-static nir_ssa_def *
-v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans,
+static nir_def *
+v3d_nir_swizzle_and_pack(nir_builder *b, nir_def **chans,
                          const uint8_t *swiz, nir_pack_func pack_func)
 {
-        nir_ssa_def *c[4];
+        nir_def *c[4];
         for (int i = 0; i < 4; i++)
                 c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]);
 
         return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3]));
 }
 
-static nir_ssa_def *
-v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed,
+static nir_def *
+v3d_nir_unpack_and_swizzle(nir_builder *b, nir_def *packed,
                            const uint8_t *swiz, nir_unpack_func unpack_func)
 {
-        nir_ssa_def *unpacked = unpack_func(b, packed);
+        nir_def *unpacked = unpack_func(b, packed);
 
-        nir_ssa_def *unpacked_chans[4];
+        nir_def *unpacked_chans[4];
         for (int i = 0; i < 4; i++)
                 unpacked_chans[i] = nir_channel(b, unpacked, i);
 
-        nir_ssa_def *c[4];
+        nir_def *c[4];
         for (int i = 0; i < 4; i++)
                 c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]);
 
         return nir_vec4(b, c[0], c[1], c[2], c[3]);
 }
 
-static nir_ssa_def *
-pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+pack_unorm_rgb10a2(nir_builder *b, nir_def *c)
 {
         static const unsigned bits[4] = { 10, 10, 10, 2 };
-        nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits);
+        nir_def *unorm = nir_format_float_to_unorm(b, c, bits);
 
-        nir_ssa_def *chans[4];
+        nir_def *chans[4];
         for (int i = 0; i < 4; i++)
                 chans[i] = nir_channel(b, unorm, i);
 
-        nir_ssa_def *result = nir_mov(b, chans[0]);
+        nir_def *result = nir_mov(b, chans[0]);
         int offset = bits[0];
         for (int i = 1; i < 4; i++) {
-                nir_ssa_def *shifted_chan =
-                        nir_ishl(b, chans[i], nir_imm_int(b, offset));
+                nir_def *shifted_chan =
+                        nir_ishl_imm(b, chans[i], offset);
                 result = nir_ior(b, result, shifted_chan);
                 offset += bits[i];
         }
         return result;
 }
 
-static nir_ssa_def *
-unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+unpack_unorm_rgb10a2(nir_builder *b, nir_def *c)
 {
         static const unsigned bits[4] = { 10, 10, 10, 2 };
         const unsigned masks[4] = { BITFIELD_MASK(bits[0]),
@@ -174,11 +174,11 @@ unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
                                     BITFIELD_MASK(bits[2]),
                                     BITFIELD_MASK(bits[3]) };
 
-        nir_ssa_def *chans[4];
+        nir_def *chans[4];
         for (int i = 0; i < 4; i++) {
-                nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i]));
+                nir_def *unorm = nir_iand_imm(b, c, masks[i]);
                 chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]);
-                c = nir_ushr(b, c, nir_imm_int(b, bits[i]));
+                c = nir_ushr_imm(b, c, bits[i]);
         }
 
         return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
@@ -201,13 +201,13 @@ v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt)
         }
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
 {
         uint32_t num_components =
                 util_format_get_nr_components(c->fs_key->color_fmt[rt].format);
 
-        nir_ssa_def *color[4];
+        nir_def *color[4];
         for (int i = 0; i < 4; i++) {
                 if (i < num_components) {
                         color[i] =
@@ -222,71 +222,68 @@ v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
         return nir_vec4(b, color[0], color[1], color[2], color[3]);
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b,
-                      nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+                      nir_def **src_chans, nir_def **dst_chans,
                       int rt, int sample)
 {
         const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
 
-        nir_ssa_def *op_res[4];
+        nir_def *op_res[4];
         for (int i = 0; i < 4; i++) {
-                nir_ssa_def *src = src_chans[i];
-                nir_ssa_def *dst =
+                nir_def *src = src_chans[i];
+                nir_def *dst =
                         v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]);
                 op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst);
 
-                /* In Vulkan we configure our integer RTs to clamp, so we need
-                 * to ignore result bits that don't fit in the destination RT
-                 * component size.
+                /* We configure our integer RTs to clamp, so we need to ignore
+                 * result bits that don't fit in the destination RT component
+                 * size.
                  */
-                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
-                        uint32_t bits =
-                                util_format_get_component_bits(
-                                        c->fs_key->color_fmt[rt].format,
-                                        UTIL_FORMAT_COLORSPACE_RGB, i);
-                        if (bits > 0 && bits < 32) {
-                                nir_ssa_def *mask =
-                                        nir_imm_int(b, (1u << bits) - 1);
-                                op_res[i] = nir_iand(b, op_res[i], mask);
-                        }
+                uint32_t bits =
+                        util_format_get_component_bits(
+                                c->fs_key->color_fmt[rt].format,
+                                UTIL_FORMAT_COLORSPACE_RGB, i);
+                if (bits > 0 && bits < 32) {
+                        op_res[i] =
+                                nir_iand_imm(b, op_res[i], (1u << bits) - 1);
                 }
         }
 
-        nir_ssa_def *r[4];
+        nir_def *r[4];
         for (int i = 0; i < 4; i++)
                 r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]);
 
         return nir_vec4(b, r[0], r[1], r[2], r[3]);
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b,
-                        nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+                        nir_def **src_chans, nir_def **dst_chans,
                         int rt, int sample,
                         nir_pack_func pack_func, nir_unpack_func unpack_func)
 {
         static const uint8_t src_swz[4] = { 0, 1, 2, 3 };
-        nir_ssa_def *packed_src =
+        nir_def *packed_src =
                 v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func);
 
         const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
-        nir_ssa_def *packed_dst =
+        nir_def *packed_dst =
                 v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func);
 
-        nir_ssa_def *packed_result =
+        nir_def *packed_result =
                 v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst);
 
         return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func);
 }
 
-static nir_ssa_def *
+static nir_def *
 v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
-                      nir_ssa_def *src, int rt, int sample)
+                      nir_def *src, int rt, int sample)
 {
-        nir_ssa_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
+        nir_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
 
-        nir_ssa_def *src_chans[4], *dst_chans[4];
+        nir_def *src_chans[4], *dst_chans[4];
         for (unsigned i = 0; i < 4; i++) {
                 src_chans[i] = nir_channel(b, src, i);
                 dst_chans[i] = nir_channel(b, dst, i);
@@ -309,7 +306,7 @@ v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
 
 static void
 v3d_emit_ms_output(nir_builder *b,
-                   nir_ssa_def *color, nir_src *offset,
+                   nir_def *color, nir_src *offset,
                    nir_alu_type type, int rt, int sample)
 {
         nir_store_tlb_sample_color_v3d(b, color, nir_imm_int(b, rt), .base = sample, .component = 0, .src_type = type);
@@ -321,7 +318,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
                              nir_intrinsic_instr *intr,
                              int rt)
 {
-        nir_ssa_def *frag_color = intr->src[0].ssa;
+        nir_def *frag_color = intr->src[0].ssa;
 
 
         const int logic_op = c->fs_key->logicop_func;
@@ -331,7 +328,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
                 nir_src *offset = &intr->src[1];
                 nir_alu_type type = nir_intrinsic_src_type(intr);
                 for (int i = 0; i < V3D_MAX_SAMPLES; i++) {
-                        nir_ssa_def *sample =
+                        nir_def *sample =
                                 v3d_nir_emit_logic_op(c, b, frag_color, rt, i);
 
                         v3d_emit_ms_output(b, sample, offset, type, rt, i);
@@ -339,11 +336,10 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
 
                 nir_instr_remove(&intr->instr);
         } else {
-                nir_ssa_def *result =
+                nir_def *result =
                         v3d_nir_emit_logic_op(c, b, frag_color, rt, 0);
 
-                nir_instr_rewrite_src(&intr->instr, &intr->src[0],
-                                      nir_src_for_ssa(result));
+                nir_src_rewrite(&intr->src[0], result);
                 intr->num_components = result->num_components;
         }
 }
@@ -351,6 +347,8 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
 static bool
 v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
 {
+        bool progress = false;
+
         nir_foreach_instr_safe(instr, block) {
                 if (instr->type != nir_instr_type_intrinsic)
                         continue;
@@ -384,35 +382,40 @@ v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
                                 continue;
                         }
 
-                        nir_function_impl *impl =
-                                nir_cf_node_get_function(&block->cf_node);
-                        nir_builder b;
-                        nir_builder_init(&b, impl);
-                        b.cursor = nir_before_instr(&intr->instr);
+                        nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
                         v3d_nir_lower_logic_op_instr(c, &b, intr, rt);
+
+                        progress = true;
                 }
         }
 
-        return true;
+        return progress;
 }
 
-void
+bool
 v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c)
 {
+        bool progress = false;
+
         /* Nothing to do if logic op is 'copy src to dst' or if logic ops are
          * disabled (we set the logic op to copy in that case).
          */
         if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY)
-                return;
+                return false;
 
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_foreach_block(block, function->impl)
-                                v3d_nir_lower_logic_ops_block(block, c);
+        nir_foreach_function_impl(impl, s) {
+                nir_foreach_block(block, impl)
+                        progress |= v3d_nir_lower_logic_ops_block(block, c);
 
-                        nir_metadata_preserve(function->impl,
+                if (progress) {
+                        nir_metadata_preserve(impl,
                                               nir_metadata_block_index |
                                               nir_metadata_dominance);
+                } else {
+                        nir_metadata_preserve(impl,
+                                              nir_metadata_all);
                 }
         }
+
+        return progress;
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c b/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
deleted file mode 100644
index 40f1cc23b1a..00000000000
--- a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "compiler/v3d_compiler.h"
-#include "compiler/nir/nir_builder.h"
-
-static void
-rewrite_offset(nir_builder *b,
-               nir_intrinsic_instr *instr,
-               uint32_t buffer_idx,
-               uint32_t offset_src,
-               nir_intrinsic_op buffer_size_op)
-{
-        b->cursor = nir_before_instr(&instr->instr);
-
-        /* Get size of the buffer */
-        nir_intrinsic_instr *size =
-                nir_intrinsic_instr_create(b->shader, buffer_size_op);
-        size->src[0] = nir_src_for_ssa(nir_imm_int(b, buffer_idx));
-        nir_ssa_dest_init(&size->instr, &size->dest, 1, 32, NULL);
-        nir_builder_instr_insert(b, &size->instr);
-
-        /* All out TMU accesses are 32-bit aligned */
-        nir_ssa_def *aligned_buffer_size =
-                nir_iand(b, &size->dest.ssa, nir_imm_int(b, 0xfffffffc));
-
-        /* Rewrite offset */
-        nir_ssa_def *offset =
-                nir_umin(b, instr->src[offset_src].ssa, aligned_buffer_size);
-        nir_instr_rewrite_src(&instr->instr, &instr->src[offset_src],
-                              nir_src_for_ssa(offset));
-}
-
-static void
-lower_load(struct v3d_compile *c,
-           nir_builder *b,
-           nir_intrinsic_instr *instr)
-{
-        uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
-
-        nir_intrinsic_op op;
-        if (instr->intrinsic == nir_intrinsic_load_ubo) {
-                op = nir_intrinsic_get_ubo_size;
-                if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
-                        index--;
-        } else {
-                op = nir_intrinsic_get_ssbo_size;
-        }
-
-        rewrite_offset(b, instr, index, 1, op);
-}
-
-static void
-lower_store(struct v3d_compile *c,
-            nir_builder *b,
-            nir_intrinsic_instr *instr)
-{
-        uint32_t index = nir_src_comp_as_uint(instr->src[1], 0);
-        rewrite_offset(b, instr, index, 2, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_atomic(struct v3d_compile *c,
-             nir_builder *b,
-             nir_intrinsic_instr *instr)
-{
-        uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
-        rewrite_offset(b, instr, index, 1, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_shared(struct v3d_compile *c,
-             nir_builder *b,
-             nir_intrinsic_instr *instr)
-{
-        b->cursor = nir_before_instr(&instr->instr);
-        nir_ssa_def *aligned_size =
-                nir_imm_int(b, c->s->info.shared_size & 0xfffffffc);
-        nir_ssa_def *offset = nir_umin(b, instr->src[0].ssa, aligned_size);
-        nir_instr_rewrite_src(&instr->instr, &instr->src[0],
-                              nir_src_for_ssa(offset));
-}
-
-static void
-lower_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr)
-{
-        if (instr->type != nir_instr_type_intrinsic)
-                return;
-        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-        switch (intr->intrinsic) {
-        case nir_intrinsic_load_ubo:
-        case nir_intrinsic_load_ssbo:
-                lower_load(c, b, intr);
-                break;
-        case nir_intrinsic_store_ssbo:
-                lower_store(c, b, intr);
-                break;
-        case nir_intrinsic_ssbo_atomic_add:
-        case nir_intrinsic_ssbo_atomic_imin:
-        case nir_intrinsic_ssbo_atomic_umin:
-        case nir_intrinsic_ssbo_atomic_imax:
-        case nir_intrinsic_ssbo_atomic_umax:
-        case nir_intrinsic_ssbo_atomic_and:
-        case nir_intrinsic_ssbo_atomic_or:
-        case nir_intrinsic_ssbo_atomic_xor:
-        case nir_intrinsic_ssbo_atomic_exchange:
-        case nir_intrinsic_ssbo_atomic_comp_swap:
-                lower_atomic(c, b, intr);
-                break;
-        case nir_intrinsic_load_shared:
-        case nir_intrinsic_shared_atomic_add:
-        case nir_intrinsic_shared_atomic_imin:
-        case nir_intrinsic_shared_atomic_umin:
-        case nir_intrinsic_shared_atomic_imax:
-        case nir_intrinsic_shared_atomic_umax:
-        case nir_intrinsic_shared_atomic_and:
-        case nir_intrinsic_shared_atomic_or:
-        case nir_intrinsic_shared_atomic_xor:
-        case nir_intrinsic_shared_atomic_exchange:
-        case nir_intrinsic_shared_atomic_comp_swap:
-                lower_shared(c, b, intr);
-                break;
-        default:
-                break;
-        }
-}
-
-void
-v3d_nir_lower_robust_buffer_access(nir_shader *s, struct v3d_compile *c)
-{
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_builder b;
-                        nir_builder_init(&b, function->impl);
-
-                        nir_foreach_block(block, function->impl) {
-                                nir_foreach_instr_safe(instr, block)
-                                        lower_instr(c, &b, instr);
-                        }
-
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
-                }
-        }
-}
diff --git a/src/broadcom/compiler/v3d_nir_lower_scratch.c b/src/broadcom/compiler/v3d_nir_lower_scratch.c
index 893b6f6ae28..93ed1bb6e26 100644
--- a/src/broadcom/compiler/v3d_nir_lower_scratch.c
+++ b/src/broadcom/compiler/v3d_nir_lower_scratch.c
@@ -34,11 +34,11 @@
  * writemasks in the process.
  */
 
-static nir_ssa_def *
+static nir_def *
 v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr)
 {
         bool is_store = instr->intrinsic == nir_intrinsic_store_scratch;
-        nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1);
+        nir_def *offset = instr->src[is_store ? 1 : 0].ssa;
 
         assert(nir_intrinsic_align_mul(instr) >= 4);
         assert(nir_intrinsic_align_offset(instr) == 0);
@@ -55,18 +55,18 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
 {
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr);
+        nir_def *offset = v3d_nir_scratch_offset(b,instr);
 
-        nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS];
+        nir_def *chans[NIR_MAX_VEC_COMPONENTS];
         for (int i = 0; i < instr->num_components; i++) {
-                nir_ssa_def *chan_offset =
+                nir_def *chan_offset =
                         nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
 
                 nir_intrinsic_instr *chan_instr =
                         nir_intrinsic_instr_create(b->shader, instr->intrinsic);
                 chan_instr->num_components = 1;
-                nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1,
-                                  instr->dest.ssa.bit_size, NULL);
+                nir_def_init(&chan_instr->instr, &chan_instr->def, 1,
+                             instr->def.bit_size);
 
                 chan_instr->src[0] = nir_src_for_ssa(chan_offset);
 
@@ -74,11 +74,11 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
 
                 nir_builder_instr_insert(b, &chan_instr->instr);
 
-                chans[i] = &chan_instr->dest.ssa;
+                chans[i] = &chan_instr->def;
         }
 
-        nir_ssa_def *result = nir_vec(b, chans, instr->num_components);
-        nir_ssa_def_rewrite_uses(&instr->dest.ssa, result);
+        nir_def *result = nir_vec(b, chans, instr->num_components);
+        nir_def_rewrite_uses(&instr->def, result);
         nir_instr_remove(&instr->instr);
 }
 
@@ -87,15 +87,14 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
 {
         b->cursor = nir_before_instr(&instr->instr);
 
-        nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr);
-        nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0],
-                                             instr->num_components);
+        nir_def *offset = v3d_nir_scratch_offset(b, instr);
+        nir_def *value = instr->src[0].ssa;
 
         for (int i = 0; i < instr->num_components; i++) {
                 if (!(nir_intrinsic_write_mask(instr) & (1 << i)))
                         continue;
 
-                nir_ssa_def *chan_offset =
+                nir_def *chan_offset =
                         nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
 
                 nir_intrinsic_instr *chan_instr =
@@ -115,39 +114,29 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
         nir_instr_remove(&instr->instr);
 }
 
-void
-v3d_nir_lower_scratch(nir_shader *s)
+static bool
+v3d_nir_lower_scratch_cb(nir_builder *b,
+                         nir_intrinsic_instr *intr,
+                         void *_state)
 {
-        nir_foreach_function(function, s) {
-                if (!function->impl)
-                        continue;
-
-                nir_builder b;
-                nir_builder_init(&b, function->impl);
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_intrinsic)
-                                        continue;
-
-                                nir_intrinsic_instr *intr =
-                                        nir_instr_as_intrinsic(instr);
-
-                                switch (intr->intrinsic) {
-                                case nir_intrinsic_load_scratch:
-                                        v3d_nir_lower_load_scratch(&b, intr);
-                                        break;
-                                case nir_intrinsic_store_scratch:
-                                        v3d_nir_lower_store_scratch(&b, intr);
-                                        break;
-                                default:
-                                        break;
-                                }
-                        }
-                }
-
-                nir_metadata_preserve(function->impl,
-                                      nir_metadata_block_index |
-                                      nir_metadata_dominance);
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_scratch:
+                v3d_nir_lower_load_scratch(b, intr);
+                return true;
+        case nir_intrinsic_store_scratch:
+                v3d_nir_lower_store_scratch(b, intr);
+                return true;
+        default:
+                return false;
         }
+
+        return false;
+}
+
+bool
+v3d_nir_lower_scratch(nir_shader *s)
+{
+        return nir_shader_intrinsics_pass(s, v3d_nir_lower_scratch_cb,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance, NULL);
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
index d79969374d5..e78c3cb9e3e 100644
--- a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
+++ b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
@@ -32,25 +32,21 @@
  * 2x2 quad.
  */
 
-#define V3D_MAX_SAMPLES 4
-
-static nir_ssa_def *
+static nir_def *
 v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
 {
         nir_tex_instr *instr = nir_instr_as_tex(in_instr);
 
         b->cursor = nir_before_instr(&instr->instr);
 
-        int coord_index = nir_tex_instr_src_index(instr, nir_tex_src_coord);
-        int sample_index = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
-        nir_ssa_def *coord = instr->src[coord_index].src.ssa;
-        nir_ssa_def *sample = instr->src[sample_index].src.ssa;
+        nir_def *coord = nir_steal_tex_src(instr, nir_tex_src_coord);
+        nir_def *sample = nir_steal_tex_src(instr, nir_tex_src_ms_index);
 
-        nir_ssa_def *one = nir_imm_int(b, 1);
-        nir_ssa_def *x = nir_iadd(b,
+        nir_def *one = nir_imm_int(b, 1);
+        nir_def *x = nir_iadd(b,
                                   nir_ishl(b, nir_channel(b, coord, 0), one),
                                   nir_iand(b, sample, one));
-        nir_ssa_def *y = nir_iadd(b,
+        nir_def *y = nir_iadd(b,
                                   nir_ishl(b, nir_channel(b, coord, 1), one),
                                   nir_iand(b, nir_ushr(b, sample, one), one));
         if (instr->is_array)
@@ -58,10 +54,7 @@ v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
         else
                 coord = nir_vec2(b, x, y);
 
-        nir_instr_rewrite_src(&instr->instr,
-                              &instr->src[nir_tex_src_coord].src,
-                              nir_src_for_ssa(coord));
-        nir_tex_instr_remove_src(instr, sample_index);
+        nir_tex_instr_add_src(instr, nir_tex_src_coord, coord);
         instr->op = nir_texop_txf;
         instr->sampler_dim = GLSL_SAMPLER_DIM_2D;
 
@@ -75,11 +68,11 @@ v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data)
                 nir_instr_as_tex(instr)->op == nir_texop_txf_ms);
 }
 
-void
-v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c)
+bool
+v3d_nir_lower_txf_ms(nir_shader *s)
 {
-        nir_shader_lower_instructions(s,
-                                      v3d_nir_lower_txf_ms_filter,
-                                      v3d_nir_lower_txf_ms_instr,
-                                      NULL);
+        return nir_shader_lower_instructions(s,
+                                             v3d_nir_lower_txf_ms_filter,
+                                             v3d_nir_lower_txf_ms_instr,
+                                             NULL);
 }
diff --git a/src/broadcom/compiler/v3d_packing.c b/src/broadcom/compiler/v3d_packing.c
new file mode 100644
index 00000000000..46643edd5e6
--- /dev/null
+++ b/src/broadcom/compiler/v3d_packing.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#define __gen_unpack_address(cl, s, e) (__gen_unpack_uint(cl, s, e) << (31 - (e - s)))
+#include "cle/v3d_packet_v42_pack.h"
+
+
+/* Typically, this method would wrap calling version-specific variant of this
+ * method, but as TMU_CONFIG_PARAMETER_1 doesn't change between v42 and v71,
+ * we can assume that p1_packed is the same struct, and use the same method.
+ */
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+                                  uint32_t *p1_packed,
+                                  bool unnormalized_coordinates)
+{
+        assert(devinfo->ver == 71 || devinfo->ver == 42);
+
+        struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked;
+        V3D42_TMU_CONFIG_PARAMETER_1_unpack((uint8_t *)p1_packed, &p1_unpacked);
+        p1_unpacked.unnormalized_coordinates = unnormalized_coordinates;
+        V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)p1_packed,
+                                     &p1_unpacked);
+}
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d_tex.c
index 7bebfe95552..643c73c4e58 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d_tex.c
@@ -28,27 +28,29 @@
 #define __gen_address_type uint32_t
 #define __gen_address_offset(reloc) (*reloc)
 #define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
 
-static inline void
+static inline struct qinst *
 vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
 {
         /* XXX perf: We should figure out how to merge ALU operations
          * producing the val with this MOV, when possible.
          */
-        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+        return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
 }
 
-static inline void
+static inline struct qinst *
 vir_TMU_WRITE_or_count(struct v3d_compile *c,
                        enum v3d_qpu_waddr waddr,
                        struct qreg val,
                        uint32_t *tmu_writes)
 {
-        if (tmu_writes)
+        if (tmu_writes) {
                 (*tmu_writes)++;
-        else
-                vir_TMU_WRITE(c, waddr, val);
+                return NULL;
+        } else {
+                return vir_TMU_WRITE(c, waddr, val);
+        }
 }
 
 static void
@@ -59,11 +61,11 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data
         inst->uniform = vir_get_uniform_index(c, contents, data);
 }
 
-static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
         .per_pixel_mask_enable = true,
 };
 
-static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
         .op = V3D_TMU_OP_REGULAR,
 };
 
@@ -84,7 +86,7 @@ handle_tex_src(struct v3d_compile *c,
                nir_tex_instr *instr,
                unsigned src_idx,
                unsigned non_array_components,
-               struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+               struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
                struct qreg *s_out,
                unsigned *tmu_writes)
 {
@@ -199,7 +201,7 @@ handle_tex_src(struct v3d_compile *c,
 static void
 vir_tex_handle_srcs(struct v3d_compile *c,
                     nir_tex_instr *instr,
-                    struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+                    struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
                     struct qreg *s,
                     unsigned *tmu_writes)
 {
@@ -222,31 +224,62 @@ get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
 }
 
 void
-v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 {
-        assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);
-
         unsigned texture_idx = instr->texture_index;
-        unsigned sampler_idx = instr->sampler_index;
 
-        struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+        /* For instructions that don't have a sampler (i.e. txf) we bind
+         * default sampler state via the backend_flags to handle precision.
+         */
+        unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ?
+                               instr->sampler_index : instr->backend_flags;
+
+        /* Even if the texture operation doesn't need a sampler by
+         * itself, we still need to add the sampler configuration
+         * parameter if the output is 32 bit
+         */
+        assert(sampler_idx < c->key->num_samplers_used);
+        bool output_type_32_bit =
+                c->key->sampler[sampler_idx].return_size == 32;
+
+        struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
         };
 
         /* Limit the number of channels returned to both how many the NIR
          * instruction writes and how many the instruction could produce.
          */
-        p0_unpacked.return_words_of_texture_data =
-                instr->dest.is_ssa ?
-                nir_ssa_def_components_read(&instr->dest.ssa) :
-                (1 << instr->dest.reg.reg->num_components) - 1;
+        nir_intrinsic_instr *store = nir_store_reg_for_def(&instr->def);
+        if (store == NULL) {
+                p0_unpacked.return_words_of_texture_data =
+                        nir_def_components_read(&instr->def);
+        } else {
+                nir_def *reg = store->src[1].ssa;
+                nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+                unsigned reg_num_components =
+                        nir_intrinsic_num_components(decl);
+
+                /* For the non-ssa case we don't have a full equivalent to
+                 * nir_def_components_read. This is a problem for the 16
+                 * bit case. nir_lower_tex will not change the destination as
+                 * nir_tex_instr_dest_size will still return 4. The driver is
+                 * just expected to not store on other channels, so we
+                 * manually ensure that here.
+                 */
+                uint32_t num_components = output_type_32_bit ?
+                        MIN2(reg_num_components, 4) :
+                        MIN2(reg_num_components, 2);
+
+                p0_unpacked.return_words_of_texture_data = (1 << num_components) - 1;
+        }
         assert(p0_unpacked.return_words_of_texture_data != 0);
 
-        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
+        struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
                 .op = V3D_TMU_OP_REGULAR,
                 .gather_mode = instr->op == nir_texop_tg4,
                 .gather_component = instr->component,
                 .coefficient_mode = instr->op == nir_texop_txd,
-                .disable_autolod = instr->op == nir_texop_tg4
+                .disable_autolod = instr->op == nir_texop_tg4,
+                .lod_query = instr->op == nir_texop_lod,
         };
 
         const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr);
@@ -270,22 +303,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
 
         uint32_t p0_packed;
-        V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
                                           &p0_unpacked);
 
         uint32_t p2_packed;
-        V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
                                           (uint8_t *)&p2_packed,
                                           &p2_unpacked);
 
-        /* We manually set the LOD Query bit (see
-         * V3D42_TMU_CONFIG_PARAMETER_2) as right now is the only V42 specific
-         * feature over V41 we are using
-         */
-        if (instr->op == nir_texop_lod)
-           p2_packed |= 1UL << 24;
-
         /* Load texture_idx number into the high bits of the texture address field,
          * which will be be used by the driver to decide which texture to put
          * in the actual address field.
@@ -294,14 +320,6 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
 
-        /* Even if the texture operation doesn't need a sampler by
-         * itself, we still need to add the sampler configuration
-         * parameter if the output is 32 bit
-         */
-        bool output_type_32_bit =
-                c->key->sampler[sampler_idx].return_size == 32 &&
-                !instr->is_shadow;
-
         /* p1 is optional, but we can skip it only if p2 can be skipped too */
         bool needs_p2_config =
                 (instr->op == nir_texop_lod ||
@@ -313,7 +331,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 output_type_32_bit;
 
         if (non_default_p1_config) {
-                struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+                struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
                         .output_type_32_bit = output_type_32_bit,
 
                         .unnormalized_coordinates = (instr->sampler_dim ==
@@ -330,7 +348,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                        p0_unpacked.return_words_of_texture_data < (1 << 2));
 
                 uint32_t p1_packed;
-                V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+                V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
                                                   (uint8_t *)&p1_packed,
                                                   &p1_unpacked);
 
@@ -358,7 +376,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                  * address
                  */
                 uint32_t p1_packed_default;
-                V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+                V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
                                                   (uint8_t *)&p1_packed_default,
                                                   &p1_unpacked_default);
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default);
@@ -368,48 +386,54 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
         /* Emit retiring TMU write */
+        struct qinst *retiring;
         if (instr->op == nir_texop_txf) {
                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
         } else if (instr->op == nir_texop_txl) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
         } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
         }
 
-        ntq_add_pending_tmu_flush(c, &instr->dest,
+        retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+        ntq_add_pending_tmu_flush(c, &instr->def,
                                   p0_unpacked.return_words_of_texture_data);
 }
 
 static uint32_t
-v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
+v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr)
+{
+        nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+        switch (atomic_op) {
+        case nir_atomic_op_iadd:    return v3d_get_op_for_atomic_add(instr, 3);
+        case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
+        case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+        case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
+        case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
+        case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
+        case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+        case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+        case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+        default:                    unreachable("unknown atomic op");
+        }
+}
+
+static uint32_t
+v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
         case nir_intrinsic_image_load:
         case nir_intrinsic_image_store:
                 return V3D_TMU_OP_REGULAR;
-        case nir_intrinsic_image_atomic_add:
-                return v3d_get_op_for_atomic_add(instr, 3);
-        case nir_intrinsic_image_atomic_imin:
-                return V3D_TMU_OP_WRITE_SMIN;
-        case nir_intrinsic_image_atomic_umin:
-                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
-        case nir_intrinsic_image_atomic_imax:
-                return V3D_TMU_OP_WRITE_SMAX;
-        case nir_intrinsic_image_atomic_umax:
-                return V3D_TMU_OP_WRITE_UMAX;
-        case nir_intrinsic_image_atomic_and:
-                return V3D_TMU_OP_WRITE_AND_READ_INC;
-        case nir_intrinsic_image_atomic_or:
-                return V3D_TMU_OP_WRITE_OR_READ_DEC;
-        case nir_intrinsic_image_atomic_xor:
-                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
-        case nir_intrinsic_image_atomic_exchange:
-                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
-        case nir_intrinsic_image_atomic_comp_swap:
-                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+        case nir_intrinsic_image_atomic:
+        case nir_intrinsic_image_atomic_swap:
+                return v3d_image_atomic_tmu_op(instr);
+
         default:
                 unreachable("unknown image intrinsic");
         };
@@ -427,7 +451,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
  * which is why we always call ntq_get_src() even if we are only interested in
  * register write counts.
  */
-static void
+static struct qinst *
 vir_image_emit_register_writes(struct v3d_compile *c,
                                nir_intrinsic_instr *instr,
                                bool atomic_add_replaced,
@@ -480,7 +504,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                 }
 
                 /* Second atomic argument */
-                if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) {
+                if (instr->intrinsic == nir_intrinsic_image_atomic_swap &&
+                    nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) {
                         struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0);
                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0,
                                                tmu_writes);
@@ -494,7 +519,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                            V3D_QPU_PF_PUSHZ);
         }
 
-        vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
+        struct qinst *retiring =
+                vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
 
         if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
             instr->intrinsic != nir_intrinsic_image_load) {
@@ -502,6 +528,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                         (struct  qinst *)c->cur_block->instructions.prev;
                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
         }
+
+        return retiring;
 }
 
 static unsigned
@@ -516,21 +544,21 @@ get_required_image_tmu_writes(struct v3d_compile *c,
 }
 
 void
-v3d40_vir_emit_image_load_store(struct v3d_compile *c,
-                                nir_intrinsic_instr *instr)
+v3d_vir_emit_image_load_store(struct v3d_compile *c,
+                              nir_intrinsic_instr *instr)
 {
         unsigned format = nir_intrinsic_format(instr);
         unsigned unit = nir_src_as_uint(instr->src[0]);
 
-        struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+        struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
         };
 
-        struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+        struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
                 .per_pixel_mask_enable = true,
                 .output_type_32_bit = v3d_gl_format_is_return_32(format),
         };
 
-        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
+        struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
 
         /* Limit the number of channels returned to both how many the NIR
          * instruction writes and how many the instruction could produce.
@@ -542,19 +570,20 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         p0_unpacked.return_words_of_texture_data =
                 (1 << instr_return_channels) - 1;
 
-        p2_unpacked.op = v3d40_image_load_store_tmu_op(instr);
+        p2_unpacked.op = v3d_image_load_store_tmu_op(instr);
 
         /* If we were able to replace atomic_add for an inc/dec, then we
          * need/can to do things slightly different, like not loading the
          * amount to add/sub, as that is implicit.
          */
         bool atomic_add_replaced =
-                (instr->intrinsic == nir_intrinsic_image_atomic_add &&
-                 (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
-                  p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+                instr->intrinsic == nir_intrinsic_image_atomic &&
+                nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
+                (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+                 p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC);
 
         uint32_t p0_packed;
-        V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
                                           &p0_unpacked);
 
@@ -565,12 +594,12 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         p0_packed |= unit << 24;
 
         uint32_t p1_packed;
-        V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
                                           (uint8_t *)&p1_packed,
                                           &p1_unpacked);
 
         uint32_t p2_packed;
-        V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+        V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
                                           (uint8_t *)&p2_packed,
                                           &p2_unpacked);
 
@@ -599,8 +628,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
                    vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
-        vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
-
-        ntq_add_pending_tmu_flush(c, &instr->dest,
+        struct qinst *retiring =
+                vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
+        retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+        ntq_add_pending_tmu_flush(c, &instr->def,
                                   p0_unpacked.return_words_of_texture_data);
 }
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 27869a35a3b..c59a8aac434 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -23,7 +23,6 @@
 
 #include "broadcom/common/v3d_device_info.h"
 #include "v3d_compiler.h"
-#include "util/u_prim.h"
 #include "compiler/nir/nir_schedule.h"
 #include "compiler/nir/nir_builder.h"
 
@@ -89,7 +88,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
          * pointer, so each read has a side effect (we don't care for ldunif
          * because we reconstruct the uniform stream buffer after compiling
          * with the surviving uniforms), so allowing DCE to remove
-         * one would break follow-up loads. We could fix this by emiting a
+         * one would break follow-up loads. We could fix this by emitting a
          * unifa for each ldunifa, but each unifa requires 3 delay slots
          * before a ldunifa, so that would be quite expensive.
          */
@@ -113,10 +112,10 @@ vir_is_raw_mov(struct qinst *inst)
                 return false;
         }
 
-        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+        if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
                 return false;
         }
 
@@ -156,30 +155,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
 }
 
 bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+                         struct qinst *inst)
 {
-        for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                switch (inst->src[i].file) {
-                case QFILE_VPM:
-                        return true;
-                default:
-                        break;
-                }
-        }
-
-        if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
-                                  inst->qpu.sig.ldtlb ||
-                                  inst->qpu.sig.ldtlbu ||
-                                  inst->qpu.sig.ldvpm)) {
-                return true;
-        }
-
-        return false;
-}
+        if (!devinfo->has_accumulators)
+                return false;
 
-bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
-{
         switch (inst->dst.file) {
         case QFILE_MAGIC:
                 switch (inst->dst.index) {
@@ -195,9 +176,6 @@ vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
                 break;
         }
 
-        if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
-                return true;
-
         return false;
 }
 
@@ -209,15 +187,15 @@ vir_set_unpack(struct qinst *inst, int src,
 
         if (vir_is_add(inst)) {
                 if (src == 0)
-                        inst->qpu.alu.add.a_unpack = unpack;
+                        inst->qpu.alu.add.a.unpack = unpack;
                 else
-                        inst->qpu.alu.add.b_unpack = unpack;
+                        inst->qpu.alu.add.b.unpack = unpack;
         } else {
                 assert(vir_is_mul(inst));
                 if (src == 0)
-                        inst->qpu.alu.mul.a_unpack = unpack;
+                        inst->qpu.alu.mul.a.unpack = unpack;
                 else
-                        inst->qpu.alu.mul.b_unpack = unpack;
+                        inst->qpu.alu.mul.b.unpack = unpack;
         }
 }
 
@@ -369,6 +347,8 @@ vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct q
         inst->src[1] = src1;
         inst->uniform = ~0;
 
+        inst->ip = -1;
+
         return inst;
 }
 
@@ -385,6 +365,8 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q
         inst->src[1] = src1;
         inst->uniform = ~0;
 
+        inst->ip = -1;
+
         return inst;
 }
 
@@ -404,12 +386,16 @@ vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
         inst->dst = vir_nop_reg();
         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
 
+        inst->ip = -1;
+
         return inst;
 }
 
 static void
 vir_emit(struct v3d_compile *c, struct qinst *inst)
 {
+        inst->ip = -1;
+
         switch (c->cursor.mode) {
         case vir_cursor_add:
                 list_add(&inst->link, c->cursor.link);
@@ -509,13 +495,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
 }
 
 const struct v3d_compiler *
-v3d_compiler_init(const struct v3d_device_info *devinfo)
+v3d_compiler_init(const struct v3d_device_info *devinfo,
+                  uint32_t max_inline_uniform_buffers)
 {
         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
         if (!compiler)
                 return NULL;
 
         compiler->devinfo = devinfo;
+        compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
 
         if (!vir_init_reg_sets(compiler)) {
                 ralloc_free(compiler);
@@ -531,6 +519,19 @@ v3d_compiler_free(const struct v3d_compiler *compiler)
         ralloc_free((void *)compiler);
 }
 
+struct v3d_compiler_strategy {
+        const char *name;
+        uint32_t max_threads;
+        uint32_t min_threads;
+        bool disable_general_tmu_sched;
+        bool disable_gcm;
+        bool disable_loop_unrolling;
+        bool disable_ubo_load_sorting;
+        bool move_buffer_loads;
+        bool disable_tmu_pipelining;
+        uint32_t max_tmu_spills;
+};
+
 static struct v3d_compile *
 vir_compile_init(const struct v3d_compiler *compiler,
                  struct v3d_key *key,
@@ -539,12 +540,8 @@ vir_compile_init(const struct v3d_compiler *compiler,
                                       void *debug_output_data),
                  void *debug_output_data,
                  int program_id, int variant_id,
-                 uint32_t max_threads,
-                 uint32_t min_threads_for_reg_alloc,
-                 bool tmu_spilling_allowed,
-                 bool disable_loop_unrolling,
-                 bool disable_constant_ubo_load_sorting,
-                 bool disable_tmu_pipelining,
+                 uint32_t compile_strategy_idx,
+                 const struct v3d_compiler_strategy *strategy,
                  bool fallback_scheduler)
 {
         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
@@ -554,17 +551,22 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->key = key;
         c->program_id = program_id;
         c->variant_id = variant_id;
-        c->threads = max_threads;
+        c->compile_strategy_idx = compile_strategy_idx;
+        c->threads = strategy->max_threads;
         c->debug_output = debug_output;
         c->debug_output_data = debug_output_data;
         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
-        c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
-        c->tmu_spilling_allowed = tmu_spilling_allowed;
+        c->min_threads_for_reg_alloc = strategy->min_threads;
+        c->max_tmu_spills = strategy->max_tmu_spills;
         c->fallback_scheduler = fallback_scheduler;
-        c->disable_tmu_pipelining = disable_tmu_pipelining;
-        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
-        c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
-                ? true : disable_loop_unrolling;
+        c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
+        c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
+        c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
+        c->move_buffer_loads = strategy->move_buffer_loads;
+        c->disable_gcm = strategy->disable_gcm;
+        c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
+                ? true : strategy->disable_loop_unrolling;
+
 
         s = nir_shader_clone(c, s);
         c->s = s;
@@ -590,17 +592,107 @@ type_size_vec4(const struct glsl_type *type, bool bindless)
         return glsl_count_attribute_slots(type, false);
 }
 
+static enum nir_lower_tex_packing
+lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
+{
+   struct v3d_compile *c = (struct v3d_compile *) data;
+
+   int sampler_index = nir_tex_instr_need_sampler(tex) ?
+      tex->sampler_index : tex->backend_flags;
+
+   assert(sampler_index < c->key->num_samplers_used);
+   return c->key->sampler[sampler_index].return_size == 16 ?
+      nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
+}
+
+static bool
+v3d_nir_lower_null_pointers_cb(nir_builder *b,
+                               nir_intrinsic_instr *intr,
+                               void *_state)
+{
+        uint32_t buffer_src_idx;
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_ubo:
+        case nir_intrinsic_load_ssbo:
+                buffer_src_idx = 0;
+                break;
+        case nir_intrinsic_store_ssbo:
+                buffer_src_idx = 1;
+                break;
+        default:
+                return false;
+        }
+
+        /* If index if constant we are good */
+        nir_src *src = &intr->src[buffer_src_idx];
+        if (nir_src_is_const(*src))
+                return false;
+
+        /* Otherwise, see if it comes from a bcsel including a null pointer */
+        if (src->ssa->parent_instr->type != nir_instr_type_alu)
+                return false;
+
+        nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
+        if (alu->op != nir_op_bcsel)
+                return false;
+
+        /* A null pointer is specified using block index 0xffffffff */
+        int32_t null_src_idx = -1;
+        for (int i = 1; i < 3; i++) {
+                 /* FIXME: since we are running this before optimization maybe
+                  * we need to also handle the case where we may have bcsel
+                  * chain that we need to recurse?
+                  */
+                if (!nir_src_is_const(alu->src[i].src))
+                        continue;
+                if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
+                        continue;
+
+                /* One of the bcsel srcs is a null pointer reference */
+                null_src_idx = i;
+                break;
+        }
+
+        if (null_src_idx < 0)
+                return false;
+
+        assert(null_src_idx == 1 || null_src_idx == 2);
+        int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
+
+        /* Rewrite the null pointer reference so we use the same buffer index
+         * as the other bcsel branch. This will allow optimization to remove
+         * the bcsel and we should then end up with a constant buffer index
+         * like we need.
+         */
+        b->cursor = nir_before_instr(&alu->instr);
+        nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
+        nir_src_rewrite(&alu->src[null_src_idx].src, copy);
+
+        return true;
+}
+
+static bool
+v3d_nir_lower_null_pointers(nir_shader *s)
+{
+        return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
+                                            nir_metadata_block_index |
+                                            nir_metadata_dominance, NULL);
+}
+
 static void
 v3d_lower_nir(struct v3d_compile *c)
 {
         struct nir_lower_tex_options tex_options = {
                 .lower_txd = true,
+                .lower_tg4_offsets = true,
                 .lower_tg4_broadcom_swizzle = true,
 
                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
                 .lower_txp = ~0,
                 /* Apply swizzles to all samplers. */
                 .swizzle_result = ~0,
+                .lower_invalid_implicit_lod = true,
         };
 
         /* Lower the format swizzle and (for 32-bit returns)
@@ -612,38 +704,35 @@ v3d_lower_nir(struct v3d_compile *c)
                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
         }
 
-        assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
-        for (int i = 0; i < c->key->num_samplers_used; i++) {
-                if (c->key->sampler[i].return_size == 16) {
-                        tex_options.lower_tex_packing[i] =
-                                nir_lower_tex_packing_16;
-                }
-        }
-
-        /* CS textures may not have return_size reflecting the shadow state. */
-        nir_foreach_uniform_variable(var, c->s) {
-                const struct glsl_type *type = glsl_without_array(var->type);
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+        tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
+        tex_options.lower_tex_packing_data = c;
 
-                if (!glsl_type_is_sampler(type) ||
-                    !glsl_sampler_type_is_shadow(type))
-                        continue;
+        NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
+        NIR_PASS(_, c->s, nir_lower_system_values);
 
-                for (int i = 0; i < array_len; i++) {
-                        tex_options.lower_tex_packing[var->data.binding + i] =
-                                nir_lower_tex_packing_16;
-                }
+        if (c->s->info.zero_initialize_shared_memory &&
+            c->s->info.shared_size > 0) {
+                /* All our BOs allocate full pages, so the underlying allocation
+                 * for shared memory will always be a multiple of 4KB. This
+                 * ensures that we can do an exact number of full chunk_size
+                 * writes to initialize the memory independently of the actual
+                 * shared_size used by the shader, which is a requirement of
+                 * the initialization pass.
+                 */
+                const unsigned chunk_size = 16; /* max single store size */
+                NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
+                         align(c->s->info.shared_size, chunk_size), chunk_size);
         }
 
-        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
-        NIR_PASS_V(c->s, nir_lower_system_values);
-        NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
+        NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
 
-        NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
-                   nir_var_function_temp,
-                   0,
-                   glsl_get_natural_size_align_bytes);
-        NIR_PASS_V(c->s, v3d_nir_lower_scratch);
+        NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
+                 nir_var_function_temp,
+                 0,
+                 glsl_get_natural_size_align_bytes);
+        NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
+        NIR_PASS(_, c->s, v3d_nir_lower_scratch);
+        NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
 }
 
 static void
@@ -711,6 +800,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
 
         /* Set us up for shared input/output segments.  This is apparently
          * necessary for our VCM setup to avoid varying corruption.
+         *
+         * FIXME: initial testing on V3D 7.1 seems to work fine when using
+         * separate segments. So we could try to reevaluate in the future, if
+         * there is any advantage of using separate segments.
          */
         prog_data->separate_segments = false;
         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
@@ -807,13 +900,14 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
 {
         v3d_set_fs_prog_data_inputs(c, prog_data);
         prog_data->writes_z = c->writes_z;
+        prog_data->writes_z_from_fep = c->writes_z_from_fep;
         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
         prog_data->uses_center_w = c->uses_center_w;
         prog_data->uses_implicit_point_line_varyings =
                 c->uses_implicit_point_line_varyings;
         prog_data->lock_scoreboard_on_first_thrsw =
                 c->lock_scoreboard_on_first_thrsw;
-        prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
+        prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
         prog_data->uses_pid = c->fs_uses_primitive_id;
 }
 
@@ -837,8 +931,14 @@ v3d_set_prog_data(struct v3d_compile *c,
         prog_data->threads = c->threads;
         prog_data->single_seg = !c->last_thrsw;
         prog_data->spill_size = c->spill_size;
+        prog_data->tmu_spills = c->spills;
+        prog_data->tmu_fills = c->fills;
+        prog_data->tmu_count = c->tmu.total_count;
+        prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
+        prog_data->compile_strategy_idx = c->compile_strategy_idx;
         prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
         prog_data->has_control_barrier = c->s->info.uses_control_barrier;
+        prog_data->has_global_address = c->has_global_address;
 
         v3d_set_prog_data_uniforms(c, prog_data);
 
@@ -882,32 +982,32 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
         /* Split our I/O vars and dead code eliminate the unused
          * components.
          */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
-                   nir_var_shader_in | nir_var_shader_out);
+        NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+                 nir_var_shader_in | nir_var_shader_out);
         uint64_t used_outputs[4] = {0};
         for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
                 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
                 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
                 used_outputs[comp] |= 1ull << slot;
         }
-        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
-                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
-        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+        NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+                 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+        NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
         v3d_optimize_nir(c, c->s);
-        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+        NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
 
         /* This must go before nir_lower_io */
         if (c->vs_key->per_vertex_point_size)
-                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+                NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
 
-        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-                   type_size_vec4,
-                   (nir_lower_io_options)0);
+        NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+                 type_size_vec4,
+                 (nir_lower_io_options)0);
         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
          * on the code it generated.
          */
-        NIR_PASS_V(c->s, nir_opt_dce);
-        NIR_PASS_V(c->s, nir_opt_constant_folding);
+        NIR_PASS(_, c->s, nir_opt_dce);
+        NIR_PASS(_, c->s, nir_opt_constant_folding);
 }
 
 static void
@@ -916,29 +1016,32 @@ v3d_nir_lower_gs_early(struct v3d_compile *c)
         /* Split our I/O vars and dead code eliminate the unused
          * components.
          */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
-                   nir_var_shader_in | nir_var_shader_out);
+        NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+                 nir_var_shader_in | nir_var_shader_out);
         uint64_t used_outputs[4] = {0};
         for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
                 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
                 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
                 used_outputs[comp] |= 1ull << slot;
         }
-        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
-                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
-        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+        NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+                 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+        NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
         v3d_optimize_nir(c, c->s);
-        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+        NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
 
         /* This must go before nir_lower_io */
         if (c->gs_key->per_vertex_point_size)
-                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+                NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
 
-        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-                   type_size_vec4,
-                   (nir_lower_io_options)0);
-        /* clean up nir_lower_io's deref_var remains */
-        NIR_PASS_V(c->s, nir_opt_dce);
+        NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+                 type_size_vec4,
+                 (nir_lower_io_options)0);
+        /* clean up nir_lower_io's deref_var remains and do a constant folding pass
+         * on the code it generated.
+         */
+        NIR_PASS(_, c->s, nir_opt_dce);
+        NIR_PASS(_, c->s, nir_opt_constant_folding);
 }
 
 static void
@@ -977,11 +1080,11 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
                 v3d_fixup_fs_output_types(c);
 
-        NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
+        NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
 
         if (c->fs_key->line_smoothing) {
-                v3d_nir_lower_line_smooth(c->s);
-                NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+                NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
+                NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
                 /* The lowering pass can introduce new sysval reads */
                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
         }
@@ -991,26 +1094,26 @@ static void
 v3d_nir_lower_gs_late(struct v3d_compile *c)
 {
         if (c->key->ucp_enables) {
-                NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
-                           false, NULL);
+                NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
+                         true, NULL);
         }
 
         /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
 }
 
 static void
 v3d_nir_lower_vs_late(struct v3d_compile *c)
 {
         if (c->key->ucp_enables) {
-                NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
-                           false, false, NULL);
+                NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
+                         false, true, NULL);
                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
-                           nir_var_shader_out);
+                           nir_var_shader_out, NULL, NULL);
         }
 
         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
 }
 
 static void
@@ -1024,9 +1127,9 @@ v3d_nir_lower_fs_late(struct v3d_compile *c)
          * are using.
          */
         if (c->key->ucp_enables)
-                NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
+                NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
 
-        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
 }
 
 static uint32_t
@@ -1107,6 +1210,69 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
         return false;
 }
 
+static unsigned
+v3d_instr_delay_cb(nir_instr *instr, void *data)
+{
+   struct v3d_compile *c = (struct v3d_compile *) data;
+
+   switch (instr->type) {
+   case nir_instr_type_undef:
+   case nir_instr_type_load_const:
+   case nir_instr_type_alu:
+   case nir_instr_type_deref:
+   case nir_instr_type_jump:
+   case nir_instr_type_parallel_copy:
+   case nir_instr_type_call:
+   case nir_instr_type_phi:
+      return 1;
+
+   /* We should not use very large delays for TMU instructions. Typically,
+    * thread switches will be sufficient to hide all or most of the latency,
+    * so we typically only need a little bit of extra room. If we over-estimate
+    * the latency here we may end up unnecessarily delaying the critical path in
+    * the shader, which would have a negative effect in performance, so here
+    * we are trying to strike a balance based on empirical testing.
+    */
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      if (!c->disable_general_tmu_sched) {
+         switch (intr->intrinsic) {
+         case nir_intrinsic_decl_reg:
+         case nir_intrinsic_load_reg:
+         case nir_intrinsic_store_reg:
+            return 0;
+         case nir_intrinsic_load_ssbo:
+         case nir_intrinsic_load_scratch:
+         case nir_intrinsic_load_shared:
+         case nir_intrinsic_image_load:
+            return 3;
+         case nir_intrinsic_load_ubo:
+            if (nir_src_is_divergent(intr->src[1]))
+               return 3;
+            FALLTHROUGH;
+         default:
+            return 1;
+         }
+      } else {
+         switch (intr->intrinsic) {
+         case nir_intrinsic_decl_reg:
+         case nir_intrinsic_load_reg:
+         case nir_intrinsic_store_reg:
+            return 0;
+         default:
+            return 1;
+         }
+      }
+      break;
+   }
+
+   case nir_instr_type_tex:
+      return 5;
+   }
+
+   return 0;
+}
+
 static bool
 should_split_wrmask(const nir_instr *instr, const void *data)
 {
@@ -1197,7 +1363,7 @@ v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
                  * reference offset, since otherwise we would not be able to
                  * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
                  */
-                if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
+                if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
                         continue;
 
                 /* We will move this load if its offset is smaller than ref's
@@ -1349,16 +1515,14 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
 static bool
 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
 {
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_foreach_block(block, function->impl) {
-                                c->sorted_any_ubo_loads |=
-                                        v3d_nir_sort_constant_ubo_loads_block(c, block);
-                        }
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
+        nir_foreach_function_impl(impl, s) {
+                nir_foreach_block(block, impl) {
+                        c->sorted_any_ubo_loads |=
+                                v3d_nir_sort_constant_ubo_loads_block(c, block);
                 }
+                nir_metadata_preserve(impl,
+                                      nir_metadata_block_index |
+                                      nir_metadata_dominance);
         }
         return c->sorted_any_ubo_loads;
 }
@@ -1376,8 +1540,8 @@ lower_load_num_subgroups(struct v3d_compile *c,
                 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
                              c->s->info.workgroup_size[1] *
                              c->s->info.workgroup_size[2], V3D_CHANNELS);
-        nir_ssa_def *result = nir_imm_int(b, num_subgroups);
-        nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
+        nir_def *result = nir_imm_int(b, num_subgroups);
+        nir_def_rewrite_uses(&intr->def, result);
         nir_instr_remove(&intr->instr);
 }
 
@@ -1404,6 +1568,36 @@ lower_subgroup_intrinsics(struct v3d_compile *c,
                 case nir_intrinsic_load_subgroup_size:
                 case nir_intrinsic_load_subgroup_invocation:
                 case nir_intrinsic_elect:
+                case nir_intrinsic_ballot:
+                case nir_intrinsic_inverse_ballot:
+                case nir_intrinsic_ballot_bitfield_extract:
+                case nir_intrinsic_ballot_bit_count_reduce:
+                case nir_intrinsic_ballot_find_lsb:
+                case nir_intrinsic_ballot_find_msb:
+                case nir_intrinsic_ballot_bit_count_exclusive:
+                case nir_intrinsic_ballot_bit_count_inclusive:
+                case nir_intrinsic_reduce:
+                case nir_intrinsic_inclusive_scan:
+                case nir_intrinsic_exclusive_scan:
+                case nir_intrinsic_read_invocation:
+                case nir_intrinsic_read_first_invocation:
+                case nir_intrinsic_load_subgroup_eq_mask:
+                case nir_intrinsic_load_subgroup_ge_mask:
+                case nir_intrinsic_load_subgroup_gt_mask:
+                case nir_intrinsic_load_subgroup_le_mask:
+                case nir_intrinsic_load_subgroup_lt_mask:
+                case nir_intrinsic_shuffle:
+                case nir_intrinsic_shuffle_xor:
+                case nir_intrinsic_shuffle_up:
+                case nir_intrinsic_shuffle_down:
+                case nir_intrinsic_vote_all:
+                case nir_intrinsic_vote_any:
+                case nir_intrinsic_vote_feq:
+                case nir_intrinsic_vote_ieq:
+                case nir_intrinsic_quad_broadcast:
+                case nir_intrinsic_quad_swap_horizontal:
+                case nir_intrinsic_quad_swap_vertical:
+                case nir_intrinsic_quad_swap_diagonal:
                         c->has_subgroups = true;
                         break;
                 default:
@@ -1418,18 +1612,15 @@ static bool
 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
 {
         bool progress = false;
-        nir_foreach_function(function, s) {
-                if (function->impl) {
-                        nir_builder b;
-                        nir_builder_init(&b, function->impl);
+        nir_foreach_function_impl(impl, s) {
+                nir_builder b = nir_builder_create(impl);
 
-                        nir_foreach_block(block, function->impl)
-                                progress |= lower_subgroup_intrinsics(c, block, &b);
+                nir_foreach_block(block, impl)
+                        progress |= lower_subgroup_intrinsics(c, block, &b);
 
-                        nir_metadata_preserve(function->impl,
-                                              nir_metadata_block_index |
-                                              nir_metadata_dominance);
-                }
+                nir_metadata_preserve(impl,
+                                      nir_metadata_block_index |
+                                      nir_metadata_dominance);
         }
         return progress;
 }
@@ -1483,30 +1674,54 @@ v3d_attempt_compile(struct v3d_compile *c)
                 break;
         }
 
-        NIR_PASS_V(c->s, v3d_nir_lower_io, c);
-        NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
-        NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
+        NIR_PASS(_, c->s, v3d_nir_lower_io, c);
+        NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
+        NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
+
+        NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
         nir_lower_idiv_options idiv_options = {
-                .imprecise_32bit_lowering = true,
                 .allow_fp16 = true,
         };
-        NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
-
-        if (c->key->robust_buffer_access) {
-           /* v3d_nir_lower_robust_buffer_access assumes constant buffer
-            * indices on ubo/ssbo intrinsics so run copy propagation and
-            * constant folding passes before we run the lowering to warrant
-            * this. We also want to run the lowering before v3d_optimize to
-            * clean-up redundant get_buffer_size calls produced in the pass.
-            */
-           NIR_PASS_V(c->s, nir_copy_prop);
-           NIR_PASS_V(c->s, nir_opt_constant_folding);
-           NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
+        NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
+        NIR_PASS(_, c->s, nir_lower_alu);
+
+        if (c->key->robust_uniform_access || c->key->robust_storage_access ||
+            c->key->robust_image_access) {
+                /* nir_lower_robust_access assumes constant buffer
+                 * indices on ubo/ssbo intrinsics so run copy propagation and
+                 * constant folding passes before we run the lowering to warrant
+                 * this. We also want to run the lowering before v3d_optimize to
+                 * clean-up redundant get_buffer_size calls produced in the pass.
+                 */
+                NIR_PASS(_, c->s, nir_copy_prop);
+                NIR_PASS(_, c->s, nir_opt_constant_folding);
+
+                nir_lower_robust_access_options opts = {
+                   .lower_image = c->key->robust_image_access,
+                   .lower_ssbo = c->key->robust_storage_access,
+                   .lower_ubo = c->key->robust_uniform_access,
+                };
+
+                NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
         }
 
-        NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
+        NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
 
-        NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
+        NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
+
+        NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
+
+        const nir_lower_subgroups_options subgroup_opts = {
+                .subgroup_size = V3D_CHANNELS,
+                .ballot_components = 1,
+                .ballot_bit_size = 32,
+                .lower_to_scalar = true,
+                .lower_inverse_ballot = true,
+                .lower_subgroup_masks = true,
+                .lower_relative_shuffle = true,
+                .lower_quad = true,
+        };
+        NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
 
         v3d_optimize_nir(c, c->s);
 
@@ -1519,25 +1734,25 @@ v3d_attempt_compile(struct v3d_compile *c)
         while (more_late_algebraic) {
                 more_late_algebraic = false;
                 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
-                NIR_PASS_V(c->s, nir_opt_constant_folding);
-                NIR_PASS_V(c->s, nir_copy_prop);
-                NIR_PASS_V(c->s, nir_opt_dce);
-                NIR_PASS_V(c->s, nir_opt_cse);
+                NIR_PASS(_, c->s, nir_opt_constant_folding);
+                NIR_PASS(_, c->s, nir_copy_prop);
+                NIR_PASS(_, c->s, nir_opt_dce);
+                NIR_PASS(_, c->s, nir_opt_cse);
         }
 
-        NIR_PASS_V(c->s, nir_lower_bool_to_int32);
-        nir_convert_to_lcssa(c->s, true, true);
+        NIR_PASS(_, c->s, nir_lower_bool_to_int32);
+        NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
         NIR_PASS_V(c->s, nir_divergence_analysis);
-        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+        NIR_PASS(_, c->s, nir_convert_from_ssa, true);
 
         struct nir_schedule_options schedule_options = {
                 /* Schedule for about half our register space, to enable more
                  * shaders to hit 4 threads.
                  */
-                .threshold = 24,
+                .threshold = c->threads == 4 ? 24 : 48,
 
                 /* Vertex shaders share the same memory for inputs and outputs,
-                 * fragement and geometry shaders do not.
+                 * fragment and geometry shaders do not.
                  */
                 .stages_with_shared_io_memory =
                 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
@@ -1548,11 +1763,22 @@ v3d_attempt_compile(struct v3d_compile *c)
 
                 .intrinsic_cb = v3d_intrinsic_dependency_cb,
                 .intrinsic_cb_data = c,
+
+                .instr_delay_cb = v3d_instr_delay_cb,
+                .instr_delay_cb_data = c,
         };
         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
 
         if (!c->disable_constant_ubo_load_sorting)
-                NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
+                NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
+
+        const nir_move_options buffer_opts = c->move_buffer_loads ?
+                (nir_move_load_ubo | nir_move_load_ssbo) : 0;
+        NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
+                                        nir_move_const_undef |
+                                        buffer_opts);
+
+        NIR_PASS_V(c->s, nir_trivialize_registers);
 
         v3d_nir_to_vir(c);
 }
@@ -1611,32 +1837,28 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
  * register allocation to any particular thread count). This is fine
  * because v3d_nir_to_vir will cap this to the actual minimum.
  */
-struct v3d_compiler_strategy {
-        const char *name;
-        uint32_t max_threads;
-        uint32_t min_threads;
-        bool disable_loop_unrolling;
-        bool disable_ubo_load_sorting;
-        bool disable_tmu_pipelining;
-        bool tmu_spilling_allowed;
-} static const strategies[] = {
-  /*0*/ { "default",                        4, 4, false, false, false, false },
-  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false, false },
-  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false, false },
-  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,  false },
-  /*4*/ { "lower thread count",             2, 1, false, false, false, false },
-  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, false },
-  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, false },
-  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  true  },
-  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  true  }
+static const struct v3d_compiler_strategy strategies[] = {
+        /*0*/  { "default",                        4, 4, false, false, false, false, false, false,  0 },
+        /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false, false,  0 },
+        /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false, false,  0 },
+        /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false, false,  0 },
+        /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false, false,  0 },
+        /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  false, true,   0 },
+        /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, false, -1 },
+        /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, false, -1 },
+        /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, false, -1 },
+        /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, false, -1 },
+        /*10*/ { "Move buffer loads (2t)",         2, 1, true,  true,  true,  true,  true,  false, -1 },
+        /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  true,  -1 },
+        /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  true,  -1 }
 };
 
 /**
  * If a particular optimization didn't make any progress during a compile
- * attempt disabling it alone won't allow us to compile the shader successfuly,
+ * attempt disabling it alone won't allow us to compile the shader successfully,
  * since we'll end up with the same code. Detect these scenarios so we can
  * avoid wasting time with useless compiles. We should also consider if the
- * strategy changes other aspects of the compilation process though, like
+ * gy changes other aspects of the compilation process though, like
  * spilling, and not skip it in that case.
  */
 static bool
@@ -1649,31 +1871,55 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
    assert(idx > 0);
 
    /* Don't skip a strategy that changes spilling behavior */
-   if (strategies[idx].tmu_spilling_allowed !=
-       strategies[idx - 1].tmu_spilling_allowed) {
+   if (strategies[idx].max_tmu_spills !=
+       strategies[idx - 1].max_tmu_spills) {
            return false;
    }
 
    switch (idx) {
-   /* Loop unrolling: skip if we didn't unroll any loops */
+   /* General TMU sched.: skip if we didn't emit any TMU loads */
    case 1:
-   case 5:
+   case 7:
+           return !c->has_general_tmu_load;
+   /* Global code motion: skip if nir_opt_gcm didn't make any progress */
+   case 2:
+   case 8:
+           return !c->gcm_progress;
+   /* Loop unrolling: skip if we didn't unroll any loops */
+   case 3:
+   case 9:
            return !c->unrolled_any_loops;
    /* UBO load sorting: skip if we didn't sort any loads */
-   case 2:
-   case 6:
+   case 4:
            return !c->sorted_any_ubo_loads;
+   /* Move buffer loads: we assume any shader with difficult RA
+    * most likely has UBO / SSBO loads so we never try to skip.
+    * For now, we only try this for 2-thread compiles since it
+    * is expected to impact instruction counts and latency.
+    */
+   case 10:
+          assert(c->threads < 4);
+          return false;
    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
-   case 3:
-   case 7:
+   case 5:
+   case 11:
            return !c->pipelined_any_tmu;
    /* Lower thread count: skip if we already tried less that 4 threads */
-   case 4:
+   case 6:
           return c->threads < 4;
    default:
            return false;
    };
 }
+
+static inline void
+set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
+{
+   if (*best)
+      vir_compile_destroy(*best);
+   *best = c;
+}
+
 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       struct v3d_key *key,
                       struct v3d_prog_data **out_prog_data,
@@ -1685,58 +1931,106 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       uint32_t *final_assembly_size)
 {
         struct v3d_compile *c = NULL;
-        for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
+
+        uint32_t best_spill_fill_count = UINT32_MAX;
+        struct v3d_compile *best_c = NULL;
+        for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
                 /* Fallback strategy */
-                if (i > 0) {
+                if (strat > 0) {
                         assert(c);
-                        if (skip_compile_strategy(c, i))
+                        if (skip_compile_strategy(c, strat))
                                 continue;
 
                         char *debug_msg;
                         int ret = asprintf(&debug_msg,
-                                           "Falling back to strategy '%s' for %s",
-                                           strategies[i].name,
-                                           vir_get_stage_name(c));
+                                           "Falling back to strategy '%s' "
+                                           "for %s prog %d/%d",
+                                           strategies[strat].name,
+                                           vir_get_stage_name(c),
+                                           c->program_id, c->variant_id);
 
                         if (ret >= 0) {
-                                if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
+                                if (V3D_DBG(PERF))
                                         fprintf(stderr, "%s\n", debug_msg);
 
                                 c->debug_output(debug_msg, c->debug_output_data);
                                 free(debug_msg);
                         }
 
-                        vir_compile_destroy(c);
+                        if (c != best_c)
+                                vir_compile_destroy(c);
                 }
 
                 c = vir_compile_init(compiler, key, s,
                                      debug_output, debug_output_data,
                                      program_id, variant_id,
-                                     strategies[i].max_threads,
-                                     strategies[i].min_threads,
-                                     strategies[i].tmu_spilling_allowed,
-                                     strategies[i].disable_loop_unrolling,
-                                     strategies[i].disable_ubo_load_sorting,
-                                     strategies[i].disable_tmu_pipelining,
-                                     i == ARRAY_SIZE(strategies) - 1);
+                                     strat, &strategies[strat],
+                                     strat == ARRAY_SIZE(strategies) - 1);
 
                 v3d_attempt_compile(c);
 
-                if (i >= ARRAY_SIZE(strategies) - 1 ||
-                    c->compilation_result !=
-                    V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
+                /* Broken shader or driver bug */
+                if (c->compilation_result == V3D_COMPILATION_FAILED)
                         break;
+
+                /* If we compiled without spills, choose this.
+                 * Otherwise if this is a 4-thread compile, choose this (these
+                 * have a very low cap on the allowed TMU spills so we assume
+                 * it will be better than a 2-thread compile without spills).
+                 * Otherwise, keep going while tracking the strategy with the
+                 * lowest spill count.
+                 */
+                if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
+                        if (c->spills == 0 ||
+                            strategies[strat].min_threads == 4 ||
+                            V3D_DBG(OPT_COMPILE_TIME)) {
+                                set_best_compile(&best_c, c);
+                                break;
+                        } else if (c->spills + c->fills <
+                                   best_spill_fill_count) {
+                                set_best_compile(&best_c, c);
+                                best_spill_fill_count = c->spills + c->fills;
+                        }
+
+                        if (V3D_DBG(PERF)) {
+                                char *debug_msg;
+                                int ret = asprintf(&debug_msg,
+                                                   "Compiled %s prog %d/%d with %d "
+                                                   "spills and %d fills. Will try "
+                                                   "more strategies.",
+                                                   vir_get_stage_name(c),
+                                                   c->program_id, c->variant_id,
+                                                   c->spills, c->fills);
+                                if (ret >= 0) {
+                                        fprintf(stderr, "%s\n", debug_msg);
+                                        c->debug_output(debug_msg, c->debug_output_data);
+                                        free(debug_msg);
+                                }
+                        }
                 }
+
+                /* Only try next streategy if we failed to register allocate
+                 * or we had to spill.
+                 */
+                assert(c->compilation_result ==
+                       V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
+                       c->spills > 0);
         }
 
-        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
+        /* If the best strategy was not the last, choose that */
+        if (best_c && c != best_c)
+                set_best_compile(&c, best_c);
+
+        if (V3D_DBG(PERF) &&
             c->compilation_result !=
             V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
             c->spills > 0) {
                 char *debug_msg;
                 int ret = asprintf(&debug_msg,
-                                   "Compiled %s with %d spills and %d fills",
+                                   "Compiled %s prog %d/%d with %d "
+                                   "spills and %d fills",
                                    vir_get_stage_name(c),
+                                   c->program_id, c->variant_id,
                                    c->spills, c->fills);
                 fprintf(stderr, "%s\n", debug_msg);
 
@@ -1747,8 +2041,12 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
         }
 
         if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
-                fprintf(stderr, "Failed to compile %s with any strategy.\n",
-                        vir_get_stage_name(c));
+                fprintf(stderr, "Failed to compile %s prog %d/%d "
+                        "with any strategy.\n",
+                        vir_get_stage_name(c), c->program_id, c->variant_id);
+
+                vir_compile_destroy(c);
+                return NULL;
         }
 
         struct v3d_prog_data *prog_data;
@@ -1762,8 +2060,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
         char *shaderdb;
         int ret = v3d_shaderdb_dump(c, &shaderdb);
         if (ret >= 0) {
-                if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
-                        fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
+                if (V3D_DBG(SHADERDB))
+                        fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
 
                 c->debug_output(shaderdb, c->debug_output_data);
                 free(shaderdb);
@@ -1872,8 +2170,11 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
         struct qinst *prev_inst = NULL;
         assert(c->cur_block);
 
-#ifdef DEBUG
-        /* Check if the current instruction is part of the current block */
+#if MESA_DEBUG
+        /* We can only reuse a uniform if it was emitted in the same block,
+         * so callers must make sure the current instruction is being emitted
+         * in the current block.
+         */
         bool found = false;
         vir_for_each_inst(inst, c->cur_block) {
                 if (&inst->link == c->cursor.link) {
@@ -1882,7 +2183,7 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
                 }
         }
 
-        assert(found || list_is_empty(&c->cur_block->instructions));
+        assert(found || &c->cur_block->instructions == c->cursor.link);
 #endif
 
         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
@@ -1900,6 +2201,12 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
         if (!prev_inst)
                 return false;
 
+        /* Only reuse the ldunif result if it was written to a temp register,
+         * otherwise there may be special restrictions (for example, ldunif
+         * may write directly to unifa, which is a write-only register).
+         */
+        if (prev_inst->dst.file != QFILE_TEMP)
+                return false;
 
         list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
                                  &c->cur_block->instructions, link) {
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..631eeee52ab 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -182,11 +182,6 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
                 break;
         }
 
-        case QFILE_VPM:
-                fprintf(stderr, "vpm%d.%d",
-                        reg.index / 4, reg.index % 4);
-                break;
-
         case QFILE_TEMP:
                 fprintf(stderr, "t%d", reg.index);
                 break;
@@ -197,9 +192,6 @@ static void
 vir_dump_sig_addr(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *instr)
 {
-        if (devinfo->ver < 41)
-                return;
-
         if (!instr->sig_magic)
                 fprintf(stderr, ".rf%d", instr->sig_addr);
         else {
@@ -270,8 +262,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
 
-                unpack[0] = instr->alu.add.a_unpack;
-                unpack[1] = instr->alu.add.b_unpack;
+                unpack[0] = instr->alu.add.a.unpack;
+                unpack[1] = instr->alu.add.b.unpack;
         } else {
                 fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
                 fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +274,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
 
-                unpack[0] = instr->alu.mul.a_unpack;
-                unpack[1] = instr->alu.mul.b_unpack;
+                unpack[0] = instr->alu.mul.a.unpack;
+                unpack[1] = instr->alu.mul.b.unpack;
         }
 
         for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 2fd6430a0f4..d1f44aa9cf7 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
                                 flags_inst = NULL;
                         }
 
-                        /* Payload registers: r0/1/2 contain W, centroid W,
-                         * and Z at program start.  Register allocation will
-                         * force their nodes to R0/1/2.
+                        /* Payload registers: for fragment shaders, W,
+                         * centroid W, and Z will be initialized in r0/1/2
+                         * until v42, or r1/r2/r3 since v71.
+                         *
+                         * For compute shaders, payload is in r0/r2 up to v42,
+                         * r2/r3 since v71.
+                         *
+                         * Register allocation will force their nodes to those
+                         * registers.
                          */
                         if (inst->src[0].file == QFILE_REG) {
-                                switch (inst->src[0].index) {
-                                case 0:
-                                case 1:
-                                case 2:
+                                uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+                                uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+                                if (inst->src[0].index >= min_payload_r ||
+                                    inst->src[0].index <= max_payload_r) {
                                         c->temp_start[inst->dst.index] = 0;
-                                        break;
                                 }
                         }
 
@@ -306,6 +311,8 @@ vir_calculate_live_intervals(struct v3d_compile *c)
 
                 vir_for_each_block(block, c) {
                         ralloc_free(block->def);
+                        ralloc_free(block->defin);
+                        ralloc_free(block->defout);
                         ralloc_free(block->use);
                         ralloc_free(block->live_in);
                         ralloc_free(block->live_out);
diff --git a/src/broadcom/compiler/vir_opt_constant_alu.c b/src/broadcom/compiler/vir_opt_constant_alu.c
index 483646f882e..dc4c8a65026 100644
--- a/src/broadcom/compiler/vir_opt_constant_alu.c
+++ b/src/broadcom/compiler/vir_opt_constant_alu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -155,6 +155,7 @@ vir_opt_constant_alu(struct v3d_compile *c)
 {
         bool progress = false;
         vir_for_each_block(block, c) {
+                c->cur_block = block;
                 vir_for_each_inst_safe(inst, block) {
                         progress = try_opt_constant_alu(c, inst) || progress;
                 }
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index c5bb6112173..611c4693ed3 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
 #include "v3d_compiler.h"
 
 static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
 {
         if (!inst)
                 return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
                 return false;
         }
 
-        switch (inst->src[0].file) {
-        case QFILE_MAGIC:
-                /* No copy propagating from R3/R4/R5 -- the MOVs from those
-                 * are there to register allocate values produced into R3/4/5
-                 * to other regs (though hopefully r3/4/5).
-                 */
-                switch (inst->src[0].index) {
-                case V3D_QPU_WADDR_R3:
-                case V3D_QPU_WADDR_R4:
-                case V3D_QPU_WADDR_R5:
-                        return false;
+        if (devinfo->ver == 42) {
+                switch (inst->src[0].file) {
+                case QFILE_MAGIC:
+                        /* No copy propagating from R3/R4/R5 -- the MOVs from
+                         * those are there to register allocate values produced
+                         * into R3/4/5 to other regs (though hopefully r3/4/5).
+                         */
+                        switch (inst->src[0].index) {
+                        case V3D_QPU_WADDR_R3:
+                        case V3D_QPU_WADDR_R4:
+                        case V3D_QPU_WADDR_R5:
+                                return false;
+                        default:
+                                break;
+                        }
+                        break;
+
+                case QFILE_REG:
+                        switch (inst->src[0].index) {
+                        case 0:
+                        case 1:
+                        case 2:
+                                /* MOVs from rf0/1/2 are only to track the live
+                                 * intervals for W/centroid W/Z.
+                                 */
+                                return false;
+                        }
+                        break;
+
                 default:
                         break;
                 }
-                break;
-
-        case QFILE_REG:
-                switch (inst->src[0].index) {
-                case 0:
-                case 1:
-                case 2:
-                        /* MOVs from rf0/1/2 are only to track the live
+        } else {
+                assert(devinfo->ver >= 71);
+                switch (inst->src[0].file) {
+                case QFILE_REG:
+                        switch (inst->src[0].index) {
+                        /* MOVs from rf1/2/3 are only to track the live
                          * intervals for W/centroid W/Z.
+                         *
+                         * Note: rf0 can be implicitly written by ldvary
+                         * (no temp involved), so it is not an SSA value and
+                         * could clash with writes to other temps that are
+                         * also allocated to rf0. In theory, that would mean
+                         * that we can't copy propagate from it, but we handle
+                         * this at register allocation time, preventing temps
+                         * from being allocated to rf0 while the rf0 value from
+                         * ldvary is still live.
                          */
-                        return false;
-                }
-                break;
+                        case 1:
+                        case 2:
+                        case 3:
+                                return false;
+                        }
+                        break;
 
-        default:
-                break;
+                default:
+                        break;
+                }
         }
 
         return true;
@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)
 
         if (vir_is_add(inst)) {
                 if (chan == 0)
-                        return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
                 else
-                        return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
         } else {
                 if (chan == 0)
-                        return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
                 else
-                        return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
         }
 }
 
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
                  */
                 struct qinst *mov = movs[inst->src[i].index];
                 if (!mov) {
-                        if (!is_copy_mov(c->defs[inst->src[i].index]))
+                        if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
                                 continue;
                         mov = c->defs[inst->src[i].index];
 
@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
                                 continue;
 
                         /* these ops can't represent abs. */
-                        if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+                        if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
                                 switch (inst->qpu.alu.add.op) {
                                 case V3D_QPU_A_VFPACK:
                                 case V3D_QPU_A_FROUND:
@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
 
                 inst->src[i] = mov->src[0];
                 if (vir_has_unpack(mov, 0)) {
-                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
 
                         vir_set_unpack(inst, i, unpack);
                 }
@@ -238,12 +267,14 @@ vir_opt_copy_propagate(struct v3d_compile *c)
                  */
                 memset(movs, 0, sizeof(struct qinst *) * c->num_temps);
 
+                c->cur_block = block;
                 vir_for_each_inst(inst, block) {
+
                         progress = try_copy_prop(c, inst, movs) || progress;
 
                         apply_kills(c, movs, inst);
 
-                        if (is_copy_mov(inst))
+                        if (is_copy_mov(c->devinfo, inst))
                                 movs[inst->dst.index] = inst;
                 }
         }
diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c
index 64c762c88db..fd1af944427 100644
--- a/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/src/broadcom/compiler/vir_opt_dead_code.c
@@ -52,21 +52,10 @@ dce(struct v3d_compile *c, struct qinst *inst)
 }
 
 static bool
-has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
-{
-        for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                if (inst->src[i].file == QFILE_VPM)
-                        return true;
-        }
-
-        return false;
-}
-
-static bool
 can_write_to_null(struct v3d_compile *c, struct qinst *inst)
 {
         /* The SFU instructions must write to a physical register. */
-        if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu))
+        if (v3d_qpu_uses_sfu(&inst->qpu))
                 return false;
 
         return true;
@@ -149,30 +138,25 @@ check_first_ldunifa(struct v3d_compile *c,
 }
 
 static bool
-increment_unifa_address(struct v3d_compile *c, struct qblock *block, struct qinst *unifa)
+increment_unifa_address(struct v3d_compile *c, struct qinst *unifa)
 {
-        struct qblock *current_block = c->cur_block;
         if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
             unifa->qpu.alu.mul.op == V3D_QPU_M_MOV) {
                 c->cursor = vir_after_inst(unifa);
-                c->cur_block = block;
                 struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 vir_ADD_dest(c, unifa_reg, unifa->src[0], vir_uniform_ui(c, 4u));
                 vir_remove_instruction(c, unifa);
-                c->cur_block = current_block;
                 return true;
         }
 
         if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
             unifa->qpu.alu.add.op == V3D_QPU_A_ADD) {
                 c->cursor = vir_after_inst(unifa);
-                c->cur_block = block;
                 struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 struct qreg tmp =
                         vir_ADD(c, unifa->src[1], vir_uniform_ui(c, 4u));
                 vir_ADD_dest(c, unifa_reg, unifa->src[0], tmp);
                 vir_remove_instruction(c, unifa);
-                c->cur_block = current_block;
                 return true;
         }
 
@@ -200,7 +184,7 @@ vir_opt_dead_code(struct v3d_compile *c)
 
         vir_for_each_block(block, c) {
                 struct qinst *last_flags_write = NULL;
-
+                c->cur_block = block;
                 vir_for_each_inst_safe(inst, block) {
                         /* If this instruction reads the flags, we can't
                          * remove the flags generation for it.
@@ -246,7 +230,6 @@ vir_opt_dead_code(struct v3d_compile *c)
                         }
 
                         if (v3d_qpu_writes_flags(&inst->qpu) ||
-                            has_nonremovable_reads(c, inst) ||
                             (is_ldunifa && !is_first_ldunifa && !is_last_ldunifa)) {
                                 /* If we can't remove the instruction, but we
                                  * don't need its destination value, just
@@ -276,7 +259,7 @@ vir_opt_dead_code(struct v3d_compile *c)
                          */
                         if (is_first_ldunifa) {
                                 assert(unifa);
-                                if (!increment_unifa_address(c, block, unifa))
+                                if (!increment_unifa_address(c, unifa))
                                         continue;
                         }
 
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index 4609ef9c361..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
             a->qpu.flags.mpf != b->qpu.flags.mpf ||
             a->qpu.alu.add.op != b->qpu.alu.add.op ||
             a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
-            a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
-            a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+            a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+            a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
             a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
-            a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
-            a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+            a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+            a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
             a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
                 return false;
         }
@@ -99,6 +99,7 @@ vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block)
         struct qinst *last_flags = NULL;
         bool progress = false;
 
+        c->cur_block = block;
         vir_for_each_inst(inst, block) {
                 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
                     inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..56f0bf20706 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
                 /* The small immediate value sits in the raddr B field, so we
                  * can't have 2 small immediates in one instruction (unless
                  * they're the same value, but that should be optimized away
-                 * elsewhere).
+                 * elsewhere). Since 7.x we can encode small immediates in
+                 * any raddr field, but each instruction can still only use
+                 * one.
                  */
                 bool uses_small_imm = false;
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
                          */
                         struct v3d_qpu_sig new_sig = inst->qpu.sig;
                         uint32_t sig_packed;
-                        new_sig.small_imm = true;
+                        if (c->devinfo->ver == 42) {
+                                new_sig.small_imm_b = true;
+                        } else {
+                               if (vir_is_add(inst)) {
+                                       if (i == 0)
+                                               new_sig.small_imm_a = true;
+                                       else
+                                               new_sig.small_imm_b = true;
+                               } else {
+                                       if (i == 0)
+                                               new_sig.small_imm_c = true;
+                                       else
+                                               new_sig.small_imm_d = true;
+                               }
+                        }
+
                         if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
                                 continue;
 
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
                                 vir_dump_inst(c, inst);
                                 fprintf(stderr, "\n");
                         }
-                        inst->qpu.sig.small_imm = true;
+                        inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+                        inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+                        inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+                        inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
                         inst->qpu.raddr_b = packed;
 
                         inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 08698b4ece1..53e84840899 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -26,12 +26,100 @@
 #include "common/v3d_device_info.h"
 #include "v3d_compiler.h"
 
-#define QPU_R(i) { .magic = false, .index = i }
-
 #define ACC_INDEX     0
 #define ACC_COUNT     6
-#define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT    64
+
+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+        if (devinfo->has_accumulators)
+                return ACC_INDEX + ACC_COUNT;
+        else
+                return 0;
+}
+
+/* ACC as accumulator */
+#define CLASS_BITS_PHYS   (1 << 0)
+#define CLASS_BITS_ACC    (1 << 1)
+#define CLASS_BITS_R5     (1 << 4)
+
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+        if (devinfo->has_accumulators)
+                return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+        else
+                return CLASS_BITS_PHYS;
+}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+   if (!devinfo->has_accumulators) {
+      assert(class_bits & CLASS_BITS_PHYS);
+      class_bits = CLASS_BITS_PHYS;
+   }
+   return class_bits;
+}
+
+static inline uint32_t
+temp_to_node(struct v3d_compile *c, uint32_t temp)
+{
+        return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+                                                      IMPLICIT_RF_COUNT);
+}
+
+static inline uint32_t
+node_to_temp(struct v3d_compile *c, uint32_t node)
+{
+        assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+               (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+        return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+                                                      IMPLICIT_RF_COUNT);
+}
+
+static inline uint8_t
+get_temp_class_bits(struct v3d_compile *c,
+                    uint32_t temp)
+{
+        return c->nodes.info[temp_to_node(c, temp)].class_bits;
+}
+
+static inline void
+set_temp_class_bits(struct v3d_compile *c,
+                    uint32_t temp, uint8_t class_bits)
+{
+        c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
+}
+
+static struct ra_class *
+choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
+{
+        if (class_bits == CLASS_BITS_PHYS) {
+                return c->compiler->reg_class_phys[c->thread_index];
+        } else if (class_bits == (CLASS_BITS_R5)) {
+                assert(c->devinfo->has_accumulators);
+                return c->compiler->reg_class_r5[c->thread_index];
+        } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+                assert(c->devinfo->has_accumulators);
+                return c->compiler->reg_class_phys_or_acc[c->thread_index];
+        } else {
+                assert(class_bits == get_class_bit_any(c->devinfo));
+                return c->compiler->reg_class_any[c->thread_index];
+        }
+}
+
+static inline struct ra_class *
+choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
+{
+        assert(temp < c->num_temps && temp < c->nodes.alloc_count);
+        return choose_reg_class(c, get_temp_class_bits(c, temp));
+}
 
 static inline bool
 qinst_writes_tmu(const struct v3d_device_info *devinfo,
@@ -46,23 +134,22 @@ static bool
 is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
                        struct qinst *inst, struct qblock *block)
 {
-        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
-            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
-                return true;
-        }
-
-        if (!inst->qpu.sig.ldtmu)
+        /* Only tmuwt and ldtmu can finish TMU sequences */
+        bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                        inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+        bool is_ldtmu = inst->qpu.sig.ldtmu;
+        if (!is_tmuwt && !is_ldtmu)
                 return false;
 
+        /* Check if this is the last tmuwt or ldtmu in the sequence */
         list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
                                  &block->instructions, link) {
-                if (scan_inst->qpu.sig.ldtmu)
-                        return false;
+                is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                           scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+                is_ldtmu = scan_inst->qpu.sig.ldtmu;
 
-                if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
-                    inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
-                        return true;
-                }
+                if (is_tmuwt || is_ldtmu)
+                        return false;
 
                 if (qinst_writes_tmu(devinfo, scan_inst))
                         return true;
@@ -79,11 +166,101 @@ vir_is_mov_uniform(struct v3d_compile *c, int temp)
         return def && def->qpu.sig.ldunif;
 }
 
+static bool
+can_reconstruct_inst(struct qinst *inst)
+{
+        assert(inst);
+
+        if (vir_is_add(inst)) {
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_FXCD:
+                case V3D_QPU_A_FYCD:
+                case V3D_QPU_A_XCD:
+                case V3D_QPU_A_YCD:
+                case V3D_QPU_A_IID:
+                case V3D_QPU_A_EIDX:
+                case V3D_QPU_A_TIDX:
+                case V3D_QPU_A_SAMPID:
+                        /* No need to check input unpacks because none of these
+                         * opcodes read sources. FXCD,FYCD have pack variants.
+                         */
+                        return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+                               inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
+                               inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
+                               inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
+                default:
+                        return false;
+                }
+        }
+
+        return false;
+}
+
+static bool
+can_reconstruct_temp(struct v3d_compile *c, int temp)
+{
+        struct qinst *def = c->defs[temp];
+        return def && can_reconstruct_inst(def);
+}
+
+static struct qreg
+reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
+{
+        struct qreg dest;
+        switch (op) {
+        case V3D_QPU_A_FXCD:
+                dest = vir_FXCD(c);
+                break;
+        case V3D_QPU_A_FYCD:
+                dest = vir_FYCD(c);
+                break;
+        case V3D_QPU_A_XCD:
+                dest = vir_XCD(c);
+                break;
+        case V3D_QPU_A_YCD:
+                dest = vir_YCD(c);
+                break;
+        case V3D_QPU_A_IID:
+                dest = vir_IID(c);
+                break;
+        case V3D_QPU_A_EIDX:
+                dest = vir_EIDX(c);
+                break;
+        case V3D_QPU_A_TIDX:
+                dest = vir_TIDX(c);
+                break;
+        case V3D_QPU_A_SAMPID:
+                dest = vir_SAMPID(c);
+                break;
+        default:
+            unreachable("Unexpected opcode for reconstruction");
+        }
+
+        return dest;
+}
+
+enum temp_spill_type {
+        SPILL_TYPE_UNIFORM,
+        SPILL_TYPE_RECONSTRUCT,
+        SPILL_TYPE_TMU
+};
+
+static enum temp_spill_type
+get_spill_type_for_temp(struct v3d_compile *c, int temp)
+{
+   if (vir_is_mov_uniform(c, temp))
+      return SPILL_TYPE_UNIFORM;
+
+   if (can_reconstruct_temp(c, temp))
+      return SPILL_TYPE_RECONSTRUCT;
+
+   return SPILL_TYPE_TMU;
+}
+
 static int
-v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
-                      uint32_t *temp_to_node)
+v3d_choose_spill_node(struct v3d_compile *c)
 {
-        const float tmu_scale = 5;
+        const float tmu_scale = 10;
         float block_scale = 1.0;
         float spill_costs[c->num_temps];
         bool in_tmu_operation = false;
@@ -99,7 +276,8 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                          * starting output writes.
                          */
                         bool no_spilling =
-                                c->threads > 1 && started_last_seg;
+                                (c->threads > 1 && started_last_seg) ||
+                                (c->max_tmu_spills == 0);
 
                         /* Discourage spilling of TMU operations */
                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -107,7 +285,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                         continue;
 
                                 int temp = inst->src[i].index;
-                                if (vir_is_mov_uniform(c, temp)) {
+                                enum temp_spill_type spill_type =
+                                        get_spill_type_for_temp(c, temp);
+
+                                if (spill_type != SPILL_TYPE_TMU) {
                                         spill_costs[temp] += block_scale;
                                 } else if (!no_spilling) {
                                         float tmu_op_scale = in_tmu_operation ?
@@ -122,11 +303,11 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
 
                         if (inst->dst.file == QFILE_TEMP) {
                                 int temp = inst->dst.index;
+                                enum temp_spill_type spill_type =
+                                        get_spill_type_for_temp(c, temp);
 
-                                if (vir_is_mov_uniform(c, temp)) {
-                                        /* We just rematerialize the unform
-                                         * later.
-                                         */
+                                if (spill_type != SPILL_TYPE_TMU) {
+                                        /* We just rematerialize it later */
                                 } else if (!no_spilling) {
                                         spill_costs[temp] += (block_scale *
                                                               tmu_scale);
@@ -147,10 +328,6 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                         if (inst->is_last_thrsw)
                                 started_last_seg = true;
 
-                        if (v3d_qpu_writes_vpm(&inst->qpu) ||
-                            v3d_qpu_uses_tlb(&inst->qpu))
-                                started_last_seg = true;
-
                         /* Track when we're in between a TMU setup and the
                          * final LDTMU or TMUWT from that TMU setup.  We
                          * penalize spills during that time.
@@ -163,12 +340,53 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                 }
         }
 
+        /* We always emit a "last thrsw" to ensure all our spilling occurs
+         * before the last thread section. See vir_emit_last_thrsw.
+         */
+        assert(started_last_seg);
+
         for (unsigned i = 0; i < c->num_temps; i++) {
-                if (BITSET_TEST(c->spillable, i))
-                        ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]);
+                if (BITSET_TEST(c->spillable, i)) {
+                        ra_set_node_spill_cost(c->g, temp_to_node(c, i),
+                                               spill_costs[i]);
+                }
         }
 
-        return ra_get_best_spill_node(g);
+        return ra_get_best_spill_node(c->g);
+}
+
+static void
+ensure_nodes(struct v3d_compile *c)
+{
+        if (c->num_temps < c->nodes.alloc_count)
+                return;
+
+        c->nodes.alloc_count *= 2;
+        c->nodes.info = reralloc_array_size(c,
+                                            c->nodes.info,
+                                            sizeof(c->nodes.info[0]),
+                                            c->nodes.alloc_count +
+                                            MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
+}
+
+/* Creates the interference node for a new temp. We use this to keep the node
+ * list updated during the spilling process, which generates new temps/nodes.
+ */
+static void
+add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+{
+        ensure_nodes(c);
+
+        int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
+        assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+                                              node == temp + IMPLICIT_RF_COUNT);
+
+        /* We fill the node priority after we are done inserting spills */
+        c->nodes.info[node].class_bits = class_bits;
+        c->nodes.info[node].priority = 0;
+        c->nodes.info[node].is_ldunif_dst = false;
+        c->nodes.info[node].is_program_end = false;
+        c->nodes.info[node].unused = false;
 }
 
 /* The spill offset for this thread takes a bit of setup, so do it once at
@@ -206,79 +424,224 @@ v3d_setup_spill_base(struct v3d_compile *c)
                                 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
 
         /* Make sure that we don't spill the spilling setup instructions. */
-        for (int i = start_num_temps; i < c->num_temps; i++)
+        for (int i = start_num_temps; i < c->num_temps; i++) {
                 BITSET_CLEAR(c->spillable, i);
 
+                /* If we are spilling, update the RA map with the temps added
+                 * by the spill setup. Our spill_base register can never be an
+                 * accumulator because it is used for TMU spill/fill and thus
+                 * needs to persist across thread switches.
+                 */
+                if (c->spilling) {
+                        int temp_class = CLASS_BITS_PHYS;
+                        if (c->devinfo->has_accumulators &&
+                            i != c->spill_base.index) {
+                                temp_class |= CLASS_BITS_ACC;
+                        }
+                        add_node(c, i, temp_class);
+                }
+        }
+
         /* Restore the current block. */
         c->cur_block = current_block;
         c->cursor = vir_after_block(c->cur_block);
 }
 
-static struct qinst *
-v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
+/**
+ * Computes the address for a spill/fill sequence and completes the spill/fill
+ * sequence by emitting the following code:
+ *
+ * ldunif.spill_offset
+ * add tmua spill_base spill_offset
+ * thrsw
+ *
+ * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
+ * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
+ *
+ * The parameter 'ip' represents the ip at which the spill/fill is happening.
+ * This is used to disallow accumulators on temps that cross this ip boundary
+ * due to the new thrsw itroduced in the sequence above.
+ */
+static void
+v3d_emit_spill_tmua(struct v3d_compile *c,
+                    uint32_t spill_offset,
+                    enum v3d_qpu_cond cond,
+                    int32_t ip,
+                    struct qreg *fill_dst)
 {
-        return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
-                            c->spill_base, vir_uniform_ui(c, spill_offset));
-}
+        assert(ip >= 0);
+
+        /* Load a uniform with the spill offset and add it to the spill base
+         * to obtain the TMUA address. It can be of class ANY because we know
+         * we are consuming it immediately without thrsw in between.
+         */
+        assert(c->disable_ldunif_opt);
+        struct qreg offset = vir_uniform_ui(c, spill_offset);
+        add_node(c, offset.index, get_class_bit_any(c->devinfo));
 
+        /* We always enable per-quad on spills/fills to ensure we spill
+         * any channels involved with helper invocations.
+         */
+        struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+        struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
+        inst->qpu.flags.ac = cond;
+        inst->ldtmu_count = 1;
+        inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+                                              0xffffff7f); /* per-quad */
+
+        vir_emit_thrsw(c);
+
+        /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
+         * result of the fill. The TMUWT temp is not really read, the ldtmu
+         * temp will be used immediately so just like the uniform above we
+         * can allow accumulators.
+         */
+        int temp_class =
+                filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+        if (!fill_dst) {
+                struct qreg dst = vir_TMUWT(c);
+                assert(dst.file == QFILE_TEMP);
+                add_node(c, dst.index, temp_class);
+        } else {
+                *fill_dst = vir_LDTMU(c);
+                assert(fill_dst->file == QFILE_TEMP);
+                add_node(c, fill_dst->index, temp_class);
+        }
+
+        /* Temps across the thread switch we injected can't be assigned to
+         * accumulators.
+         *
+         * Fills inject code before ip, so anything that starts at ip or later
+         * is not affected by the thrsw. Something that ends at ip will be
+         * affected though.
+         *
+         * Spills inject code after ip, so anything that starts strictly later
+         * than ip is not affected (the temp starting at ip is usually the
+         * spilled temp except for postponed spills). Something that ends at ip
+         * won't be affected either.
+         */
+        for (int i = 0; i < c->spill_start_num_temps; i++) {
+                bool thrsw_cross = fill_dst ?
+                        c->temp_start[i] < ip && c->temp_end[i] >= ip :
+                        c->temp_start[i] <= ip && c->temp_end[i] > ip;
+                if (thrsw_cross) {
+                        ra_set_node_class(c->g, temp_to_node(c, i),
+                                          choose_reg_class(c, CLASS_BITS_PHYS));
+                }
+        }
+}
 
 static void
-v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst,
-                   struct qinst *position, uint32_t spill_offset)
+v3d_emit_tmu_spill(struct v3d_compile *c,
+                   struct qinst *inst,
+                   struct qreg spill_temp,
+                   struct qinst *position,
+                   uint32_t ip,
+                   uint32_t spill_offset)
 {
         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+        assert(inst->dst.file == QFILE_TEMP);
 
         c->cursor = vir_after_inst(position);
-        inst->dst = vir_get_temp(c);
+
         enum v3d_qpu_cond cond = vir_get_cond(inst);
+
+        /* If inst and position don't match, this is a postponed spill,
+         * in which case we have already allocated the temp for the spill
+         * and we should use that, otherwise create a new temp with the
+         * same register class bits as the original.
+         */
+        if (inst == position) {
+                uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
+                inst->dst = vir_get_temp(c);
+                add_node(c, inst->dst.index, class_bits);
+        } else {
+                inst->dst = spill_temp;
+
+                /* If this is a postponed spill the register being spilled may
+                 * have been written more than once including conditional
+                 * writes, so ignore predication on the spill instruction and
+                 * always spill the full register.
+                 */
+                cond = V3D_QPU_COND_NONE;
+        }
+
         struct qinst *tmp =
                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
                              inst->dst);
         tmp->qpu.flags.mc = cond;
-        tmp = v3d_emit_spill_tmua(c, spill_offset);
-        tmp->qpu.flags.ac = cond;
-        vir_emit_thrsw(c);
-        vir_TMUWT(c);
+
+        v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
+
         c->spills++;
         c->tmu_dirty_rcl = true;
 }
 
+static inline bool
+interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
+{
+        return !(t0_start >= t1_end || t1_start >= t0_end);
+}
+
 static void
-v3d_spill_reg(struct v3d_compile *c, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+              int spill_temp)
 {
-        c->spill_count++;
+        c->spill_start_num_temps = c->num_temps;
+        c->spilling = true;
 
-        bool is_uniform = vir_is_mov_uniform(c, spill_temp);
+        enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
 
         uint32_t spill_offset = 0;
-
-        if (!is_uniform) {
+        if (spill_type == SPILL_TYPE_TMU) {
                 spill_offset = c->spill_size;
                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
 
-                if (spill_offset == 0)
+                if (spill_offset == 0) {
                         v3d_setup_spill_base(c);
+
+                        /* Don't allocate our spill base to rf0 to avoid
+                         * conflicts with instructions doing implicit writes
+                         * to that register.
+                         */
+                        if (!c->devinfo->has_accumulators) {
+                                ra_add_node_interference(
+                                        c->g,
+                                        temp_to_node(c, c->spill_base.index),
+                                        implicit_rf_nodes[0]);
+                        }
+                }
         }
 
         struct qinst *last_thrsw = c->last_thrsw;
         assert(last_thrsw && last_thrsw->is_last_thrsw);
 
-        int start_num_temps = c->num_temps;
-
         int uniform_index = ~0;
-        if (is_uniform) {
+        if (spill_type == SPILL_TYPE_UNIFORM) {
                 struct qinst *orig_unif = c->defs[spill_temp];
                 uniform_index = orig_unif->uniform;
         }
 
+        enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
+        if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+                struct qinst *orig_def = c->defs[spill_temp];
+                assert(vir_is_add(orig_def));
+                reconstruct_op = orig_def->qpu.alu.add.op;
+        }
+
+        uint32_t spill_node = temp_to_node(c, spill_temp);
+
         /* We must disable the ldunif optimization if we are spilling uniforms */
         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
         c->disable_ldunif_opt = true;
 
         struct qinst *start_of_tmu_sequence = NULL;
         struct qinst *postponed_spill = NULL;
+        struct qreg postponed_spill_temp = { 0 };
         vir_for_each_block(block, c) {
                 vir_for_each_inst_safe(inst, block) {
+                        int32_t ip = inst->ip;
+
                         /* Track when we're in between a TMU setup and the final
                          * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
                          * temps during that time, because that involves inserting a
@@ -289,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                         if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
                                 if (postponed_spill) {
                                         v3d_emit_tmu_spill(c, postponed_spill,
-                                                           inst, spill_offset);
+                                                           postponed_spill_temp,
+                                                           inst, ip, spill_offset);
                                 }
 
                                 start_of_tmu_sequence = NULL;
@@ -302,49 +666,103 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                         }
 
                         /* fills */
+                        int filled_src = -1;
                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
                                 if (inst->src[i].file != QFILE_TEMP ||
                                     inst->src[i].index != spill_temp) {
                                         continue;
                                 }
 
+                                if (filled_src >= 0) {
+                                        inst->src[i] = inst->src[filled_src];
+                                        continue;
+                                }
+
                                 c->cursor = vir_before_inst(inst);
 
-                                if (is_uniform) {
+                                if (spill_type == SPILL_TYPE_UNIFORM) {
                                         struct qreg unif =
                                                 vir_uniform(c,
                                                             c->uniform_contents[uniform_index],
                                                             c->uniform_data[uniform_index]);
                                         inst->src[i] = unif;
+                                        /* We are using the uniform in the
+                                         * instruction immediately after, so
+                                         * we can use any register class for it.
+                                         */
+                                        add_node(c, unif.index,
+                                                 get_class_bit_any(c->devinfo));
+                                } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+                                        struct qreg temp =
+                                                reconstruct_temp(c, reconstruct_op);
+                                        inst->src[i] = temp;
+                                        /* We are using the temp in the
+                                         * instruction immediately after so we
+                                         * can use ACC.
+                                         */
+                                        int temp_class =
+                                                filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+                                                                              CLASS_BITS_ACC);
+                                        add_node(c, temp.index, temp_class);
                                 } else {
-                                        /* If we have a postponed spill, we don't need
-                                         * a fill as the temp would not have been
-                                         * spilled yet.
+                                        /* If we have a postponed spill, we
+                                         * don't need a fill as the temp would
+                                         * not have been spilled yet, however,
+                                         * we need to update the temp index.
                                          */
-                                        if (postponed_spill)
-                                                continue;
-                                        if (start_of_tmu_sequence)
-                                                c->cursor = vir_before_inst(start_of_tmu_sequence);
-
-                                        v3d_emit_spill_tmua(c, spill_offset);
-                                        vir_emit_thrsw(c);
-                                        inst->src[i] = vir_LDTMU(c);
-                                        c->fills++;
+                                        if (postponed_spill) {
+                                                inst->src[i] =
+                                                        postponed_spill_temp;
+                                        } else {
+                                                int32_t fill_ip = ip;
+                                                if (start_of_tmu_sequence) {
+                                                        c->cursor = vir_before_inst(start_of_tmu_sequence);
+                                                        fill_ip = start_of_tmu_sequence->ip;
+                                                }
+
+                                                v3d_emit_spill_tmua(c,  spill_offset,
+                                                                    V3D_QPU_COND_NONE,
+                                                                    fill_ip, &inst->src[i]);
+                                                c->fills++;
+                                        }
                                 }
+
+                                filled_src = i;
                         }
 
                         /* spills */
                         if (inst->dst.file == QFILE_TEMP &&
                             inst->dst.index == spill_temp) {
-                                if (is_uniform) {
+                                if (spill_type != SPILL_TYPE_TMU) {
                                         c->cursor.link = NULL;
                                         vir_remove_instruction(c, inst);
                                 } else {
-                                        if (start_of_tmu_sequence)
+                                        /* If we are in the middle of a TMU
+                                         * sequence, we postpone the actual
+                                         * spill until we have finished it. We,
+                                         * still need to replace the spill temp
+                                         * with a new temp though.
+                                         */
+                                        if (start_of_tmu_sequence) {
+                                                if (postponed_spill) {
+                                                        postponed_spill->dst =
+                                                                postponed_spill_temp;
+                                                }
+                                                if (!postponed_spill ||
+                                                    vir_get_cond(inst) == V3D_QPU_COND_NONE) {
+                                                        postponed_spill_temp =
+                                                                vir_get_temp(c);
+                                                        add_node(c,
+                                                                 postponed_spill_temp.index,
+                                                                 c->nodes.info[spill_node].class_bits);
+                                                }
                                                 postponed_spill = inst;
-                                        else
-                                                v3d_emit_tmu_spill(c, inst, inst,
+                                        } else {
+                                                v3d_emit_tmu_spill(c, inst,
+                                                                   postponed_spill_temp,
+                                                                   inst, ip,
                                                                    spill_offset);
+                                        }
                                 }
                         }
                 }
@@ -358,21 +776,64 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
         /* Don't allow spilling of our spilling instructions.  There's no way
          * they can help get things colored.
          */
-        for (int i = start_num_temps; i < c->num_temps; i++)
+        for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
                 BITSET_CLEAR(c->spillable, i);
 
+        /* Reset interference for spilled node */
+        ra_set_node_spill_cost(c->g, spill_node, 0);
+        ra_reset_node_interference(c->g, spill_node);
+        BITSET_CLEAR(c->spillable, spill_temp);
+
+        /* Rebuild program ips */
+        int32_t ip = 0;
+        vir_for_each_inst_inorder(inst, c)
+                inst->ip = ip++;
+
+        /* Rebuild liveness */
+        vir_calculate_live_intervals(c);
+
+        /* Add interferences for the new spilled temps and update interferences
+         * for c->spill_base (since we may have modified its liveness). Also,
+         * update node priorities based one new liveness data.
+         */
+        uint32_t sb_temp =c->spill_base.index;
+        uint32_t sb_node = temp_to_node(c, sb_temp);
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                if (c->temp_end[i] == -1)
+                        continue;
+
+                uint32_t node_i = temp_to_node(c, i);
+                c->nodes.info[node_i].priority =
+                        c->temp_end[i] - c->temp_start[i];
+
+                for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
+                     j < c->num_temps; j++) {
+                        if (interferes(c->temp_start[i], c->temp_end[i],
+                                       c->temp_start[j], c->temp_end[j])) {
+                                uint32_t node_j = temp_to_node(c, j);
+                                ra_add_node_interference(c->g, node_i, node_j);
+                        }
+                }
+
+                if (spill_type == SPILL_TYPE_TMU) {
+                        if (i != sb_temp &&
+                            interferes(c->temp_start[i], c->temp_end[i],
+                                       c->temp_start[sb_temp], c->temp_end[sb_temp])) {
+                                ra_add_node_interference(c->g, node_i, sb_node);
+                        }
+                }
+        }
+
         c->disable_ldunif_opt = had_disable_ldunif_opt;
+        c->spilling = false;
 }
 
-struct node_to_temp_map {
-        uint32_t temp;
-        uint32_t priority;
-};
-
 struct v3d_ra_select_callback_data {
+        uint32_t phys_index;
         uint32_t next_acc;
         uint32_t next_phys;
-        struct node_to_temp_map *map;
+        struct v3d_ra_node_info *nodes;
+        const struct v3d_device_info *devinfo;
 };
 
 /* Choosing accumulators improves chances of merging QPU instructions
@@ -384,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
                    BITSET_WORD *regs,
                    int priority)
 {
+        if (!v3d_ra->devinfo->has_accumulators)
+                return false;
+
         /* Favor accumulators if we have less that this number of physical
          * registers. Accumulators have more restrictions (like being
          * invalidated through thrsw), so running out of physical registers
@@ -393,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
         static const int available_rf_threshold = 5;
         int available_rf = 0 ;
         for (int i = 0; i < PHYS_COUNT; i++) {
-                if (BITSET_TEST(regs, PHYS_INDEX + i))
+                if (BITSET_TEST(regs, v3d_ra->phys_index + i))
                         available_rf++;
                 if (available_rf >= available_rf_threshold)
                         break;
@@ -419,6 +883,19 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
                     BITSET_WORD *regs,
                     unsigned int *out)
 {
+        if (!v3d_ra->devinfo->has_accumulators)
+                return false;
+
+        /* Choose r5 for our ldunifs if possible (nobody else can load to that
+         * reg, and it keeps the QPU cond field free from being occupied by
+         * ldunifrf).
+         */
+        int r5 = ACC_INDEX + 5;
+        if (BITSET_TEST(regs, r5)) {
+                *out = r5;
+                return true;
+        }
+
         /* Round-robin through our accumulators to give post-RA instruction
          * selection more options.
          */
@@ -438,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
 
 static bool
 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 unsigned int node,
                  BITSET_WORD *regs,
                  unsigned int *out)
 {
+        /* If this node is for an unused temp, ignore. */
+        if (v3d_ra->nodes->info[node].unused) {
+                *out = 0;
+                return true;
+        }
+
+        /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+         * so we can avoid turning them into ldunifrf (which uses the
+         * cond field to encode the dst and would prevent merge with
+         * instructions that use cond flags).
+         */
+        if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+            BITSET_TEST(regs, v3d_ra->phys_index)) {
+                assert(v3d_ra->devinfo->ver >= 71);
+                *out = v3d_ra->phys_index;
+                return true;
+        }
+
+        /* The last 3 instructions in a shader can't use some specific registers
+         * (usually early rf registers, depends on v3d version) so try to
+         * avoid allocating these to registers used by the last instructions
+         * in the shader.
+         */
+        const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
+        if (v3d_ra->nodes->info[node].is_program_end &&
+            v3d_ra->next_phys < safe_rf_start) {
+                v3d_ra->next_phys = safe_rf_start;
+        }
+
         for (int i = 0; i < PHYS_COUNT; i++) {
                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
-                int phys = PHYS_INDEX + phys_off;
+
+                /* Try to keep rf0 available for ldunif in 7.x (see above). */
+                if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+                        continue;
+
+                int phys = v3d_ra->phys_index + phys_off;
 
                 if (BITSET_TEST(regs, phys)) {
                         v3d_ra->next_phys = phys_off + 1;
@@ -452,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
                 }
         }
 
+        /* If we couldn't allocate, do try to assign rf0 if it is available. */
+        if (v3d_ra->devinfo->ver >= 71 &&
+            BITSET_TEST(regs, v3d_ra->phys_index)) {
+                v3d_ra->next_phys = 1;
+                *out = v3d_ra->phys_index;
+                return true;
+        }
+
         return false;
 }
 
@@ -459,22 +979,14 @@ static unsigned int
 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
 {
         struct v3d_ra_select_callback_data *v3d_ra = data;
-        int r5 = ACC_INDEX + 5;
-
-        /* Choose r5 for our ldunifs if possible (nobody else can load to that
-         * reg, and it keeps the QPU cond field free from being occupied by
-         * ldunifrf).
-         */
-        if (BITSET_TEST(regs, r5))
-                return r5;
 
         unsigned int reg;
-        if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&
+        if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
             v3d_ra_select_accum(v3d_ra, regs, &reg)) {
                 return reg;
         }
 
-        if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+        if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
                 return reg;
 
         /* If we ran out of physical registers try to assign an accumulator
@@ -492,9 +1004,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         /* Allocate up to 3 regfile classes, for the ways the physical
          * register file can be divided up for fragment shader threading.
          */
-        int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+        int max_thread_index = 2;
+        uint8_t phys_index = get_phys_index(compiler->devinfo);
 
-        compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+        compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
                                           false);
         if (!compiler->regs)
                 return false;
@@ -502,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         for (int threads = 0; threads < max_thread_index; threads++) {
                 compiler->reg_class_any[threads] =
                         ra_alloc_contig_reg_class(compiler->regs, 1);
-                compiler->reg_class_r5[threads] =
-                        ra_alloc_contig_reg_class(compiler->regs, 1);
-                compiler->reg_class_phys_or_acc[threads] =
-                        ra_alloc_contig_reg_class(compiler->regs, 1);
+                if (compiler->devinfo->has_accumulators) {
+                        compiler->reg_class_r5[threads] =
+                                ra_alloc_contig_reg_class(compiler->regs, 1);
+                        compiler->reg_class_phys_or_acc[threads] =
+                                ra_alloc_contig_reg_class(compiler->regs, 1);
+                }
                 compiler->reg_class_phys[threads] =
                         ra_alloc_contig_reg_class(compiler->regs, 1);
 
-                for (int i = PHYS_INDEX;
-                     i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                /* Init physical regs */
+                for (int i = phys_index;
+                     i < phys_index + (PHYS_COUNT >> threads); i++) {
+                        if (compiler->devinfo->has_accumulators)
+                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
                         ra_class_add_reg(compiler->reg_class_any[threads], i);
                 }
 
-                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
-                        ra_class_add_reg(compiler->reg_class_any[threads], i);
+                /* Init accumulator regs */
+                if (compiler->devinfo->has_accumulators) {
+                        for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                                ra_class_add_reg(compiler->reg_class_any[threads], i);
+                        }
+                        /* r5 can only store a single 32-bit value, so not much can
+                         * use it.
+                         */
+                        ra_class_add_reg(compiler->reg_class_r5[threads],
+                                         ACC_INDEX + 5);
+                        ra_class_add_reg(compiler->reg_class_any[threads],
+                                         ACC_INDEX + 5);
                 }
-                /* r5 can only store a single 32-bit value, so not much can
-                 * use it.
-                 */
-                ra_class_add_reg(compiler->reg_class_r5[threads],
-                                 ACC_INDEX + 5);
-                ra_class_add_reg(compiler->reg_class_any[threads],
-                                 ACC_INDEX + 5);
         }
 
         ra_set_finalize(compiler->regs, NULL);
@@ -534,52 +1054,220 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         return true;
 }
 
-static int
-node_to_temp_priority(const void *in_a, const void *in_b)
+static inline bool
+tmu_spilling_allowed(struct v3d_compile *c)
 {
-        const struct node_to_temp_map *a = in_a;
-        const struct node_to_temp_map *b = in_b;
-
-        return a->priority - b->priority;
+        return c->spills + c->fills < c->max_tmu_spills;
 }
 
-/**
- * Computes the number of registers to spill in a batch after a register
- * allocation failure.
- */
-static uint32_t
-get_spill_batch_size(struct v3d_compile *c)
-{
-   /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of
-    * over-spilling if the program requires few spills to compile.
-    */
-   if (c->spill_count < 10)
-           return 1;
-
-   /* If we have to spill more than that we assume performance is not going to
-    * be great and we shift focus to batching spills to cut down compile
-    * time at the expense of over-spilling.
-    */
-   return 20;
-}
-
-/* Don't emit spills using the TMU until we've dropped thread count first. We,
- * may also disable spilling when certain optimizations that are known to
- * increase register pressure are active so we favor recompiling with
- * optimizations disabled instead of spilling.
- */
-static inline bool
-tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
+static void
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+                                      int *acc_nodes,
+                                      int *implicit_rf_nodes,
+                                      int last_ldvary_ip,
+                                      struct qinst *inst)
 {
-        return thread_index == 0 && c->tmu_spilling_allowed;
+        int32_t ip = inst->ip;
+        assert(ip >= 0);
+
+        /* If the instruction writes r4 (and optionally moves its
+         * result to a temp), nothing else can be stored in r4 across
+         * it.
+         */
+        if (vir_writes_r4_implicitly(c->devinfo, inst)) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, i),
+                                                         acc_nodes[4]);
+                        }
+                }
+        }
+
+        /* If any instruction writes to a physical register implicitly
+         * nothing else can write the same register across it.
+         */
+        if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, i),
+                                                         implicit_rf_nodes[0]);
+                        }
+                }
+        }
+
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_LDVPMV_IN:
+                case V3D_QPU_A_LDVPMV_OUT:
+                case V3D_QPU_A_LDVPMD_IN:
+                case V3D_QPU_A_LDVPMD_OUT:
+                case V3D_QPU_A_LDVPMP:
+                case V3D_QPU_A_LDVPMG_IN:
+                case V3D_QPU_A_LDVPMG_OUT: {
+                        /* LDVPMs only store to temps (the MA flag
+                         * decides whether the LDVPM is in or out)
+                         */
+                        assert(inst->dst.file == QFILE_TEMP);
+                        set_temp_class_bits(c, inst->dst.index,
+                                            CLASS_BITS_PHYS);
+                        break;
+                }
+
+                case V3D_QPU_A_RECIP:
+                case V3D_QPU_A_RSQRT:
+                case V3D_QPU_A_EXP:
+                case V3D_QPU_A_LOG:
+                case V3D_QPU_A_SIN:
+                case V3D_QPU_A_RSQRT2: {
+                        /* The SFU instructions write directly to the
+                         * phys regfile.
+                         */
+                        assert(inst->dst.file == QFILE_TEMP);
+                        set_temp_class_bits(c, inst->dst.index,
+                                            CLASS_BITS_PHYS);
+                        break;
+                }
+
+                default:
+                        break;
+                }
+        }
+
+        if (inst->src[0].file == QFILE_REG) {
+                switch (inst->src[0].index) {
+                case 0:
+                        /* V3D 7.x doesn't use rf0 for thread payload */
+                        if (c->devinfo->ver >= 71)
+                                break;
+                        else
+                                FALLTHROUGH;
+                case 1:
+                case 2:
+                case 3: {
+                        /* Payload setup instructions: Force allocate
+                         * the dst to the given register (so the MOV
+                         * will disappear).
+                         */
+                        assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+                        assert(inst->dst.file == QFILE_TEMP);
+                        uint32_t node = temp_to_node(c, inst->dst.index);
+                        ra_set_node_reg(c->g, node,
+                                        get_phys_index(c->devinfo) +
+                                        inst->src[0].index);
+                        break;
+                }
+                }
+        }
+
+        /* Don't allocate rf0 to temps that cross ranges where we have
+         * live implicit rf0 writes from ldvary. We can identify these
+         * by tracking the last ldvary instruction and explicit reads
+         * of rf0.
+         */
+        if (c->devinfo->ver >= 71 &&
+            ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+              (vir_get_nsrc(inst) > 1 &&
+               inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip &&
+                            c->temp_end[i] > last_ldvary_ip) {
+                                        ra_add_node_interference(c->g,
+                                                                 temp_to_node(c, i),
+                                                                 implicit_rf_nodes[0]);
+                        }
+                }
+        }
+
+        if (inst->dst.file == QFILE_TEMP) {
+                /* Only a ldunif gets to write to R5, which only has a
+                 * single 32-bit channel of storage.
+                 *
+                 * NOTE: ldunifa is subject to the same, however, going by
+                 * shader-db it is best to keep r5 exclusive to ldunif, probably
+                 * because ldunif has usually a shorter lifespan, allowing for
+                 * more accumulator reuse and QPU merges.
+                 */
+                if (c->devinfo->has_accumulators) {
+                        if (!inst->qpu.sig.ldunif) {
+                                uint8_t class_bits =
+                                        get_temp_class_bits(c, inst->dst.index) &
+                                        ~CLASS_BITS_R5;
+                                set_temp_class_bits(c, inst->dst.index,
+                                                    class_bits);
+
+                        }
+                } else {
+                        /* Make sure we don't allocate the ldvary's
+                         * destination to rf0, since it would clash
+                         * with its implicit write to that register.
+                         */
+                        if (inst->qpu.sig.ldvary) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, inst->dst.index),
+                                                         implicit_rf_nodes[0]);
+                        }
+                        /* Flag dst temps from ldunif(a) instructions
+                         * so we can try to assign rf0 to them and avoid
+                         * converting these to ldunif(a)rf.
+                         */
+                        if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+                                const uint32_t dst_n =
+                                        temp_to_node(c, inst->dst.index);
+                                c->nodes.info[dst_n].is_ldunif_dst = true;
+                        }
+                }
+        }
+
+        /* All accumulators are invalidated across a thread switch. */
+        if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                set_temp_class_bits(c, i,
+                                                    CLASS_BITS_PHYS);
+                        }
+                }
+        }
 }
 
-#define CLASS_BIT_PHYS			(1 << 0)
-#define CLASS_BIT_ACC			(1 << 1)
-#define CLASS_BIT_R5			(1 << 4)
-#define CLASS_BITS_ANY			(CLASS_BIT_PHYS | \
-                                         CLASS_BIT_ACC | \
-                                         CLASS_BIT_R5)
+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+        /* Only look for registers used in this many instructions */
+        uint32_t last_set_count = 6;
+
+        struct qblock *last_block = vir_exit_block(c);
+        list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+                if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                        continue;
+
+                int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+                for (int i = 0; i < num_src; i++) {
+                        if (inst->src[i].file == QFILE_TEMP) {
+                                int node = temp_to_node(c, inst->src[i].index);
+                                c->nodes.info[node].is_program_end = true;
+                        }
+                }
+
+                num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+                for (int i = 0; i < num_src; i++) {
+                       if (inst->src[i].file == QFILE_TEMP) {
+                                int node = temp_to_node(c, inst->src[i].index);
+                                c->nodes.info[node].is_program_end = true;
+
+                        }
+                }
+
+                if (inst->dst.file == QFILE_TEMP) {
+                        int node = temp_to_node(c, inst->dst.index);
+                        c->nodes.info[node].is_program_end = true;
+                }
+
+                if (--last_set_count == 0)
+                        break;
+        }
+}
 
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
@@ -587,24 +1275,37 @@ tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
  * The return value should be freed by the caller.
  */
 struct qpu_reg *
-v3d_register_allocate(struct v3d_compile *c, bool *spilled)
+v3d_register_allocate(struct v3d_compile *c)
 {
-        uint32_t UNUSED start_num_temps = c->num_temps;
-        struct node_to_temp_map map[c->num_temps];
-        uint32_t temp_to_node[c->num_temps];
-        uint8_t class_bits[c->num_temps];
         int acc_nodes[ACC_COUNT];
+        int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+        unsigned num_ra_nodes = c->num_temps;
+        if (c->devinfo->has_accumulators)
+                num_ra_nodes += ARRAY_SIZE(acc_nodes);
+        else
+                num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
+        c->nodes = (struct v3d_ra_node_info) {
+                .alloc_count = c->num_temps,
+                .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
+                                          num_ra_nodes),
+        };
+
+        uint32_t phys_index = get_phys_index(c->devinfo);
+
         struct v3d_ra_select_callback_data callback_data = {
+                .phys_index = phys_index,
                 .next_acc = 0,
                 /* Start at RF3, to try to keep the TLB writes from using
-                 * RF0-2.
+                 * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+                 * using RF2-3.
                  */
-                .next_phys = 3,
-                .map = map,
+                .next_phys = c->devinfo->ver == 42 ? 3 : 4,
+                .nodes = &c->nodes,
+                .devinfo = c->devinfo,
         };
 
-        *spilled = false;
-
         vir_calculate_live_intervals(c);
 
         /* Convert 1, 2, 4 threads to 0, 1, 2 index.
@@ -612,257 +1313,163 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
          * V3D 4.x has double the physical register space, so 64 physical regs
          * are available at both 1x and 2x threading, and 4x has 32.
          */
-        int thread_index = ffs(c->threads) - 1;
-        if (c->devinfo->ver >= 40) {
-                if (thread_index >= 1)
-                        thread_index--;
-        }
+        c->thread_index = ffs(c->threads) - 1;
+        if (c->thread_index >= 1)
+                c->thread_index--;
 
-        struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
-                                                         c->num_temps +
-                                                         ARRAY_SIZE(acc_nodes));
-        ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
+        c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
+        ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
 
         /* Make some fixed nodes for the accumulators, which we will need to
          * interfere with when ops have implied r3/r4 writes or for the thread
          * switches.  We could represent these as classes for the nodes to
          * live in, but the classes take up a lot of memory to set up, so we
-         * don't want to make too many.
+         * don't want to make too many. We use the same mechanism on platforms
+         * without accumulators that can have implicit writes to phys regs.
          */
-        for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
-                acc_nodes[i] = c->num_temps + i;
-                ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
-        }
-
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                map[i].temp = i;
-                map[i].priority = c->temp_end[i] - c->temp_start[i];
-        }
-        qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                temp_to_node[map[i].temp] = i;
+        for (uint32_t i = 0; i < num_ra_nodes; i++) {
+                c->nodes.info[i].is_ldunif_dst = false;
+                c->nodes.info[i].is_program_end = false;
+                c->nodes.info[i].unused = false;
+                c->nodes.info[i].priority = 0;
+                c->nodes.info[i].class_bits = 0;
+                if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+                        acc_nodes[i] = i;
+                        ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+                } else if (!c->devinfo->has_accumulators &&
+                           i < ARRAY_SIZE(implicit_rf_nodes)) {
+                        implicit_rf_nodes[i] = i;
+                        ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
+                } else {
+                        uint32_t t = node_to_temp(c, i);
+                        c->nodes.info[i].priority =
+                                c->temp_end[t] - c->temp_start[t];
+                        c->nodes.info[i].class_bits =
+                                get_class_bit_any(c->devinfo);
+                }
         }
 
-        /* Figure out our register classes and preallocated registers.  We
-         * start with any temp being able to be in any file, then instructions
-         * incrementally remove bits that the temp definitely can't be in.
+        /* Walk the instructions adding register class restrictions and
+         * interferences.
          */
-        memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
-
         int ip = 0;
+        int last_ldvary_ip = -1;
         vir_for_each_inst_inorder(inst, c) {
-                /* If the instruction writes r3/r4 (and optionally moves its
-                 * result to a temp), nothing else can be stored in r3/r4 across
-                 * it.
+                inst->ip = ip++;
+
+                /* ldunif(a) always write to a temporary, so we have
+                 * liveness info available to decide if rf0 is
+                 * available for them, however, ldvary is different:
+                 * it always writes to rf0 directly so we don't have
+                 * liveness information for its implicit rf0 write.
+                 *
+                 * That means the allocator may assign rf0 to a temp
+                 * that is defined while an implicit rf0 write from
+                 * ldvary is still live. We fix that by manually
+                 * tracking rf0 live ranges from ldvary instructions.
                  */
-                if (vir_writes_r3(c->devinfo, inst)) {
-                        for (int i = 0; i < c->num_temps; i++) {
-                                if (c->temp_start[i] < ip &&
-                                    c->temp_end[i] > ip) {
-                                        ra_add_node_interference(g,
-                                                                 temp_to_node[i],
-                                                                 acc_nodes[3]);
-                                }
-                        }
-                }
-                if (vir_writes_r4(c->devinfo, inst)) {
-                        for (int i = 0; i < c->num_temps; i++) {
-                                if (c->temp_start[i] < ip &&
-                                    c->temp_end[i] > ip) {
-                                        ra_add_node_interference(g,
-                                                                 temp_to_node[i],
-                                                                 acc_nodes[4]);
-                                }
-                        }
-                }
-
-                if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
-                        switch (inst->qpu.alu.add.op) {
-                        case V3D_QPU_A_LDVPMV_IN:
-                        case V3D_QPU_A_LDVPMV_OUT:
-                        case V3D_QPU_A_LDVPMD_IN:
-                        case V3D_QPU_A_LDVPMD_OUT:
-                        case V3D_QPU_A_LDVPMP:
-                        case V3D_QPU_A_LDVPMG_IN:
-                        case V3D_QPU_A_LDVPMG_OUT:
-                                /* LDVPMs only store to temps (the MA flag
-                                 * decides whether the LDVPM is in or out)
-                                 */
-                                assert(inst->dst.file == QFILE_TEMP);
-                                class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
-                                break;
-
-                        case V3D_QPU_A_RECIP:
-                        case V3D_QPU_A_RSQRT:
-                        case V3D_QPU_A_EXP:
-                        case V3D_QPU_A_LOG:
-                        case V3D_QPU_A_SIN:
-                        case V3D_QPU_A_RSQRT2:
-                                /* The SFU instructions write directly to the
-                                 * phys regfile.
-                                 */
-                                assert(inst->dst.file == QFILE_TEMP);
-                                class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
-                                break;
-
-                        default:
-                                break;
-                        }
-                }
+                if (inst->qpu.sig.ldvary)
+                        last_ldvary_ip = ip;
 
-                if (inst->src[0].file == QFILE_REG) {
-                        switch (inst->src[0].index) {
-                        case 0:
-                        case 1:
-                        case 2:
-                        case 3:
-                                /* Payload setup instructions: Force allocate
-                                 * the dst to the given register (so the MOV
-                                 * will disappear).
-                                 */
-                                assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
-                                assert(inst->dst.file == QFILE_TEMP);
-                                ra_set_node_reg(g,
-                                                temp_to_node[inst->dst.index],
-                                                PHYS_INDEX +
-                                                inst->src[0].index);
-                                break;
-                        }
-                }
-
-                if (inst->dst.file == QFILE_TEMP) {
-                        /* Only a ldunif gets to write to R5, which only has a
-                         * single 32-bit channel of storage.
-                         */
-                        if (!inst->qpu.sig.ldunif) {
-                                class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
-                        } else {
-                                /* Until V3D 4.x, we could only load a uniform
-                                 * to r5, so we'll need to spill if uniform
-                                 * loads interfere with each other.
-                                 */
-                                if (c->devinfo->ver < 40) {
-                                        class_bits[inst->dst.index] &=
-                                                CLASS_BIT_R5;
-                                }
-                        }
-                }
-
-                if (inst->qpu.sig.thrsw) {
-                        /* All accumulators are invalidated across a thread
-                         * switch.
-                         */
-                        for (int i = 0; i < c->num_temps; i++) {
-                                if (c->temp_start[i] < ip && c->temp_end[i] > ip)
-                                        class_bits[i] &= CLASS_BIT_PHYS;
-                        }
-                }
-
-                ip++;
+                update_graph_and_reg_classes_for_inst(c, acc_nodes,
+                                                      implicit_rf_nodes,
+                                                      last_ldvary_ip, inst);
         }
 
+        /* Flag the nodes that are used in the last instructions of the program
+         * (there are some registers that cannot be used in the last 3
+         * instructions). We only do this for fragment shaders, because the idea
+         * is that by avoiding this conflict we may be able to emit the last
+         * thread switch earlier in some cases, however, in non-fragment shaders
+         * this won't happen because the last instructions are always VPM stores
+         * with a small immediate, which conflicts with other signals,
+         * preventing us from ever moving the thrsw earlier.
+         */
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+                flag_program_end_nodes(c);
+
+        /* Set the register classes for all our temporaries in the graph */
         for (uint32_t i = 0; i < c->num_temps; i++) {
-                if (class_bits[i] == CLASS_BIT_PHYS) {
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_phys[thread_index]);
-                } else if (class_bits[i] == (CLASS_BIT_R5)) {
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_r5[thread_index]);
-                } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_phys_or_acc[thread_index]);
-                } else {
-                        assert(class_bits[i] == CLASS_BITS_ANY);
-                        ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_any[thread_index]);
-                }
+                ra_set_node_class(c->g, temp_to_node(c, i),
+                                  choose_reg_class_for_temp(c, i));
         }
 
+        /* Add register interferences based on liveness data */
         for (uint32_t i = 0; i < c->num_temps; i++) {
+                /* And while we are here, let's also flag nodes for
+                 * unused temps.
+                 */
+                if (c->temp_start[i] > c->temp_end[i])
+                        c->nodes.info[temp_to_node(c, i)].unused = true;
+
                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
-                        if (!(c->temp_start[i] >= c->temp_end[j] ||
-                              c->temp_start[j] >= c->temp_end[i])) {
-                                ra_add_node_interference(g,
-                                                         temp_to_node[i],
-                                                         temp_to_node[j]);
+                        if (interferes(c->temp_start[i], c->temp_end[i],
+                                       c->temp_start[j], c->temp_end[j])) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, i),
+                                                         temp_to_node(c, j));
                         }
                 }
         }
 
-        /* Debug code to force a bit of register spilling, for running across
-         * conformance tests to make sure that spilling works.
+        /* Debug option to force a bit of TMU spilling, for running
+         * across conformance tests to make sure that spilling works.
          */
-        int force_register_spills = 0;
-        if (c->spill_size <
-            V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
-                int node = v3d_choose_spill_node(c, g, temp_to_node);
-                if (node != -1) {
-                        v3d_spill_reg(c, map[node].temp);
-                        ralloc_free(g);
-                        *spilled = true;
-                        return NULL;
+        const int force_register_spills = 0;
+        if (force_register_spills > 0)
+                c->max_tmu_spills = UINT32_MAX;
+
+        struct qpu_reg *temp_registers = NULL;
+        while (true) {
+                if (c->spill_size <
+                    V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
+                        int node = v3d_choose_spill_node(c);
+                        uint32_t temp = node_to_temp(c, node);
+                        if (node != -1) {
+                                v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                                continue;
+                        }
                 }
-        }
-
-        bool ok = ra_allocate(g);
-        if (!ok) {
-                const uint32_t spill_batch_size = get_spill_batch_size(c);
-
-                for (uint32_t i = 0; i < spill_batch_size; i++) {
-                        int node = v3d_choose_spill_node(c, g, temp_to_node);
-                        if (node == -1)
-                           break;
-
-                        /* TMU spills inject thrsw signals that invalidate
-                         * accumulators, so we can't batch them.
-                         */
-                        bool is_uniform = vir_is_mov_uniform(c, map[node].temp);
-                        if (i > 0 && !is_uniform)
-                                break;
 
-                        if (is_uniform || tmu_spilling_allowed(c, thread_index)) {
-                                v3d_spill_reg(c, map[node].temp);
-
-                                /* Ask the outer loop to call back in. */
-                                *spilled = true;
+                if (ra_allocate(c->g))
+                        break;
 
-                                /* See comment above about batching TMU spills.
-                                 */
-                                if (!is_uniform) {
-                                        assert(i == 0);
-                                        break;
-                                }
-                        } else {
-                                break;
-                        }
+                /* Failed allocation, try to spill */
+                int node = v3d_choose_spill_node(c);
+                if (node == -1)
+                        goto spill_fail;
+
+                uint32_t temp = node_to_temp(c, node);
+                enum temp_spill_type spill_type =
+                        get_spill_type_for_temp(c, temp);
+                if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
+                        v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                        if (c->spills + c->fills > c->max_tmu_spills)
+                                goto spill_fail;
+                } else {
+                        goto spill_fail;
                 }
-
-                ralloc_free(g);
-                return NULL;
         }
 
-        /* Ensure that we are not accessing temp_to_node out of bounds. We
-         * should never trigger this assertion because `c->num_temps` only
-         * grows when we spill, in which case we return early and don't get
-         * here.
-         */
-        assert(start_num_temps == c->num_temps);
-        struct qpu_reg *temp_registers = calloc(c->num_temps,
-                                                sizeof(*temp_registers));
-
+        /* Allocation was successful, build the 'temp -> reg' map */
+        temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
         for (uint32_t i = 0; i < c->num_temps; i++) {
-                int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
-                if (ra_reg < PHYS_INDEX) {
+                int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+                if (ra_reg < phys_index) {
                         temp_registers[i].magic = true;
                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
                                                    ra_reg - ACC_INDEX);
                 } else {
                         temp_registers[i].magic = false;
-                        temp_registers[i].index = ra_reg - PHYS_INDEX;
+                        temp_registers[i].index = ra_reg - phys_index;
                 }
         }
 
-        ralloc_free(g);
-
+spill_fail:
+        ralloc_free(c->nodes.info);
+        c->nodes.info = NULL;
+        c->nodes.alloc_count = 0;
+        ralloc_free(c->g);
+        c->g = NULL;
         return temp_registers;
 }
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index aa33545420e..605c3e4c7d5 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr)
         return reg;
 }
 
-static inline struct qpu_reg
-qpu_acc(int acc)
-{
-        return qpu_magic(V3D_QPU_WADDR_R0 + acc);
-}
-
 struct v3d_qpu_instr
 v3d_qpu_nop(void)
 {
@@ -92,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
         return q;
 }
 
+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+        /* If we have a small immediate move it from inst->raddr_b to the
+         * corresponding raddr.
+         */
+        if (src.smimm) {
+                assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+                       instr->sig.small_imm_c || instr->sig.small_imm_d);
+                *raddr = instr->raddr_b;
+                return;
+        }
+
+        assert(!src.magic);
+        *raddr = src.index;
+}
+
 /**
  * Allocates the src register (accumulator or register file) into the RADDR
  * fields of the instruction.
  */
 static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d42_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
 {
         if (src.smimm) {
-                assert(instr->sig.small_imm);
+                assert(instr->sig.small_imm_b);
                 *mux = V3D_QPU_MUX_B;
                 return;
         }
@@ -112,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
                 return;
         }
 
-        if (instr->alu.add.a != V3D_QPU_MUX_A &&
-            instr->alu.add.b != V3D_QPU_MUX_A &&
-            instr->alu.mul.a != V3D_QPU_MUX_A &&
-            instr->alu.mul.b != V3D_QPU_MUX_A) {
+        if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+            instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+            instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+            instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
                 instr->raddr_a = src.index;
                 *mux = V3D_QPU_MUX_A;
         } else {
                 if (instr->raddr_a == src.index) {
                         *mux = V3D_QPU_MUX_A;
                 } else {
-                        assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
-                                 instr->alu.add.b == V3D_QPU_MUX_B &&
-                                 instr->alu.mul.a == V3D_QPU_MUX_B &&
-                                 instr->alu.mul.b == V3D_QPU_MUX_B) ||
+                        assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
                                src.index == instr->raddr_b);
 
                         instr->raddr_b = src.index;
@@ -134,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
         }
 }
 
-static bool
-is_no_op_mov(struct qinst *qinst)
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+        enum v3d_qpu_mux *mux,
+        uint8_t *raddr,
+        struct qpu_reg src,
+        const struct v3d_device_info *devinfo)
 {
-        static const struct v3d_qpu_sig no_sig = {0};
-
-        /* Make sure it's just a lone MOV. */
-        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
-            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
-            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
-            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
-                return false;
-        }
+        if (devinfo->ver < 71)
+                return v3d42_set_src(instr, mux, src);
+        else
+                return v3d71_set_src(instr, raddr, src);
+}
 
-        /* Check if it's a MOV from a register to itself. */
+static bool
+v3d42_mov_src_and_dst_equal(struct qinst *qinst)
+{
         enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
         if (qinst->qpu.alu.mul.magic_write) {
                 if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
                         return false;
 
-                if (qinst->qpu.alu.mul.a !=
+                if (qinst->qpu.alu.mul.a.mux !=
                     V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
                         return false;
                 }
         } else {
                 int raddr;
 
-                switch (qinst->qpu.alu.mul.a) {
+                switch (qinst->qpu.alu.mul.a.mux) {
                 case V3D_QPU_MUX_A:
                         raddr = qinst->qpu.raddr_a;
                         break;
@@ -174,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
                         return false;
         }
 
+        return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+        if (qinst->qpu.alu.mul.magic_write)
+                return false;
+
+        enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+        int raddr;
+
+        raddr = qinst->qpu.alu.mul.a.raddr;
+        if (raddr != waddr)
+                return false;
+
+        return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+                      const struct v3d_device_info *devinfo)
+{
+        if (devinfo->ver < 71)
+                return v3d42_mov_src_and_dst_equal(qinst);
+        else
+                return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+             const struct v3d_device_info *devinfo)
+{
+        static const struct v3d_qpu_sig no_sig = {0};
+
+        /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+         * for V3D 7.x there is also A_MOV, we don't need to check for it as
+         * we always emit using M_MOV. We could use A_MOV later on the
+         * squedule to improve performance
+         */
+        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+                return false;
+        }
+
+        if (!mov_src_and_dst_equal(qinst, devinfo))
+                return false;
+
         /* No packing or flags updates, or we need to execute the
          * instruction.
          */
-        if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+        if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
             qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
             qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
             qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -193,8 +262,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         struct qblock *block,
                         struct qpu_reg *temp_registers)
 {
-        int last_vpm_read_index = -1;
-
         vir_for_each_inst_safe(qinst, block) {
 #if 0
                 fprintf(stderr, "translating qinst to qpu: ");
@@ -202,8 +269,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                 fprintf(stderr, "\n");
 #endif
 
-                struct qinst *temp;
-
                 if (vir_has_uniform(qinst))
                         c->num_uniforms++;
 
@@ -219,8 +284,14 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 src[i] = qpu_magic(qinst->src[i].index);
                                 break;
                         case QFILE_NULL:
+                                /* QFILE_NULL is an undef, so we can load
+                                 * anything. Using a reg that doesn't have
+                                 * sched. restrictions.
+                                 */
+                                src[i] = qpu_reg(5);
+                                break;
                         case QFILE_LOAD_IMM:
-                                src[i] = qpu_acc(0);
+                                assert(!"not reached");
                                 break;
                         case QFILE_TEMP:
                                 src[i] = temp_registers[index];
@@ -228,18 +299,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         case QFILE_SMALL_IMM:
                                 src[i].smimm = true;
                                 break;
-
-                        case QFILE_VPM:
-                                assert((int)qinst->src[i].index >=
-                                       last_vpm_read_index);
-                                (void)last_vpm_read_index;
-                                last_vpm_read_index = qinst->src[i].index;
-
-                                temp = new_qpu_nop_before(qinst);
-                                temp->qpu.sig.ldvpm = true;
-
-                                src[i] = qpu_acc(3);
-                                break;
                         }
                 }
 
@@ -261,10 +320,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         dst = temp_registers[qinst->dst.index];
                         break;
 
-                case QFILE_VPM:
-                        dst = qpu_magic(V3D_QPU_WADDR_VPM);
-                        break;
-
                 case QFILE_SMALL_IMM:
                 case QFILE_LOAD_IMM:
                         assert(!"not reached");
@@ -276,10 +331,15 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
 
-                                if (!dst.magic ||
-                                    dst.index != V3D_QPU_WADDR_R5) {
-                                        assert(c->devinfo->ver >= 40);
+                                bool use_rf;
+                                if (c->devinfo->has_accumulators) {
+                                        use_rf = !dst.magic ||
+                                                 dst.index != V3D_QPU_WADDR_R5;
+                                } else {
+                                        use_rf = dst.magic || dst.index != 0;
+                                }
 
+                                if (use_rf) {
                                         if (qinst->qpu.sig.ldunif) {
                                            qinst->qpu.sig.ldunif = false;
                                            qinst->qpu.sig.ldunifrf = true;
@@ -299,13 +359,18 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 qinst->qpu.sig_magic = dst.magic;
                         } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
                                 if (nsrc >= 1) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.add.a, src[0]);
+                                                &qinst->qpu.alu.add.a.mux,
+                                                &qinst->qpu.alu.add.a.raddr,
+                                                src[0], c->devinfo);
                                 }
                                 if (nsrc >= 2) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.add.b, src[1]);
+                                                &qinst->qpu.alu.add.b.mux,
+                                                &qinst->qpu.alu.add.b.raddr,
+                                                src[1], c->devinfo);
                                 }
 
                                 qinst->qpu.alu.add.waddr = dst.index;
@@ -313,17 +378,21 @@ v3d_generate_code_block(struct v3d_compile *c,
                         } else {
                                 if (nsrc >= 1) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.mul.a, src[0]);
+                                                &qinst->qpu.alu.mul.a.mux,
+                                                &qinst->qpu.alu.mul.a.raddr,
+                                                src[0], c->devinfo);
                                 }
                                 if (nsrc >= 2) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.mul.b, src[1]);
+                                                &qinst->qpu.alu.mul.b.mux,
+                                                &qinst->qpu.alu.mul.b.raddr,
+                                                src[1], c->devinfo);
                                 }
 
                                 qinst->qpu.alu.mul.waddr = dst.index;
                                 qinst->qpu.alu.mul.magic_write = dst.magic;
 
-                                if (is_no_op_mov(qinst)) {
+                                if (is_no_op_mov(qinst, c->devinfo)) {
                                         vir_remove_instruction(c, qinst);
                                         continue;
                                 }
@@ -378,11 +447,7 @@ v3d_dump_qpu(struct v3d_compile *c)
                 const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
                 fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str);
 
-                /* We can only do this on 4.x, because we're not tracking TMU
-                 * implicit uniforms here on 3.x.
-                 */
-                if (c->devinfo->ver >= 40 &&
-                    reads_uniform(c->devinfo, c->qpu_insts[i])) {
+                if (reads_uniform(c->devinfo, c->qpu_insts[i])) {
                         fprintf(stderr, " (");
                         vir_dump_uniform(c->uniform_contents[next_uniform],
                                          c->uniform_data[next_uniform]);
@@ -394,8 +459,7 @@ v3d_dump_qpu(struct v3d_compile *c)
         }
 
         /* Make sure our dumping lined up. */
-        if (c->devinfo->ver >= 40)
-                assert(next_uniform == c->num_uniforms);
+        assert(next_uniform == c->num_uniforms);
 
         fprintf(stderr, "\n");
 }
@@ -431,8 +495,8 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
         }
         assert(i == c->qpu_inst_count);
 
-        if (V3D_DEBUG & (V3D_DEBUG_QPU |
-                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+        if (V3D_DBG(QPU) ||
+            v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
                 v3d_dump_qpu(c);
         }
 
diff --git a/src/broadcom/drm-shim/README.md b/src/broadcom/drm-shim/README.md
index 16cbff75825..614cc8304bf 100644
--- a/src/broadcom/drm-shim/README.md
+++ b/src/broadcom/drm-shim/README.md
@@ -1,12 +1,3 @@
-### v3d backend
-
-This implements some of v3d using the closed source v3dv3 tree's
-C/C++-based simulator.  All execution is synchronous.
-
-Export: `MESA_LOADER_DRIVER_OVERRIDE=v3d
-LD_PRELOAD=$prefix/lib/libv3d_drm_shim.so`.  The v3dv3 version exposed
-will depend on the v3dv3 build -- 3.3, 4.1, and 4.2 are supported.
-
 ### v3d_noop backend
 
 This implements the minimum of v3d in order to make shader-db work.
diff --git a/src/broadcom/drm-shim/meson.build b/src/broadcom/drm-shim/meson.build
index b44b6c15d18..212c0287aa8 100644
--- a/src/broadcom/drm-shim/meson.build
+++ b/src/broadcom/drm-shim/meson.build
@@ -19,55 +19,19 @@
 # SOFTWARE.
 
 libvc4_noop_drm_shim = shared_library(
-  ['vc4_noop_drm_shim'],
+  'vc4_noop_drm_shim',
   'vc4_noop.c',
-  include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
+  include_directories: [inc_include, inc_src],
   dependencies: dep_drm_shim,
   gnu_symbol_visibility : 'hidden',
   install : true,
 )
 
 libv3d_noop_drm_shim = shared_library(
-  ['v3d_noop_drm_shim'],
+  'v3d_noop_drm_shim',
   'v3d_noop.c',
-  include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
+  include_directories: [inc_include, inc_src],
   dependencies: dep_drm_shim,
   gnu_symbol_visibility : 'hidden',
   install : true,
 )
-
-dep_v3dv3 = dependency('v3dv3', required: false)
-if dep_v3dv3.found()
-  v3dv3_c_args = '-DUSE_V3D_SIMULATOR'
-
-  inc_gallium_v3d = include_directories('../../gallium/drivers/v3d')
-
-  per_version_libs = []
-  foreach ver : v3d_versions
-    per_version_libs += static_library(
-        'libv3d_drm_shim-v' + ver,
-        [
-                            'v3dx.c',
-                            v3d_xml_pack
-        ],
-        include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_gallium_v3d, inc_simulator],
-        c_args : [no_override_init_args, '-DV3D_VERSION=' + ver, v3dv3_c_args],
-        gnu_symbol_visibility : 'hidden',
-        dependencies: [dep_valgrind, dep_thread, dep_v3dv3],
-    )
-  endforeach
-
-  libv3d_drm_shim = shared_library(
-    ['v3d_drm_shim'],
-    [
-      'v3d.c',
-      '../simulator/v3d_simulator_wrapper.cpp',
-    ],
-    dependencies: [idep_mesautil, dep_dl, dep_drm_shim, dep_v3dv3],
-    link_with: per_version_libs,
-    include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_gallium_v3d, inc_simulator],
-    c_args : [no_override_init_args, '-std=gnu99', v3dv3_c_args],
-    gnu_symbol_visibility : 'hidden',
-    cpp_args : [v3dv3_c_args]
-  )
-endif
diff --git a/src/broadcom/drm-shim/v3d.c b/src/broadcom/drm-shim/v3d.c
deleted file mode 100644
index f4d5bd31323..00000000000
--- a/src/broadcom/drm-shim/v3d.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright © 2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include "drm-uapi/v3d_drm.h"
-#include "drm-shim/drm_shim.h"
-#include "v3d.h"
-#include "v3d_simulator_wrapper.h"
-
-bool drm_shim_driver_prefers_first_render_node = false;
-
-static struct v3d_device_info devinfo;
-struct v3d_shim_device v3d = {
-        .devinfo = &devinfo
-};
-
-struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle)
-{
-        return v3d_bo(drm_shim_bo_lookup(shim_fd, handle));
-}
-
-int
-v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg)
-{
-        /* No need to wait on anything yet, given that we submit
-         * synchronously.
-         */
-        return 0;
-}
-
-int
-v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
-{
-        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
-        struct drm_v3d_mmap_bo *map = arg;
-        struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
-
-        map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
-
-        drm_shim_bo_put(bo);
-
-        return 0;
-}
-
-int
-v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg)
-{
-        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
-        struct drm_v3d_get_bo_offset *get = arg;
-        struct v3d_bo *bo = v3d_bo_lookup(shim_fd, get->handle);
-
-        get->offset = bo->offset;
-
-        drm_shim_bo_put(&bo->base);
-
-        return 0;
-}
-
-void
-drm_shim_driver_init(void)
-{
-        shim_device.bus_type = DRM_BUS_PLATFORM;
-        shim_device.driver_name = "v3d";
-
-        drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n"
-                               "OF_COMPATIBLE_N=1\n"
-                               "OF_COMPATIBLE_0=brcm,7278-v3d\n",
-                               "/sys/dev/char/%d:%d/device/uevent",
-                               DRM_MAJOR, render_node_minor);
-
-        v3d.hw = v3d_hw_auto_new(NULL);
-        v3d.devinfo->ver = v3d_hw_get_version(v3d.hw);
-
-        if (v3d.devinfo->ver >= 42)
-                v3d42_drm_shim_driver_init();
-        else if (v3d.devinfo->ver >= 41)
-                v3d41_drm_shim_driver_init();
-        else
-                v3d33_drm_shim_driver_init();
-}
diff --git a/src/broadcom/drm-shim/v3d.h b/src/broadcom/drm-shim/v3d.h
deleted file mode 100644
index 0712b8b3f24..00000000000
--- a/src/broadcom/drm-shim/v3d.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright © 2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef DRM_SHIM_V3D_H
-#define DRM_SHIM_V3D_H
-
-#include "broadcom/common/v3d_device_info.h"
-#include "util/vma.h"
-
-struct drm_shim_fd;
-
-struct v3d_shim_device {
-        struct v3d_hw *hw;
-        struct v3d_device_info *devinfo;
-
-        /* Base virtual address of the heap. */
-        void *mem;
-        /* Base hardware address of the heap. */
-        uint32_t mem_base;
-        /* Size of the heap. */
-        size_t mem_size;
-
-        /* Allocator for the GPU virtual addresses. */
-        struct util_vma_heap heap;
-};
-extern struct v3d_shim_device v3d;
-
-struct v3d_bo {
-        struct shim_bo base;
-        uint64_t offset;
-        void *sim_vaddr;
-        void *gem_vaddr;
-};
-
-static inline struct v3d_bo *
-v3d_bo(struct shim_bo *bo)
-{
-        return (struct v3d_bo *)bo;
-}
-
-struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle);
-int v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg);
-int v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg);
-int v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg);
-
-void v3d33_drm_shim_driver_init(void);
-void v3d41_drm_shim_driver_init(void);
-void v3d42_drm_shim_driver_init(void);
-
-#endif /* DRM_SHIM_V3D_H */
diff --git a/src/broadcom/drm-shim/v3d_noop.c b/src/broadcom/drm-shim/v3d_noop.c
index fd92e8859c5..8a27052441b 100644
--- a/src/broadcom/drm-shim/v3d_noop.c
+++ b/src/broadcom/drm-shim/v3d_noop.c
@@ -122,6 +122,15 @@ v3d_ioctl_get_param(int fd, unsigned long request, void *arg)
         case DRM_V3D_PARAM_SUPPORTS_TFU:
                 gp->value = 1;
                 return 0;
+        case DRM_V3D_PARAM_SUPPORTS_CSD:
+                gp->value = 1;
+                return 0;
+        case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
+                gp->value = 1;
+                return 0;
+        case DRM_V3D_PARAM_SUPPORTS_PERFMON:
+                gp->value = 1;
+                return 0;
         default:
                 break;
         }
diff --git a/src/broadcom/drm-shim/v3dx.c b/src/broadcom/drm-shim/v3dx.c
deleted file mode 100644
index a22550a03a5..00000000000
--- a/src/broadcom/drm-shim/v3dx.c
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Copyright © 2014-2017 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/* @file
- *
- * v3d driver code interacting v3dv3 simulator/fpga library.
- *
- * This is compiled per V3D version we support, since the register definitions
- * conflict.
- */
-
-#include <errno.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/mman.h>
-#include "util/macros.h"
-#include "util/u_mm.h"
-#include "broadcom/common/v3d_macros.h"
-#include "v3d_simulator_wrapper.h"
-#include "drm-shim/drm_shim.h"
-#include "drm-uapi/v3d_drm.h"
-#include "v3d.h"
-
-#define HW_REGISTER_RO(x) (x)
-#define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
-#include "libs/core/v3d/registers/4.1.34.0/v3d.h"
-#else
-#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
-#endif
-
-#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d.hw, reg, val)
-#define V3D_READ(reg) v3d_hw_read_reg(v3d.hw, reg)
-
-static void
-v3d_flush_l3()
-{
-        if (!v3d_hw_has_gca(v3d.hw))
-                return;
-
-#if V3D_VERSION < 40
-        uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
-
-        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
-        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
-#endif
-}
-
-/* Invalidates the L2 cache.  This is a read-only cache. */
-static void
-v3d_flush_l2(void)
-{
-        V3D_WRITE(V3D_CTL_0_L2CACTL,
-                  V3D_CTL_0_L2CACTL_L2CCLR_SET |
-                  V3D_CTL_0_L2CACTL_L2CENA_SET);
-}
-
-/* Invalidates texture L2 cachelines */
-static void
-v3d_flush_l2t(void)
-{
-        V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
-        V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
-        V3D_WRITE(V3D_CTL_0_L2TCACTL,
-                  V3D_CTL_0_L2TCACTL_L2TFLS_SET |
-                  (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
-}
-
-/* Invalidates the slice caches.  These are read-only caches. */
-static void
-v3d_flush_slices(void)
-{
-        V3D_WRITE(V3D_CTL_0_SLCACTL, ~0);
-}
-
-static void
-v3d_flush_caches(void)
-{
-        v3d_flush_l3();
-        v3d_flush_l2();
-        v3d_flush_l2t();
-        v3d_flush_slices();
-}
-
-static void
-v3d_simulator_copy_in_handle(struct shim_fd *shim_fd, int handle)
-{
-        if (!handle)
-                return;
-
-        struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
-
-        memcpy(bo->sim_vaddr, bo->gem_vaddr, bo->base.size);
-}
-
-static void
-v3d_simulator_copy_out_handle(struct shim_fd *shim_fd, int handle)
-{
-        if (!handle)
-                return;
-
-        struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
-
-        memcpy(bo->gem_vaddr, bo->sim_vaddr, bo->base.size);
-}
-
-static int
-v3dX(v3d_ioctl_submit_cl)(int fd, unsigned long request, void *arg)
-{
-        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
-        struct drm_v3d_submit_cl *submit = arg;
-        uint32_t *bo_handles = (uint32_t *)(uintptr_t)submit->bo_handles;
-
-        for (int i = 0; i < submit->bo_handle_count; i++)
-                v3d_simulator_copy_in_handle(shim_fd, bo_handles[i]);
-
-        v3d_flush_caches();
-
-        if (submit->qma) {
-                V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
-                V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
-        }
-#if V3D_VERSION >= 41
-        if (submit->qts) {
-                V3D_WRITE(V3D_CLE_0_CT0QTS,
-                          V3D_CLE_0_CT0QTS_CTQTSEN_SET |
-                          submit->qts);
-        }
-#endif
-
-        fprintf(stderr, "submit %x..%x!\n", submit->bcl_start, submit->bcl_end);
-
-        V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
-        V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
-
-        /* Wait for bin to complete before firing render, as it seems the
-         * simulator doesn't implement the semaphores.
-         */
-        while (V3D_READ(V3D_CLE_0_CT0CA) !=
-               V3D_READ(V3D_CLE_0_CT0EA)) {
-                v3d_hw_tick(v3d.hw);
-        }
-
-        fprintf(stderr, "submit %x..%x!\n", submit->rcl_start, submit->rcl_end);
-
-        v3d_flush_caches();
-
-        V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start);
-        V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end);
-
-        while (V3D_READ(V3D_CLE_0_CT1CA) !=
-               V3D_READ(V3D_CLE_0_CT1EA)) {
-                v3d_hw_tick(v3d.hw);
-        }
-
-        for (int i = 0; i < submit->bo_handle_count; i++)
-                v3d_simulator_copy_out_handle(shim_fd, bo_handles[i]);
-
-        return 0;
-}
-
-static int
-v3dX(v3d_ioctl_submit_tfu)(int fd, unsigned long request, void *arg)
-{
-        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
-        struct drm_v3d_submit_tfu *submit = arg;
-
-        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[0]);
-        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[1]);
-        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[2]);
-        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[3]);
-
-        int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
-
-        V3D_WRITE(V3D_TFU_IIA, submit->iia);
-        V3D_WRITE(V3D_TFU_IIS, submit->iis);
-        V3D_WRITE(V3D_TFU_ICA, submit->ica);
-        V3D_WRITE(V3D_TFU_IUA, submit->iua);
-        V3D_WRITE(V3D_TFU_IOA, submit->ioa);
-        V3D_WRITE(V3D_TFU_IOS, submit->ios);
-        V3D_WRITE(V3D_TFU_COEF0, submit->coef[0]);
-        V3D_WRITE(V3D_TFU_COEF1, submit->coef[1]);
-        V3D_WRITE(V3D_TFU_COEF2, submit->coef[2]);
-        V3D_WRITE(V3D_TFU_COEF3, submit->coef[3]);
-
-        V3D_WRITE(V3D_TFU_ICFG, submit->icfg);
-
-        while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
-                v3d_hw_tick(v3d.hw);
-        }
-
-        v3d_simulator_copy_out_handle(shim_fd, submit->bo_handles[0]);
-
-        return 0;
-}
-
-static int
-v3dX(v3d_ioctl_create_bo)(int fd, unsigned long request, void *arg)
-{
-        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
-        struct drm_v3d_create_bo *create = arg;
-        struct v3d_bo *bo = calloc(1, sizeof(*bo));
-
-        drm_shim_bo_init(&bo->base, create->size);
-        bo->offset = util_vma_heap_alloc(&v3d.heap, create->size, 4096);
-        if (bo->offset == 0)
-                return -ENOMEM;
-
-        bo->sim_vaddr = v3d.mem + bo->offset - v3d.mem_base;
-#if 0
-        /* Place a mapping of the BO inside of the simulator's address space
-         * for V3D memory.  This lets us avoid copy in/out for simpenrose, but
-         * I'm betting we'll need something else for FPGA.
-         */
-        void *sim_addr = v3d.mem + bo->block->ofs;
-        void *mmap_ret = mmap(sim_addr, create->size, PROT_READ | PROT_WRITE,
-                              MAP_SHARED | MAP_FIXED, bo->base.fd, 0);
-        assert(mmap_ret == sim_addr);
-#else
-        /* Make a simulator-private mapping of the shim GEM object. */
-        bo->gem_vaddr = mmap(NULL, bo->base.size,
-                             PROT_READ | PROT_WRITE,
-                             MAP_SHARED,
-                             bo->base.fd, 0);
-        if (bo->gem_vaddr == MAP_FAILED) {
-                fprintf(stderr, "v3d: mmap of shim bo failed\n");
-                abort();
-        }
-#endif
-
-        create->offset = bo->offset;
-        create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
-
-        drm_shim_bo_put(&bo->base);
-
-        return 0;
-}
-
-static int
-v3dX(v3d_ioctl_get_param)(int fd, unsigned long request, void *arg)
-{
-        struct drm_v3d_get_param *gp = arg;
-        static const uint32_t reg_map[] = {
-                [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
-                [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
-                [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
-                [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
-                [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
-                [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
-                [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
-        };
-
-        switch (gp->param) {
-        case DRM_V3D_PARAM_SUPPORTS_TFU:
-                gp->value = 1;
-                return 0;
-        }
-
-        if (gp->param < ARRAY_SIZE(reg_map) && reg_map[gp->param]) {
-                gp->value = V3D_READ(reg_map[gp->param]);
-                return 0;
-        }
-
-        fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param);
-        return -1;
-}
-
-static ioctl_fn_t driver_ioctls[] = {
-        [DRM_V3D_SUBMIT_CL] = v3dX(v3d_ioctl_submit_cl),
-        [DRM_V3D_SUBMIT_TFU] = v3dX(v3d_ioctl_submit_tfu),
-        [DRM_V3D_WAIT_BO] = v3d_ioctl_wait_bo,
-        [DRM_V3D_CREATE_BO] = v3dX(v3d_ioctl_create_bo),
-        [DRM_V3D_GET_PARAM] = v3dX(v3d_ioctl_get_param),
-        [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo,
-        [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset,
-};
-
-static void
-v3d_isr(uint32_t hub_status)
-{
-        /* Check the per-core bits */
-        if (hub_status & (1 << 0)) {
-                uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS);
-
-                if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
-                        fprintf(stderr, "GMP violation at 0x%08x\n",
-                                V3D_READ(V3D_GMP_0_VIO_ADDR));
-                        abort();
-                } else {
-                        fprintf(stderr,
-                                "Unexpected ISR with core status 0x%08x\n",
-                                core_status);
-                }
-                abort();
-        }
-
-        return;
-}
-
-static void
-v3dX(simulator_init_regs)(void)
-{
-#if V3D_VERSION == 33
-        /* Set OVRTMUOUT to match kernel behavior.
-         *
-         * This means that the texture sampler uniform configuration's tmu
-         * output type field is used, instead of using the hardware default
-         * behavior based on the texture type.  If you want the default
-         * behavior, you can still put "2" in the indirect texture state's
-         * output_type field.
-         */
-        V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
-#endif
-
-        uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_GMPV_SET;
-        V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
-        V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
-
-        v3d_hw_set_isr(v3d.hw, v3d_isr);
-}
-
-static void
-v3d_bo_free(struct shim_bo *shim_bo)
-{
-        struct v3d_bo *bo = v3d_bo(shim_bo);
-
-        if (bo->gem_vaddr)
-                munmap(bo->gem_vaddr, shim_bo->size);
-
-        util_vma_heap_free(&v3d.heap, bo->offset, bo->base.size);
-}
-
-void
-v3dX(drm_shim_driver_init)(void)
-{
-        shim_device.driver_ioctls = driver_ioctls;
-        shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls);
-
-        shim_device.driver_bo_free = v3d_bo_free;
-
-        /* Allocate a gig of memory to play in. */
-        v3d_hw_alloc_mem(v3d.hw, 1024 * 1024 * 1024);
-        v3d.mem_base =
-                v3d_hw_get_mem(v3d.hw, &v3d.mem_size,
-                               &v3d.mem);
-        util_vma_heap_init(&v3d.heap, 4096, v3d.mem_size - 4096);
-
-        v3dX(simulator_init_regs)();
-}
diff --git a/src/broadcom/drm-shim/vc4_noop.c b/src/broadcom/drm-shim/vc4_noop.c
index 3f85158e6df..b9c83db8313 100644
--- a/src/broadcom/drm-shim/vc4_noop.c
+++ b/src/broadcom/drm-shim/vc4_noop.c
@@ -51,6 +51,20 @@ vc4_ioctl_create_bo(int fd, unsigned long request, void *arg)
 }
 
 static int
+vc4_ioctl_create_shader_bo(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_vc4_create_shader_bo *create = arg;
+        struct shim_bo *bo = calloc(1, sizeof(*bo));
+
+        drm_shim_bo_init(bo, create->size);
+        create->handle = drm_shim_bo_get_handle(shim_fd, bo);
+        drm_shim_bo_put(bo);
+
+        return 0;
+}
+
+static int
 vc4_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
 {
         struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
@@ -101,6 +115,7 @@ vc4_ioctl_get_param(int fd, unsigned long request, void *arg)
 
 static ioctl_fn_t driver_ioctls[] = {
         [DRM_VC4_CREATE_BO] = vc4_ioctl_create_bo,
+        [DRM_VC4_CREATE_SHADER_BO] = vc4_ioctl_create_shader_bo,
         [DRM_VC4_MMAP_BO] = vc4_ioctl_mmap_bo,
         [DRM_VC4_GET_PARAM] = vc4_ioctl_get_param,
         [DRM_VC4_GET_TILING] = vc4_ioctl_noop,
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
index 2e1145dd0c0..f8e93526300 100644
--- a/src/broadcom/meson.build
+++ b/src/broadcom/meson.build
@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
 
 subdir('cle')
 
-v3d_versions = ['33', '41', '42']
+v3d_versions = ['42', '71']
 v3d_libs = []
 
 if with_gallium_v3d or with_broadcom_vk
@@ -38,12 +38,12 @@ endif
 per_version_libs = []
 foreach ver : v3d_versions
   per_version_libs += static_library(
-    'libbroadcom-v' + ver,
+    'broadcom-v' + ver,
     [
       files('clif/v3dx_dump.c'),
       v3d_xml_pack
     ],
-    include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+    include_directories : [inc_include, inc_src, inc_broadcom],
     c_args : [no_override_init_args, '-DV3D_VERSION=' + ver],
     gnu_symbol_visibility : 'hidden',
     dependencies: [dep_valgrind, dep_thread],
@@ -61,7 +61,7 @@ libv3d_neon = static_library(
   'v3d_neon',
   'common/v3d_tiling.c',
   include_directories : [
-    inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
+    inc_src, inc_include, inc_broadcom,
   ],
   c_args : [v3d_args, v3d_neon_c_args],
   gnu_symbol_visibility : 'hidden',
@@ -69,12 +69,12 @@ libv3d_neon = static_library(
 )
 
 libbroadcom_v3d = static_library(
-  'libbroadcom_v3d',
+  'broadcom_v3d',
   [
     files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c', 'common/v3d_util.c'),
     v3d_xml_pack,
   ],
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+  include_directories : [inc_include, inc_src, inc_broadcom],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
   link_whole : v3d_libs + per_version_libs,
diff --git a/src/broadcom/qpu/meson.build b/src/broadcom/qpu/meson.build
index eea1f9bb058..fefc6a5cc56 100644
--- a/src/broadcom/qpu/meson.build
+++ b/src/broadcom/qpu/meson.build
@@ -25,9 +25,9 @@ libbroadcom_qpu_files = files(
 )
 
 libbroadcom_qpu = static_library(
-  ['broadcom_qpu', v3d_xml_pack],
-  libbroadcom_qpu_files,
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+  'broadcom_qpu',
+  [libbroadcom_qpu_files, v3d_xml_pack],
+  include_directories : [inc_include, inc_src, inc_broadcom],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
   dependencies : [dep_libdrm, dep_valgrind],
@@ -42,7 +42,7 @@ test(
     'qpu_disasm', 'tests/qpu_disasm.c',
     link_with: libbroadcom_qpu,
     dependencies : idep_mesautil,
-    include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux]
+    include_directories: [inc_include, inc_src]
   ),
   suite : ['broadcom'],
 )
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index b5648bd76e2..c1590a760de 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n)
 
 
 static void
-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
-                     const struct v3d_qpu_instr *instr, uint8_t mux)
+v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
+                       const struct v3d_qpu_instr *instr,
+                       enum v3d_qpu_mux mux)
 {
         if (mux == V3D_QPU_MUX_A) {
                 append(disasm, "rf%d", instr->raddr_a);
         } else if (mux == V3D_QPU_MUX_B) {
-                if (instr->sig.small_imm) {
+                if (instr->sig.small_imm_b) {
                         uint32_t val;
                         ASSERTED bool ok =
                                 v3d_qpu_small_imm_unpack(disasm->devinfo,
@@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
         }
 }
 
+enum v3d_qpu_input_class {
+        V3D_QPU_ADD_A,
+        V3D_QPU_ADD_B,
+        V3D_QPU_MUL_A,
+        V3D_QPU_MUL_B
+};
+
+static void
+v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
+                       const struct v3d_qpu_instr *instr,
+                       uint8_t raddr,
+                       enum v3d_qpu_input_class input_class)
+{
+        bool is_small_imm = false;
+        switch(input_class) {
+        case V3D_QPU_ADD_A:
+                is_small_imm = instr->sig.small_imm_a;
+                break;
+        case V3D_QPU_ADD_B:
+                is_small_imm = instr->sig.small_imm_b;
+                break;
+        case V3D_QPU_MUL_A:
+                is_small_imm = instr->sig.small_imm_c;
+                break;
+        case V3D_QPU_MUL_B:
+                is_small_imm = instr->sig.small_imm_d;
+                break;
+        }
+
+        if (is_small_imm) {
+                uint32_t val;
+                ASSERTED bool ok =
+                        v3d_qpu_small_imm_unpack(disasm->devinfo,
+                                                 raddr,
+                                                 &val);
+
+                if ((int)val >= -16 && (int)val <= 15)
+                        append(disasm, "%d", val);
+                else
+                        append(disasm, "0x%08x", val);
+                assert(ok);
+        } else {
+                append(disasm, "rf%d", raddr);
+        }
+}
+
+static void
+v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+                     const struct v3d_qpu_instr *instr,
+                     const struct v3d_qpu_input *input,
+                     enum v3d_qpu_input_class input_class)
+{
+        if (disasm->devinfo->ver < 71)
+                v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
+        else
+                v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
+}
+
 static void
 v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
 {
@@ -110,7 +169,7 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
         append(disasm, "%s", v3d_qpu_pf_name(instr->flags.apf));
         append(disasm, "%s", v3d_qpu_uf_name(instr->flags.auf));
 
-        append(disasm, "  ");
+        append(disasm, " ");
 
         if (has_dst) {
                 v3d_qpu_disasm_waddr(disasm, instr->alu.add.waddr,
@@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
         if (num_src >= 1) {
                 if (has_dst)
                         append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.add.a_unpack));
+                       v3d_qpu_unpack_name(instr->alu.add.a.unpack));
         }
 
         if (num_src >= 2) {
                 append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.add.b_unpack));
+                       v3d_qpu_unpack_name(instr->alu.add.b.unpack));
         }
 }
 
@@ -141,7 +200,7 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
         bool has_dst = v3d_qpu_mul_op_has_dst(instr->alu.mul.op);
         int num_src = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
 
-        pad_to(disasm, 21);
+        pad_to(disasm, 30);
         append(disasm, "; ");
 
         append(disasm, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
@@ -153,7 +212,7 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
         if (instr->alu.mul.op == V3D_QPU_M_NOP)
                 return;
 
-        append(disasm, "  ");
+        append(disasm, " ");
 
         if (has_dst) {
                 v3d_qpu_disasm_waddr(disasm, instr->alu.mul.waddr,
@@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
         if (num_src >= 1) {
                 if (has_dst)
                         append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
+                       v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
         }
 
         if (num_src >= 2) {
                 append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
+                       v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
         }
 }
 
@@ -217,7 +276,7 @@ v3d_qpu_disasm_sig(struct disasm_state *disasm,
                 return;
         }
 
-        pad_to(disasm, 41);
+        pad_to(disasm, 60);
 
         if (sig->thrsw)
                 append(disasm, "; thrsw");
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 569c5fc4074..9a6434d94dd 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
         if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
                 return "tmu";
 
+        /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
+         */
+        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
+                return "quad";
+
+        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
+                return "rep";
+
         static const char *waddr_magic[] = {
                 [V3D_QPU_WADDR_R0] = "r0",
                 [V3D_QPU_WADDR_R1] = "r1",
@@ -169,6 +177,19 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
                 [V3D_QPU_A_ITOF] = "itof",
                 [V3D_QPU_A_CLZ] = "clz",
                 [V3D_QPU_A_UTOF] = "utof",
+                [V3D_QPU_A_MOV] = "mov",
+                [V3D_QPU_A_FMOV] = "fmov",
+                [V3D_QPU_A_VPACK] = "vpack",
+                [V3D_QPU_A_V8PACK] = "v8pack",
+                [V3D_QPU_A_V10PACK] = "v10pack",
+                [V3D_QPU_A_V11FPACK] = "v11fpack",
+                [V3D_QPU_A_BALLOT] = "ballot",
+                [V3D_QPU_A_BCASTF] = "bcastf",
+                [V3D_QPU_A_ALLEQ] = "alleq",
+                [V3D_QPU_A_ALLFEQ] = "allfeq",
+                [V3D_QPU_A_ROTQ] = "rotq",
+                [V3D_QPU_A_ROT] = "rot",
+                [V3D_QPU_A_SHUFFLE] = "shuffle",
         };
 
         if (op >= ARRAY_SIZE(op_names))
@@ -191,6 +212,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
                 [V3D_QPU_M_MOV] = "mov",
                 [V3D_QPU_M_NOP] = "nop",
                 [V3D_QPU_M_FMUL] = "fmul",
+                [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
+                [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
+                [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
+                [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
+                [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
+                [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
         };
 
         if (op >= ARRAY_SIZE(op_names))
@@ -450,6 +477,21 @@ static const uint8_t add_op_args[] = {
         [V3D_QPU_A_ITOF] = D | A,
         [V3D_QPU_A_CLZ] = D | A,
         [V3D_QPU_A_UTOF] = D | A,
+
+        [V3D_QPU_A_MOV] = D | A,
+        [V3D_QPU_A_FMOV] = D | A,
+        [V3D_QPU_A_VPACK] = D | A | B,
+        [V3D_QPU_A_V8PACK] = D | A | B,
+        [V3D_QPU_A_V10PACK] = D | A | B,
+        [V3D_QPU_A_V11FPACK] = D | A | B,
+
+        [V3D_QPU_A_BALLOT] = D | A,
+        [V3D_QPU_A_BCASTF] = D | A,
+        [V3D_QPU_A_ALLEQ] = D | A,
+        [V3D_QPU_A_ALLFEQ] = D | A,
+        [V3D_QPU_A_ROTQ] = D | A | B,
+        [V3D_QPU_A_ROT] = D | A | B,
+        [V3D_QPU_A_SHUFFLE] = D | A | B,
 };
 
 static const uint8_t mul_op_args[] = {
@@ -463,6 +505,12 @@ static const uint8_t mul_op_args[] = {
         [V3D_QPU_M_NOP] = 0,
         [V3D_QPU_M_MOV] = D | A,
         [V3D_QPU_M_FMUL] = D | A | B,
+        [V3D_QPU_M_FTOUNORM16] = D | A,
+        [V3D_QPU_M_FTOSNORM16] = D | A,
+        [V3D_QPU_M_VFTOUNORM8] = D | A,
+        [V3D_QPU_M_VFTOSNORM8] = D | A,
+        [V3D_QPU_M_VFTOUNORM10LO] = D | A,
+        [V3D_QPU_M_VFTOUNORM10HI] = D | A,
 };
 
 bool
@@ -636,19 +684,23 @@ v3d_qpu_add_op_writes_vpm(enum  v3d_qpu_add_op op)
 }
 
 bool
-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
 {
-        if (inst->sig.ldtlb ||
-            inst->sig.ldtlbu)
-                return true;
+        return inst->sig.ldtlb || inst->sig.ldtlbu;
+}
 
+bool
+v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
+{
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
-                if (inst->alu.add.magic_write &&
+                if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                    inst->alu.add.magic_write &&
                     v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr)) {
                         return true;
                 }
 
-                if (inst->alu.mul.magic_write &&
+                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+                    inst->alu.mul.magic_write &&
                     v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr)) {
                         return true;
                 }
@@ -658,18 +710,32 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
 }
 
 bool
+v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+{
+        return  v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
+}
+
+bool
 v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
 {
-        if (v3d_qpu_instr_is_sfu(inst))
-                return true;
+        return v3d_qpu_instr_is_sfu(inst) || v3d_qpu_instr_is_legacy_sfu(inst);
+}
 
+/* Checks whether the instruction implements a SFU operation by the writing
+ * to specific magic register addresses instead of using SFU ALU opcodes.
+ */
+bool
+v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst)
+{
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
-                if (inst->alu.add.magic_write &&
+                if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                    inst->alu.add.magic_write &&
                     v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) {
                         return true;
                 }
 
-                if (inst->alu.mul.magic_write &&
+                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+                    inst->alu.mul.magic_write &&
                     v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) {
                         return true;
                 }
@@ -689,6 +755,13 @@ v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst)
                 case V3D_QPU_A_LOG:
                 case V3D_QPU_A_SIN:
                 case V3D_QPU_A_RSQRT2:
+                case V3D_QPU_A_BALLOT:
+                case V3D_QPU_A_BCASTF:
+                case V3D_QPU_A_ALLEQ:
+                case V3D_QPU_A_ALLFEQ:
+                case V3D_QPU_A_ROTQ:
+                case V3D_QPU_A_ROT:
+                case V3D_QPU_A_SHUFFLE:
                         return true;
                 default:
                         return false;
@@ -702,9 +775,11 @@ v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo,
                    const struct v3d_qpu_instr *inst)
 {
         return (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
-                ((inst->alu.add.magic_write &&
+                ((inst->alu.add.op != V3D_QPU_A_NOP &&
+                  inst->alu.add.magic_write &&
                   v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.add.waddr)) ||
-                 (inst->alu.mul.magic_write &&
+                 (inst->alu.mul.op != V3D_QPU_M_NOP &&
+                  inst->alu.mul.magic_write &&
                   v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.mul.waddr))));
 }
 
@@ -740,12 +815,14 @@ v3d_qpu_writes_vpm(const struct v3d_qpu_instr *inst)
                 if (v3d_qpu_add_op_writes_vpm(inst->alu.add.op))
                         return true;
 
-                if (inst->alu.add.magic_write &&
+                if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                    inst->alu.add.magic_write &&
                     v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr)) {
                         return true;
                 }
 
-                if (inst->alu.mul.magic_write &&
+                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+                    inst->alu.mul.magic_write &&
                     v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr)) {
                         return true;
                 }
@@ -773,12 +850,18 @@ v3d_qpu_writes_unifa(const struct v3d_device_info *devinfo,
                     inst->alu.mul.waddr == V3D_QPU_WADDR_UNIFA) {
                         return true;
                 }
+
+                if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+                    inst->sig_magic &&
+                    inst->sig_addr == V3D_QPU_WADDR_UNIFA) {
+                        return true;
+                }
         }
 
         return false;
 }
 
-static bool
+bool
 v3d_qpu_waits_vpm(const struct v3d_qpu_instr *inst)
 {
         return inst->type == V3D_QPU_INSTR_TYPE_ALU &&
@@ -805,10 +888,12 @@ qpu_writes_magic_waddr_explicitly(const struct v3d_device_info *devinfo,
                                   uint32_t waddr)
 {
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
-                if (inst->alu.add.magic_write && inst->alu.add.waddr == waddr)
+                if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                    inst->alu.add.magic_write && inst->alu.add.waddr == waddr)
                         return true;
 
-                if (inst->alu.mul.magic_write && inst->alu.mul.waddr == waddr)
+                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+                    inst->alu.mul.magic_write && inst->alu.mul.waddr == waddr)
                         return true;
         }
 
@@ -824,6 +909,9 @@ bool
 v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *inst)
 {
+        if(!devinfo->has_accumulators)
+                return false;
+
         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
                 return true;
 
@@ -834,14 +922,19 @@ bool
 v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
-                if (inst->alu.add.magic_write &&
+                if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                    inst->alu.add.magic_write &&
                     (inst->alu.add.waddr == V3D_QPU_WADDR_R4 ||
                      v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))) {
                         return true;
                 }
 
-                if (inst->alu.mul.magic_write &&
+                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+                    inst->alu.mul.magic_write &&
                     (inst->alu.mul.waddr == V3D_QPU_WADDR_R4 ||
                      v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))) {
                         return true;
@@ -862,6 +955,9 @@ bool
 v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
                 return true;
 
@@ -872,6 +968,9 @@ bool
 v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
                      const struct v3d_qpu_instr *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         if (v3d_qpu_writes_r5(devinfo, inst))
                 return true;
         if (v3d_qpu_writes_r4(devinfo, inst))
@@ -889,15 +988,67 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
 }
 
 bool
+v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+                              const struct v3d_qpu_instr *inst)
+{
+        if (devinfo->ver >= 71 &&
+            (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
+                return true;
+        }
+
+        return false;
+}
+
+bool
 v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
 {
         int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
         int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
 
-        return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
-                (add_nsrc > 1 && inst->alu.add.b == mux) ||
-                (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
-                (mul_nsrc > 1 && inst->alu.mul.b == mux));
+        return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
+                (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
+                (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
+                (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
+}
+
+bool
+v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+        int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+        int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+
+        return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
+               (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
+               (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
+               (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
+}
+
+bool
+v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+                                  const struct v3d_qpu_instr *inst,
+                                  uint8_t waddr)
+{
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
+            !inst->alu.add.magic_write &&
+            inst->alu.add.waddr == waddr) {
+                return true;
+        }
+
+        if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
+            !inst->alu.mul.magic_write &&
+            inst->alu.mul.waddr == waddr) {
+                return true;
+        }
+
+        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+            !inst->sig_magic && inst->sig_addr == waddr) {
+                return true;
+        }
+
+        return false;
 }
 
 bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 4f165e93914..fe9b5d3a00f 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
         bool ldvpm:1;
         bool ldtlb:1;
         bool ldtlbu:1;
-        bool small_imm:1;
         bool ucb:1;
         bool rotate:1;
         bool wrtmuc:1;
+        bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
+        bool small_imm_b:1; /* raddr_b (add b) */
+        bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
+        bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
 };
 
 enum v3d_qpu_cond {
@@ -88,12 +91,13 @@ enum v3d_qpu_uf {
 };
 
 enum v3d_qpu_waddr {
-        V3D_QPU_WADDR_R0 = 0,
-        V3D_QPU_WADDR_R1 = 1,
-        V3D_QPU_WADDR_R2 = 2,
-        V3D_QPU_WADDR_R3 = 3,
-        V3D_QPU_WADDR_R4 = 4,
-        V3D_QPU_WADDR_R5 = 5,
+        V3D_QPU_WADDR_R0 = 0,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R1 = 1,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R2 = 2,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R3 = 3,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R4 = 4,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R5 = 5,    /* V3D 4.x */
+        V3D_QPU_WADDR_QUAD = 5,  /* V3D 7.x */
         V3D_QPU_WADDR_NOP = 6,
         V3D_QPU_WADDR_TLB = 7,
         V3D_QPU_WADDR_TLBU = 8,
@@ -108,12 +112,12 @@ enum v3d_qpu_waddr {
         V3D_QPU_WADDR_SYNC = 16,
         V3D_QPU_WADDR_SYNCU = 17,
         V3D_QPU_WADDR_SYNCB = 18,
-        V3D_QPU_WADDR_RECIP = 19,
-        V3D_QPU_WADDR_RSQRT = 20,
-        V3D_QPU_WADDR_EXP = 21,
-        V3D_QPU_WADDR_LOG = 22,
-        V3D_QPU_WADDR_SIN = 23,
-        V3D_QPU_WADDR_RSQRT2 = 24,
+        V3D_QPU_WADDR_RECIP = 19,  /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_RSQRT = 20,  /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_EXP = 21,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_LOG = 22,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_SIN = 23,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
         V3D_QPU_WADDR_TMUC = 32,
         V3D_QPU_WADDR_TMUS = 33,
         V3D_QPU_WADDR_TMUT = 34,
@@ -129,7 +133,8 @@ enum v3d_qpu_waddr {
         V3D_QPU_WADDR_TMUHSCM = 44,
         V3D_QPU_WADDR_TMUHSF = 45,
         V3D_QPU_WADDR_TMUHSLOD = 46,
-        V3D_QPU_WADDR_R5REP = 55,
+        V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
+        V3D_QPU_WADDR_REP = 55,   /* V3D 7.x */
 };
 
 struct v3d_qpu_flags {
@@ -222,6 +227,21 @@ enum v3d_qpu_add_op {
         V3D_QPU_A_ITOF,
         V3D_QPU_A_CLZ,
         V3D_QPU_A_UTOF,
+
+        /* V3D 7.x */
+        V3D_QPU_A_FMOV,
+        V3D_QPU_A_MOV,
+        V3D_QPU_A_VPACK,
+        V3D_QPU_A_V8PACK,
+        V3D_QPU_A_V10PACK,
+        V3D_QPU_A_V11FPACK,
+        V3D_QPU_A_BALLOT,
+        V3D_QPU_A_BCASTF,
+        V3D_QPU_A_ALLEQ,
+        V3D_QPU_A_ALLFEQ,
+        V3D_QPU_A_ROTQ,
+        V3D_QPU_A_ROT,
+        V3D_QPU_A_SHUFFLE,
 };
 
 enum v3d_qpu_mul_op {
@@ -235,6 +255,14 @@ enum v3d_qpu_mul_op {
         V3D_QPU_M_MOV,
         V3D_QPU_M_NOP,
         V3D_QPU_M_FMUL,
+
+        /* V3D 7.x */
+        V3D_QPU_M_FTOUNORM16,
+        V3D_QPU_M_FTOSNORM16,
+        V3D_QPU_M_VFTOUNORM8,
+        V3D_QPU_M_VFTOSNORM8,
+        V3D_QPU_M_VFTOUNORM10LO,
+        V3D_QPU_M_VFTOUNORM10HI,
 };
 
 enum v3d_qpu_output_pack {
@@ -276,6 +304,15 @@ enum v3d_qpu_input_unpack {
 
         /** Swap high and low 16 bits */
         V3D_QPU_UNPACK_SWAP_16,
+
+        /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
+        V3D_QPU_UNPACK_UL,
+        /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
+        V3D_QPU_UNPACK_UH,
+        /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
+        V3D_QPU_UNPACK_IL,
+        /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
+        V3D_QPU_UNPACK_IH,
 };
 
 enum v3d_qpu_mux {
@@ -289,25 +326,29 @@ enum v3d_qpu_mux {
         V3D_QPU_MUX_B,
 };
 
+struct v3d_qpu_input {
+        union {
+                enum v3d_qpu_mux mux; /* V3D 4.x */
+                uint8_t raddr; /* V3D 7.x */
+        };
+        enum v3d_qpu_input_unpack unpack;
+};
+
 struct v3d_qpu_alu_instr {
         struct {
                 enum v3d_qpu_add_op op;
-                enum v3d_qpu_mux a, b;
+                struct v3d_qpu_input a, b;
                 uint8_t waddr;
                 bool magic_write;
                 enum v3d_qpu_output_pack output_pack;
-                enum v3d_qpu_input_unpack a_unpack;
-                enum v3d_qpu_input_unpack b_unpack;
         } add;
 
         struct {
                 enum v3d_qpu_mul_op op;
-                enum v3d_qpu_mux a, b;
+                struct v3d_qpu_input a, b;
                 uint8_t waddr;
                 bool magic_write;
                 enum v3d_qpu_output_pack output_pack;
-                enum v3d_qpu_input_unpack a_unpack;
-                enum v3d_qpu_input_unpack b_unpack;
         } mul;
 };
 
@@ -379,8 +420,8 @@ struct v3d_qpu_instr {
         struct v3d_qpu_sig sig;
         uint8_t sig_addr;
         bool sig_magic; /* If the signal writes to a magic address */
-        uint8_t raddr_a;
-        uint8_t raddr_b;
+        uint8_t raddr_a; /* V3D 4.x */
+        uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
         struct v3d_qpu_flags flags;
 
         union {
@@ -450,8 +491,11 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo,
                         const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -463,11 +507,14 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+                                   const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
                           const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux);
 bool v3d_qpu_uses_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_waits_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_reads_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_reads_or_writes_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -481,4 +528,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 
 bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+
+bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
+bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+                                       const struct v3d_qpu_instr *inst,
+                                       uint8_t waddr);
 #endif
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index eee1e9f95a5..c4added7344 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -84,6 +84,9 @@
 #define V3D_QPU_MUL_A_SHIFT                 18
 #define V3D_QPU_MUL_A_MASK                  QPU_MASK(20, 18)
 
+#define V3D_QPU_RADDR_C_SHIFT               18
+#define V3D_QPU_RADDR_C_MASK                QPU_MASK(23, 18)
+
 #define V3D_QPU_ADD_B_SHIFT                 15
 #define V3D_QPU_ADD_B_MASK                  QPU_MASK(17, 15)
 
@@ -98,6 +101,9 @@
 #define V3D_QPU_BRANCH_BDI_SHIFT            12
 #define V3D_QPU_BRANCH_BDI_MASK             QPU_MASK(13, 12)
 
+#define V3D_QPU_RADDR_D_SHIFT               12
+#define V3D_QPU_RADDR_D_MASK                QPU_MASK(17, 12)
+
 #define V3D_QPU_RADDR_A_SHIFT               6
 #define V3D_QPU_RADDR_A_MASK                QPU_MASK(11, 6)
 
@@ -112,12 +118,15 @@
 #define LDTMU .ldtmu = true
 #define LDVARY .ldvary = true
 #define LDVPM .ldvpm = true
-#define SMIMM .small_imm = true
 #define LDTLB .ldtlb = true
 #define LDTLBU .ldtlbu = true
 #define UCB .ucb = true
 #define ROT .rotate = true
 #define WRTMUC .wrtmuc = true
+#define SMIMM_A .small_imm_a = true
+#define SMIMM_B .small_imm_b = true
+#define SMIMM_C .small_imm_c = true
+#define SMIMM_D .small_imm_d = true
 
 static const struct v3d_qpu_sig v33_sig_map[] = {
         /*      MISC   R3       R4      R5 */
@@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
         [11] = { THRSW, LDVARY,         LDUNIF },
         [12] = {        LDVARY, LDTMU,         },
         [13] = { THRSW, LDVARY, LDTMU,         },
-        [14] = { SMIMM, LDVARY,                },
-        [15] = { SMIMM,                        },
+        [14] = { SMIMM_B, LDVARY,              },
+        [15] = { SMIMM_B,                      },
         [16] = {        LDTLB,                 },
         [17] = {        LDTLBU,                },
         /* 18-21 reserved */
@@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
         [27] = { THRSW, LDVPM,          LDUNIF },
         [28] = {        LDVPM, LDTMU,          },
         [29] = { THRSW, LDVPM, LDTMU,          },
-        [30] = { SMIMM, LDVPM,                 },
-        [31] = { SMIMM,                        },
+        [30] = { SMIMM_B, LDVPM,               },
+        [31] = { SMIMM_B,                      },
 };
 
 static const struct v3d_qpu_sig v40_sig_map[] = {
@@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
         [10] = {        LDVARY,         LDUNIF },
         [11] = { THRSW, LDVARY,         LDUNIF },
         /* 12-13 reserved */
-        [14] = { SMIMM, LDVARY,                },
-        [15] = { SMIMM,                        },
+        [14] = { SMIMM_B, LDVARY,              },
+        [15] = { SMIMM_B,                      },
         [16] = {        LDTLB,                 },
         [17] = {        LDTLBU,                },
         [18] = {                        WRTMUC },
@@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
         [22] = { UCB,                          },
         [23] = { ROT,                          },
         /* 24-30 reserved */
-        [31] = { SMIMM,         LDTMU,         },
+        [31] = { SMIMM_B,       LDTMU,         },
 };
 
 static const struct v3d_qpu_sig v41_sig_map[] = {
@@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
         [11] = { THRSW,    LDVARY, LDUNIF },
         [12] = { LDUNIFRF                 },
         [13] = { THRSW,    LDUNIFRF       },
-        [14] = { SMIMM,    LDVARY,        },
-        [15] = { SMIMM,                   },
+        [14] = { SMIMM_B,    LDVARY       },
+        [15] = { SMIMM_B,                 },
         [16] = {           LDTLB,         },
         [17] = {           LDTLBU,        },
         [18] = {                          WRTMUC },
@@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
         [24] = {                   LDUNIFA},
         [25] = { LDUNIFARF                },
         /* 26-30 reserved */
-        [31] = { SMIMM,            LDTMU, },
+        [31] = { SMIMM_B,          LDTMU, },
+};
+
+
+static const struct v3d_qpu_sig v71_sig_map[] = {
+        /*      MISC       phys    RF0 */
+        [0]  = {                          },
+        [1]  = { THRSW,                   },
+        [2]  = {                   LDUNIF },
+        [3]  = { THRSW,            LDUNIF },
+        [4]  = {           LDTMU,         },
+        [5]  = { THRSW,    LDTMU,         },
+        [6]  = {           LDTMU,  LDUNIF },
+        [7]  = { THRSW,    LDTMU,  LDUNIF },
+        [8]  = {           LDVARY,        },
+        [9]  = { THRSW,    LDVARY,        },
+        [10] = {           LDVARY, LDUNIF },
+        [11] = { THRSW,    LDVARY, LDUNIF },
+        [12] = { LDUNIFRF                 },
+        [13] = { THRSW,    LDUNIFRF       },
+        [14] = { SMIMM_A,                 },
+        [15] = { SMIMM_B,                 },
+        [16] = {           LDTLB,         },
+        [17] = {           LDTLBU,        },
+        [18] = {                          WRTMUC },
+        [19] = { THRSW,                   WRTMUC },
+        [20] = {           LDVARY,        WRTMUC },
+        [21] = { THRSW,    LDVARY,        WRTMUC },
+        [22] = { UCB,                     },
+        /* 23 reserved */
+        [24] = {                   LDUNIFA},
+        [25] = { LDUNIFARF                },
+        /* 26-29 reserved */
+        [30] = { SMIMM_C,                 },
+        [31] = { SMIMM_D,                 },
 };
 
 bool
@@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
         if (packed_sig >= ARRAY_SIZE(v33_sig_map))
                 return false;
 
-        if (devinfo->ver >= 41)
+        if (devinfo->ver >= 71)
+                *sig = v71_sig_map[packed_sig];
+        else if (devinfo->ver >= 41)
                 *sig = v41_sig_map[packed_sig];
         else if (devinfo->ver == 40)
                 *sig = v40_sig_map[packed_sig];
@@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
 {
         static const struct v3d_qpu_sig *map;
 
-        if (devinfo->ver >= 41)
+        if (devinfo->ver >= 71)
+                map = v71_sig_map;
+        else if (devinfo->ver >= 41)
                 map = v41_sig_map;
         else if (devinfo->ver == 40)
                 map = v40_sig_map;
@@ -256,13 +303,6 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
 
         return false;
 }
-static inline unsigned
-fui( float f )
-{
-        union {float f; unsigned ui;} fi;
-   fi.f = f;
-   return fi.ui;
-}
 
 static const uint32_t small_immediates[] = {
         0, 1, 2, 3,
@@ -425,8 +465,13 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
                 if (flags_present & MUF)
                         *packed_cond |= cond->muf - V3D_QPU_UF_ANDZ + 4;
 
-                if (flags_present & AC)
-                        *packed_cond |= (cond->ac - V3D_QPU_COND_IFA) << 2;
+                if (flags_present & AC) {
+                        if (*packed_cond & (1 << 6))
+                                *packed_cond |= cond->ac - V3D_QPU_COND_IFA;
+                        else
+                                *packed_cond |= (cond->ac -
+                                                 V3D_QPU_COND_IFA) << 2;
+                }
 
                 if (flags_present & MC) {
                         if (*packed_cond & (1 << 6))
@@ -445,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
 
 /* Make a mapping of the table of opcodes in the spec.  The opcode is
  * determined by a combination of the opcode field, and in the case of 0 or
- * 1-arg opcodes, the mux_b field as well.
+ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as
+ * well.
  */
-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1))
-#define ANYMUX MUX_MASK(0, 7)
+#define OP_MASK(val) BITFIELD64_BIT(val)
+#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1)
+#define ANYMUX OP_RANGE(0, 7)
+#define ANYOPMASK OP_RANGE(0, 63)
 
 struct opcode_desc {
         uint8_t opcode_first;
         uint8_t opcode_last;
-        uint8_t mux_b_mask;
-        uint8_t mux_a_mask;
+
+        union {
+                struct {
+                        uint8_t b_mask;
+                        uint8_t a_mask;
+                } mux;
+                uint64_t raddr_mask;
+        };
+
         uint8_t op;
 
         /* first_ver == 0 if it's the same across all V3D versions.
@@ -467,122 +522,329 @@ struct opcode_desc {
         uint8_t last_ver;
 };
 
-static const struct opcode_desc add_ops[] = {
+static const struct opcode_desc add_ops_v33[] = {
         /* FADD is FADDNF depending on the order of the mux_a/mux_b. */
-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADD },
-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADDNF },
-        { 53,  55,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
-        { 56,  56,  ANYMUX, ANYMUX, V3D_QPU_A_ADD },
-        { 57,  59,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
-        { 60,  60,  ANYMUX, ANYMUX, V3D_QPU_A_SUB },
-        { 61,  63,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
-        { 64,  111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB },
-        { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN },
-        { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX },
-        { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN },
-        { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX },
-        { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL },
-        { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR },
-        { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR },
-        { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR },
+        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD },
+        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF },
+        { 53,  55,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+        { 56,  56,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD },
+        { 57,  59,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+        { 60,  60,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB },
+        { 61,  63,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+        { 64,  111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB },
+        { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN },
+        { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX },
+        { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN },
+        { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX },
+        { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL },
+        { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR },
+        { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR },
+        { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR },
         /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */
-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN },
-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX },
-        { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN },
-
-        { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND },
-        { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR },
-        { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR },
-
-        { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD },
-        { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB },
-        { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT },
-        { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
-        { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
-        { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
-        { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
-        { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
-        { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
-        { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
-        { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
-        { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX },
-        { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX },
-        { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR },
-        { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA },
-        { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA },
-        { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB },
-        { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB },
-
-        { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD },
-        { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD },
-        { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD },
-        { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD },
-
-        { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF },
-        { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF },
-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 },
-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 },
-        { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 },
-        { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
-        { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
-        { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
-        { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
-        { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
-        { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
-
-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
-        { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
-        { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
-        { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
-        { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
-        { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
-        { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
+        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN },
+        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX },
+        { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN },
+
+        { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND },
+        { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR },
+        { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR },
+
+        { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD },
+        { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB },
+        { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT },
+        { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG },
+        { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH },
+        { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH },
+        { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP },
+        { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP },
+        { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF },
+        { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD },
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD },
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD },
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD },
+
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 },
+        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 },
+        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+
+        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 },
+        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
+        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
 
         /* FIXME: MORE COMPLICATED */
-        /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
+        /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
 
-        { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP },
-        { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX },
+        { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP },
+        { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX },
 
-        { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND },
-        { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN },
-        { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC },
-        { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ },
-        { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR },
-        { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ },
-        { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL },
-        { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC },
+        { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND },
+        { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN },
+        { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC },
+        { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ },
+        { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR },
+        { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ },
+        { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL },
+        { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC },
 
-        { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX },
-        { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY },
+        { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX },
+        { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY },
 
         /* The stvpms are distinguished by the waddr field. */
-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV },
-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD },
-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP },
+        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV },
+        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD },
+        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP },
+
+        { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF },
+        { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ },
+        { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF },
+};
+
+static const struct opcode_desc mul_ops_v33[] = {
+        { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD },
+        { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB },
+        { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 },
+        { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL },
+        { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 },
+        { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP },
+        { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 },
+        { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42},
+        { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 },
+        { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 },
+
+        { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL },
+};
 
-        { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF },
-        { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ },
-        { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF },
+/* Note that it would have been possible to define all the add/mul opcodes in
+ * just one table, using the first_ver/last_ver. But taking into account that
+ * for v71 there were a lot of changes, it was more tidy this way. Also right
+ * now we are doing a linear search on those tables, so this maintains the
+ * tables smaller.
+ *
+ * Just in case we merge the tables, we define the first_ver as 71 for those
+ * opcodes that changed on v71
+ */
+static const struct opcode_desc add_ops_v71[] = {
+        /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
+        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
+        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
+        { 53,  55,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+        { 56,  56,  .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
+        { 57,  59,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+        { 60,  60,  .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB },
+        { 61,  63,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+        { 64,  111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB },
+        { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN },
+        { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX },
+        { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN },
+        { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX },
+        { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL },
+        { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
+        { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
+        { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
+        /* FMIN is instead FMAX depending on the raddr_a/b order. */
+        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
+        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
+        { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
+
+        { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
+        { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
+        { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR },
+        { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD },
+        { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB },
+
+        { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT },
+        { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG },
+        { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH },
+        { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH },
+        { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP },
+        { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ },
+        { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF },
+        { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF },
+
+        { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+        { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+        { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+        { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR },
+        { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+        { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+        { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+        { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+        { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD },
+        { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD },
+        { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF },
+        { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF },
+        { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID },
+        { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID },
+        { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID },
+        { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT },
+        { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT },
+        { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST },
+        { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST },
+
+        { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD },
+        { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD },
+
+        { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 },
+
+        { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(38), V3D_QPU_A_BALLOT, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(39), V3D_QPU_A_BCASTF, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(40), V3D_QPU_A_ALLEQ, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(41), V3D_QPU_A_ALLFEQ, 71 },
+
+        { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 },
+
+        /* The stvpms are distinguished by the waddr field. */
+        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71},
+        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71},
+        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71},
+
+        { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FROUND, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FROUND, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FROUND, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(3),  V3D_QPU_A_FTOIN, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(7),  V3D_QPU_A_FTOIN, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC },
+        { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC },
+        { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC },
+        { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC },
+
+        { 246, 246, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 },
+
+        { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+
+        { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
+        { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
+
+        { 249, 249, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
+
+        { 249, 249, .raddr_mask = OP_MASK(3),  V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(7),  V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+
+        { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
+        { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
+
+        { 252, 252, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROTQ, 71 },
+        { 253, 253, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROT, 71 },
+        { 254, 254, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHUFFLE, 71 },
 };
 
-static const struct opcode_desc mul_ops[] = {
-        { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD },
-        { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB },
-        { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 },
-        { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL },
-        { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 },
-        { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP },
-        { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV },
-        { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV },
-        { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 },
-        { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV },
-        { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
+static const struct opcode_desc mul_ops_v71[] = {
+        /* For V3D 7.1, second mask field would be ignored */
+        { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 },
+        { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 },
+        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+        { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 },
+        { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 },
+        { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 },
+
+        { 14, 14, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 },
+
+        { 14, 14, .raddr_mask = OP_MASK(3),  V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(7),  V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
+
+        { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
+
+        { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
+
+        { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
 };
 
 /* Returns true if op_desc should be filtered out based on devinfo->ver
@@ -591,17 +853,23 @@ static const struct opcode_desc mul_ops[] = {
  */
 static bool
 opcode_invalid_in_version(const struct v3d_device_info *devinfo,
-                          const struct opcode_desc *op_desc)
+                          const uint8_t first_ver,
+                          const uint8_t last_ver)
 {
-        return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
-                (op_desc->last_ver != 0  && devinfo->ver > op_desc->last_ver);
+        return (first_ver != 0 && devinfo->ver < first_ver) ||
+                (last_ver != 0  && devinfo->ver > last_ver);
 }
 
+/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending
+ * on the devinfo->ver some would be ignored. We do this way just to avoid
+ * having two really similar lookup_opcode methods
+ */
 static const struct opcode_desc *
 lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
                           const struct opcode_desc *opcodes,
                           size_t num_opcodes, uint32_t opcode,
-                          uint32_t mux_a, uint32_t mux_b)
+                          uint32_t mux_a, uint32_t mux_b,
+                          uint32_t raddr)
 {
         for (int i = 0; i < num_opcodes; i++) {
                 const struct opcode_desc *op_desc = &opcodes[i];
@@ -610,14 +878,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
                     opcode > op_desc->opcode_last)
                         continue;
 
-                if (opcode_invalid_in_version(devinfo, op_desc))
+                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
                         continue;
 
-                if (!(op_desc->mux_b_mask & (1 << mux_b)))
-                        continue;
+                if (devinfo->ver < 71) {
+                        if (!(op_desc->mux.b_mask & (1 << mux_b)))
+                                continue;
 
-                if (!(op_desc->mux_a_mask & (1 << mux_a)))
-                        continue;
+                        if (!(op_desc->mux.a_mask & (1 << mux_a)))
+                                continue;
+                } else {
+                        if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr)))
+                                continue;
+                }
 
                 return op_desc;
         }
@@ -670,6 +943,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
 }
 
 static bool
+v3d_qpu_int32_unpack_unpack(uint32_t packed,
+                            enum v3d_qpu_input_unpack *unpacked)
+{
+        switch (packed) {
+        case 0:
+                *unpacked = V3D_QPU_UNPACK_NONE;
+                return true;
+        case 1:
+                *unpacked = V3D_QPU_UNPACK_UL;
+                return true;
+        case 2:
+                *unpacked = V3D_QPU_UNPACK_UH;
+                return true;
+        case 3:
+                *unpacked = V3D_QPU_UNPACK_IL;
+                return true;
+        case 4:
+                *unpacked = V3D_QPU_UNPACK_IH;
+                return true;
+        default:
+                return false;
+        }
+}
+
+static bool
+v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+                          uint32_t *packed)
+{
+        switch (unpacked) {
+        case V3D_QPU_UNPACK_NONE:
+                *packed = 0;
+                return true;
+        case V3D_QPU_UNPACK_UL:
+                *packed = 1;
+                return true;
+        case V3D_QPU_UNPACK_UH:
+                *packed = 2;
+                return true;
+        case V3D_QPU_UNPACK_IL:
+                *packed = 3;
+                return true;
+        case V3D_QPU_UNPACK_IH:
+                *packed = 4;
+                return true;
+        default:
+                return false;
+        }
+}
+
+static bool
 v3d_qpu_float16_unpack_unpack(uint32_t packed,
                               enum v3d_qpu_input_unpack *unpacked)
 {
@@ -720,10 +1043,10 @@ v3d_qpu_float16_unpack_pack(enum v3d_qpu_input_unpack unpacked,
 }
 
 static bool
-v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked,
+v3d_qpu_float32_pack_pack(enum v3d_qpu_output_pack pack,
                           uint32_t *packed)
 {
-        switch (unpacked) {
+        switch (pack) {
         case V3D_QPU_PACK_NONE:
                 *packed = 0;
                 return true;
@@ -739,8 +1062,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked,
 }
 
 static bool
-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
-                   struct v3d_qpu_instr *instr)
+v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
 {
         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
@@ -757,8 +1080,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 map_op = (map_op - 253 + 245);
 
         const struct opcode_desc *desc =
-                lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
-                                          map_op, mux_a, mux_b);
+                lookup_opcode_from_packed(devinfo, add_ops_v33,
+                                          ARRAY_SIZE(add_ops_v33),
+                                          map_op, mux_a, mux_b, 0);
 
         if (!desc)
                 return false;
@@ -814,12 +1138,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                         instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
 
                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }
 
                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
-                                                   &instr->alu.add.b_unpack)) {
+                                                   &instr->alu.add.b.unpack)) {
                         return false;
                 }
                 break;
@@ -833,7 +1157,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.add.output_pack = mux_b & 0x3;
 
                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }
                 break;
@@ -845,7 +1169,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
 
                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }
                 break;
@@ -853,23 +1177,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
         case V3D_QPU_A_VFMIN:
         case V3D_QPU_A_VFMAX:
                 if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }
 
                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
                 break;
 
         default:
                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
-                instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
                 break;
         }
 
-        instr->alu.add.a = mux_a;
-        instr->alu.add.b = mux_b;
+        instr->alu.add.a.mux = mux_a;
+        instr->alu.add.b.mux = mux_b;
         instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
 
         instr->alu.add.magic_write = false;
@@ -894,18 +1218,205 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
 }
 
 static bool
-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
+{
+        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+        uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A);
+        uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B);
+        uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+        uint32_t map_op = op;
+
+        const struct opcode_desc *desc =
+                lookup_opcode_from_packed(devinfo,
+                                          add_ops_v71,
+                                          ARRAY_SIZE(add_ops_v71),
+                                          map_op, 0, 0,
+                                          raddr_b);
+        if (!desc)
+                return false;
+
+        instr->alu.add.op = desc->op;
+
+        /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
+         * operands.
+         */
+        if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+            instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
+                if (instr->alu.add.op == V3D_QPU_A_FMIN)
+                        instr->alu.add.op = V3D_QPU_A_FMAX;
+                if (instr->alu.add.op == V3D_QPU_A_FADD)
+                        instr->alu.add.op = V3D_QPU_A_FADDNF;
+        }
+
+        /* Some QPU ops require a bit more than just basic opcode and mux a/b
+         * comparisons to distinguish them.
+         */
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_STVPMV:
+        case V3D_QPU_A_STVPMD:
+        case V3D_QPU_A_STVPMP:
+                switch (waddr) {
+                case 0:
+                        instr->alu.add.op = V3D_QPU_A_STVPMV;
+                        break;
+                case 1:
+                        instr->alu.add.op = V3D_QPU_A_STVPMD;
+                        break;
+                case 2:
+                        instr->alu.add.op = V3D_QPU_A_STVPMP;
+                        break;
+                default:
+                        return false;
+                }
+                break;
+        default:
+                break;
+        }
+
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_FADD:
+        case V3D_QPU_A_FADDNF:
+        case V3D_QPU_A_FSUB:
+        case V3D_QPU_A_FMIN:
+        case V3D_QPU_A_FMAX:
+        case V3D_QPU_A_FCMP:
+        case V3D_QPU_A_VFPACK:
+                if (instr->alu.add.op != V3D_QPU_A_VFPACK &&
+                    instr->alu.add.op != V3D_QPU_A_FCMP) {
+                        instr->alu.add.output_pack = (op >> 4) & 0x3;
+                } else {
+                        instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                }
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+                                                   &instr->alu.add.b.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_FFLOOR:
+        case V3D_QPU_A_FROUND:
+        case V3D_QPU_A_FTRUNC:
+        case V3D_QPU_A_FCEIL:
+        case V3D_QPU_A_FDX:
+        case V3D_QPU_A_FDY:
+                instr->alu.add.output_pack = raddr_b & 0x3;
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_FTOIN:
+        case V3D_QPU_A_FTOIZ:
+        case V3D_QPU_A_FTOUZ:
+        case V3D_QPU_A_FTOC:
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_VFMIN:
+        case V3D_QPU_A_VFMAX:
+                unreachable("pending v71 update");
+                if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                break;
+
+        case V3D_QPU_A_MOV:
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
+                                                 &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_FMOV:
+                instr->alu.add.output_pack = raddr_b & 0x3;
+
+                /* Mul alu FMOV has one additional variant */
+                int32_t unpack = (raddr_b >> 2) & 0x7;
+                if (unpack == 7)
+                        return false;
+
+                if (!v3d_qpu_float32_unpack_unpack(unpack,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        default:
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                break;
+        }
+
+        instr->alu.add.a.raddr = raddr_a;
+        instr->alu.add.b.raddr = raddr_b;
+        instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+
+        instr->alu.add.magic_write = false;
+        if (packed_inst & V3D_QPU_MA) {
+                switch (instr->alu.add.op) {
+                case V3D_QPU_A_LDVPMV_IN:
+                        instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
+                        break;
+                case V3D_QPU_A_LDVPMD_IN:
+                        instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT;
+                        break;
+                case V3D_QPU_A_LDVPMG_IN:
+                        instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT;
+                        break;
+                default:
+                        instr->alu.add.magic_write = true;
+                        break;
+                }
+        }
+
+        return true;
+}
+
+static bool
+v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                    struct v3d_qpu_instr *instr)
 {
+        if (devinfo->ver < 71)
+                return v3d33_qpu_add_unpack(devinfo, packed_inst, instr);
+        else
+                return v3d71_qpu_add_unpack(devinfo, packed_inst, instr);
+}
+
+static bool
+v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
+{
         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
         uint32_t mux_b = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_B);
 
         {
                 const struct opcode_desc *desc =
-                        lookup_opcode_from_packed(devinfo, mul_ops,
-                                                  ARRAY_SIZE(mul_ops),
-                                                  op, mux_a, mux_b);
+                        lookup_opcode_from_packed(devinfo,
+                                                  mul_ops_v33,
+                                                  ARRAY_SIZE(mul_ops_v33),
+                                                  op, mux_a, mux_b, 0);
                 if (!desc)
                         return false;
 
@@ -917,12 +1428,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
 
                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.mul.a_unpack)) {
+                                                   &instr->alu.mul.a.unpack)) {
                         return false;
                 }
 
                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
-                                                   &instr->alu.mul.b_unpack)) {
+                                                   &instr->alu.mul.b.unpack)) {
                         return false;
                 }
 
@@ -933,7 +1444,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                                               ((mux_b >> 2) & 1));
 
                 if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
-                                                   &instr->alu.mul.a_unpack)) {
+                                                   &instr->alu.mul.a.unpack)) {
                         return false;
                 }
 
@@ -943,29 +1454,123 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
 
                 if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
-                                                   &instr->alu.mul.a_unpack)) {
+                                                   &instr->alu.mul.a.unpack)) {
                         return false;
                 }
 
-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
 
                 break;
 
         default:
                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
-                instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
                 break;
         }
 
-        instr->alu.mul.a = mux_a;
-        instr->alu.mul.b = mux_b;
+        instr->alu.mul.a.mux = mux_a;
+        instr->alu.mul.b.mux = mux_b;
         instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
         instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
 
         return true;
 }
 
+static bool
+v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
+{
+        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+        uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C);
+        uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D);
+
+        {
+                const struct opcode_desc *desc =
+                        lookup_opcode_from_packed(devinfo,
+                                                  mul_ops_v71,
+                                                  ARRAY_SIZE(mul_ops_v71),
+                                                  op, 0, 0,
+                                                  raddr_d);
+                if (!desc)
+                        return false;
+
+                instr->alu.mul.op = desc->op;
+        }
+
+        switch (instr->alu.mul.op) {
+        case V3D_QPU_M_FMUL:
+                instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+                                                   &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+                                                   &instr->alu.mul.b.unpack)) {
+                        return false;
+                }
+
+                break;
+
+        case V3D_QPU_M_FMOV:
+                instr->alu.mul.output_pack = raddr_d & 0x3;
+
+                if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
+                                                   &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+
+                break;
+
+        case V3D_QPU_M_VFMUL:
+                unreachable("pending v71 update");
+                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
+                                                   &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+                break;
+
+        case V3D_QPU_M_MOV:
+                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
+                                                 &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        default:
+                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+                break;
+        }
+
+        instr->alu.mul.a.raddr = raddr_c;
+        instr->alu.mul.b.raddr = raddr_d;
+        instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
+        instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
+
+        return true;
+}
+
+static bool
+v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                   struct v3d_qpu_instr *instr)
+{
+        if (devinfo->ver < 71)
+                return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr);
+        else
+                return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr);
+}
+
 static const struct opcode_desc *
 lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
                          const struct opcode_desc *opcodes, size_t num_opcodes,
@@ -977,7 +1582,7 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
                 if (op_desc->op != op)
                         continue;
 
-                if (opcode_invalid_in_version(devinfo, op_desc))
+                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
                         continue;
 
                 return op_desc;
@@ -987,15 +1592,16 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
 }
 
 static bool
-v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d33_qpu_add_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
 {
         uint32_t waddr = instr->alu.add.waddr;
-        uint32_t mux_a = instr->alu.add.a;
-        uint32_t mux_b = instr->alu.add.b;
+        uint32_t mux_a = instr->alu.add.a.mux;
+        uint32_t mux_b = instr->alu.add.b.mux;
         int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
         const struct opcode_desc *desc =
-                lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
+                lookup_opcode_from_instr(devinfo, add_ops_v33,
+                                         ARRAY_SIZE(add_ops_v33),
                                          instr->alu.add.op);
 
         if (!desc)
@@ -1007,10 +1613,10 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
          * identify the operation type.
          */
         if (nsrc < 2)
-                mux_b = ffs(desc->mux_b_mask) - 1;
+                mux_b = ffs(desc->mux.b_mask) - 1;
 
         if (nsrc < 1)
-                mux_a = ffs(desc->mux_a_mask) - 1;
+                mux_a = ffs(desc->mux.a_mask) - 1;
 
         bool no_magic_write = false;
 
@@ -1063,12 +1669,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 }
                 opcode |= output_pack << 4;
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &a_unpack)) {
                         return false;
                 }
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
                                                  &b_unpack)) {
                         return false;
                 }
@@ -1102,23 +1708,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 uint32_t a_unpack;
                 uint32_t b_unpack;
 
-                if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
-                    instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
                         return false;
                 }
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &a_unpack)) {
                         return false;
                 }
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
                                                  &b_unpack)) {
                         return false;
                 }
 
-                opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
-                opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
+                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
 
                 break;
         }
@@ -1137,13 +1743,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 }
                 mux_b |= packed;
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &packed)) {
                         return false;
                 }
                 if (packed == 0)
                         return false;
-                opcode = (opcode & ~(1 << 2)) | packed << 2;
+                opcode = (opcode & ~(0x3 << 2)) | packed << 2;
                 break;
         }
 
@@ -1155,7 +1761,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                         return false;
 
                 uint32_t packed;
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1168,11 +1774,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
         case V3D_QPU_A_VFMIN:
         case V3D_QPU_A_VFMAX:
                 if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
-                    instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
+                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
                         return false;
                 }
 
-                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1182,8 +1788,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
         default:
                 if (instr->alu.add.op != V3D_QPU_A_NOP &&
                     (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
-                     instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
-                     instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
+                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
                         return false;
                 }
                 break;
@@ -1200,15 +1806,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
 }
 
 static bool
-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
 {
-        uint32_t mux_a = instr->alu.mul.a;
-        uint32_t mux_b = instr->alu.mul.b;
+        uint32_t waddr = instr->alu.add.waddr;
+        uint32_t raddr_a = instr->alu.add.a.raddr;
+        uint32_t raddr_b = instr->alu.add.b.raddr;
+
+        int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+        const struct opcode_desc *desc =
+                lookup_opcode_from_instr(devinfo, add_ops_v71,
+                                         ARRAY_SIZE(add_ops_v71),
+                                         instr->alu.add.op);
+        if (!desc)
+                return false;
+
+        uint32_t opcode = desc->opcode_first;
+
+        /* If an operation doesn't use an arg, its raddr values may be used to
+         * identify the operation type.
+         */
+        if (nsrc < 2)
+                raddr_b = ffsll(desc->raddr_mask) - 1;
+
+        bool no_magic_write = false;
+
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_STVPMV:
+                waddr = 0;
+                no_magic_write = true;
+                break;
+        case V3D_QPU_A_STVPMD:
+                waddr = 1;
+                no_magic_write = true;
+                break;
+        case V3D_QPU_A_STVPMP:
+                waddr = 2;
+                no_magic_write = true;
+                break;
+
+        case V3D_QPU_A_LDVPMV_IN:
+        case V3D_QPU_A_LDVPMD_IN:
+        case V3D_QPU_A_LDVPMP:
+        case V3D_QPU_A_LDVPMG_IN:
+                assert(!instr->alu.add.magic_write);
+                break;
+
+        case V3D_QPU_A_LDVPMV_OUT:
+        case V3D_QPU_A_LDVPMD_OUT:
+        case V3D_QPU_A_LDVPMG_OUT:
+                assert(!instr->alu.add.magic_write);
+                *packed_instr |= V3D_QPU_MA;
+                break;
+
+        default:
+                break;
+        }
+
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_FADD:
+        case V3D_QPU_A_FADDNF:
+        case V3D_QPU_A_FSUB:
+        case V3D_QPU_A_FMIN:
+        case V3D_QPU_A_FMAX:
+        case V3D_QPU_A_FCMP: {
+                uint32_t output_pack;
+                uint32_t a_unpack;
+                uint32_t b_unpack;
+
+                if (instr->alu.add.op != V3D_QPU_A_FCMP) {
+                        if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+                                                       &output_pack)) {
+                                return false;
+                        }
+                        opcode |= output_pack << 4;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &a_unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                 &b_unpack)) {
+                        return false;
+                }
+
+                /* These operations with commutative operands are
+                 * distinguished by the order of the operands come in.
+                 */
+                bool ordering =
+                        instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
+                        instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
+                if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
+                      instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
+                    ((instr->alu.add.op == V3D_QPU_A_FMAX ||
+                      instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
+                        uint32_t temp;
+
+                        temp = a_unpack;
+                        a_unpack = b_unpack;
+                        b_unpack = temp;
+
+                        temp = raddr_a;
+                        raddr_a = raddr_b;
+                        raddr_b = temp;
+
+                        /* If we are swapping raddr_a/b we also need to swap
+                         * small_imm_a/b.
+                         */
+                        if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
+                                assert(instr->sig.small_imm_a !=
+                                       instr->sig.small_imm_b);
+                                struct v3d_qpu_sig new_sig = instr->sig;
+                                new_sig.small_imm_a = !instr->sig.small_imm_a;
+                                new_sig.small_imm_b = !instr->sig.small_imm_b;
+                                uint32_t sig;
+                                if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+                                    return false;
+                            *packed_instr &= ~V3D_QPU_SIG_MASK;
+                            *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+                        }
+                }
+
+                opcode |= a_unpack << 2;
+                opcode |= b_unpack << 0;
+
+                break;
+        }
+
+        case V3D_QPU_A_VFPACK: {
+                uint32_t a_unpack;
+                uint32_t b_unpack;
+
+                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &a_unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                 &b_unpack)) {
+                        return false;
+                }
+
+                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
+
+                break;
+        }
+
+        case V3D_QPU_A_FFLOOR:
+        case V3D_QPU_A_FROUND:
+        case V3D_QPU_A_FTRUNC:
+        case V3D_QPU_A_FCEIL:
+        case V3D_QPU_A_FDX:
+        case V3D_QPU_A_FDY: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                raddr_b |= packed;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                if (packed == 0)
+                        return false;
+                raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2;
+                break;
+        }
+
+        case V3D_QPU_A_FTOIN:
+        case V3D_QPU_A_FTOIZ:
+        case V3D_QPU_A_FTOUZ:
+        case V3D_QPU_A_FTOC:
+                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                uint32_t packed;
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                if (packed == 0)
+                        return false;
+
+                raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2;
+
+                break;
+
+        case V3D_QPU_A_VFMIN:
+        case V3D_QPU_A_VFMAX:
+                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                opcode |= packed;
+                break;
+
+        case V3D_QPU_A_MOV: {
+                uint32_t packed;
+
+                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
+                                               &packed)) {
+                        return false;
+                }
+
+                raddr_b |= packed << 2;
+                break;
+        }
+
+        case V3D_QPU_A_FMOV: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                raddr_b = packed;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                raddr_b |= packed << 2;
+                break;
+        }
+
+        default:
+                if (instr->alu.add.op != V3D_QPU_A_NOP &&
+                    (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                        return false;
+                }
+                break;
+        }
+
+        *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A);
+        *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B);
+        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
+        *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
+        if (instr->alu.add.magic_write && !no_magic_write)
+                *packed_instr |= V3D_QPU_MA;
+
+        return true;
+}
+
+static bool
+v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        uint32_t mux_a = instr->alu.mul.a.mux;
+        uint32_t mux_b = instr->alu.mul.b.mux;
         int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
 
         const struct opcode_desc *desc =
-                lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
+                lookup_opcode_from_instr(devinfo, mul_ops_v33,
+                                         ARRAY_SIZE(mul_ops_v33),
                                          instr->alu.mul.op);
 
         if (!desc)
@@ -1220,10 +2091,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
          * that here.  If mux a/b determine packing, it will be set below.
          */
         if (nsrc < 2)
-                mux_b = ffs(desc->mux_b_mask) - 1;
+                mux_b = ffs(desc->mux.b_mask) - 1;
 
         if (nsrc < 1)
-                mux_a = ffs(desc->mux_a_mask) - 1;
+                mux_a = ffs(desc->mux.a_mask) - 1;
 
         switch (instr->alu.mul.op) {
         case V3D_QPU_M_FMUL: {
@@ -1238,13 +2109,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
                  */
                 opcode += packed << 4;
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
                                                  &packed)) {
                         return false;
                 }
                 opcode |= packed << 2;
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1262,7 +2133,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
                 opcode |= (packed >> 1) & 1;
                 mux_b = (packed & 1) << 2;
 
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1276,22 +2147,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
                 if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
                         return false;
 
-                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
                                                  &packed)) {
                         return false;
                 }
-                if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
+                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
                         opcode = 8;
                 else
                         opcode |= (packed + 4) & 7;
 
-                if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
+                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
                         return false;
 
                 break;
         }
 
         default:
+                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                        return false;
+                }
                 break;
         }
 
@@ -1307,6 +2184,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
 }
 
 static bool
+v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        uint32_t raddr_c = instr->alu.mul.a.raddr;
+        uint32_t raddr_d = instr->alu.mul.b.raddr;
+        int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+
+        const struct opcode_desc *desc =
+                lookup_opcode_from_instr(devinfo, mul_ops_v71,
+                                         ARRAY_SIZE(mul_ops_v71),
+                                         instr->alu.mul.op);
+        if (!desc)
+                return false;
+
+        uint32_t opcode = desc->opcode_first;
+
+        /* Some opcodes have a single valid value for their raddr_d, so set
+         * that here.  If raddr_b determine packing, it will be set below.
+         */
+        if (nsrc < 2)
+                raddr_d = ffsll(desc->raddr_mask) - 1;
+
+        switch (instr->alu.mul.op) {
+        case V3D_QPU_M_FMUL: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                /* No need for a +1 because desc->opcode_first has a 1 in this
+                 * field.
+                 */
+                opcode += packed << 4;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                opcode |= packed << 2;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                opcode |= packed << 0;
+                break;
+        }
+
+        case V3D_QPU_M_FMOV: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                raddr_d |= packed;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                raddr_d |= packed << 2;
+                break;
+        }
+
+        case V3D_QPU_M_VFMUL: {
+                unreachable("pending v71 update");
+                uint32_t packed;
+
+                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
+                        opcode = 8;
+                else
+                        opcode |= (packed + 4) & 7;
+
+                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
+                        return false;
+
+                break;
+        }
+
+        case V3D_QPU_M_MOV: {
+                uint32_t packed;
+
+                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
+                                               &packed)) {
+                        return false;
+                }
+
+                raddr_d |= packed << 2;
+                break;
+        }
+
+        default:
+                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                        return false;
+                }
+                break;
+        }
+
+        *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C);
+        *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D);
+        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
+        *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
+        if (instr->alu.mul.magic_write)
+                *packed_instr |= V3D_QPU_MM;
+
+        return true;
+}
+
+static bool
+v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        if (devinfo->ver < 71)
+                return v3d33_qpu_add_pack(devinfo, instr, packed_instr);
+        else
+                return v3d71_qpu_add_pack(devinfo, instr, packed_instr);
+}
+
+static bool
+v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        if (devinfo->ver < 71)
+                return v3d33_qpu_mul_pack(devinfo, instr, packed_instr);
+        else
+                return v3d71_qpu_mul_pack(devinfo, instr, packed_instr);
+}
+
+static bool
 v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
                          uint64_t packed_instr,
                          struct v3d_qpu_instr *instr)
@@ -1334,8 +2355,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
                         return false;
         }
 
-        instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
-        instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+        if (devinfo->ver <= 71) {
+                /*
+                 * For v71 this will be set on add/mul unpack, as raddr are now
+                 * part of v3d_qpu_input
+                 */
+                instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+                instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+        }
 
         if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
                 return false;
@@ -1421,8 +2448,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
         *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
 
         if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
-                *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
-                *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+                if (devinfo->ver < 71) {
+                        /*
+                         * For v71 this will be set on add/mul unpack, as raddr are now
+                         * part of v3d_qpu_input
+                         */
+                        *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+                        *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+                }
 
                 if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
                         return false;
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
index e6b1918b8f0..be7b78d5ef0 100644
--- a/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
@@ -34,29 +34,29 @@ static const struct {
         uint64_t inst;
         const char *expected;
 } tests[] = {
-        { 33, 0x3d003186bb800000ull, "nop                  ; nop               ; ldvary" },
-        { 33, 0x3c20318105829000ull, "fadd  r1, r1, r5     ; nop               ; thrsw" },
-        { 33, 0x3c403186bb81d000ull, "vpmsetup  -, r5      ; nop               ; ldunif" },
-        { 33, 0x3f003186bb800000ull, "nop                  ; nop               ; ldvpm" },
-        { 33, 0x3c002380b6edb000ull, "or  rf0, r3, r3      ; mov  vpm, r3" },
-        { 33, 0x57403006bbb80000ull, "nop                  ; fmul  r0, rf0, r5 ; ldvpm; ldunif" },
-        { 33, 0x9c094adef634b000ull, "ffloor.ifb  rf30.l, r3; fmul.pushz  rf43.l, r5, r1.h" },
-        { 33, 0xb0044c56ba326840ull, "flpop  rf22, rf33    ; fmul.pushz  rf49.l, r4.h, r1.abs" },
+        { 33, 0x3d003186bb800000ull, "nop                           ; nop                         ; ldvary" },
+        { 33, 0x3c20318105829000ull, "fadd r1, r1, r5               ; nop                         ; thrsw" },
+        { 33, 0x3c403186bb81d000ull, "vpmsetup -, r5                ; nop                         ; ldunif" },
+        { 33, 0x3f003186bb800000ull, "nop                           ; nop                         ; ldvpm" },
+        { 33, 0x3c002380b6edb000ull, "or rf0, r3, r3                ; mov vpm, r3" },
+        { 33, 0x57403006bbb80000ull, "nop                           ; fmul r0, rf0, r5            ; ldvpm; ldunif" },
+        { 33, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3         ; fmul.pushz rf43.l, r5, r1.h" },
+        { 33, 0xb0044c56ba326840ull, "flpop rf22, rf33              ; fmul.pushz rf49.l, r4.h, r1.abs" },
 
         /* vfmul input packing */
-        { 33, 0x101e8b6e8aad4000ull, "fmax.nornn  rf46, r4.l, r2.l; vfmul.ifnb  rf45, r3, r5" },
-        { 33, 0x1857d3c219825000ull, "faddnf.norc  r2.l, r5.l, r4; vfmul.ifb  rf15, r0.ll, r4; ldunif" },
-        { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna  rf61.h, r4.abs, r2.l; vfmul  rf55, r2.hh, r1" },
-        { 33, 0x2011c89b402cc000ull, "fsub.norz  rf27, r4.abs, r1.abs; vfmul.ifa  rf34, r3.swp, r1" },
+        { 33, 0x101e8b6e8aad4000ull, "fmax.nornn rf46, r4.l, r2.l   ; vfmul.ifnb rf45, r3, r5" },
+        { 33, 0x1857d3c219825000ull, "faddnf.norc r2.l, r5.l, r4    ; vfmul.ifb rf15, r0.ll, r4   ; ldunif" },
+        { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna rf61.h, r4.abs, r2.l; vfmul rf55, r2.hh, r1" },
+        { 33, 0x2011c89b402cc000ull, "fsub.norz rf27, r4.abs, r1.abs; vfmul.ifa rf34, r3.swp, r1" },
 
-        { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc  rf43, rf15.l, r0.h; fmul.ifna  rf10.h, r4.l, r5.abs" },
-        { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb  rf7.h, r1.l; fmul.pushn  rf46, r3.l, r2.abs" },
+        { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc rf43, rf15.l, r0.h; fmul.ifna rf10.h, r4.l, r5.abs" },
+        { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb rf7.h, r1.l          ; fmul.pushn rf46, r3.l, r2.abs" },
 
         /* small immediates */
-        { 33, 0x5de24398bbdc6218ull, "vflb.andnn  rf24     ; fmul  rf14, -8, rf8.h" },
-        { 33, 0x25ef83d8b166f00full, "vfmin.pushn  rf24, 15.ff, r5; smul24.ifnb  rf15, r1, r3" },
-        { 33, 0xadedcdf70839f990ull, "faddnf.pushc  rf55, -16.l, r3.abs; fmul.ifb  rf55.l, rf38.l, r1.h" },
-        { 33, 0x7dff89fa6a01f020ull, "fsub.nornc  rf58.h, 0x3b800000.l, r3.l; fmul.ifnb  rf39, r0.h, r0.h" },
+        { 33, 0x5de24398bbdc6218ull, "vflb.andnn rf24               ; fmul rf14, -8, rf8.h" },
+        { 33, 0x25ef83d8b166f00full, "vfmin.pushn rf24, 15.ff, r5   ; smul24.ifnb rf15, r1, r3" },
+        { 33, 0xadedcdf70839f990ull, "faddnf.pushc rf55, -16.l, r3.abs; fmul.ifb rf55.l, rf38.l, r1.h" },
+        { 33, 0x7dff89fa6a01f020ull, "fsub.nornc rf58.h, 0x3b800000.l, r3.l; fmul.ifnb rf39, r0.h, r0.h" },
 
         /* branch conditions */
         { 33, 0x02000006002034c0ull, "b.anyap  rf19" },
@@ -68,36 +68,36 @@ static const struct {
         { 33, 0x0200000300006000ull, "bu.na0  lri, a:unif" },
 
         /* Special waddr names */
-        { 33, 0x3c00318735808000ull, "vfpack  tlb, r0, r1  ; nop" },
-        { 33, 0xe0571c938e8d5000ull, "fmax.andc  recip, r5.h, r2.l; fmul.ifb  rf50.h, r3.l, r4.abs; ldunif" },
-        { 33, 0xc04098d4382c9000ull, "add.pushn  rsqrt, r1, r1; fmul  rf35.h, r3.abs, r1.abs; ldunif" },
-        { 33, 0x481edcd6b3184500ull, "vfmin.norn  log, r4.hh, r0; fmul.ifnb  rf51, rf20.abs, r0.l" },
-        { 33, 0x041618d57c453000ull, "shl.andn  exp, r3, r2; add.ifb  rf35, r1, r2" },
-        { 33, 0x7048e5da49272800ull, "fsub.ifa  rf26, r2.l, rf32; fmul.pushc  sin, r1.h, r1.abs; ldunif" },
+        { 33, 0x3c00318735808000ull, "vfpack tlb, r0, r1            ; nop" },
+        { 33, 0xe0571c938e8d5000ull, "fmax.andc recip, r5.h, r2.l   ; fmul.ifb rf50.h, r3.l, r4.abs; ldunif" },
+        { 33, 0xc04098d4382c9000ull, "add.pushn rsqrt, r1, r1       ; fmul rf35.h, r3.abs, r1.abs ; ldunif" },
+        { 33, 0x481edcd6b3184500ull, "vfmin.norn log, r4.hh, r0     ; fmul.ifnb rf51, rf20.abs, r0.l" },
+        { 33, 0x041618d57c453000ull, "shl.andn exp, r3, r2          ; add.ifb rf35, r1, r2" },
+        { 33, 0x7048e5da49272800ull, "fsub.ifa rf26, r2.l, rf32     ; fmul.pushc sin, r1.h, r1.abs; ldunif" },
 
         /* v4.1 signals */
-        { 41, 0x1f010520cf60a000ull, "fcmp.andz  rf32, r2.h, r1.h; vfmul  rf20, r0.hh, r3; ldunifa" },
-        { 41, 0x932045e6c16ea000ull, "fcmp  rf38, r2.abs, r5; fmul  rf23.l, r3, r3.abs; ldunifarf.rf1" },
-        { 41, 0xd72f0434e43ae5c0ull, "fcmp  rf52.h, rf23, r5.abs; fmul  rf16.h, rf23, r1; ldunifarf.rf60" },
-        { 41, 0xdb3048eb9d533780ull, "fmax  rf43.l, r3.h, rf30; fmul  rf35.h, r4, r2.l; ldunifarf.r1" },
-        { 41, 0x733620471e6ce700ull, "faddnf  rf7.l, rf28.h, r1.l; fmul  r1, r3.h, r3.abs; ldunifarf.rsqrt2" },
-        { 41, 0x9c094adef634b000ull, "ffloor.ifb  rf30.l, r3; fmul.pushz  rf43.l, r5, r1.h" },
+        { 41, 0x1f010520cf60a000ull, "fcmp.andz rf32, r2.h, r1.h    ; vfmul rf20, r0.hh, r3       ; ldunifa" },
+        { 41, 0x932045e6c16ea000ull, "fcmp rf38, r2.abs, r5         ; fmul rf23.l, r3, r3.abs     ; ldunifarf.rf1" },
+        { 41, 0xd72f0434e43ae5c0ull, "fcmp rf52.h, rf23, r5.abs     ; fmul rf16.h, rf23, r1       ; ldunifarf.rf60" },
+        { 41, 0xdb3048eb9d533780ull, "fmax rf43.l, r3.h, rf30       ; fmul rf35.h, r4, r2.l       ; ldunifarf.r1" },
+        { 41, 0x733620471e6ce700ull, "faddnf rf7.l, rf28.h, r1.l    ; fmul r1, r3.h, r3.abs       ; ldunifarf.rsqrt2" },
+        { 41, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3         ; fmul.pushz rf43.l, r5, r1.h" },
 
         /* v4.1 opcodes */
-        { 41, 0x3de020c7bdfd200dull, "ldvpmg_in  rf7, r2, r2; mov  r3, 13" },
-        { 41, 0x3de02040f8ff7201ull, "stvpmv  1, rf8       ; mov  r1, 1" },
-        { 41, 0xd8000e50bb2d3000ull, "sampid  rf16         ; fmul  rf57.h, r3, r1.l" },
+        { 41, 0x3de020c7bdfd200dull, "ldvpmg_in rf7, r2, r2         ; mov r3, 13" },
+        { 41, 0x3de02040f8ff7201ull, "stvpmv 1, rf8                 ; mov r1, 1" },
+        { 41, 0xd8000e50bb2d3000ull, "sampid rf16                   ; fmul rf57.h, r3, r1.l" },
 
         /* v4.1 SFU instructions. */
-        { 41, 0xe98d60c1ba2aef80ull, "recip  rf1, rf62     ; fmul  r3.h, r2.l, r1.l; ldunifrf.rf53" },
-        { 41, 0x7d87c2debc51c000ull, "rsqrt  rf30, r4      ; fmul  rf11, r4.h, r2.h; ldunifrf.rf31" },
-        { 41, 0xb182475abc2bb000ull, "rsqrt2  rf26, r3     ; fmul  rf29.l, r2.h, r1.abs; ldunifrf.rf9" },
-        { 41, 0x79880808bc0b6900ull, "sin  rf8, rf36       ; fmul  rf32, r2.h, r0.l; ldunifrf.rf32" },
-        { 41, 0x04092094bc5a28c0ull, "exp.ifb  rf20, r2    ; add  r2, rf35, r2" },
-        { 41, 0xe00648bfbc32a000ull, "log  rf63, r2        ; fmul.andnn  rf34.h, r4.l, r1.abs" },
+        { 41, 0xe98d60c1ba2aef80ull, "recip rf1, rf62               ; fmul r3.h, r2.l, r1.l       ; ldunifrf.rf53" },
+        { 41, 0x7d87c2debc51c000ull, "rsqrt rf30, r4                ; fmul rf11, r4.h, r2.h       ; ldunifrf.rf31" },
+        { 41, 0xb182475abc2bb000ull, "rsqrt2 rf26, r3               ; fmul rf29.l, r2.h, r1.abs   ; ldunifrf.rf9" },
+        { 41, 0x79880808bc0b6900ull, "sin rf8, rf36                 ; fmul rf32, r2.h, r0.l       ; ldunifrf.rf32" },
+        { 41, 0x04092094bc5a28c0ull, "exp.ifb rf20, r2              ; add r2, rf35, r2" },
+        { 41, 0xe00648bfbc32a000ull, "log rf63, r2                  ; fmul.andnn rf34.h, r4.l, r1.abs" },
 
         /* v4.2 changes */
-        { 42, 0x3c203192bb814000ull, "barrierid  syncb     ; nop               ; thrsw" },
+        { 42, 0x3c203192bb814000ull, "barrierid syncb               ; nop                         ; thrsw" },
 };
 
 static void
@@ -133,6 +133,8 @@ main(int argc, char **argv)
                 const char *disasm_output = v3d_qpu_disasm(&devinfo,
                                                            tests[i].inst);
 
+                printf("%s\n", disasm_output);
+
                 if (strcmp(disasm_output, tests[i].expected) != 0) {
                         printf("FAIL\n");
                         printf("  Expected: \"%s\"\n", tests[i].expected);
@@ -158,10 +160,10 @@ main(int argc, char **argv)
                                 /* Swap the operands to be sure that we test
                                  * how the QPUs distinguish between these ops.
                                  */
-                                swap_mux(&instr.alu.add.a,
-                                         &instr.alu.add.b);
-                                swap_pack(&instr.alu.add.a_unpack,
-                                          &instr.alu.add.b_unpack);
+                                swap_mux(&instr.alu.add.a.mux,
+                                         &instr.alu.add.b.mux);
+                                swap_pack(&instr.alu.add.a.unpack,
+                                          &instr.alu.add.b.unpack);
                                 break;
                         default:
                                 break;
diff --git a/src/broadcom/simulator/meson.build b/src/broadcom/simulator/meson.build
index 51f311bb094..0432fa0e52c 100644
--- a/src/broadcom/simulator/meson.build
+++ b/src/broadcom/simulator/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2019 Raspberry Pi
+# Copyright © 2019 Raspberry Pi Ltd
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -29,8 +29,8 @@ files_per_version = files(
 )
 
 v3d_args = []
-dep_v3dv3 = dependency('v3dv3', required: false)
-if dep_v3dv3.found()
+dep_v3d_hw = dependency('v3d_hw', required: false)
+if dep_v3d_hw.found()
   v3d_args += '-DUSE_V3D_SIMULATOR'
 endif
 
@@ -40,22 +40,22 @@ foreach ver : v3d_versions
     'v3d-simulator-v' + ver,
     [files_per_version, v3d_xml_pack],
     include_directories : [
-      inc_src, inc_include, inc_gallium_aux, inc_broadcom,
+      inc_src, inc_include, inc_broadcom,
     ],
     c_args : [v3d_args, '-DV3D_VERSION=' + ver],
     gnu_symbol_visibility: 'hidden',
-    dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind],
+    dependencies : [dep_v3d_hw, dep_libdrm, dep_valgrind],
 )
 endforeach
 
 libbroadcom_simulator = static_library(
   'broadcom_simulator',
   [libbroadcom_simulator_files],
-  include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux],
+  include_directories : [inc_src, inc_include],
   c_args : [v3d_args, no_override_init_args],
   cpp_args : [v3d_args],
   gnu_symbol_visibility : 'hidden', 
-  dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind],
+  dependencies : [dep_v3d_hw, dep_libdrm, dep_valgrind],
   link_with : [per_version_libs],
   build_by_default : false,
 )
diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
index 494e5bb4475..1d78d7205f1 100644
--- a/src/broadcom/simulator/v3d_simulator.c
+++ b/src/broadcom/simulator/v3d_simulator.c
@@ -54,32 +54,32 @@
 #include "util/hash_table.h"
 #include "util/ralloc.h"
 #include "util/set.h"
+#include "util/simple_mtx.h"
 #include "util/u_dynarray.h"
 #include "util/u_memory.h"
 #include "util/u_mm.h"
 #include "util/u_math.h"
 
 #include <xf86drm.h>
+#include "drm-uapi/amdgpu_drm.h"
 #include "drm-uapi/i915_drm.h"
 #include "drm-uapi/v3d_drm.h"
 
 #include "v3d_simulator.h"
 #include "v3d_simulator_wrapper.h"
 
+#include "broadcom/common/v3d_csd.h"
+
 /** Global (across GEM fds) state for the simulator */
 static struct v3d_simulator_state {
-        mtx_t mutex;
+        simple_mtx_t mutex;
         mtx_t submit_lock;
 
         struct v3d_hw *v3d;
         int ver;
 
-        /* Base virtual address of the heap. */
-        void *mem;
-        /* Base hardware address of the heap. */
-        uint32_t mem_base;
         /* Size of the heap. */
-        uint32_t mem_size;
+        uint64_t mem_size;
 
         struct mem_block *heap;
         struct mem_block *overflow;
@@ -90,10 +90,19 @@ static struct v3d_simulator_state {
         /** Last performance monitor ID. */
         uint32_t last_perfid;
 
+        /** Total performance counters */
+        uint32_t perfcnt_total;
+
         struct util_dynarray bin_oom;
         int refcount;
 } sim_state = {
-        .mutex = _MTX_INITIALIZER_NP,
+        .mutex = SIMPLE_MTX_INITIALIZER,
+};
+
+enum gem_type {
+        GEM_I915,
+        GEM_AMDGPU,
+        GEM_DUMB
 };
 
 /** Per-GEM-fd state for the simulator. */
@@ -109,10 +118,10 @@ struct v3d_simulator_file {
         uint32_t active_perfid;
 
         struct mem_block *gmp;
-        void *gmp_vaddr;
+        uint64_t gmp_addr;
 
-        /** Actual GEM fd is i915, so we should use their create ioctl. */
-        bool is_i915;
+        /** For specific gpus, use their create ioctl. Otherwise use dumb bo. */
+        enum gem_type gem_type;
 };
 
 /** Wrapper for drm_v3d_bo tracking the simulator-specific state. */
@@ -123,7 +132,7 @@ struct v3d_simulator_bo {
         struct mem_block *block;
         uint32_t size;
         uint64_t mmap_offset;
-        void *sim_vaddr;
+        uint64_t sim_addr;
         void *gem_vaddr;
 
         int handle;
@@ -184,7 +193,8 @@ set_gmp_flags(struct v3d_simulator_file *file,
         assert((offset & ((1 << GMP_ALIGN2) - 1)) == 0);
         int gmp_offset = offset >> GMP_ALIGN2;
         int gmp_count = align(size, 1 << GMP_ALIGN2) >> GMP_ALIGN2;
-        uint32_t *gmp = file->gmp_vaddr;
+        uint32_t *gmp = malloc((gmp_count + gmp_offset)*sizeof(uint32_t));
+        v3d_hw_read_mem(sim_state.v3d, gmp, file->gmp_addr, (gmp_offset + gmp_count)*sizeof(uint32_t));
 
         assert(flag <= 0x3);
 
@@ -193,6 +203,9 @@ set_gmp_flags(struct v3d_simulator_file *file,
                 gmp[i / 16] &= ~(0x3 << bitshift);
                 gmp[i / 16] |= flag << bitshift;
         }
+
+        v3d_hw_write_mem(sim_state.v3d, file->gmp_addr, gmp, (gmp_offset + gmp_count)*sizeof(uint32_t));
+        free(gmp);
 }
 
 /**
@@ -203,26 +216,25 @@ static struct v3d_simulator_bo *
 v3d_create_simulator_bo(int fd, unsigned size)
 {
         struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+
+        simple_mtx_lock(&sim_state.mutex);
         struct v3d_simulator_bo *sim_bo = rzalloc(file,
                                                   struct v3d_simulator_bo);
-        size = align(size, 4096);
-
-        sim_bo->file = file;
-
-        mtx_lock(&sim_state.mutex);
         sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, GMP_ALIGN2, 0);
-        mtx_unlock(&sim_state.mutex);
+        simple_mtx_unlock(&sim_state.mutex);
         assert(sim_bo->block);
-
+        size = align(size, 4096);
+        sim_bo->file = file;
         set_gmp_flags(file, sim_bo->block->ofs, size, 0x3);
 
         sim_bo->size = size;
 
         /* Allocate space for the buffer in simulator memory. */
-        sim_bo->sim_vaddr = sim_state.mem + sim_bo->block->ofs - sim_state.mem_base;
-        memset(sim_bo->sim_vaddr, 0xd0, size);
+        sim_bo->sim_addr = sim_bo->block->ofs;
+        v3d_hw_set_mem(sim_state.v3d, sim_bo->sim_addr, 0xd0, size);
 
-        *(uint32_t *)(sim_bo->sim_vaddr + sim_bo->size) = BO_SENTINEL;
+        uint32_t sentinel = BO_SENTINEL;
+        v3d_hw_write_mem(sim_state.v3d, sim_bo->sim_addr + sim_bo->size, &sentinel, sizeof(sentinel));
 
         return sim_bo;
 }
@@ -241,7 +253,9 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size)
          * one.
          */
         int ret;
-        if (file->is_i915) {
+        switch (file->gem_type) {
+        case GEM_I915:
+        {
                 struct drm_i915_gem_mmap_gtt map = {
                         .handle = handle,
                 };
@@ -252,14 +266,26 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size)
                  */
                 ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &map);
                 sim_bo->mmap_offset = map.offset;
-        } else {
+                break;
+        }
+        case GEM_AMDGPU:
+        {
+                union drm_amdgpu_gem_mmap map = { 0 };
+                map.in.handle = handle;
+
+                ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &map);
+                sim_bo->mmap_offset = map.out.addr_ptr;
+                break;
+        }
+        default:
+        {
                 struct drm_mode_map_dumb map = {
                         .handle = handle,
                 };
-
                 ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
                 sim_bo->mmap_offset = map.offset;
         }
+        }
         if (ret) {
                 fprintf(stderr, "Failed to get MMAP offset: %d\n", ret);
                 abort();
@@ -278,10 +304,10 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size)
          * don't need to go in the lookup table.
          */
         if (handle != 0) {
-                mtx_lock(&sim_state.mutex);
+                simple_mtx_lock(&sim_state.mutex);
                 _mesa_hash_table_insert(file->bo_map, int_to_key(handle),
                                         sim_bo);
-                mtx_unlock(&sim_state.mutex);
+                simple_mtx_unlock(&sim_state.mutex);
         }
 
         return sim_bo;
@@ -311,14 +337,14 @@ v3d_free_simulator_bo(struct v3d_simulator_bo *sim_bo)
         if (sim_bo->gem_vaddr)
                 munmap(sim_bo->gem_vaddr, sim_bo->size);
 
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         u_mmFreeMem(sim_bo->block);
         if (sim_bo->handle) {
                 _mesa_hash_table_remove_key(sim_file->bo_map,
                                             int_to_key(sim_bo->handle));
         }
-        mtx_unlock(&sim_state.mutex);
         ralloc_free(sim_bo);
+        simple_mtx_unlock(&sim_state.mutex);
 }
 
 static struct v3d_simulator_bo *
@@ -327,10 +353,10 @@ v3d_get_simulator_bo(struct v3d_simulator_file *file, int gem_handle)
         if (gem_handle == 0)
                 return NULL;
 
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         struct hash_entry *entry =
                 _mesa_hash_table_search(file->bo_map, int_to_key(gem_handle));
-        mtx_unlock(&sim_state.mutex);
+        simple_mtx_unlock(&sim_state.mutex);
 
         return entry ? entry->data : NULL;
 }
@@ -343,7 +369,7 @@ v3d_simulator_copy_in_handle(struct v3d_simulator_file *file, int handle)
         if (!sim_bo)
                 return;
 
-        memcpy(sim_bo->sim_vaddr, sim_bo->gem_vaddr, sim_bo->size);
+        v3d_hw_write_mem(sim_state.v3d, sim_bo->sim_addr, sim_bo->gem_vaddr, sim_bo->size);
 }
 
 static void
@@ -354,10 +380,11 @@ v3d_simulator_copy_out_handle(struct v3d_simulator_file *file, int handle)
         if (!sim_bo)
                 return;
 
-        memcpy(sim_bo->gem_vaddr, sim_bo->sim_vaddr, sim_bo->size);
+        v3d_hw_read_mem(sim_state.v3d, sim_bo->gem_vaddr, sim_bo->sim_addr, sim_bo->size);
 
-        if (*(uint32_t *)(sim_bo->sim_vaddr +
-                          sim_bo->size) != BO_SENTINEL) {
+        uint32_t sentinel;
+        v3d_hw_read_mem(sim_state.v3d, &sentinel, sim_bo->sim_addr + sim_bo->size, sizeof(sentinel));
+        if (sentinel != BO_SENTINEL) {
                 fprintf(stderr, "Buffer overflow in handle %d\n",
                         handle);
         }
@@ -395,10 +422,10 @@ v3d_get_simulator_perfmon(int fd, uint32_t perfid)
 
         struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
 
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         assert(perfid <= file->perfmons_size);
         struct v3d_simulator_perfmon *perfmon = file->perfmons[perfid - 1];
-        mtx_unlock(&sim_state.mutex);
+        simple_mtx_unlock(&sim_state.mutex);
 
         return perfmon;
 }
@@ -414,20 +441,46 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid)
 
         perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
         if (perfmon)
-                v3d41_simulator_perfmon_stop(sim_state.v3d,
-                                             perfmon->ncounters,
-                                             perfmon->values);
+                v3d_X_simulator(perfmon_stop)(sim_state.v3d,
+                                              perfmon->ncounters,
+                                              perfmon->values);
 
         perfmon = v3d_get_simulator_perfmon(fd, perfid);
         if (perfmon)
-                v3d41_simulator_perfmon_start(sim_state.v3d,
-                                              perfmon->ncounters,
-                                              perfmon->counters);
+                v3d_X_simulator(perfmon_start)(sim_state.v3d,
+                                               perfmon->ncounters,
+                                               perfmon->counters);
 
         file->active_perfid = perfid;
 }
 
 static int
+v3d_simulator_signal_syncobjs(int fd, struct drm_v3d_multi_sync *ms)
+{
+        struct drm_v3d_sem *out_syncs = (void *)(uintptr_t)ms->out_syncs;
+        int n_syncobjs = ms->out_sync_count;
+        uint32_t syncobjs[n_syncobjs];
+
+        for (int i = 0; i < n_syncobjs; i++)
+                syncobjs[i] = out_syncs[i].handle;
+        return drmSyncobjSignal(fd, (uint32_t *) &syncobjs, n_syncobjs);
+}
+
+static int
+v3d_simulator_process_post_deps(int fd, struct drm_v3d_extension *ext)
+{
+        int ret = 0;
+        while (ext && ext->id != DRM_V3D_EXT_ID_MULTI_SYNC)
+                ext = (void *)(uintptr_t) ext->next;
+
+        if (ext) {
+                struct drm_v3d_multi_sync *ms = (struct drm_v3d_multi_sync *) ext;
+                ret = v3d_simulator_signal_syncobjs(fd, ms);
+        }
+        return ret;
+}
+
+static int
 v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
 {
         struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
@@ -441,11 +494,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
         bin_fd = fd;
 
         v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
-
-        if (sim_state.ver >= 41)
-                v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
-        else
-                v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+        v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs);
 
         util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
                               sim_bo) {
@@ -459,7 +508,12 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
         if (ret)
                 return ret;
 
-        return 0;
+        if (submit->flags & DRM_V3D_SUBMIT_EXTENSION) {
+                struct drm_v3d_extension *ext = (void *)(uintptr_t)submit->extensions;
+                ret = v3d_simulator_process_post_deps(fd, ext);
+        }
+
+        return ret;
 }
 
 /**
@@ -488,14 +542,30 @@ v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
          * native ioctl in case we're on a render node.
          */
         int ret;
-        if (file->is_i915) {
+        switch (file->gem_type) {
+        case GEM_I915:
+        {
                 struct drm_i915_gem_create create = {
                         .size = args->size,
                 };
+
                 ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE, &create);
 
                 args->handle = create.handle;
-        } else {
+                break;
+        }
+        case GEM_AMDGPU:
+        {
+                union drm_amdgpu_gem_create create = { 0 };
+                create.in.bo_size = args->size;
+
+                ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create);
+
+                args->handle = create.out.handle;
+                break;
+        }
+        default:
+        {
                 struct drm_mode_create_dumb create = {
                         .width = 128,
                         .bpp = 8,
@@ -507,7 +577,7 @@ v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
 
                 args->handle = create.handle;
         }
-
+        }
         if (ret == 0) {
                 struct v3d_simulator_bo *sim_bo =
                         v3d_create_simulator_bo_for_gem(fd, args->handle,
@@ -564,15 +634,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
 }
 
 static int
-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
-{
-        if (sim_state.ver >= 41)
-                return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
-        else
-                return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
-}
-
-static int
 v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
 {
         struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
@@ -583,13 +644,18 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
         v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
         v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
 
-        if (sim_state.ver >= 41)
-                ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
-        else
-                ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+        ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args);
 
         v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
 
+        if (ret)
+                return ret;
+
+        if (args->flags & DRM_V3D_SUBMIT_EXTENSION) {
+                struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions;
+                ret = v3d_simulator_process_post_deps(fd, ext);
+        }
+
         return ret;
 }
 
@@ -605,15 +671,311 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
 
         v3d_simulator_perfmon_switch(fd, args->perfmon_id);
 
-        if (sim_state.ver >= 41)
-                ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
-                                                       file->gmp->ofs);
-        else
-                ret = -1;
+        ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args,
+                                                file->gmp->ofs);
 
         for (int i = 0; i < args->bo_handle_count; i++)
                 v3d_simulator_copy_out_handle(file, bo_handles[i]);
 
+        if (ret < 0)
+                return ret;
+
+        if (args->flags & DRM_V3D_SUBMIT_EXTENSION) {
+                struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions;
+                ret = v3d_simulator_process_post_deps(fd, ext);
+        }
+
+        return ret;
+}
+
+static void
+v3d_rewrite_csd_job_wg_counts_from_indirect(int fd,
+					    struct drm_v3d_extension *ext,
+					    struct drm_v3d_submit_cpu *args)
+{
+	struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+	struct drm_v3d_indirect_csd *indirect_csd = (struct drm_v3d_indirect_csd *) ext;
+	uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+
+	assert(args->bo_handle_count == 1);
+	struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+	struct v3d_simulator_bo *indirect = v3d_get_simulator_bo(file, indirect_csd->indirect);
+	struct drm_v3d_submit_csd *submit = &indirect_csd->submit;
+
+	uint32_t *wg_counts = (uint32_t *) (bo->gem_vaddr + indirect_csd->offset);
+
+	if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0)
+		return;
+
+	submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+	submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+	submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+	submit->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
+			(wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+
+	for (int i = 0; i < 3; i++) {
+		/* 0xffffffff indicates that the uniform rewrite is not needed */
+		if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) {
+			uint32_t uniform_idx = indirect_csd->wg_uniform_offsets[i];
+			((uint32_t *) indirect->gem_vaddr)[uniform_idx] = wg_counts[i];
+		}
+	}
+
+	v3d_simulator_submit_csd_ioctl(fd, submit);
+}
+
+static void
+v3d_timestamp_query(int fd,
+		    struct drm_v3d_extension *ext,
+		    struct drm_v3d_submit_cpu *args)
+{
+	struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+	struct drm_v3d_timestamp_query *timestamp_query = (struct drm_v3d_timestamp_query *) ext;
+	uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+	struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+	uint32_t *offsets = (void *)(uintptr_t) timestamp_query->offsets;
+	uint32_t *syncs = (void *)(uintptr_t) timestamp_query->syncs;
+
+	struct timespec t;
+	clock_gettime(CLOCK_MONOTONIC, &t);
+
+	for (uint32_t i = 0; i < timestamp_query->count; i++) {
+                uint64_t value = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
+                v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + offsets[i], &value, sizeof(value));
+	}
+
+	drmSyncobjSignal(fd, syncs, timestamp_query->count);
+}
+
+static void
+v3d_reset_timestamp_queries(int fd,
+			    struct drm_v3d_extension *ext,
+			    struct drm_v3d_submit_cpu *args)
+{
+	struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+	struct drm_v3d_reset_timestamp_query *reset = (struct drm_v3d_reset_timestamp_query *) ext;
+	uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+	struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+	uint32_t *syncs = (void *)(uintptr_t) reset->syncs;
+
+        v3d_hw_set_mem(sim_state.v3d, bo->sim_addr + reset->offset, 0, reset->count);
+
+	drmSyncobjReset(fd, syncs, reset->count);
+}
+
+static void
+write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
+{
+        if (do_64bit) {
+                uint64_t *dst64 = (uint64_t *) dst;
+                dst64[idx] = value;
+        } else {
+                uint32_t *dst32 = (uint32_t *) dst;
+                dst32[idx] = (uint32_t) value;
+        }
+}
+
+static void
+v3d_copy_query_results(int fd,
+		       struct drm_v3d_extension *ext,
+		       struct drm_v3d_submit_cpu *args)
+{
+	struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+	struct drm_v3d_copy_timestamp_query *copy = (struct drm_v3d_copy_timestamp_query *) ext;
+	uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+	struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+	struct v3d_simulator_bo *timestamp = v3d_get_simulator_bo(file, bo_handles[1]);
+	uint32_t *offsets = (void *)(uintptr_t) copy->offsets;
+	uint32_t *syncs = (void *)(uintptr_t) copy->syncs;
+	bool available, write_result;
+	uint8_t *data = malloc(copy->count * copy->stride);
+	uint64_t query_val;
+
+	uint8_t *p = data;
+	for (uint32_t i = 0; i < copy->count; i++) {
+		available = (drmSyncobjWait(fd, &syncs[i], 1, 0, 0, NULL) == 0);
+
+		write_result = available || copy->do_partial;
+		if (write_result) {
+			v3d_hw_read_mem(sim_state.v3d, &query_val, timestamp->sim_addr + offsets[i], sizeof(uint64_t));
+			write_to_buffer(p, 0, copy->do_64bit, query_val);
+		}
+
+		if (copy->availability_bit)
+			write_to_buffer(p, 1, copy->do_64bit, available ? 1u : 0u);
+
+		p += copy->stride;
+	}
+
+	v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + copy->offset, data, copy->count * copy->stride);
+	free(data);
+}
+
+static void
+v3d_reset_performance_queries(int fd,
+			      struct drm_v3d_extension *ext,
+			      struct drm_v3d_submit_cpu *args)
+{
+	struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+	struct drm_v3d_reset_performance_query *reset = (struct drm_v3d_reset_performance_query *) ext;
+	uint64_t *kperfmon_ids = (void *)(uintptr_t) reset->kperfmon_ids;
+	uint32_t *syncs = (void *)(uintptr_t) reset->syncs;
+	struct v3d_simulator_perfmon *perfmon;
+
+	for (uint32_t i = 0; i < reset->count; i++) {
+		uint32_t *ids = (void *)(uintptr_t) kperfmon_ids[i];
+
+		for (uint32_t j = 0; j < reset->nperfmons; j++) {
+			mtx_lock(&sim_state.submit_lock);
+
+			/* Stop the perfmon if it is still active */
+			if (ids[j] == file->active_perfid)
+				v3d_simulator_perfmon_switch(fd, 0);
+
+			mtx_unlock(&sim_state.submit_lock);
+
+			perfmon = v3d_get_simulator_perfmon(fd, ids[j]);
+
+			if (!perfmon)
+				return;
+
+			memset(perfmon->values, 0, perfmon->ncounters * sizeof(uint64_t));
+		}
+	}
+
+	drmSyncobjReset(fd, syncs, reset->count);
+}
+
+static void
+v3d_write_performance_query_result(int fd,
+				   struct drm_v3d_copy_performance_query *copy,
+				   uint32_t *kperfmon_ids,
+				   void *data)
+{
+	struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+	struct v3d_simulator_perfmon *perfmon;
+	uint64_t counter_values[sim_state.perfcnt_total];
+
+	for (uint32_t i = 0; i < copy->nperfmons; i++) {
+		mtx_lock(&sim_state.submit_lock);
+
+		/* Stop the perfmon if it is still active */
+		if (kperfmon_ids[i] == file->active_perfid)
+			v3d_simulator_perfmon_switch(fd, 0);
+
+		mtx_unlock(&sim_state.submit_lock);
+
+		perfmon = v3d_get_simulator_perfmon(fd, kperfmon_ids[i]);
+
+		if (!perfmon)
+			return;
+
+		memcpy(&counter_values[i * DRM_V3D_MAX_PERF_COUNTERS], perfmon->values,
+		       perfmon->ncounters * sizeof(uint64_t));
+	}
+
+	for (uint32_t i = 0; i < copy->ncounters; i++)
+		write_to_buffer(data, i, copy->do_64bit, counter_values[i]);
+}
+
+static void
+v3d_copy_performance_query(int fd,
+			   struct drm_v3d_extension *ext,
+			   struct drm_v3d_submit_cpu *args)
+{
+	struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+	struct drm_v3d_copy_performance_query *copy = (struct drm_v3d_copy_performance_query *) ext;
+	uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+	struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+	uint64_t *kperfmon_ids = (void *)(uintptr_t) copy->kperfmon_ids;
+	uint32_t *syncs = (void *)(uintptr_t) copy->syncs;
+	bool available, write_result;
+	uint8_t *data = malloc(copy->count * copy->stride);
+
+	uint8_t *p = data;
+	for (uint32_t i = 0; i < copy->count; i++) {
+		/* Although we don't have in_syncs implemented in the simulator,
+		 * we don't need to wait for the availability of the syncobjs,
+		 * as they are signaled by CL and CSD jobs, which are serialized
+		 * by the simulator.
+		 */
+		available = (drmSyncobjWait(fd, &syncs[i], 1, 0, 0, NULL) == 0);
+
+		write_result = available || copy->do_partial;
+		if (write_result) {
+			v3d_write_performance_query_result(fd, copy,
+							   (void *)(uintptr_t) kperfmon_ids[i],
+							   p);
+		}
+
+		if (copy->availability_bit) {
+			write_to_buffer(p, copy->ncounters, copy->do_64bit,
+					available ? 1u : 0u);
+		}
+
+		p += copy->stride;
+	}
+
+	v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + copy->offset, data, copy->count + copy->stride);
+	free(data);
+}
+
+static int
+v3d_simulator_submit_cpu_ioctl(int fd, struct drm_v3d_submit_cpu *args)
+{
+	struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions;
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+        uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+        int ret = 0;
+
+        for (int i = 0; i < args->bo_handle_count; i++)
+                v3d_simulator_copy_in_handle(file, bo_handles[i]);
+
+	while (ext) {
+		switch (ext->id) {
+		case DRM_V3D_EXT_ID_MULTI_SYNC:
+			/* As the simulator serializes the jobs, we don't need
+			 * to handle the in_syncs here. The out_syncs are handled
+			 * by the end of the ioctl in v3d_simulator_process_post_deps().
+			 */
+			break;
+		case DRM_V3D_EXT_ID_CPU_INDIRECT_CSD:
+			v3d_rewrite_csd_job_wg_counts_from_indirect(fd, ext, args);
+			break;
+		case DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY:
+			v3d_timestamp_query(fd, ext, args);
+			break;
+		case DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY:
+			v3d_reset_timestamp_queries(fd, ext, args);
+			break;
+		case DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY:
+			v3d_copy_query_results(fd, ext, args);
+			break;
+		case DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY:
+			v3d_reset_performance_queries(fd, ext, args);
+			break;
+		case DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY:
+			v3d_copy_performance_query(fd, ext, args);
+			break;
+		default:
+			fprintf(stderr, "Unknown CPU job 0x%08x\n", (int)ext->id);
+			break;
+		}
+
+                ext = (void *)(uintptr_t) ext->next;
+	}
+
+        for (int i = 0; i < args->bo_handle_count; i++)
+                v3d_simulator_copy_out_handle(file, bo_handles[i]);
+
+        if (ret < 0)
+                return ret;
+
+        if (args->flags & DRM_V3D_SUBMIT_EXTENSION) {
+                ext = (void *)(uintptr_t)args->extensions;
+                ret = v3d_simulator_process_post_deps(fd, ext);
+        }
+
         return ret;
 }
 
@@ -631,7 +993,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
 
         perfmon->ncounters = args->ncounters;
         for (int i = 0; i < args->ncounters; i++) {
-                if (args->counters[i] >= V3D_PERFCNT_NUM) {
+                if (args->counters[i] >= sim_state.perfcnt_total) {
                         ralloc_free(perfmon);
                         return -EINVAL;
                 } else {
@@ -639,10 +1001,10 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
                 }
         }
 
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         args->id = perfmons_next_id(file);
         file->perfmons[args->id - 1] = perfmon;
-        mtx_unlock(&sim_state.mutex);
+        simple_mtx_unlock(&sim_state.mutex);
 
         return 0;
 }
@@ -657,9 +1019,9 @@ v3d_simulator_perfmon_destroy_ioctl(int fd, struct drm_v3d_perfmon_destroy *args
         if (!perfmon)
                 return -EINVAL;
 
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         file->perfmons[args->id - 1] = NULL;
-        mtx_unlock(&sim_state.mutex);
+        simple_mtx_unlock(&sim_state.mutex);
 
         ralloc_free(perfmon);
 
@@ -712,7 +1074,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
                 return 0;
 
         case DRM_IOCTL_V3D_GET_PARAM:
-                return v3d_simulator_get_param_ioctl(fd, args);
+                return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args);
 
         case DRM_IOCTL_GEM_CLOSE:
                 return v3d_simulator_gem_close_ioctl(fd, args);
@@ -723,6 +1085,9 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
         case DRM_IOCTL_V3D_SUBMIT_CSD:
                 return v3d_simulator_submit_csd_ioctl(fd, args);
 
+	case DRM_IOCTL_V3D_SUBMIT_CPU:
+		return v3d_simulator_submit_cpu_ioctl(fd, args);
+
         case DRM_IOCTL_V3D_PERFMON_CREATE:
                 return v3d_simulator_perfmon_create_ioctl(fd, args);
 
@@ -747,20 +1112,28 @@ v3d_simulator_get_mem_size(void)
    return sim_state.mem_size;
 }
 
+uint32_t
+v3d_simulator_get_mem_free(void)
+{
+   uint32_t total_free = 0;
+   struct mem_block *p;
+   for (p = sim_state.heap->next_free; p != sim_state.heap; p = p->next_free)
+      total_free += p->size;
+   return total_free;
+}
+
 static void
 v3d_simulator_init_global()
 {
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         if (sim_state.refcount++) {
-                mtx_unlock(&sim_state.mutex);
+                simple_mtx_unlock(&sim_state.mutex);
                 return;
         }
 
         sim_state.v3d = v3d_hw_auto_new(NULL);
         v3d_hw_alloc_mem(sim_state.v3d, 1024 * 1024 * 1024);
-        sim_state.mem_base =
-                v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size,
-                               &sim_state.mem);
+        v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size);
 
         /* Allocate from anywhere from 4096 up.  We don't allocate at 0,
          * because for OQs and some other addresses in the HW, 0 means
@@ -772,11 +1145,11 @@ v3d_simulator_init_global()
          * and land there.
          */
         struct mem_block *b = u_mmAllocMem(sim_state.heap, 4096, GMP_ALIGN2, 0);
-        memset(sim_state.mem + b->ofs - sim_state.mem_base, 0xd0, 4096);
+        v3d_hw_set_mem(sim_state.v3d, b->ofs, 0xd0, 4096);
 
         sim_state.ver = v3d_hw_get_version(sim_state.v3d);
 
-        mtx_unlock(&sim_state.mutex);
+        simple_mtx_unlock(&sim_state.mutex);
 
         sim_state.fd_map =
                 _mesa_hash_table_create(NULL,
@@ -785,10 +1158,8 @@ v3d_simulator_init_global()
 
         util_dynarray_init(&sim_state.bin_oom, NULL);
 
-        if (sim_state.ver >= 41)
-                v3d41_simulator_init_regs(sim_state.v3d);
-        else
-                v3d33_simulator_init_regs(sim_state.v3d);
+        v3d_X_simulator(init_regs)(sim_state.v3d);
+        v3d_X_simulator(get_perfcnt_total)(&sim_state.perfcnt_total);
 }
 
 struct v3d_simulator_file *
@@ -800,7 +1171,11 @@ v3d_simulator_init(int fd)
 
         drmVersionPtr version = drmGetVersion(fd);
         if (version && strncmp(version->name, "i915", version->name_len) == 0)
-                sim_file->is_i915 = true;
+                sim_file->gem_type = GEM_I915;
+        else if (version && strncmp(version->name, "amdgpu", version->name_len) == 0)
+                sim_file->gem_type = GEM_AMDGPU;
+        else
+                sim_file->gem_type = GEM_DUMB;
         drmFreeVersion(version);
 
         sim_file->bo_map =
@@ -808,15 +1183,14 @@ v3d_simulator_init(int fd)
                                         _mesa_hash_pointer,
                                         _mesa_key_pointer_equal);
 
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         _mesa_hash_table_insert(sim_state.fd_map, int_to_key(fd + 1),
                                 sim_file);
-        mtx_unlock(&sim_state.mutex);
+        simple_mtx_unlock(&sim_state.mutex);
 
         sim_file->gmp = u_mmAllocMem(sim_state.heap, 8096, GMP_ALIGN2, 0);
-        sim_file->gmp_vaddr = (sim_state.mem + sim_file->gmp->ofs -
-                               sim_state.mem_base);
-        memset(sim_file->gmp_vaddr, 0, 8096);
+        sim_file->gmp_addr = sim_file->gmp->ofs;
+        v3d_hw_set_mem(sim_state.v3d, sim_file->gmp_addr, 0, 8096);
 
         return sim_file;
 }
@@ -824,16 +1198,16 @@ v3d_simulator_init(int fd)
 void
 v3d_simulator_destroy(struct v3d_simulator_file *sim_file)
 {
-        mtx_lock(&sim_state.mutex);
+        simple_mtx_lock(&sim_state.mutex);
         if (!--sim_state.refcount) {
                 _mesa_hash_table_destroy(sim_state.fd_map, NULL);
                 util_dynarray_fini(&sim_state.bin_oom);
                 u_mmDestroy(sim_state.heap);
-                /* No memsetting the struct, because it contains the mutex. */
-                sim_state.mem = NULL;
+                /* No memsetting the sim_state struct, because it contains the
+                 * mutex. */
         }
-        mtx_unlock(&sim_state.mutex);
         ralloc_free(sim_file);
+        simple_mtx_unlock(&sim_state.mutex);
 }
 
 #endif /* USE_V3D_SIMULATOR */
diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
index ef6bf44f19f..03575ae8951 100644
--- a/src/broadcom/simulator/v3d_simulator.h
+++ b/src/broadcom/simulator/v3d_simulator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  * Copyright © 2014-2017 Broadcom
  * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
  *
@@ -40,17 +40,35 @@ uint32_t v3d_simulator_get_spill(uint32_t spill_size);
 int v3d_simulator_ioctl(int fd, unsigned long request, void *arg);
 void v3d_simulator_open_from_handle(int fd, int handle, uint32_t size);
 uint32_t v3d_simulator_get_mem_size(void);
+uint32_t v3d_simulator_get_mem_free(void);
 
 #ifdef v3dX
 #  include "v3dx_simulator.h"
 #else
-#  define v3dX(x) v3d33_##x
+#  define v3dX(x) v3d42_##x
 #  include "v3dx_simulator.h"
 #  undef v3dX
 
-#  define v3dX(x) v3d41_##x
+#  define v3dX(x) v3d71_##x
 #  include "v3dx_simulator.h"
 #  undef v3dX
+
 #endif
 
+/* Helper to call simulator ver specific functions */
+#define v3d_X_simulator(thing) ({                     \
+   __typeof(&v3d42_simulator_##thing) v3d_X_sim_thing;\
+   switch (sim_state.ver) {                           \
+   case 42:                                           \
+      v3d_X_sim_thing = &v3d42_simulator_##thing;     \
+      break;                                          \
+   case 71:                                           \
+      v3d_X_sim_thing = &v3d71_simulator_##thing;     \
+      break;                                          \
+   default:                                           \
+      unreachable("Unsupported hardware generation"); \
+   }                                                  \
+   v3d_X_sim_thing;                                   \
+})
+
 #endif
diff --git a/src/broadcom/simulator/v3d_simulator_wrapper.cpp b/src/broadcom/simulator/v3d_simulator_wrapper.cpp
index 88e439255d3..ef9bec492ee 100644
--- a/src/broadcom/simulator/v3d_simulator_wrapper.cpp
+++ b/src/broadcom/simulator/v3d_simulator_wrapper.cpp
@@ -30,12 +30,6 @@
 #ifdef USE_V3D_SIMULATOR
 
 #include "v3d_simulator_wrapper.h"
-
-#define V3D_TECH_VERSION 3
-#define V3D_REVISION 3
-#define V3D_SUB_REV 0
-#define V3D_HIDDEN_REV 0
-#define V3D_COMPAT_REV 0
 #include "v3d_hw_auto.h"
 
 extern "C" {
@@ -45,13 +39,29 @@ struct v3d_hw *v3d_hw_auto_new(void *in_params)
         return v3d_hw_auto_make_unique().release();
 }
 
+uint64_t v3d_hw_get_mem(const struct v3d_hw *hw, uint64_t *size)
+{
+        uint64_t addr;
+        assert(hw->get_mem(&addr, size));
+        return addr;
+}
+
+void v3d_hw_set_mem(struct v3d_hw *hw, uint64_t addr, uint8_t value, uint64_t size)
+{
+        hw->set_mem(addr, value, size);
+}
+
+void v3d_hw_write_mem(struct v3d_hw *hw, uint64_t addr, const void *p, uint64_t size)
+{
+        hw->write_mem(addr, p, size);
+}
 
-uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p)
+void v3d_hw_read_mem(struct v3d_hw *hw, void *p, uint64_t addr, uint64_t size)
 {
-        return hw->get_mem(size, p);
+        hw->read_mem(p, addr, size);
 }
 
-bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size)
+bool v3d_hw_alloc_mem(struct v3d_hw *hw, uint64_t min_size)
 {
         return hw->alloc_mem(min_size) == V3D_HW_ALLOC_SUCCESS;
 }
diff --git a/src/broadcom/simulator/v3d_simulator_wrapper.h b/src/broadcom/simulator/v3d_simulator_wrapper.h
index 05b2a3361ac..7f2be57a3be 100644
--- a/src/broadcom/simulator/v3d_simulator_wrapper.h
+++ b/src/broadcom/simulator/v3d_simulator_wrapper.h
@@ -31,8 +31,11 @@ extern "C" {
 #endif
 
 struct v3d_hw *v3d_hw_auto_new(void *params);
-uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p);
-bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size);
+uint64_t v3d_hw_get_mem(const struct v3d_hw *hw, uint64_t *size);
+void v3d_hw_set_mem(struct v3d_hw *hw, uint64_t addr, uint8_t value, uint64_t size);
+void v3d_hw_write_mem(struct v3d_hw *hw, uint64_t add, const void *p, uint64_t size);
+void v3d_hw_read_mem(struct v3d_hw *hw, void *p, uint64_t addr, uint64_t size);
+bool v3d_hw_alloc_mem(struct v3d_hw *hw, uint64_t min_size);
 uint32_t v3d_hw_read_reg(struct v3d_hw *hw, uint32_t reg);
 void v3d_hw_write_reg(struct v3d_hw *hw, uint32_t reg, uint32_t val);
 void v3d_hw_tick(struct v3d_hw *hw);
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
index 07bbbe2f8c9..ea682955dca 100644
--- a/src/broadcom/simulator/v3dx_simulator.c
+++ b/src/broadcom/simulator/v3dx_simulator.c
@@ -40,32 +40,25 @@
 #include "v3d_simulator.h"
 #include "v3d_simulator_wrapper.h"
 
+#include "common/v3d_performance_counters.h"
+
 #include "util/macros.h"
 #include "util/bitscan.h"
 #include "drm-uapi/v3d_drm.h"
 
 #define HW_REGISTER_RO(x) (x)
 #define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
-#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
+#if V3D_VERSION == 71
+#include "libs/core/v3d/registers/7.1.7.0/v3d.h"
 #else
-#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+#if V3D_VERSION == 42
+#include "libs/core/v3d/registers/4.2.14.0/v3d.h"
+#endif
 #endif
 
 #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
 #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
 
-static void
-v3d_invalidate_l3(struct v3d_hw *v3d)
-{
-#if V3D_VERSION < 40
-        uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
-
-        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
-        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
-#endif
-}
-
 /* Invalidates the L2C cache.  This is a read-only cache for uniforms and instructions. */
 static void
 v3d_invalidate_l2c(struct v3d_hw *v3d)
@@ -150,7 +143,6 @@ v3d_invalidate_slices(struct v3d_hw *v3d)
 static void
 v3d_invalidate_caches(struct v3d_hw *v3d)
 {
-        v3d_invalidate_l3(v3d);
         v3d_invalidate_l2c(v3d);
         v3d_invalidate_l2t(v3d);
         v3d_invalidate_slices(v3d);
@@ -178,38 +170,48 @@ v3d_flush_caches(struct v3d_hw *v3d)
         v3d_flush_l2t(v3d);
 }
 
+#if V3D_VERSION < 71
+#define TFU_REG(NAME) V3D_TFU_ ## NAME
+#else
+#define TFU_REG(NAME) V3D_IFC_ ## NAME
+#endif
+
+
 int
 v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
                                  struct drm_v3d_submit_tfu *args)
 {
-        int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
-
-        V3D_WRITE(V3D_TFU_IIA, args->iia);
-        V3D_WRITE(V3D_TFU_IIS, args->iis);
-        V3D_WRITE(V3D_TFU_ICA, args->ica);
-        V3D_WRITE(V3D_TFU_IUA, args->iua);
-        V3D_WRITE(V3D_TFU_IOA, args->ioa);
-        V3D_WRITE(V3D_TFU_IOS, args->ios);
-        V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
-        V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
-        V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
-        V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
-
-        V3D_WRITE(V3D_TFU_ICFG, args->icfg);
-
-        while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+        int last_vtct = V3D_READ(TFU_REG(CS)) & TFU_REG(CS_CVTCT_SET);
+
+        V3D_WRITE(TFU_REG(IIA), args->iia);
+        V3D_WRITE(TFU_REG(IIS), args->iis);
+        V3D_WRITE(TFU_REG(ICA), args->ica);
+        V3D_WRITE(TFU_REG(IUA), args->iua);
+        V3D_WRITE(TFU_REG(IOA), args->ioa);
+#if V3D_VERSION >= 71
+        V3D_WRITE(TFU_REG(IOC), args->v71.ioc);
+#endif
+        V3D_WRITE(TFU_REG(IOS), args->ios);
+        V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
+        V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
+        V3D_WRITE(TFU_REG(COEF2), args->coef[2]);
+        V3D_WRITE(TFU_REG(COEF3), args->coef[3]);
+
+        V3D_WRITE(TFU_REG(ICFG), args->icfg);
+
+        while ((V3D_READ(TFU_REG(CS)) & TFU_REG(CS_CVTCT_SET)) == last_vtct) {
                 v3d_hw_tick(v3d);
         }
 
         return 0;
 }
 
-#if V3D_VERSION >= 41
 int
 v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
                                  struct drm_v3d_submit_csd *args,
                                  uint32_t gmp_ofs)
 {
+#if V3D_VERSION >= 42
         int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
                                    V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
         g_gmp_ofs = gmp_ofs;
@@ -223,6 +225,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
+#if V3D_VERSION >= 71
+        V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
+#endif
         /* CFG0 kicks off the job */
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
 
@@ -239,15 +244,21 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
         v3d_flush_caches(v3d);
 
         return 0;
-}
+#else
+        return -1;
 #endif
+}
 
 int
 v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
                                 struct drm_v3d_get_param *args)
 {
         static const uint32_t reg_map[] = {
+#if V3D_VERSION >= 71
+                [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_IDENT0,
+#else
                 [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
+#endif
                 [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
                 [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
                 [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
@@ -261,14 +272,20 @@ v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
                 args->value = 1;
                 return 0;
         case DRM_V3D_PARAM_SUPPORTS_CSD:
-                args->value = V3D_VERSION >= 41;
+                args->value = V3D_VERSION >= 42;
                 return 0;
         case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
                 args->value = 1;
                 return 0;
         case DRM_V3D_PARAM_SUPPORTS_PERFMON:
-                args->value = V3D_VERSION >= 41;
+                args->value = V3D_VERSION >= 42;
                 return 0;
+        case DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT:
+                args->value = 1;
+                return 0;
+	case DRM_V3D_PARAM_SUPPORTS_CPU_QUEUE:
+		args->value = 1;
+		return 0;
         }
 
         if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) {
@@ -307,16 +324,17 @@ v3d_isr_core(struct v3d_hw *v3d,
                 return;
         }
 
+#if V3D_VERSION <= 42
         if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
                 fprintf(stderr, "GMP violation at 0x%08x\n",
                         V3D_READ(V3D_GMP_VIO_ADDR));
-                abort();
         } else {
                 fprintf(stderr,
                         "Unexpected ISR with core status 0x%08x\n",
                         core_status);
         }
         abort();
+#endif
 }
 
 static void
@@ -331,11 +349,10 @@ handle_mmu_interruptions(struct v3d_hw *v3d,
                 return;
 
         const char *client = "?";
-        uint32_t axi_id = V3D_READ(V3D_MMU_VIO_ID);
+        uint32_t axi_id = V3D_READ(V3D_MMU0_VIO_ID);
         uint32_t va_width = 30;
 
-#if V3D_VERSION >= 41
-        static const char *const v3d41_axi_ids[] = {
+        static const char *const v3d42_axi_ids[] = {
                 "L2T",
                 "PTB",
                 "PSE",
@@ -347,21 +364,21 @@ handle_mmu_interruptions(struct v3d_hw *v3d,
         };
 
         axi_id = axi_id >> 5;
-        if (axi_id < ARRAY_SIZE(v3d41_axi_ids))
-                client = v3d41_axi_ids[axi_id];
+        if (axi_id < ARRAY_SIZE(v3d42_axi_ids))
+                client = v3d42_axi_ids[axi_id];
 
-        uint32_t mmu_debug = V3D_READ(V3D_MMU_DEBUG_INFO);
+        uint32_t mmu_debug = V3D_READ(V3D_MMU0_DEBUG_INFO);
+
+        va_width += ((mmu_debug & V3D_MMU0_DEBUG_INFO_VA_WIDTH_SET)
+                     >> V3D_MMU0_DEBUG_INFO_VA_WIDTH_LSB);
 
-        va_width += ((mmu_debug & V3D_MMU_DEBUG_INFO_VA_WIDTH_SET)
-                     >> V3D_MMU_DEBUG_INFO_VA_WIDTH_LSB);
-#endif
         /* Only the top bits (final number depends on the gen) of the virtual
          * address are reported in the MMU VIO_ADDR register.
          */
-        uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU_VIO_ADDR) <<
+        uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU0_VIO_ADDR) <<
                              (va_width - 32));
 
-        /* Difference with the kernal: here were are going to abort after
+        /* Difference with the kernel: here were are going to abort after
          * logging, so we don't bother with some stuff that the kernel does,
          * like restoring the MMU ctrl bits
          */
@@ -393,6 +410,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
         }
 
         handle_mmu_interruptions(v3d, hub_status);
+
+#if V3D_VERSION == 71
+        if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
+                fprintf(stderr, "GMP violation at 0x%08x\n",
+                        V3D_READ(V3D_GMP_VIO_ADDR));
+        } else {
+                fprintf(stderr,
+                        "Unexpected ISR with status 0x%08x\n",
+                        hub_status);
+        }
+        abort();
+#endif
 }
 
 static void
@@ -417,24 +446,15 @@ v3d_isr(uint32_t hub_status)
 void
 v3dX(simulator_init_regs)(struct v3d_hw *v3d)
 {
-#if V3D_VERSION == 33
-        /* Set OVRTMUOUT to match kernel behavior.
-         *
-         * This means that the texture sampler uniform configuration's tmu
-         * output type field is used, instead of using the hardware default
-         * behavior based on the texture type.  If you want the default
-         * behavior, you can still put "2" in the indirect texture state's
-         * output_type field.
-         */
-        V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
-#endif
-
         /* FIXME: the kernel captures some additional core interrupts here,
          * for tracing. Perhaps we should evaluate to do the same here and add
          * some debug options.
          */
-        uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
-                                    V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
+        uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
+#if V3D_VERSION <= 42
+        core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
+#endif
+
         V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
         V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
 
@@ -444,6 +464,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
             V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET |  /* CAP exceeded */
             V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
 
+#if V3D_VERSION == 71
+        hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
+#endif
         V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
         V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
 
@@ -471,13 +494,11 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
                 V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
                 V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
         }
-#if V3D_VERSION >= 41
         if (submit->qts) {
                 V3D_WRITE(V3D_CLE_0_CT0QTS,
                           V3D_CLE_0_CT0QTS_CTQTSEN_SET |
                           submit->qts);
         }
-#endif
         V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
         V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
 
@@ -501,20 +522,18 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
         }
 }
 
-#if V3D_VERSION >= 41
 #define V3D_PCTR_0_PCTR_N(x) (V3D_PCTR_0_PCTR0 + 4 * (x))
 #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
 #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
 #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
-                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
-#endif
+                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + \
+                                                 V3D_PCTR_0_SRC_0_3_PCTRS0_MSB))
 
 void
 v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
                               uint32_t ncounters,
                               uint8_t *events)
 {
-#if V3D_VERSION >= 41
         int i, j;
         uint32_t source;
         uint32_t mask = BITFIELD_RANGE(0, ncounters);
@@ -529,21 +548,23 @@ v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
         V3D_WRITE(V3D_PCTR_0_CLR, mask);
         V3D_WRITE(V3D_PCTR_0_OVERFLOW, mask);
         V3D_WRITE(V3D_PCTR_0_EN, mask);
-#endif
 }
 
 void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
                                   uint32_t ncounters,
                                   uint64_t *values)
 {
-#if V3D_VERSION >= 41
         int i;
 
         for (i = 0; i < ncounters; i++)
                 values[i] += V3D_READ(V3D_PCTR_0_PCTR_N(i));
 
         V3D_WRITE(V3D_PCTR_0_EN, 0);
-#endif
+}
+
+void v3dX(simulator_get_perfcnt_total)(uint32_t *count)
+{
+        *count = ARRAY_SIZE(v3d_performance_counters);
 }
 
 #endif /* USE_V3D_SIMULATOR */
diff --git a/src/broadcom/simulator/v3dx_simulator.h b/src/broadcom/simulator/v3dx_simulator.h
index 145ae59c21e..51fc2409d3e 100644
--- a/src/broadcom/simulator/v3dx_simulator.h
+++ b/src/broadcom/simulator/v3dx_simulator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  * Copyright © 2014-2017 Broadcom
  * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
  *
@@ -50,3 +50,4 @@ void v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
 void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
                                   uint32_t ncounters,
                                   uint64_t *values);
+void v3dX(simulator_get_perfcnt_total)(uint32_t *count);
diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
index 9d2593cf6d2..3f04a4162dc 100644
--- a/src/broadcom/vulkan/meson.build
+++ b/src/broadcom/vulkan/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2019 Raspberry Pi
+# Copyright © 2019 Raspberry Pi Ltd
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,9 @@ v3dv_entrypoints = custom_target(
   command : [
     prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
     '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
+    '--beta', with_vulkan_beta.to_string(),
     '--device-prefix', 'ver42',
+    '--device-prefix', 'ver71',
   ],
   depend_files : vk_entrypoints_gen_depend_files,
 )
@@ -38,6 +40,7 @@ libv3dv_files = files(
   'v3dv_debug.h',
   'v3dv_descriptor_set.c',
   'v3dv_device.c',
+  'v3dv_event.c',
   'v3dv_formats.c',
   'v3dv_image.c',
   'v3dv_limits.h',
@@ -50,9 +53,8 @@ libv3dv_files = files(
   'v3dv_query.c',
   'v3dv_queue.c',
   'v3dv_uniforms.c',
-  'v3dv_util.c',
   'v3dv_wsi.c',
-)
+) + [v3d_xml_pack]
 
 files_per_version = files(
   'v3dvx_cmd_buffer.c',
@@ -63,18 +65,16 @@ files_per_version = files(
   'v3dvx_pipeline.c',
   'v3dvx_meta_common.c',
   'v3dvx_pipeline.c',
+  'v3dvx_query.c',
   'v3dvx_queue.c',
 )
 
-# The vulkan driver only supports version >= 42, which is the version present in
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
-# driver.
-v3d_versions = ['42']
+v3d_versions = ['42', '71']
 
 v3dv_flags = []
 
-dep_v3dv3 = dependency('v3dv3', required : false)
-if dep_v3dv3.found()
+dep_v3d_hw = dependency('v3d_hw', required : false)
+if dep_v3d_hw.found()
   v3dv_flags += '-DUSE_V3D_SIMULATOR'
 endif
 
@@ -82,31 +82,27 @@ v3dv_deps = [
   dep_dl,
   dep_libdrm,
   dep_valgrind,
-  dep_v3dv3,
+  dep_v3d_hw,
   idep_nir,
   idep_nir_headers,
   idep_vulkan_util,
+  idep_vulkan_runtime,
+  idep_vulkan_wsi,
 ]
 
 if with_platform_x11
   v3dv_deps += dep_xcb_dri3
-  v3dv_flags += [
-    '-DVK_USE_PLATFORM_XCB_KHR',
-    '-DVK_USE_PLATFORM_XLIB_KHR',
-  ]
-  libv3dv_files += files('v3dv_wsi_x11.c')
 endif
 
 if with_platform_wayland
-  v3dv_deps += [dep_wayland_client, dep_wl_protocols]
-  v3dv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
-  libv3dv_files += files('v3dv_wsi_wayland.c')
+  v3dv_deps += dep_wayland_client
   libv3dv_files += [wayland_drm_client_protocol_h, wayland_drm_protocol_c]
 endif
 
-if system_has_kms_drm and not with_platform_android
- v3dv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
- libv3dv_files += files('v3dv_wsi_display.c')
+if with_platform_android
+  v3dv_deps += [dep_android, idep_u_gralloc]
+  v3dv_flags += '-DVK_USE_PLATFORM_ANDROID_KHR'
+  libv3dv_files += files('v3dv_android.c')
 endif
 
 per_version_libs = []
@@ -115,8 +111,8 @@ foreach ver : v3d_versions
     'v3dv-v' + ver,
     [files_per_version, v3d_xml_pack, v3dv_entrypoints[0]],
     include_directories : [
-      inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
-      inc_compiler, inc_util, inc_vulkan_wsi,
+      inc_src, inc_include, inc_broadcom,
+      inc_util,
     ],
     c_args : [v3dv_flags, '-DV3D_VERSION=' + ver],
     gnu_symbol_visibility : 'hidden',
@@ -128,17 +124,17 @@ libvulkan_broadcom = shared_library(
   'vulkan_broadcom',
   [libv3dv_files, v3dv_entrypoints, sha1_h],
   include_directories : [
-    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util, inc_vulkan_wsi,
+    inc_include, inc_src, inc_broadcom, inc_util,
   ],
   link_with : [
     libbroadcom_cle,
     libbroadcom_v3d,
-    libvulkan_wsi,
     per_version_libs,
   ],
   dependencies : v3dv_deps,
   c_args : v3dv_flags,
-  link_args : ['-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections],
+  link_args : [vulkan_icd_link_args, '-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections],
+  link_depends : vulkan_icd_link_depends,
   gnu_symbol_visibility : 'hidden',
   install : true,
 )
@@ -162,12 +158,31 @@ broadcom_icd = custom_target(
   output : 'broadcom_icd.@0@.json'.format(host_machine.cpu()),
   command : [
     prog_python, '@INPUT0@',
-    '--api-version', '1.0', '--xml', '@INPUT1@',
+    '--api-version', '1.2', '--xml', '@INPUT1@',
     '--lib-path', join_paths(get_option('prefix'), get_option('libdir'),
     		  	     'libvulkan_broadcom.so'),
     '--out', '@OUTPUT@',
   ],
   build_by_default : true,
   install_dir : with_vulkan_icd_dir,
+  install_tag : 'runtime',
   install : true,
 )
+
+_dev_icdname = 'broadcom_devenv_icd.@0@.json'.format(host_machine.cpu())
+_dev_icd = custom_target(
+  'broadcom_devenv_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : _dev_icdname,
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', meson.current_build_dir() / 'libvulkan_broadcom.so',
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+)
+
+devenv.append('VK_DRIVER_FILES', _dev_icd.full_path())
+# Deprecated: replaced by VK_DRIVER_FILES above
+devenv.append('VK_ICD_FILENAMES', _dev_icd.full_path())
diff --git a/src/broadcom/vulkan/v3dv_android.c b/src/broadcom/vulkan/v3dv_android.c
new file mode 100644
index 00000000000..afb691e55d0
--- /dev/null
+++ b/src/broadcom/vulkan/v3dv_android.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright © 2017, Google Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include <hardware/gralloc.h>
+
+#if ANDROID_API_LEVEL >= 26
+#include <hardware/gralloc1.h>
+#endif
+
+#include "drm-uapi/drm_fourcc.h"
+#include <hardware/hardware.h>
+#include <hardware/hwvulkan.h>
+
+#include <vulkan/vk_android_native_buffer.h>
+#include <vulkan/vk_icd.h>
+
+#include "vk_android.h"
+#include "vk_enum_defines.h"
+
+#include "util/libsync.h"
+#include "util/log.h"
+#include "util/os_file.h"
+
+static int
+v3dv_hal_open(const struct hw_module_t *mod,
+              const char *id,
+              struct hw_device_t **dev);
+static int
+v3dv_hal_close(struct hw_device_t *dev);
+
+static_assert(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC, "");
+
+PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
+   .common =
+     {
+       .tag = HARDWARE_MODULE_TAG,
+       .module_api_version = HWVULKAN_MODULE_API_VERSION_0_1,
+       .hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0),
+       .id = HWVULKAN_HARDWARE_MODULE_ID,
+       .name = "Broadcom Vulkan HAL",
+       .author = "Mesa3D",
+       .methods =
+         &(hw_module_methods_t) {
+           .open = v3dv_hal_open,
+         },
+     },
+};
+
+/* If any bits in test_mask are set, then unset them and return true. */
+static inline bool
+unmask32(uint32_t *inout_mask, uint32_t test_mask)
+{
+   uint32_t orig_mask = *inout_mask;
+   *inout_mask &= ~test_mask;
+   return *inout_mask != orig_mask;
+}
+
+static int
+v3dv_hal_open(const struct hw_module_t *mod,
+              const char *id,
+              struct hw_device_t **dev)
+{
+   assert(mod == &HAL_MODULE_INFO_SYM.common);
+   assert(strcmp(id, HWVULKAN_DEVICE_0) == 0);
+
+   hwvulkan_device_t *hal_dev = malloc(sizeof(*hal_dev));
+   if (!hal_dev)
+      return -1;
+
+   *hal_dev = (hwvulkan_device_t){
+      .common =
+        {
+          .tag = HARDWARE_DEVICE_TAG,
+          .version = HWVULKAN_DEVICE_API_VERSION_0_1,
+          .module = &HAL_MODULE_INFO_SYM.common,
+          .close = v3dv_hal_close,
+        },
+     .EnumerateInstanceExtensionProperties =
+        v3dv_EnumerateInstanceExtensionProperties,
+     .CreateInstance = v3dv_CreateInstance,
+     .GetInstanceProcAddr = v3dv_GetInstanceProcAddr,
+   };
+
+   mesa_logi("v3dv: Warning: Android Vulkan implementation is experimental");
+
+   *dev = &hal_dev->common;
+   return 0;
+}
+
+static int
+v3dv_hal_close(struct hw_device_t *dev)
+{
+   /* hwvulkan.h claims that hw_device_t::close() is never called. */
+   return -1;
+}
+
+VkResult
+v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc,
+                                    struct u_gralloc_buffer_handle *in_hnd,
+                                    VkImageDrmFormatModifierExplicitCreateInfoEXT *out,
+                                    VkSubresourceLayout *out_layouts,
+                                    int max_planes)
+{
+   struct u_gralloc_buffer_basic_info info;
+
+   if (u_gralloc_get_buffer_basic_info(gralloc, in_hnd, &info) != 0)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   if (info.num_planes > max_planes)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   bool is_disjoint = false;
+   for (int i = 1; i < info.num_planes; i++) {
+      if (info.offsets[i] == 0) {
+         is_disjoint = true;
+         break;
+      }
+   }
+
+   if (is_disjoint) {
+      /* We don't support disjoint planes yet */
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+   }
+
+   memset(out_layouts, 0, sizeof(*out_layouts) * info.num_planes);
+   memset(out, 0, sizeof(*out));
+
+   out->sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT;
+   out->pPlaneLayouts = out_layouts;
+
+   out->drmFormatModifier = info.modifier;
+   out->drmFormatModifierPlaneCount = info.num_planes;
+   for (int i = 0; i < info.num_planes; i++) {
+      out_layouts[i].offset = info.offsets[i];
+      out_layouts[i].rowPitch = info.strides[i];
+   }
+
+   if (info.drm_fourcc == DRM_FORMAT_YVU420) {
+      /* Swap the U and V planes to match the VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM */
+      VkSubresourceLayout tmp = out_layouts[1];
+      out_layouts[1] = out_layouts[2];
+      out_layouts[2] = tmp;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+v3dv_import_native_buffer_fd(VkDevice device_h,
+                             int native_buffer_fd,
+                             const VkAllocationCallbacks *alloc,
+                             VkImage image_h)
+{
+   VkResult result;
+
+   VkDeviceMemory memory_h;
+
+   const VkMemoryDedicatedAllocateInfo ded_alloc = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+      .pNext = NULL,
+      .buffer = VK_NULL_HANDLE,
+      .image = image_h
+   };
+
+   const VkImportMemoryFdInfoKHR import_info = {
+      .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+      .pNext = &ded_alloc,
+      .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+      .fd = os_dupfd_cloexec(native_buffer_fd),
+   };
+
+   result =
+      v3dv_AllocateMemory(device_h,
+                          &(VkMemoryAllocateInfo) {
+                             .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+                             .pNext = &import_info,
+                             .allocationSize = lseek(native_buffer_fd, 0, SEEK_END),
+                             .memoryTypeIndex = 0,
+                          },
+                          alloc, &memory_h);
+
+   if (result != VK_SUCCESS)
+      goto fail_create_image;
+
+   VkBindImageMemoryInfo bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
+      .image = image_h,
+      .memory = memory_h,
+      .memoryOffset = 0,
+   };
+   v3dv_BindImageMemory2(device_h, 1, &bind_info);
+
+   return VK_SUCCESS;
+
+fail_create_image:
+   close(import_info.fd);
+
+   return result;
+}
+
+static VkResult
+format_supported_with_usage(VkDevice device_h,
+                            VkFormat format,
+                            VkImageUsageFlags imageUsage)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+   struct v3dv_physical_device *phys_dev = device->pdevice;
+   VkPhysicalDevice phys_dev_h = v3dv_physical_device_to_handle(phys_dev);
+   VkResult result;
+
+   const VkPhysicalDeviceImageFormatInfo2 image_format_info = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+      .format = format,
+      .type = VK_IMAGE_TYPE_2D,
+      .tiling = VK_IMAGE_TILING_OPTIMAL,
+      .usage = imageUsage,
+   };
+
+   VkImageFormatProperties2 image_format_props = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
+   };
+
+   /* Check that requested format and usage are supported. */
+   result = v3dv_GetPhysicalDeviceImageFormatProperties2(
+      phys_dev_h, &image_format_info, &image_format_props);
+   if (result != VK_SUCCESS) {
+      return vk_errorf(device, result,
+                       "v3dv_GetPhysicalDeviceImageFormatProperties2 failed "
+                       "inside %s",
+                       __func__);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_gralloc0_usage(struct v3dv_device *device,
+                     VkFormat format,
+                     VkImageUsageFlags imageUsage,
+                     int *grallocUsage)
+{
+   if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+                             VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT))
+      *grallocUsage |= GRALLOC_USAGE_HW_RENDER;
+
+   if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                             VK_IMAGE_USAGE_SAMPLED_BIT |
+                             VK_IMAGE_USAGE_STORAGE_BIT |
+                             VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
+      *grallocUsage |= GRALLOC_USAGE_HW_TEXTURE;
+
+   /* All VkImageUsageFlags not explicitly checked here are unsupported for
+    * gralloc swapchains.
+    */
+   if (imageUsage != 0) {
+      return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                       "unsupported VkImageUsageFlags(0x%x) for gralloc "
+                       "swapchain",
+                       imageUsage);
+   }
+
+   /* Swapchain assumes direct displaying, therefore enable COMPOSER flag,
+    * In case format is not supported by display controller, gralloc will
+    * drop this flag and still allocate the buffer in VRAM
+    */
+   *grallocUsage |= GRALLOC_USAGE_HW_COMPOSER;
+
+   if (*grallocUsage == 0)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetSwapchainGrallocUsageANDROID(VkDevice device_h,
+                                     VkFormat format,
+                                     VkImageUsageFlags imageUsage,
+                                     int *grallocUsage)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+   VkResult result;
+
+   result = format_supported_with_usage(device_h, format, imageUsage);
+   if (result != VK_SUCCESS)
+      return result;
+
+   *grallocUsage = 0;
+   return setup_gralloc0_usage(device, format, imageUsage, grallocUsage);
+}
+
+#if ANDROID_API_LEVEL >= 26
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetSwapchainGrallocUsage2ANDROID(
+   VkDevice device_h,
+   VkFormat format,
+   VkImageUsageFlags imageUsage,
+   VkSwapchainImageUsageFlagsANDROID swapchainImageUsage,
+   uint64_t *grallocConsumerUsage,
+   uint64_t *grallocProducerUsage)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+   VkResult result;
+
+   *grallocConsumerUsage = 0;
+   *grallocProducerUsage = 0;
+   mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
+
+   result = format_supported_with_usage(device_h, format, imageUsage);
+   if (result != VK_SUCCESS)
+      return result;
+
+   int32_t grallocUsage = 0;
+   result = setup_gralloc0_usage(device, format, imageUsage, &grallocUsage);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Setup gralloc1 usage flags from gralloc0 flags. */
+
+   if (grallocUsage & GRALLOC_USAGE_HW_RENDER) {
+      *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET;
+   }
+
+   if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) {
+      *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE;
+   }
+
+   if (grallocUsage & GRALLOC_USAGE_HW_COMPOSER) {
+      /* GPU composing case */
+      *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE;
+      /* Hardware composing case */
+      *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER;
+   }
+
+   if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID) {
+      uint64_t front_rendering_usage = 0;
+      u_gralloc_get_front_rendering_usage(device->gralloc, &front_rendering_usage);
+      *grallocProducerUsage |= front_rendering_usage;
+   }
+
+   return VK_SUCCESS;
+}
+#endif
+
+/* ----------------------------- AHardwareBuffer --------------------------- */
+
+static VkResult
+get_ahb_buffer_format_properties2(VkDevice device_h, const struct AHardwareBuffer *buffer,
+                                  VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+
+   /* Get a description of buffer contents . */
+   AHardwareBuffer_Desc desc;
+   AHardwareBuffer_describe(buffer, &desc);
+
+   /* Verify description. */
+   const uint64_t gpu_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE |
+                              AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT |
+                              AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
+
+   /* "Buffer must be a valid Android hardware buffer object with at least
+    * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags."
+    */
+   if (!(desc.usage & (gpu_usage)))
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   /* Fill properties fields based on description. */
+   VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
+
+   p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+
+   p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+   p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+
+   p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+   p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+
+   VkFormatProperties2 format_properties = {.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2};
+
+   p->format = vk_ahb_format_to_image_format(desc.format);
+
+   VkFormat external_format = p->format;
+
+   if (p->format != VK_FORMAT_UNDEFINED)
+      goto finish;
+
+   /* External format only case
+    *
+    * From vkGetAndroidHardwareBufferPropertiesANDROID spec:
+    * "If the Android hardware buffer has one of the formats listed in the Format
+    * Equivalence table (see spec.), then format must have the equivalent Vulkan
+    * format listed in the table. Otherwise, format may be VK_FORMAT_UNDEFINED,
+    * indicating the Android hardware buffer can only be used with an external format."
+    *
+    * From SKIA source code analysis: p->format MUST be VK_FORMAT_UNDEFINED, if the
+    * format is not in the Equivalence table.
+    */
+
+   struct u_gralloc_buffer_handle gr_handle = {
+      .handle = AHardwareBuffer_getNativeHandle(buffer),
+      .pixel_stride = desc.stride,
+      .hal_format = desc.format,
+   };
+
+   struct u_gralloc_buffer_basic_info info;
+
+   if (u_gralloc_get_buffer_basic_info(device->gralloc, &gr_handle, &info) != 0)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   switch (info.drm_fourcc) {
+   case DRM_FORMAT_YVU420:
+      /* Assuming that U and V planes are swapped earlier */
+      external_format = VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM;
+      break;
+   case DRM_FORMAT_NV12:
+      external_format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+      break;
+   default:;
+      mesa_loge("Unsupported external DRM format: %d", info.drm_fourcc);
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+   }
+
+   struct u_gralloc_buffer_color_info color_info;
+   if (u_gralloc_get_buffer_color_info(device->gralloc, &gr_handle, &color_info) == 0) {
+      switch (color_info.yuv_color_space) {
+      case __DRI_YUV_COLOR_SPACE_ITU_REC601:
+         p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+         break;
+      case __DRI_YUV_COLOR_SPACE_ITU_REC709:
+         p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709;
+         break;
+      case __DRI_YUV_COLOR_SPACE_ITU_REC2020:
+         p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020;
+         break;
+      default:
+         break;
+      }
+
+      p->suggestedYcbcrRange = (color_info.sample_range == __DRI_YUV_NARROW_RANGE) ?
+         VK_SAMPLER_YCBCR_RANGE_ITU_NARROW : VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+      p->suggestedXChromaOffset = (color_info.horizontal_siting == __DRI_YUV_CHROMA_SITING_0_5) ?
+         VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN;
+      p->suggestedYChromaOffset = (color_info.vertical_siting == __DRI_YUV_CHROMA_SITING_0_5) ?
+         VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN;
+   }
+
+finish:
+
+   v3dv_GetPhysicalDeviceFormatProperties2(v3dv_physical_device_to_handle(device->pdevice),
+                                           external_format, &format_properties);
+
+   /* v3dv doesn't support direct sampling from linear images but has a logic to copy
+    * from linear to tiled images implicitly before sampling. Therefore expose optimal
+    * features for both linear and optimal tiling.
+    */
+   p->formatFeatures = format_properties.formatProperties.optimalTilingFeatures;
+   p->externalFormat = external_format;
+
+   /* From vkGetAndroidHardwareBufferPropertiesANDROID spec:
+    * "The formatFeatures member *must* include
+    *  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT and at least one of
+    *  VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT or
+    *  VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT"
+    */
+   p->formatFeatures |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+v3dv_GetAndroidHardwareBufferPropertiesANDROID(VkDevice device_h,
+                                               const struct AHardwareBuffer *buffer,
+                                               VkAndroidHardwareBufferPropertiesANDROID *pProperties)
+{
+   V3DV_FROM_HANDLE(v3dv_device, dev, device_h);
+   struct v3dv_physical_device *pdevice = dev->pdevice;
+
+   VkResult result;
+
+   VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop =
+      vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID);
+
+   /* Fill format properties of an Android hardware buffer. */
+   if (format_prop) {
+      VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = {
+         .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID,
+      };
+      result = get_ahb_buffer_format_properties2(device_h, buffer, &format_prop2);
+      if (result != VK_SUCCESS)
+         return result;
+
+      format_prop->format                 = format_prop2.format;
+      format_prop->externalFormat         = format_prop2.externalFormat;
+      format_prop->formatFeatures         =
+         vk_format_features2_to_features(format_prop2.formatFeatures);
+      format_prop->samplerYcbcrConversionComponents =
+         format_prop2.samplerYcbcrConversionComponents;
+      format_prop->suggestedYcbcrModel    = format_prop2.suggestedYcbcrModel;
+      format_prop->suggestedYcbcrRange    = format_prop2.suggestedYcbcrRange;
+      format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset;
+      format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset;
+   }
+
+   VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 =
+      vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID);
+   if (format_prop2) {
+      result = get_ahb_buffer_format_properties2(device_h, buffer, format_prop2);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   const native_handle_t *handle = AHardwareBuffer_getNativeHandle(buffer);
+   assert(handle && handle->numFds > 0);
+   pProperties->allocationSize = lseek(handle->data[0], 0, SEEK_END);
+
+   /* All memory types. */
+   pProperties->memoryTypeBits = (1u << pdevice->memory.memoryTypeCount) - 1;
+
+   return VK_SUCCESS;
+}
diff --git a/src/broadcom/vulkan/v3dv_bo.c b/src/broadcom/vulkan/v3dv_bo.c
index 71679ceec27..1b26abec325 100644
--- a/src/broadcom/vulkan/v3dv_bo.c
+++ b/src/broadcom/vulkan/v3dv_bo.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -31,11 +31,12 @@
 
 /* Default max size of the bo cache, in MB.
  *
- * FIXME: we got this value when testing some apps using the rpi4 with 4GB,
- * but it should depend on the total amount of RAM. But for that we would need
- * to test on real hw with different amount of RAM. Using this value for now.
+ * This value comes from testing different Vulkan application. Greater values
+ * didn't get any further performance benefit. This looks somewhat small, but
+ * from testing those applications, the main consumer of the bo cache are
+ * the bos used for the CLs, that are usually small.
  */
-#define DEFAULT_MAX_BO_CACHE_SIZE 512
+#define DEFAULT_MAX_BO_CACHE_SIZE 64
 
 /* Discarded to use a V3D_DEBUG for this, as it would mean adding a run-time
  * check for most of the calls
@@ -67,8 +68,8 @@ bo_dump_stats(struct v3dv_device *device)
 
       struct timespec time;
       clock_gettime(CLOCK_MONOTONIC, &time);
-      fprintf(stderr, "  now:               %ld\n",
-              time.tv_sec);
+      fprintf(stderr, "  now:               %lld\n",
+              (long long)time.tv_sec);
    }
 
    if (cache->size_list_size) {
@@ -117,8 +118,8 @@ bo_from_cache(struct v3dv_device *device, uint32_t size, const char *name)
       }
 
       bo_remove_from_cache(cache, bo);
-
       bo->name = name;
+      p_atomic_set(&bo->refcnt, 1);
    }
    mtx_unlock(&cache->lock);
    return bo;
@@ -131,28 +132,39 @@ bo_free(struct v3dv_device *device,
    if (!bo)
       return true;
 
-   if (bo->map)
-      v3dv_bo_unmap(device, bo);
+   assert(p_atomic_read(&bo->refcnt) == 0);
+   assert(bo->map == NULL);
+
+   if (!bo->is_import) {
+      device->bo_count--;
+      device->bo_size -= bo->size;
+
+      if (dump_stats) {
+         fprintf(stderr, "Freed %s%s%dkb:\n",
+                 bo->name ? bo->name : "",
+                 bo->name ? " " : "",
+                 bo->size / 1024);
+         bo_dump_stats(device);
+      }
+   }
+
+   uint32_t handle = bo->handle;
+   /* Our BO structs are stored in a sparse array in the physical device,
+    * so we don't want to free the BO pointer, instead we want to reset it
+    * to 0, to signal that array entry as being free.
+    *
+    * We must do the reset before we actually free the BO in the kernel, since
+    * otherwise there is a chance the application creates another BO in a
+    * different thread and gets the same array entry, causing a race.
+    */
+   memset(bo, 0, sizeof(*bo));
 
    struct drm_gem_close c;
    memset(&c, 0, sizeof(c));
-   c.handle = bo->handle;
+   c.handle = handle;
    int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_GEM_CLOSE, &c);
    if (ret != 0)
-      fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
-
-   device->bo_count--;
-   device->bo_size -= bo->size;
-
-   if (dump_stats) {
-      fprintf(stderr, "Freed %s%s%dkb:\n",
-              bo->name ? bo->name : "",
-              bo->name ? " " : "",
-              bo->size / 1024);
-      bo_dump_stats(device);
-   }
-
-   vk_free(&device->vk.alloc, bo);
+      fprintf(stderr, "close object %d: %s\n", handle, strerror(errno));
 
    return ret == 0;
 }
@@ -183,6 +195,7 @@ v3dv_bo_init(struct v3dv_bo *bo,
              const char *name,
              bool private)
 {
+   p_atomic_set(&bo->refcnt, 1);
    bo->handle = handle;
    bo->handle_bit = 1ull << (handle % 64);
    bo->size = size;
@@ -192,9 +205,22 @@ v3dv_bo_init(struct v3dv_bo *bo,
    bo->name = name;
    bo->private = private;
    bo->dumb_handle = -1;
+   bo->is_import = false;
+   bo->cl_branch_offset = 0xffffffff;
    list_inithead(&bo->list_link);
 }
 
+void
+v3dv_bo_init_import(struct v3dv_bo *bo,
+                    uint32_t handle,
+                    uint32_t size,
+                    uint32_t offset,
+                    bool private)
+{
+   v3dv_bo_init(bo, handle, size, offset, "import", private);
+   bo->is_import = true;
+}
+
 struct v3dv_bo *
 v3dv_bo_alloc(struct v3dv_device *device,
               uint32_t size,
@@ -218,14 +244,6 @@ v3dv_bo_alloc(struct v3dv_device *device,
       }
    }
 
-   bo = vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8,
-                 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
-   if (!bo) {
-      fprintf(stderr, "Failed to allocate host memory for BO\n");
-      return NULL;
-   }
-
  retry:
    ;
 
@@ -244,7 +262,6 @@ v3dv_bo_alloc(struct v3dv_device *device,
          goto retry;
       }
 
-      vk_free(&device->vk.alloc, bo);
       fprintf(stderr, "Failed to allocate device memory for BO\n");
       return NULL;
    }
@@ -252,6 +269,9 @@ v3dv_bo_alloc(struct v3dv_device *device,
    assert(create.offset % page_align == 0);
    assert((create.offset & 0xffffffff) == create.offset);
 
+   bo = v3dv_device_lookup_bo(device->pdevice, create.handle);
+   assert(bo && bo->handle == 0);
+
    v3dv_bo_init(bo, create.handle, size, create.offset, name, private);
 
    device->bo_count++;
@@ -320,7 +340,7 @@ v3dv_bo_map(struct v3dv_device *device, struct v3dv_bo *bo, uint32_t size)
    if (!ok)
       return false;
 
-   ok = v3dv_bo_wait(device, bo, PIPE_TIMEOUT_INFINITE);
+   ok = v3dv_bo_wait(device, bo, OS_TIMEOUT_INFINITE);
    if (!ok) {
       fprintf(stderr, "memory wait for map failed\n");
       return false;
@@ -340,7 +360,7 @@ v3dv_bo_unmap(struct v3dv_device *device, struct v3dv_bo *bo)
    bo->map_size = 0;
 }
 
-static boolean
+static bool
 reallocate_size_list(struct v3dv_bo_cache *cache,
                      struct v3dv_device *device,
                      uint32_t size)
@@ -400,9 +420,11 @@ v3dv_bo_cache_init(struct v3dv_device *device)
       fprintf(stderr, "MAX BO CACHE SIZE: %iMB\n", device->bo_cache.max_cache_size);
    }
 
+   mtx_lock(&device->bo_cache.lock);
    device->bo_cache.max_cache_size *= 1024 * 1024;
    device->bo_cache.cache_count = 0;
    device->bo_cache.cache_size = 0;
+   mtx_unlock(&device->bo_cache.lock);
 }
 
 void
@@ -455,6 +477,12 @@ v3dv_bo_free(struct v3dv_device *device,
    if (!bo)
       return true;
 
+   if (!p_atomic_dec_zero(&bo->refcnt))
+      return true;
+
+   if (bo->map)
+      v3dv_bo_unmap(device, bo);
+
    struct timespec time;
    struct v3dv_bo_cache *cache = &device->bo_cache;
    uint32_t page_index = bo->size / 4096 - 1;
diff --git a/src/broadcom/vulkan/v3dv_bo.h b/src/broadcom/vulkan/v3dv_bo.h
index ab2b8c7356d..5e382817b37 100644
--- a/src/broadcom/vulkan/v3dv_bo.h
+++ b/src/broadcom/vulkan/v3dv_bo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -39,6 +39,11 @@ struct v3dv_bo {
 
    const char *name;
 
+   /* In a CL where a BRANCH has been emitted, the offset of the BRANCH
+    * instruction in the BO.
+    */
+   uint32_t cl_branch_offset;
+
    /** Entry in the linked list of buffers freed, by age. */
    struct list_head time_list;
    /** Entry in the per-page-count linked list of buffers freed (by age). */
@@ -52,14 +57,20 @@ struct v3dv_bo {
     */
    bool private;
 
+   /** If this BO has been imported */
+   bool is_import;
+
    /**
     * If this BO was allocated for a swapchain on the display device, the
     * handle of the dumb BO on that device.
     */
    int32_t dumb_handle;
+
+   int32_t refcnt;
 };
 
 void v3dv_bo_init(struct v3dv_bo *bo, uint32_t handle, uint32_t size, uint32_t offset, const char *name, bool private);
+void v3dv_bo_init_import(struct v3dv_bo *bo, uint32_t handle, uint32_t size, uint32_t offset, bool private);
 
 struct v3dv_bo *v3dv_bo_alloc(struct v3dv_device *device, uint32_t size, const char *name, bool private);
 
diff --git a/src/broadcom/vulkan/v3dv_cl.c b/src/broadcom/vulkan/v3dv_cl.c
index ed11f53c4bb..7d414999e9b 100644
--- a/src/broadcom/vulkan/v3dv_cl.c
+++ b/src/broadcom/vulkan/v3dv_cl.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -27,7 +27,7 @@
  * versions, so we just explicitly set the V3D_VERSION and include v3dx_pack
  * here
  */
-#define V3D_VERSION 33
+#define V3D_VERSION 42
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"
 
@@ -58,6 +58,14 @@ v3dv_cl_destroy(struct v3dv_cl *cl)
 static bool
 cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)
 {
+   /* If we are growing, double the BO allocation size to reduce the number
+    * of allocations with large command buffers. This has a very significant
+    * impact on the number of draw calls per second reported by vkoverhead.
+    */
+   space = align(space, 4096);
+   if (cl->bo)
+      space = MAX2(cl->bo->size * 2, space);
+
    struct v3dv_bo *bo = v3dv_bo_alloc(cl->job->device, space, "CL", true);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for command list\n");
@@ -76,6 +84,7 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)
 
    /* Chain to the new BO from the old one if requested */
    if (use_branch && cl->bo) {
+      cl->bo->cl_branch_offset = v3dv_cl_offset(cl);
       cl_emit(cl, BRANCH, branch) {
          branch.address = v3dv_cl_address(bo, 0);
       }
@@ -114,14 +123,18 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
     * end with a 'return from sub list' command.
     */
    bool needs_return_from_sub_list = false;
-   if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
-      if (cl->size > 0) {
+   if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE && cl->size > 0)
          needs_return_from_sub_list = true;
-         space += cl_packet_length(RETURN_FROM_SUB_LIST);
-      }
-   } else {
-      space += cl_packet_length(BRANCH);
-   }
+
+   /*
+    * The CLE processor in the simulator tries to read V3D_CL_MAX_INSTR_SIZE
+    * bytes form the CL for each new instruction. If the last instruction in our
+    * CL is smaller than that, and there are not at least V3D_CL_MAX_INSTR_SIZE
+    * bytes until the end of the BO, it will read out of bounds and possibly
+    * cause a GMP violation interrupt to trigger. Ensure we always have at
+    * least that many bytes available to read with the last instruction.
+    */
+   space += V3D_CL_MAX_INSTR_SIZE;
 
    if (v3dv_cl_offset(cl) + space <= cl->size)
       return;
diff --git a/src/broadcom/vulkan/v3dv_cl.h b/src/broadcom/vulkan/v3dv_cl.h
index 68d5acd455b..7e17ac395c4 100644
--- a/src/broadcom/vulkan/v3dv_cl.h
+++ b/src/broadcom/vulkan/v3dv_cl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,7 +26,8 @@
 
 #include "broadcom/cle/v3d_packet_helpers.h"
 
-#include "list.h"
+#include "util/list.h"
+#include "util/macros.h"
 
 struct v3dv_bo;
 struct v3dv_job;
@@ -118,6 +119,13 @@ cl_advance(struct v3dv_cl_out **cl, uint32_t n)
 }
 
 static inline void
+cl_advance_and_end(struct v3dv_cl *cl, uint32_t n)
+{
+   cl->next = (struct v3dv_cl_out *)((char *)(cl->next) + n);
+   assert(v3dv_cl_offset(cl) <= cl->size);
+}
+
+static inline void
 cl_aligned_u32(struct v3dv_cl_out **cl, uint32_t n)
 {
    *(uint32_t *)(*cl) = n;
@@ -143,15 +151,9 @@ cl_aligned_reloc(struct v3dv_cl *cl,
 uint32_t v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment);
 void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
 
-/* We redefine ALIGN as a macro as we want to use cl_aligned_packet_length for
- * struct fields
- */
-#define ALIGN(value, alignment)                           \
-        (((value) + (alignment) - 1) & ~((alignment) - 1))
-
 #define cl_packet_header(packet) V3DX(packet ## _header)
 #define cl_packet_length(packet) V3DX(packet ## _length)
-#define cl_aligned_packet_length(packet, alignment) ALIGN(cl_packet_length(packet), alignment)
+#define cl_aligned_packet_length(packet, alignment) ALIGN_POT(cl_packet_length(packet), alignment)
 #define cl_packet_pack(packet)   V3DX(packet ## _pack)
 #define cl_packet_struct(packet) V3DX(packet)
 
@@ -178,8 +180,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
         ({                                                       \
                 struct v3dv_cl_out *cl_out = cl_start(cl);        \
                 cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \
-                cl_advance(&cl_out, cl_packet_length(packet));   \
-                cl_end(cl, cl_out);                              \
+                cl_advance_and_end(cl, cl_packet_length(packet)); \
                 _loop_terminate = NULL;                          \
         }))                                                      \
 
@@ -195,8 +196,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
                 cl_packet_pack(packet)(cl, packed, &name);       \
                 for (int _i = 0; _i < cl_packet_length(packet); _i++) \
                         ((uint8_t *)cl_out)[_i] = packed[_i] | (prepacked)[_i]; \
-                cl_advance(&cl_out, cl_packet_length(packet));   \
-                cl_end(cl, cl_out);                              \
+                cl_advance_and_end(cl, cl_packet_length(packet)); \
                 _loop_terminate = NULL;                          \
         }))                                                      \
 
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 0d6c393ee6e..96e83c657e6 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,42 +21,26 @@
  * IN THE SOFTWARE.
  */
 
+#include "broadcom/common/v3d_csd.h"
 #include "v3dv_private.h"
 #include "util/u_pack_color.h"
-#include "vk_format_info.h"
+#include "vk_common_entrypoints.h"
 #include "vk_util.h"
 
-const struct v3dv_dynamic_state default_dynamic_state = {
-   .viewport = {
-      .count = 0,
-   },
-   .scissor = {
-      .count = 0,
-   },
-   .stencil_compare_mask =
-   {
-     .front = ~0u,
-     .back = ~0u,
-   },
-   .stencil_write_mask =
-   {
-     .front = ~0u,
-     .back = ~0u,
-   },
-   .stencil_reference =
-   {
-     .front = 0u,
-     .back = 0u,
-   },
-   .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
-   .depth_bias = {
-      .constant_factor = 0.0f,
-      .depth_bias_clamp = 0.0f,
-      .slope_factor = 0.0f,
-   },
-   .line_width = 1.0f,
-   .color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1,
-};
+float
+v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline,
+                       struct v3dv_cmd_buffer *buffer)
+{
+   float width = buffer->vk.dynamic_graphics_state.rs.line.width;
+
+   /* If line smoothing is enabled then we want to add some extra pixels to
+    * the width in order to have some semi-transparent edges.
+    */
+   if (pipeline->line_smooth)
+      width = floorf(M_SQRT2 * width) + 3;
+
+   return width;
+}
 
 void
 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
@@ -83,59 +67,22 @@ v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
    job->bo_handle_mask |= bo->handle_bit;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateCommandPool(VkDevice _device,
-                       const VkCommandPoolCreateInfo *pCreateInfo,
-                       const VkAllocationCallbacks *pAllocator,
-                       VkCommandPool *pCmdPool)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   struct v3dv_cmd_pool *pool;
-
-   /* We only support one queue */
-   assert(pCreateInfo->queueFamilyIndex == 0);
-
-   pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
-                           VK_OBJECT_TYPE_COMMAND_POOL);
-   if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   if (pAllocator)
-      pool->alloc = *pAllocator;
-   else
-      pool->alloc = device->vk.alloc;
-
-   list_inithead(&pool->cmd_buffers);
-
-   *pCmdPool = v3dv_cmd_pool_to_handle(pool);
-
-   return VK_SUCCESS;
-}
-
 static void
 cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
-                struct v3dv_device *device,
-                struct v3dv_cmd_pool *pool,
-                VkCommandBufferLevel level)
+                struct v3dv_device *device)
 {
    /* Do not reset the base object! If we are calling this from a command
     * buffer reset that would reset the loader's dispatch table for the
     * command buffer, and any other relevant info from vk_object_base
     */
-   const uint32_t base_size = sizeof(struct vk_object_base);
+   const uint32_t base_size = sizeof(struct vk_command_buffer);
    uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
    memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
 
    cmd_buffer->device = device;
-   cmd_buffer->pool = pool;
-   cmd_buffer->level = level;
 
    list_inithead(&cmd_buffer->private_objs);
    list_inithead(&cmd_buffer->jobs);
-   list_inithead(&cmd_buffer->list_link);
-
-   assert(pool);
-   list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
 
    cmd_buffer->state.subpass_idx = -1;
    cmd_buffer->state.meta.subpass_idx = -1;
@@ -144,22 +91,35 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static VkResult
-cmd_buffer_create(struct v3dv_device *device,
-                  struct v3dv_cmd_pool *pool,
-                  VkCommandBufferLevel level,
-                  VkCommandBuffer *pCommandBuffer)
+cmd_buffer_create(struct vk_command_pool *pool, VkCommandBufferLevel level,
+                  struct vk_command_buffer **cmd_buffer_out)
 {
+   struct v3dv_device *device =
+      container_of(pool->base.device, struct v3dv_device, vk);
+
    struct v3dv_cmd_buffer *cmd_buffer;
-   cmd_buffer = vk_object_zalloc(&device->vk,
-                                 &pool->alloc,
-                                 sizeof(*cmd_buffer),
-                                 VK_OBJECT_TYPE_COMMAND_BUFFER);
+   cmd_buffer = vk_zalloc(&pool->alloc,
+                          sizeof(*cmd_buffer),
+                          8,
+                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Here we pass 0 as level because this callback hook doesn't have the level
+    * info, but that's fine, vk_common_AllocateCommandBuffers will fix it up
+    * after creation.
+    */
+   VkResult result;
+   result = vk_command_buffer_init(pool, &cmd_buffer->vk,
+                                   &v3dv_cmd_buffer_ops, level);
+   if (result != VK_SUCCESS) {
+      vk_free(&pool->alloc, cmd_buffer);
+      return result;
+   }
 
-   cmd_buffer_init(cmd_buffer, device, pool, level);
+   cmd_buffer_init(cmd_buffer, device);
 
-   *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+   *cmd_buffer_out = &cmd_buffer->vk;
 
    return VK_SUCCESS;
 }
@@ -168,7 +128,7 @@ static void
 job_destroy_gpu_cl_resources(struct v3dv_job *job)
 {
    assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
-          job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+          job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
 
    v3dv_cl_destroy(&job->bcl);
    v3dv_cl_destroy(&job->rcl);
@@ -189,9 +149,21 @@ job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
 {
    assert(job->type == V3DV_JOB_TYPE_GPU_CL);
 
-   list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
-      list_del(&bo->list_link);
-      vk_free(&job->device->vk.alloc, bo);
+   struct v3dv_cmd_buffer *cmd_buffer = job->cmd_buffer;
+   if (job->clone_owns_bcl) {
+      /* For suspending jobs in command buffers with the simultaneous use flag
+       * we allocate a real copy of the BCL.
+       */
+      assert(job->suspending &&
+             cmd_buffer &&
+             (cmd_buffer->usage_flags &
+              VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT));
+      v3dv_cl_destroy(&job->bcl);
+   } else {
+      list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
+         list_del(&bo->list_link);
+         vk_free(&job->device->vk.alloc, bo);
+      }
    }
 
    list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
@@ -219,22 +191,6 @@ job_destroy_gpu_csd_resources(struct v3dv_job *job)
       v3dv_bo_free(job->device, job->csd.shared_memory);
 }
 
-static void
-job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
-   assert(job->cmd_buffer);
-   vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events);
-}
-
-static void
-job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
-   assert(job->cmd_buffer);
-   v3dv_job_destroy(job->cpu.csd_indirect.csd_job);
-}
-
 void
 v3dv_job_destroy(struct v3dv_job *job)
 {
@@ -249,18 +205,12 @@ v3dv_job_destroy(struct v3dv_job *job)
    if (!job->is_clone) {
       switch (job->type) {
       case V3DV_JOB_TYPE_GPU_CL:
-      case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
+      case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
          job_destroy_gpu_cl_resources(job);
          break;
       case V3DV_JOB_TYPE_GPU_CSD:
          job_destroy_gpu_csd_resources(job);
          break;
-      case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
-         job_destroy_cpu_wait_events_resources(job);
-         break;
-      case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
-         job_destroy_cpu_csd_indirect_resources(job);
-         break;
       default:
          break;
       }
@@ -316,7 +266,7 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
       v3dv_job_destroy(cmd_buffer->state.job);
 
    if (cmd_buffer->state.attachments)
-      vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
+      vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
 
    if (cmd_buffer->state.query.end.alloc_count > 0)
       vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);
@@ -333,38 +283,22 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
          assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
          vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);
    }
+
+   v3dv_destroy_dynamic_framebuffer(cmd_buffer);
 }
 
 static void
-cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
+cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
 {
-   list_del(&cmd_buffer->pool_link);
+   struct v3dv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
+
    cmd_buffer_free_resources(cmd_buffer);
-   vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
+   vk_command_buffer_finish(&cmd_buffer->vk);
+   vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
 }
 
 static bool
-attachment_list_is_subset(struct v3dv_subpass_attachment *l1, uint32_t l1_count,
-                          struct v3dv_subpass_attachment *l2, uint32_t l2_count)
-{
-   for (uint32_t i = 0; i < l1_count; i++) {
-      uint32_t attachment_idx = l1[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      uint32_t j;
-      for (j = 0; j < l2_count; j++) {
-         if (l2[j].attachment == attachment_idx)
-            break;
-      }
-      if (j == l2_count)
-         return false;
-   }
-
-   return true;
- }
-
-static bool
 cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
                              uint32_t subpass_idx)
 {
@@ -372,9 +306,9 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
    assert(state->pass);
 
    const struct v3dv_physical_device *physical_device =
-      &cmd_buffer->device->instance->physicalDevice;
+      cmd_buffer->device->pdevice;
 
-   if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+   if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
       return false;
 
    if (!cmd_buffer->state.job)
@@ -399,44 +333,37 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
    struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
 
-   /* Don't merge if the subpasses have different view masks, since in that
-    * case the framebuffer setup is different and we need to emit different
-    * RCLs.
-    */
-   if (subpass->view_mask != prev_subpass->view_mask)
+   if (subpass->ds_attachment.attachment !=
+       prev_subpass->ds_attachment.attachment)
       return false;
 
-   /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED,
-    * we need to check that for each subpass all its used attachments are
-    * used by the other subpass.
-    */
-   bool compatible =
-      attachment_list_is_subset(prev_subpass->color_attachments,
-                                prev_subpass->color_count,
-                                subpass->color_attachments,
-                                subpass->color_count);
-   if (!compatible)
+   if (subpass->color_count != prev_subpass->color_count)
       return false;
 
-   compatible =
-      attachment_list_is_subset(subpass->color_attachments,
-                                subpass->color_count,
-                                prev_subpass->color_attachments,
-                                prev_subpass->color_count);
-   if (!compatible)
-      return false;
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      if (subpass->color_attachments[i].attachment !=
+          prev_subpass->color_attachments[i].attachment) {
+         return false;
+      }
+   }
 
-   if (subpass->ds_attachment.attachment !=
-       prev_subpass->ds_attachment.attachment)
+   /* Don't merge if the subpasses have different view masks, since in that
+    * case the framebuffer setup is different and we need to emit different
+    * RCLs.
+    */
+   if (subpass->view_mask != prev_subpass->view_mask)
       return false;
 
    /* FIXME: Since some attachment formats can't be resolved using the TLB we
     * need to emit separate resolve jobs for them and that would not be
     * compatible with subpass merges. We could fix that by testing if any of
-    * the attachments to resolve doesn't suppotr TLB resolves.
+    * the attachments to resolve doesn't support TLB resolves.
     */
-   if (prev_subpass->resolve_attachments || subpass->resolve_attachments)
+   if (prev_subpass->resolve_attachments || subpass->resolve_attachments ||
+       prev_subpass->resolve_depth || prev_subpass->resolve_stencil ||
+       subpass->resolve_depth || subpass->resolve_stencil) {
       return false;
+   }
 
    return true;
 }
@@ -452,18 +379,10 @@ job_compute_frame_tiling(struct v3dv_job *job,
                          uint32_t layers,
                          uint32_t render_target_count,
                          uint8_t max_internal_bpp,
-                         bool msaa)
-{
-   static const uint8_t tile_sizes[] = {
-      64, 64,
-      64, 32,
-      32, 32,
-      32, 16,
-      16, 16,
-      16,  8,
-       8,  8
-   };
-
+                         uint8_t total_color_bpp,
+                         bool msaa,
+                         bool double_buffer)
+{
    assert(job);
    struct v3dv_frame_tiling *tiling = &job->frame_tiling;
 
@@ -472,23 +391,18 @@ job_compute_frame_tiling(struct v3dv_job *job,
    tiling->layers = layers;
    tiling->render_target_count = render_target_count;
    tiling->msaa = msaa;
-
-   uint32_t tile_size_index = 0;
-
-   if (render_target_count > 2)
-      tile_size_index += 2;
-   else if (render_target_count > 1)
-      tile_size_index += 1;
-
-   if (msaa)
-      tile_size_index += 2;
-
    tiling->internal_bpp = max_internal_bpp;
-   tile_size_index += tiling->internal_bpp;
-   assert(tile_size_index < ARRAY_SIZE(tile_sizes) / 2);
+   tiling->total_color_bpp = total_color_bpp;
+   tiling->double_buffer = double_buffer;
 
-   tiling->tile_width = tile_sizes[tile_size_index * 2];
-   tiling->tile_height = tile_sizes[tile_size_index * 2 + 1];
+   /* Double-buffer is incompatible with MSAA */
+   assert(!tiling->msaa || !tiling->double_buffer);
+
+   v3d_choose_tile_size(&job->device->devinfo,
+                        render_target_count,
+                        max_internal_bpp, total_color_bpp, msaa,
+                        tiling->double_buffer,
+                        &tiling->tile_width, &tiling->tile_height);
 
    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
    tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
@@ -516,41 +430,17 @@ job_compute_frame_tiling(struct v3dv_job *job,
    return tiling;
 }
 
-void
-v3dv_job_start_frame(struct v3dv_job *job,
-                     uint32_t width,
-                     uint32_t height,
-                     uint32_t layers,
-                     bool allocate_tile_state_for_all_layers,
-                     uint32_t render_target_count,
-                     uint8_t max_internal_bpp,
-                     bool msaa)
+bool
+v3dv_job_allocate_tile_state(struct v3dv_job *job)
 {
-   assert(job);
-
-   /* Start by computing frame tiling spec for this job */
-   const struct v3dv_frame_tiling *tiling =
-      job_compute_frame_tiling(job,
-                               width, height, layers,
-                               render_target_count, max_internal_bpp, msaa);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
-   v3dv_return_if_oom(NULL, job);
-
-   /* We only need to allocate tile state for all layers if the binner
-    * writes primitives to layers other than the first. This can only be
-    * done using layered rendering (writing gl_Layer from a geometry shader),
-    * so for other cases of multilayered framebuffers (typically with
-    * meta copy/clear operations) that won't use layered rendering, we only
-    * need one layer worth of of tile state for the binner.
-    */
-   if (!allocate_tile_state_for_all_layers)
-      layers = 1;
+   struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+   const uint32_t layers =
+      job->allocate_tile_state_for_all_layers ? tiling->layers : 1;
 
    /* The PTB will request the tile alloc initial size per tile at start
     * of tile binning.
     */
-   uint32_t tile_alloc_size = 64 * tiling->layers *
+   uint32_t tile_alloc_size = 64 * layers *
                               tiling->draw_tiles_x *
                               tiling->draw_tiles_y;
 
@@ -573,47 +463,127 @@ v3dv_job_start_frame(struct v3dv_job *job,
                                    "tile_alloc", true);
    if (!job->tile_alloc) {
       v3dv_flag_oom(NULL, job);
-      return;
+      return false;
    }
 
    v3dv_job_add_bo_unchecked(job, job->tile_alloc);
 
    const uint32_t tsda_per_tile_size = 256;
-   const uint32_t tile_state_size = tiling->layers *
+   const uint32_t tile_state_size = layers *
                                     tiling->draw_tiles_x *
                                     tiling->draw_tiles_y *
                                     tsda_per_tile_size;
    job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);
    if (!job->tile_state) {
       v3dv_flag_oom(NULL, job);
-      return;
+      return false;
    }
 
    v3dv_job_add_bo_unchecked(job, job->tile_state);
+   return true;
+}
+
+void
+v3dv_job_start_frame(struct v3dv_job *job,
+                     uint32_t width,
+                     uint32_t height,
+                     uint32_t layers,
+                     bool allocate_tile_state_for_all_layers,
+                     bool allocate_tile_state_now,
+                     uint32_t render_target_count,
+                     uint8_t max_internal_bpp,
+                     uint8_t total_color_bpp,
+                     bool msaa)
+{
+   assert(job);
+
+   /* Start by computing frame tiling spec for this job assuming that
+    * double-buffer mode is disabled.
+    */
+   const struct v3dv_frame_tiling *tiling =
+      job_compute_frame_tiling(job, width, height, layers,
+                               render_target_count, max_internal_bpp,
+                               total_color_bpp, msaa, false);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
+   v3dv_return_if_oom(NULL, job);
 
-   v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers);
+   job->allocate_tile_state_for_all_layers = allocate_tile_state_for_all_layers;
+
+   /* For subpass jobs we postpone tile state allocation until we are finishing
+    * the job and have made a decision about double-buffer.
+    */
+   if (allocate_tile_state_now) {
+      if (!v3dv_job_allocate_tile_state(job))
+         return;
+   }
+
+   v3dv_X(job->device, job_emit_binning_prolog)(job, tiling,
+      allocate_tile_state_for_all_layers ? tiling->layers : 1);
 
    job->ez_state = V3D_EZ_UNDECIDED;
    job->first_ez_state = V3D_EZ_UNDECIDED;
 }
 
+static bool
+job_should_enable_double_buffer(struct v3dv_job *job)
+{
+   /* Incompatibility with double-buffer */
+   if (!job->can_use_double_buffer)
+      return false;
+
+   /* Too much geometry processing */
+   if (job->double_buffer_score.geom > 2000000)
+      return false;
+
+   /* Too little rendering to make up for tile store latency */
+   if (job->double_buffer_score.render < 100000)
+      return false;
+
+   return true;
+}
+
 static void
 cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   assert(cmd_buffer->state.job);
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   /* For subpass jobs we always emit the RCL here */
+   assert(v3dv_cl_offset(&job->rcl) == 0);
+
+   /* Only emit RCL for the first job in a suspend/resume chain */
+   if (!job->resuming) {
+      /* Decide if we want to enable double-buffer for this job. If we do, then
+       * we need to rewrite the TILE_BINNING_MODE_CFG packet in the BCL.
+       */
+      if (job_should_enable_double_buffer(job)) {
+         assert(!job->frame_tiling.double_buffer);
+         job_compute_frame_tiling(job,
+                                  job->frame_tiling.width,
+                                  job->frame_tiling.height,
+                                  job->frame_tiling.layers,
+                                  job->frame_tiling.render_target_count,
+                                  job->frame_tiling.internal_bpp,
+                                  job->frame_tiling.total_color_bpp,
+                                  job->frame_tiling.msaa,
+                                  true);
+
+         v3dv_X(job->device, job_emit_enable_double_buffer)(job);
+      }
+
+      /* At this point we have decided whether we want to use double-buffer or
+       * not and the job's frame tiling represents that decision so we can
+       * allocate the tile state, which we need to do before we emit the RCL.
+       */
+      v3dv_job_allocate_tile_state(job);
 
-   /* Typically, we have a single job for each subpass and we emit the job's RCL
-    * here when we are ending the frame for the subpass. However, some commands
-    * such as vkCmdClearAttachments need to run in their own separate job and
-    * they emit their own RCL even if they execute inside a subpass. In this
-    * scenario, we don't want to emit subpass RCL when we end the frame for
-    * those jobs, so we only emit the subpass RCL if the job has not recorded
-    * any RCL commands of its own.
-    */
-   if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0)
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
+   }
 
-   v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job);
+   /* Only emit the binning flush for the last job in resume/suspend chain */
+   if (!job->suspending)
+      v3dv_X(cmd_buffer->device, job_emit_binning_flush)(job);
 }
 
 struct v3dv_job *
@@ -635,24 +605,47 @@ v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
 }
 
 static void
-cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
+cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+                              struct v3dv_query_pool *pool,
+                              uint32_t query, uint32_t count)
 {
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
 
-   if (state->query.end.used_count > 0) {
-      const uint32_t query_count = state->query.end.used_count;
-      for (uint32_t i = 0; i < query_count; i++) {
-         assert(i < state->query.end.used_count);
-         struct v3dv_job *job =
-            v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                           V3DV_JOB_TYPE_CPU_END_QUERY,
-                                           cmd_buffer, -1);
-         v3dv_return_if_oom(cmd_buffer, NULL);
-
-         job->cpu.query_end = state->query.end.states[i];
-         list_addtail(&job->list_link, &cmd_buffer->jobs);
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                     V3DV_JOB_TYPE_CPU_END_QUERY,
+                                     cmd_buffer, -1);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   job->cpu.query_end.pool = pool;
+   job->cpu.query_end.query = query;
+   job->cpu.query_end.count = count;
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
+}
+
+static inline bool
+cmd_buffer_has_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   return cmd_buffer->state.query.end.used_count > 0;
+}
+
+static void
+cmd_buffer_add_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   const uint32_t count = state->query.end.used_count;
+   for (uint32_t i = 0; i < count; i++) {
+      assert(i < state->query.end.used_count);
+      struct v3dv_end_query_info *info = &state->query.end.states[i];
+       if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+         v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, info->pool,
+                                                     info->query, info->count, 1);
+      } else {
+         cmd_buffer_emit_end_query_cpu(cmd_buffer, info->pool,
+                                       info->query, info->count);
       }
    }
+   state->query.end.used_count = 0;
 }
 
 void
@@ -673,8 +666,17 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
     * should at least have the start frame commands, otherwise, it should have
     * a transfer command. The only exception are secondary command buffers
     * inside a render pass.
+    *
+    * With dynamic rendering there is also the possibility that we resume a
+    * suspended pass with an empty job. In that case, we need to ensure the
+    * empty job is still a valid commmand list, which we will ensure when we
+    * add the binning flush right below, which only happens if this is the
+    * last job in the resume/suspend chain. If it is not the last then we know
+    * it must at least have the BRANCH instruction to link with a follow-up
+    * resume job.
     */
-   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
+   assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
+          (job->resuming && !job->suspending) ||
           v3dv_cl_offset(&job->bcl) > 0);
 
    /* When we merge multiple subpasses into the same job we must only emit one
@@ -684,6 +686,11 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
     */
    assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
 
+   if (!(cmd_buffer->state.barrier.dst_mask & V3DV_BARRIER_GRAPHICS_BIT)) {
+      cmd_buffer->state.barrier.bcl_buffer_access = 0;
+      cmd_buffer->state.barrier.bcl_image_access = 0;
+   }
+
    /* If we are finishing a job inside a render pass we have two scenarios:
     *
     * 1. It is a regular CL, in which case we will submit the job to the GPU,
@@ -699,32 +706,36 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
       if (job->type == V3DV_JOB_TYPE_GPU_CL) {
          cmd_buffer_end_render_pass_frame(cmd_buffer);
       } else {
-         assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+         assert(job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
          v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);
       }
    }
 
+   bool suspending = job->suspending;
    list_addtail(&job->list_link, &cmd_buffer->jobs);
    cmd_buffer->state.job = NULL;
 
    /* If we have recorded any state with this last GPU job that requires to
-    * emit CPU jobs after the job is completed, add them now. The only
-    * exception is secondary command buffers inside a render pass, because in
+    * emit jobs after the job is completed, add them now. The only exception
+    * is secondary command buffers inside a render pass, because in
     * that case we want to defer this until we finish recording the primary
     * job into which we execute the secondary.
     */
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
-       !cmd_buffer->state.pass) {
-      cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);
+   if (!suspending) {
+      if (cmd_buffer_has_pending_jobs(cmd_buffer) &&
+          (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
+           !cmd_buffer->state.pass)) {
+         cmd_buffer_add_pending_jobs(cmd_buffer);
+      }
    }
 }
 
-static bool
-job_type_is_gpu(struct v3dv_job *job)
+bool
+v3dv_job_type_is_gpu(struct v3dv_job *job)
 {
    switch (job->type) {
    case V3DV_JOB_TYPE_GPU_CL:
-   case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
+   case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
    case V3DV_JOB_TYPE_GPU_TFU:
    case V3DV_JOB_TYPE_GPU_CSD:
       return true;
@@ -739,24 +750,40 @@ cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
 {
    assert(cmd_buffer && job);
 
-   if (!cmd_buffer->state.has_barrier)
-      return;
-
    /* Serialization only affects GPU jobs, CPU jobs are always automatically
     * serialized.
     */
-   if (!job_type_is_gpu(job))
+   if (!v3dv_job_type_is_gpu(job))
       return;
 
-   job->serialize = true;
-   if (cmd_buffer->state.has_bcl_barrier &&
-       (job->type == V3DV_JOB_TYPE_GPU_CL ||
-        job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) {
-      job->needs_bcl_sync = true;
+   uint8_t barrier_mask = cmd_buffer->state.barrier.dst_mask;
+   if (barrier_mask == 0)
+      return;
+
+   uint8_t bit = 0;
+   uint8_t *src_mask;
+   if (job->type == V3DV_JOB_TYPE_GPU_CSD) {
+      assert(!job->is_transfer);
+      bit = V3DV_BARRIER_COMPUTE_BIT;
+      src_mask = &cmd_buffer->state.barrier.src_mask_compute;
+   } else if (job->is_transfer) {
+      assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
+             job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
+             job->type == V3DV_JOB_TYPE_GPU_TFU);
+      bit = V3DV_BARRIER_TRANSFER_BIT;
+      src_mask = &cmd_buffer->state.barrier.src_mask_transfer;
+   } else {
+      assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
+             job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
+      bit = V3DV_BARRIER_GRAPHICS_BIT;
+      src_mask = &cmd_buffer->state.barrier.src_mask_graphics;
    }
 
-   cmd_buffer->state.has_barrier = false;
-   cmd_buffer->state.has_bcl_barrier = false;
+   if (barrier_mask & bit) {
+      job->serialize = *src_mask;
+      *src_mask = 0;
+      cmd_buffer->state.barrier.dst_mask &= ~bit;
+   }
 }
 
 void
@@ -779,7 +806,7 @@ v3dv_job_init(struct v3dv_job *job,
    list_inithead(&job->list_link);
 
    if (type == V3DV_JOB_TYPE_GPU_CL ||
-       type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
+       type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
        type == V3DV_JOB_TYPE_GPU_CSD) {
       job->bos =
          _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
@@ -787,12 +814,12 @@ v3dv_job_init(struct v3dv_job *job,
 
       v3dv_cl_init(job, &job->indirect);
 
-      if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
+      if (V3D_DBG(ALWAYS_FLUSH))
          job->always_flush = true;
    }
 
    if (type == V3DV_JOB_TYPE_GPU_CL ||
-       type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
+       type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
       v3dv_cl_init(job, &job->bcl);
       v3dv_cl_init(job, &job->rcl);
    }
@@ -806,9 +833,10 @@ v3dv_job_init(struct v3dv_job *job,
        */
       cmd_buffer->state.dirty = ~0;
       cmd_buffer->state.dirty_descriptor_stages = ~0;
+      vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state);
 
-      /* Honor inheritance of occlussion queries in secondaries if requested */
-      if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
+      /* Honor inheritance of occlusion queries in secondaries if requested */
+      if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
           cmd_buffer->state.inheritance.occlusion_query_enable) {
          cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
       }
@@ -820,7 +848,11 @@ v3dv_job_init(struct v3dv_job *job,
       if (cmd_buffer->state.pass)
          job->first_subpass = subpass_idx;
 
+      job->is_transfer = cmd_buffer->state.is_transfer;
+
       cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
+
+      job->perf = cmd_buffer->state.query.active_query.perf;
    }
 }
 
@@ -860,19 +892,16 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
    return job;
 }
 
-static VkResult
-cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
+static void
+cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
                  VkCommandBufferResetFlags flags)
 {
+   struct v3dv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
+
+   vk_command_buffer_reset(&cmd_buffer->vk);
    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
       struct v3dv_device *device = cmd_buffer->device;
-      struct v3dv_cmd_pool *pool = cmd_buffer->pool;
-      VkCommandBufferLevel level = cmd_buffer->level;
-
-      /* cmd_buffer_init below will re-add the command buffer to the pool
-       * so remove it here so we don't end up adding it again.
-       */
-      list_del(&cmd_buffer->pool_link);
 
       /* FIXME: For now we always free all resources as if
        * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
@@ -880,87 +909,61 @@ cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
       if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)
          cmd_buffer_free_resources(cmd_buffer);
 
-      cmd_buffer_init(cmd_buffer, device, pool, level);
+      cmd_buffer_init(cmd_buffer, device);
    }
 
    assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AllocateCommandBuffers(VkDevice _device,
-                            const VkCommandBufferAllocateInfo *pAllocateInfo,
-                            VkCommandBuffer *pCommandBuffers)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool);
-
-   VkResult result = VK_SUCCESS;
-   uint32_t i;
-
-   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
-      result = cmd_buffer_create(device, pool, pAllocateInfo->level,
-                                 &pCommandBuffers[i]);
-      if (result != VK_SUCCESS)
-         break;
-   }
-
-   if (result != VK_SUCCESS) {
-      v3dv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
-                              i, pCommandBuffers);
-      for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
-         pCommandBuffers[i] = VK_NULL_HANDLE;
-   }
-
-   return result;
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_FreeCommandBuffers(VkDevice device,
-                        VkCommandPool commandPool,
-                        uint32_t commandBufferCount,
-                        const VkCommandBuffer *pCommandBuffers)
-{
-   for (uint32_t i = 0; i < commandBufferCount; i++) {
-      V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
-
-      if (!cmd_buffer)
-         continue;
-
-      cmd_buffer_destroy(cmd_buffer);
-   }
-}
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroyCommandPool(VkDevice _device,
-                        VkCommandPool commandPool,
-                        const VkAllocationCallbacks *pAllocator)
+static void
+cmd_buffer_emit_resolve(struct v3dv_cmd_buffer *cmd_buffer,
+                        uint32_t dst_attachment_idx,
+                        uint32_t src_attachment_idx,
+                        VkImageAspectFlagBits aspect)
 {
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
-
-   if (!pool)
-      return;
-
-   list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
-                            &pool->cmd_buffers, pool_link) {
-      cmd_buffer_destroy(cmd_buffer);
-   }
+   struct v3dv_image_view *src_iview =
+      cmd_buffer->state.attachments[src_attachment_idx].image_view;
+   struct v3dv_image_view *dst_iview =
+      cmd_buffer->state.attachments[dst_attachment_idx].image_view;
+
+   const VkRect2D *ra = &cmd_buffer->state.render_area;
+
+   VkImageResolve2 region = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2,
+      .srcSubresource = {
+         aspect,
+         src_iview->vk.base_mip_level,
+         src_iview->vk.base_array_layer,
+         src_iview->vk.layer_count,
+      },
+      .srcOffset = { ra->offset.x, ra->offset.y, 0 },
+      .dstSubresource =  {
+         aspect,
+         dst_iview->vk.base_mip_level,
+         dst_iview->vk.base_array_layer,
+         dst_iview->vk.layer_count,
+      },
+      .dstOffset = { ra->offset.x, ra->offset.y, 0 },
+      .extent = { ra->extent.width, ra->extent.height, 1 },
+   };
 
-   vk_object_free(&device->vk, pAllocator, pool);
-}
+   struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
+   struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
+   VkResolveImageInfo2 resolve_info = {
+      .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2,
+      .srcImage = v3dv_image_to_handle(src_image),
+      .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      .dstImage = v3dv_image_to_handle(dst_image),
+      .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      .regionCount = 1,
+      .pRegions = &region,
+   };
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_TrimCommandPool(VkDevice device,
-                     VkCommandPool commandPool,
-                     VkCommandPoolTrimFlags flags)
-{
-   /* We don't need to do anything here, our command pools never hold on to
-    * any resources from command buffers that are freed or reset.
-    */
+   VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
+   v3dv_CmdResolveImage2(cmd_buffer_handle, &resolve_info);
 }
 
-
 static void
 cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -972,8 +975,6 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
    if (!subpass->resolve_attachments)
       return;
 
-   struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer;
-
    /* At this point we have already ended the current subpass and now we are
     * about to emit vkCmdResolveImage calls to get the resolves we can't handle
     * handle in the subpass RCL.
@@ -993,55 +994,42 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.pass = NULL;
    cmd_buffer->state.subpass_idx = -1;
 
-   VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
    for (uint32_t i = 0; i < subpass->color_count; i++) {
       const uint32_t src_attachment_idx =
          subpass->color_attachments[i].attachment;
       if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
          continue;
 
-      if (pass->attachments[src_attachment_idx].use_tlb_resolve)
+      /* Skip if this attachment doesn't have a resolve or if it was already
+       * implemented as a TLB resolve.
+       */
+      if (!cmd_buffer->state.attachments[src_attachment_idx].has_resolve ||
+          cmd_buffer->state.attachments[src_attachment_idx].use_tlb_resolve) {
          continue;
+      }
 
       const uint32_t dst_attachment_idx =
          subpass->resolve_attachments[i].attachment;
-      if (dst_attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
+      assert(dst_attachment_idx != VK_ATTACHMENT_UNUSED);
 
-      struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];
-      struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];
-
-      VkImageResolve2KHR region = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR,
-         .srcSubresource = {
-            VK_IMAGE_ASPECT_COLOR_BIT,
-            src_iview->vk.base_mip_level,
-            src_iview->vk.base_array_layer,
-            src_iview->vk.layer_count,
-         },
-         .srcOffset = { 0, 0, 0 },
-         .dstSubresource =  {
-            VK_IMAGE_ASPECT_COLOR_BIT,
-            dst_iview->vk.base_mip_level,
-            dst_iview->vk.base_array_layer,
-            dst_iview->vk.layer_count,
-         },
-         .dstOffset = { 0, 0, 0 },
-         .extent = src_iview->vk.image->extent,
-      };
+      cmd_buffer_emit_resolve(cmd_buffer, dst_attachment_idx, src_attachment_idx,
+                              VK_IMAGE_ASPECT_COLOR_BIT);
+   }
 
-      struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
-      struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
-      VkResolveImageInfo2KHR resolve_info = {
-         .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR,
-         .srcImage = v3dv_image_to_handle(src_image),
-         .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
-         .dstImage = v3dv_image_to_handle(dst_image),
-         .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
-         .regionCount = 1,
-         .pRegions = &region,
-      };
-      v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info);
+   const uint32_t ds_src_attachment_idx =
+      subpass->ds_attachment.attachment;
+   if (ds_src_attachment_idx != VK_ATTACHMENT_UNUSED &&
+       cmd_buffer->state.attachments[ds_src_attachment_idx].has_resolve &&
+       !cmd_buffer->state.attachments[ds_src_attachment_idx].use_tlb_resolve) {
+      assert(subpass->resolve_depth || subpass->resolve_stencil);
+      const VkImageAspectFlags ds_aspects =
+         (subpass->resolve_depth ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
+         (subpass->resolve_stencil ? VK_IMAGE_ASPECT_STENCIL_BIT : 0);
+      const uint32_t ds_dst_attachment_idx =
+         subpass->ds_resolve_attachment.attachment;
+      assert(ds_dst_attachment_idx != VK_ATTACHMENT_UNUSED);
+      cmd_buffer_emit_resolve(cmd_buffer, ds_dst_attachment_idx,
+                              ds_src_attachment_idx, ds_aspects);
    }
 
    cmd_buffer->state.framebuffer = restore_fb;
@@ -1054,19 +1042,30 @@ cmd_buffer_begin_render_pass_secondary(
    struct v3dv_cmd_buffer *cmd_buffer,
    const VkCommandBufferInheritanceInfo *inheritance_info)
 {
-   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+   assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
    assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
    assert(inheritance_info);
 
-   cmd_buffer->state.pass =
-      v3dv_render_pass_from_handle(inheritance_info->renderPass);
-   assert(cmd_buffer->state.pass);
+   const VkCommandBufferInheritanceRenderingInfo *rendering_info = NULL;
+   if (inheritance_info->renderPass == VK_NULL_HANDLE) {
+      rendering_info = vk_find_struct_const(inheritance_info,
+                                            COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
+      assert(rendering_info);
+      v3dv_setup_dynamic_render_pass_inheritance(cmd_buffer, rendering_info);
+      cmd_buffer->state.pass = &cmd_buffer->state.dynamic_pass;
+      cmd_buffer->state.subpass_idx = 0;
+      cmd_buffer->state.framebuffer = NULL;
+   } else {
+      cmd_buffer->state.pass =
+         v3dv_render_pass_from_handle(inheritance_info->renderPass);
 
-   cmd_buffer->state.framebuffer =
-      v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
+      assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
+      cmd_buffer->state.subpass_idx = inheritance_info->subpass;
 
-   assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
-   cmd_buffer->state.subpass_idx = inheritance_info->subpass;
+      cmd_buffer->state.framebuffer =
+         v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
+   }
+   assert(cmd_buffer->state.pass);
 
    cmd_buffer->state.inheritance.occlusion_query_enable =
       inheritance_info->occlusionQueryEnable;
@@ -1075,8 +1074,8 @@ cmd_buffer_begin_render_pass_secondary(
     * so we want to create a job for them here.
     */
    struct v3dv_job *job =
-      v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass,
-                                V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+      v3dv_cmd_buffer_start_job(cmd_buffer, cmd_buffer->state.subpass_idx,
+                                V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
    if (!job) {
       v3dv_flag_oom(cmd_buffer, NULL);
       return VK_ERROR_OUT_OF_HOST_MEMORY;
@@ -1089,21 +1088,31 @@ cmd_buffer_begin_render_pass_secondary(
     *
     *    "The application must ensure (using scissor if necessary) that all
     *     rendering is contained within the render area."
-    *
-    * FIXME: setup constants for the max framebuffer dimensions and use them
-    * here and when filling in VkPhysicalDeviceLimits.
     */
    const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
    cmd_buffer->state.render_area.offset.x = 0;
    cmd_buffer->state.render_area.offset.y = 0;
    cmd_buffer->state.render_area.extent.width =
-      framebuffer ? framebuffer->width : 4096;
+      framebuffer ? framebuffer->width : V3D_MAX_IMAGE_DIMENSION;
    cmd_buffer->state.render_area.extent.height =
-      framebuffer ? framebuffer->height : 4096;
+      framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION;
+
+   /* We only really execute double-buffer mode in primary jobs, so allow this
+    * mode in render pass secondaries to keep track of the double-buffer mode
+    * score in them and update the primaries accordingly when they are executed
+    * into them.
+    */
+    job->can_use_double_buffer = true;
 
    return VK_SUCCESS;
 }
 
+const struct vk_command_buffer_ops v3dv_cmd_buffer_ops = {
+   .create = cmd_buffer_create,
+   .reset = cmd_buffer_reset,
+   .destroy = cmd_buffer_destroy,
+};
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                         const VkCommandBufferBeginInfo *pBeginInfo)
@@ -1114,17 +1123,15 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
     * command buffer's state. Otherwise, we must reset its state. In both
     * cases we reset it.
     */
-   VkResult result = cmd_buffer_reset(cmd_buffer, 0);
-   if (result != VK_SUCCESS)
-      return result;
+   cmd_buffer_reset(&cmd_buffer->vk, 0);
 
    assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
 
    cmd_buffer->usage_flags = pBeginInfo->flags;
 
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
       if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
-         result =
+         VkResult result =
             cmd_buffer_begin_render_pass_secondary(cmd_buffer,
                                                    pBeginInfo->pInheritanceInfo);
          if (result != VK_SUCCESS)
@@ -1137,32 +1144,6 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    return VK_SUCCESS;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
-                        VkCommandBufferResetFlags flags)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   return cmd_buffer_reset(cmd_buffer, flags);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetCommandPool(VkDevice device,
-                      VkCommandPool commandPool,
-                      VkCommandPoolResetFlags flags)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
-
-   VkCommandBufferResetFlags reset_flags = 0;
-   if (flags & VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT)
-      reset_flags = VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT;
-   list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
-                            &pool->cmd_buffers, pool_link) {
-      cmd_buffer_reset(cmd_buffer, reset_flags);
-   }
-
-   return VK_SUCCESS;
-}
-
 static void
 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -1191,21 +1172,64 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 }
 
 static void
+cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* NOTE: This should be called after cmd_buffer_update_tile_alignment()
+    * since it relies on up-to-date information about subpass tile alignment.
+    */
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   const struct v3dv_render_pass *pass = state->pass;
+   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      state->attachments[attachment_idx].has_resolve =
+         subpass->resolve_attachments &&
+         subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
+
+      state->attachments[attachment_idx].use_tlb_resolve =
+         state->attachments[attachment_idx].has_resolve &&
+         state->tile_aligned_render_area &&
+         pass->attachments[attachment_idx].try_tlb_resolve;
+   }
+
+   uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+      uint32_t ds_resolve_attachment_idx =
+         subpass->ds_resolve_attachment.attachment;
+      state->attachments[ds_attachment_idx].has_resolve =
+         ds_resolve_attachment_idx != VK_ATTACHMENT_UNUSED;
+
+      assert(!state->attachments[ds_attachment_idx].has_resolve ||
+             (subpass->resolve_depth || subpass->resolve_stencil));
+
+      state->attachments[ds_attachment_idx].use_tlb_resolve =
+         state->attachments[ds_attachment_idx].has_resolve &&
+         state->tile_aligned_render_area &&
+         pass->attachments[ds_attachment_idx].try_tlb_resolve;
+   }
+}
+
+static void
 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
                                             uint32_t attachment_idx,
                                             const VkClearColorValue *color)
 {
    assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
-
    const struct v3dv_render_pass_attachment *attachment =
       &cmd_buffer->state.pass->attachments[attachment_idx];
 
    uint32_t internal_type, internal_bpp;
    const struct v3dv_format *format =
       v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);
+   /* We don't allow multi-planar formats for render pass attachments */
+   assert(format->plane_count == 1);
 
    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)
-      (format->rt_type, &internal_type, &internal_bpp);
+      (format->planes[0].rt_type, &internal_type, &internal_bpp);
 
    uint32_t internal_size = 4 << internal_bpp;
 
@@ -1273,12 +1297,39 @@ cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
+cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer *cmd_buffer,
+                                 const VkRenderPassBeginInfo *pRenderPassBegin)
+{
+   V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
+   V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
+
+   const VkRenderPassAttachmentBeginInfo *attach_begin =
+      vk_find_struct_const(pRenderPassBegin, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
+
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   for (uint32_t i = 0; i < pass->attachment_count; i++) {
+      if (attach_begin && attach_begin->attachmentCount != 0) {
+         state->attachments[i].image_view =
+            v3dv_image_view_from_handle(attach_begin->pAttachments[i]);
+      } else if (framebuffer) {
+         state->attachments[i].image_view = framebuffer->attachments[i];
+      } else {
+         assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+         state->attachments[i].image_view = NULL;
+      }
+   }
+}
+
+static void
 cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,
                                              const VkRenderPassBeginInfo *pRenderPassBegin)
 {
    cmd_buffer_state_set_clear_values(cmd_buffer,
                                      pRenderPassBegin->clearValueCount,
                                      pRenderPassBegin->pClearValues);
+
+   cmd_buffer_state_set_attachments(cmd_buffer, pRenderPassBegin);
 }
 
 static void
@@ -1307,10 +1358,33 @@ cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffe
    assert(state->attachment_alloc_count >= pass->attachment_count);
 }
 
+/* If our render area is smaller than the current clip window we will have
+ * to emit a new clip window to constraint it to the render area.
+ */
+static void
+constraint_clip_window_to_render_area(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+
+   uint32_t min_render_x = state->render_area.offset.x;
+   uint32_t min_render_y = state->render_area.offset.y;
+   uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
+   uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
+   uint32_t min_clip_x = state->clip_window.offset.x;
+   uint32_t min_clip_y = state->clip_window.offset.y;
+   uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
+   uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
+   if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
+       max_render_x < max_clip_x || max_render_y < max_clip_y) {
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
+   }
+}
+
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
-                        const VkRenderPassBeginInfo *pRenderPassBegin,
-                        VkSubpassContents contents)
+v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
+                         const VkRenderPassBeginInfo *pRenderPassBegin,
+                         const VkSubpassBeginInfo *pSubpassBeginInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
@@ -1326,29 +1400,16 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
    cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);
 
    state->render_area = pRenderPassBegin->renderArea;
-
-   /* If our render area is smaller than the current clip window we will have
-    * to emit a new clip window to constraint it to the render area.
-    */
-   uint32_t min_render_x = state->render_area.offset.x;
-   uint32_t min_render_y = state->render_area.offset.y;
-   uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
-   uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
-   uint32_t min_clip_x = state->clip_window.offset.x;
-   uint32_t min_clip_y = state->clip_window.offset.y;
-   uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
-   uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
-   if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
-       max_render_x < max_clip_x || max_render_y < max_clip_y) {
-      state->dirty |= V3DV_CMD_DIRTY_SCISSOR;
-   }
+   constraint_clip_window_to_render_area(cmd_buffer);
 
    /* Setup for first subpass */
    v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
+v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,
+                     const VkSubpassBeginInfo *pSubpassBeginInfo,
+                     const VkSubpassEndInfo *pSubpassEndInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
@@ -1366,10 +1427,9 @@ v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
 static void
 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
    assert(cmd_buffer->state.pass);
    assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
+   assert(!cmd_buffer->state.resuming);
    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    const struct v3dv_render_pass *pass = state->pass;
    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
@@ -1384,7 +1444,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
    }
 
    uint32_t att_count = 0;
-   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
+   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
 
    /* We only need to emit subpass clears as draw calls for color attachments
     * if the render area is not aligned to tile boundaries.
@@ -1444,7 +1504,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
    } else if (subpass->do_depth_clear_with_draw ||
               subpass->do_stencil_clear_with_draw) {
-      perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), "
+      perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), "
                  "falling back to vkCmdClearAttachments for "
                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
    }
@@ -1458,23 +1518,212 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
     * So the clear is only constrained by the render area and not by pipeline
     * state such as scissor or viewport, these are the semantics of
     * vkCmdClearAttachments as well.
+    *
+    * Also:
+    *
+    *   "If the render pass instance this is recorded in uses multiview, then
+    *    baseArrayLayer must be zero and layerCount must be one."
     */
+   assert(state->framebuffer);
+   uint32_t layer_count = cmd_buffer->state.pass->multiview_enabled ?
+      1 : state->framebuffer->layers;
    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
    VkClearRect rect = {
       .rect = state->render_area,
       .baseArrayLayer = 0,
-      .layerCount = 1,
+      .layerCount = layer_count,
    };
    v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
 }
 
+bool
+v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
+                                 VkImageAspectFlags aspect,
+                                 uint32_t first_subpass_idx,
+                                 VkAttachmentLoadOp load_op,
+                                 uint32_t last_subpass_idx,
+                                 VkAttachmentStoreOp store_op)
+{
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
+    * testing does not exist in the image.
+    */
+   if (!aspect)
+      return false;
+
+   /* Attachment (or view) load operations apply on the first subpass that
+    * uses the attachment (or view), otherwise we always need to load.
+    */
+   if (state->job->first_subpass > first_subpass_idx)
+      return true;
+
+   /* If the job is continuing a subpass started in another job, we always
+    * need to load.
+    */
+   if (state->job->is_subpass_continue)
+      return true;
+
+   /* If the area is not aligned to tile boundaries and we are going to store,
+    * then we need to load to preserve contents outside the render area.
+    */
+   if (!state->tile_aligned_render_area &&
+       v3dv_cmd_buffer_check_needs_store(state, aspect, last_subpass_idx,
+                                         store_op)) {
+      return true;
+   }
+
+   /* The attachment load operations must be LOAD */
+   return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
+}
+
+bool
+v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
+                                  VkImageAspectFlags aspect,
+                                  uint32_t last_subpass_idx,
+                                  VkAttachmentStoreOp store_op)
+{
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
+    * testing does not exist in the image.
+    */
+   if (!aspect)
+      return false;
+
+   /* Attachment (or view) store operations only apply on the last subpass
+    * where the attachment (or view)  is used, in other subpasses we always
+    * need to store.
+    */
+   if (state->subpass_idx < last_subpass_idx)
+      return true;
+
+   /* Attachment store operations only apply on the last job we emit on the the
+    * last subpass where the attachment is used, otherwise we always need to
+    * store.
+    */
+   if (!state->job->is_subpass_finish)
+      return true;
+
+   /* The attachment store operation must be STORE */
+   return store_op == VK_ATTACHMENT_STORE_OP_STORE;
+}
+
+static void
+cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer *cmd_buffer,
+                                            bool msaa)
+{
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   job->can_use_double_buffer = false;
+
+   /* Double-buffer can only be used if requested via V3D_DEBUG */
+   if (!V3D_DBG(DOUBLE_BUFFER))
+      return;
+
+   /* Double-buffer cannot be enabled for MSAA jobs */
+   if (msaa)
+      return;
+
+   const struct v3dv_render_pass *pass = state->pass;
+   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+
+   /* FIXME: For now we discard multiview jobs (which have an implicit geometry
+    * shader) for this optimization. If we want to enable this with multiview
+    * we would need to check if any view (layer) in any attachment used by the
+    * job has loads and/or stores as we do below for regular attachments. Also,
+    * we would want to have a heuristic that doesn't automatically disable
+    * double-buffer in the presence of geometry shaders.
+    */
+   if (state->pass->multiview_enabled)
+      return;
+
+   /* Tile loads are serialized against stores, in which case we don't get
+    * any benefits from enabling double-buffer and would just pay the price
+    * of a smaller tile size instead. Similarly, we only benefit from
+    * double-buffer if we have tile stores, as the point of this mode is
+    * to execute rendering of a new tile while we store the previous one to
+    * hide latency on the tile store operation.
+    */
+   bool has_stores = false;
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      const struct v3dv_render_pass_attachment *attachment =
+         &state->pass->attachments[attachment_idx];
+
+      /* FIXME: This will check 'tile_aligned_render_area' but that was
+       * computed with a tile size without double-buffer. That is okay
+       * because if the larger tile size is aligned then we know the smaller
+       * tile size for double-buffer will be as well. However, we might
+       * still benefit from doing this check with the smaller tile size
+       * because it can happen that the smaller size is aligned and the
+       * larger size is not.
+       */
+      if (v3dv_cmd_buffer_check_needs_load(state,
+                                           VK_IMAGE_ASPECT_COLOR_BIT,
+                                           attachment->first_subpass,
+                                           attachment->desc.loadOp,
+                                           attachment->last_subpass,
+                                           attachment->desc.storeOp)) {
+         return;
+      }
+
+      if (v3dv_cmd_buffer_check_needs_store(state,
+                                            VK_IMAGE_ASPECT_COLOR_BIT,
+                                            attachment->last_subpass,
+                                            attachment->desc.storeOp)) {
+         has_stores = true;
+      }
+   }
+
+   if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+      uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+      const struct v3dv_render_pass_attachment *ds_attachment =
+         &state->pass->attachments[ds_attachment_idx];
+
+      const VkImageAspectFlags ds_aspects =
+         vk_format_aspects(ds_attachment->desc.format);
+
+      if (v3dv_cmd_buffer_check_needs_load(state,
+                                           ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                           ds_attachment->first_subpass,
+                                           ds_attachment->desc.loadOp,
+                                           ds_attachment->last_subpass,
+                                           ds_attachment->desc.storeOp)) {
+         return;
+      }
+
+      if (v3dv_cmd_buffer_check_needs_load(state,
+                                           ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                           ds_attachment->first_subpass,
+                                           ds_attachment->desc.stencilLoadOp,
+                                           ds_attachment->last_subpass,
+                                           ds_attachment->desc.stencilStoreOp)) {
+         return;
+      }
+
+      has_stores |= v3dv_cmd_buffer_check_needs_store(state,
+                                                      ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                                      ds_attachment->last_subpass,
+                                                      ds_attachment->desc.storeOp);
+      has_stores |= v3dv_cmd_buffer_check_needs_store(state,
+                                                      ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                                      ds_attachment->last_subpass,
+                                                      ds_attachment->desc.stencilStoreOp);
+   }
+
+   job->can_use_double_buffer = has_stores;
+}
+
 static struct v3dv_job *
 cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
                               uint32_t subpass_idx,
-                              enum v3dv_job_type type)
+                              enum v3dv_job_type type,
+                              bool is_subpass_start)
 {
    assert(type == V3DV_JOB_TYPE_GPU_CL ||
-          type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+          type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
 
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    assert(subpass_idx < state->pass->subpass_count);
@@ -1488,24 +1737,33 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
    if (!job)
       return NULL;
 
+   if (is_subpass_start && cmd_buffer->state.resuming) {
+      assert(subpass_idx == 0);
+      job->resuming = true;
+   }
+
    state->subpass_idx = subpass_idx;
 
    /* If we are starting a new job we need to setup binning. We only do this
-    * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY
+    * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_INCOMPLETE
     * jobs are not submitted to the GPU directly, and are instead meant to be
-    * branched to from other V3DV_JOB_TYPE_GPU_CL jobs.
+    * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. With dynamic rendering,
+    * all resuming jobs work similarly to secondary command buffers, so we
+    * apply the same.
     */
    if (type == V3DV_JOB_TYPE_GPU_CL &&
-       job->first_subpass == state->subpass_idx) {
+       job->first_subpass == state->subpass_idx &&
+       !job->resuming) {
       const struct v3dv_subpass *subpass =
          &state->pass->subpasses[state->subpass_idx];
 
       const struct v3dv_framebuffer *framebuffer = state->framebuffer;
 
-      uint8_t internal_bpp;
+      uint8_t max_internal_bpp, total_color_bpp;
       bool msaa;
       v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
-         (framebuffer, subpass, &internal_bpp, &msaa);
+         (framebuffer, state->attachments, subpass,
+          &max_internal_bpp, &total_color_bpp, &msaa);
 
       /* From the Vulkan spec:
        *
@@ -1527,9 +1785,10 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
                            framebuffer->width,
                            framebuffer->height,
                            layers,
-                           true,
+                           true, false,
                            subpass->color_count,
-                           internal_bpp,
+                           max_internal_bpp,
+                           total_color_bpp,
                            msaa);
    }
 
@@ -1545,28 +1804,29 @@ v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
 
    struct v3dv_job *job =
       cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
-                                    V3DV_JOB_TYPE_GPU_CL);
+                                    V3DV_JOB_TYPE_GPU_CL, true);
    if (!job)
       return NULL;
 
+   /* FIXME: do we need all this below for resuming jobs?  */
+
    /* Check if our render area is aligned to tile boundaries. We have to do
     * this in each subpass because the subset of attachments used can change
     * and with that the tile size selected by the hardware can change too.
     */
    cmd_buffer_update_tile_alignment(cmd_buffer);
 
+   /* Decide if we can use double-buffer for this subpass job */
+   cmd_buffer_subpass_check_double_buffer_mode(cmd_buffer, job->frame_tiling.msaa);
+
+   cmd_buffer_update_attachment_resolve_state(cmd_buffer);
+
    /* If we can't use TLB clears then we need to emit draw clears for any
     * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
-    * Depth/Stencil clears if we hit GFXH-1461.
-    *
-    * Secondary command buffers don't start subpasses (and may not even have
-    * framebuffer state), so we only care about this in primaries. The only
-    * exception could be a secondary runnning inside a subpass that needs to
-    * record a meta operation (with its own render pass) that relies on
-    * attachment load clears, but we don't have any instances of that right
-    * now.
-    */
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+    * Depth/Stencil clears if we hit GFXH-1461. With dynamic render passes this
+    * should only be called when starting the render pass, not when resuming.
+    */
+   if (!cmd_buffer->state.resuming)
       cmd_buffer_emit_subpass_clears(cmd_buffer);
 
    return job;
@@ -1580,13 +1840,13 @@ v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
    assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
 
    struct v3dv_job *job;
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
       job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
-                                          V3DV_JOB_TYPE_GPU_CL);
+                                          V3DV_JOB_TYPE_GPU_CL, false);
    } else {
-      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
       job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
-                                          V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+                                          V3DV_JOB_TYPE_GPU_CL_INCOMPLETE, false);
    }
 
    if (!job)
@@ -1611,7 +1871,8 @@ v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
+v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
+                       const VkSubpassEndInfo *pSubpassEndInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
@@ -1645,7 +1906,7 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
     * inside a render pass.
     */
    if (cmd_buffer->state.job) {
-      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
+      assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
              cmd_buffer->state.pass);
       v3dv_cmd_buffer_finish_job(cmd_buffer);
    }
@@ -1655,26 +1916,73 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
    return VK_SUCCESS;
 }
 
-static void
-clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
+static bool
+clone_bo_list(struct v3dv_device *device,
               struct list_head *dst,
               struct list_head *src)
 {
-   assert(cmd_buffer);
+   assert(device);
 
    list_inithead(dst);
    list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
       struct v3dv_bo *clone_bo =
-         vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8,
+         vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8,
                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-      if (!clone_bo) {
-         v3dv_flag_oom(cmd_buffer, NULL);
-         return;
-      }
+      if (!clone_bo)
+         return false;
 
       *clone_bo = *bo;
       list_addtail(&clone_bo->list_link, dst);
    }
+
+   return true;
+}
+
+struct v3dv_job *
+v3dv_job_clone(struct v3dv_job *job, bool skip_bcl)
+{
+   struct v3dv_job *clone = vk_alloc(&job->device->vk.alloc,
+                                     sizeof(struct v3dv_job), 8,
+                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!clone)
+      return NULL;
+
+   /* Cloned jobs don't duplicate resources, they share their CLs with the
+    * oringinal job, since they are typically read-only. The exception to this
+    * is dynamic rendering suspension paired with
+    * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, since in that case we need
+    * to patch the BCL with the resume address and for that we need to create a
+    * copy of the job so we avoid rewriting the resume address for another copy
+    * of the same job that may be running in the GPU. When we create a job for
+    * this use case skip_bcl is set to True and the caller will be responsible
+    * for creating the BCL.
+    */
+   *clone = *job;
+   clone->is_clone = true;
+   clone->cmd_buffer = NULL;
+
+   /* We need to regen the BO lists so that they point to the BO list in the
+    * cloned job. Otherwise functions like list_length() will loop forever.
+    */
+   if (job->type == V3DV_JOB_TYPE_GPU_CL) {
+      assert(job->cmd_buffer);
+      struct v3dv_device *device = job->cmd_buffer->device;
+
+      clone->bcl.job = clone;
+      clone->rcl.job = clone;
+      clone->indirect.job = clone;
+
+      if (!skip_bcl &&
+          !clone_bo_list(device, &clone->bcl.bo_list, &job->bcl.bo_list)) {
+         return NULL;
+      }
+      if (!clone_bo_list(device, &clone->rcl.bo_list, &job->rcl.bo_list))
+         return NULL;
+      if (!clone_bo_list(device, &clone->indirect.bo_list, &job->indirect.bo_list))
+         return NULL;
+   }
+
+   return clone;
 }
 
 /* Clones a job for inclusion in the given command buffer. Note that this
@@ -1687,31 +1995,29 @@ struct v3dv_job *
 v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
                              struct v3dv_cmd_buffer *cmd_buffer)
 {
-   struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc,
-                                         sizeof(struct v3dv_job), 8,
-                                         VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!clone_job) {
+   struct v3dv_job *clone = v3dv_job_clone(job, false);
+   if (!clone) {
       v3dv_flag_oom(cmd_buffer, NULL);
       return NULL;
    }
 
-   /* Cloned jobs don't duplicate resources! */
-   *clone_job = *job;
-   clone_job->is_clone = true;
-   clone_job->cmd_buffer = cmd_buffer;
-   list_addtail(&clone_job->list_link, &cmd_buffer->jobs);
+   clone->cmd_buffer = cmd_buffer;
+   list_addtail(&clone->list_link, &cmd_buffer->jobs);
+   return clone;
+}
 
-   /* We need to regen the BO lists so that they point to the BO list in the
-    * cloned job. Otherwise functions like list_length() will loop forever.
-    */
-   if (job->type == V3DV_JOB_TYPE_GPU_CL) {
-      clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list);
-      clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list);
-      clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list,
-                    &job->indirect.bo_list);
-   }
+void
+v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
+                                    struct v3dv_barrier_state *src)
+{
+   dst->dst_mask |= src->dst_mask;
 
-   return clone_job;
+   dst->src_mask_graphics |= src->src_mask_graphics;
+   dst->src_mask_compute  |= src->src_mask_compute;
+   dst->src_mask_transfer |= src->src_mask_transfer;
+
+   dst->bcl_buffer_access |= src->bcl_buffer_access;
+   dst->bcl_image_access  |= src->bcl_image_access;
 }
 
 static void
@@ -1719,8 +2025,7 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
                                 uint32_t cmd_buffer_count,
                                 const VkCommandBuffer *cmd_buffers)
 {
-   bool pending_barrier = false;
-   bool pending_bcl_barrier = false;
+   struct v3dv_barrier_state pending_barrier = { 0 };
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
       V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
 
@@ -1743,17 +2048,23 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
       list_for_each_entry(struct v3dv_job, secondary_job,
                           &secondary->jobs, list_link) {
          /* These can only happen inside a render pass */
-         assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+         assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
          struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
          if (!job)
             return;
 
-         if (pending_barrier) {
-            job->serialize = true;
-            if (pending_bcl_barrier)
+         if (pending_barrier.dst_mask) {
+            /* FIXME: do the same we do for primaries and only choose the
+             * relevant src masks.
+             */
+            job->serialize = pending_barrier.src_mask_graphics |
+                             pending_barrier.src_mask_transfer |
+                             pending_barrier.src_mask_compute;
+            if (pending_barrier.bcl_buffer_access ||
+                pending_barrier.bcl_image_access) {
                job->needs_bcl_sync = true;
-            pending_barrier = false;
-            pending_bcl_barrier = false;
+            }
+            memset(&pending_barrier, 0, sizeof(pending_barrier));
          }
       }
 
@@ -1761,14 +2072,15 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
        * barrier state consumed with whatever comes after it (first job in
        * the next secondary or the primary, if this was the last secondary).
        */
-      assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
-      pending_barrier = secondary->state.has_barrier;
-      pending_bcl_barrier = secondary->state.has_bcl_barrier;
+      assert(secondary->state.barrier.dst_mask ||
+             (!secondary->state.barrier.bcl_buffer_access &&
+              !secondary->state.barrier.bcl_image_access));
+      pending_barrier = secondary->state.barrier;
    }
 
-   if (pending_barrier) {
-      primary->state.has_barrier = true;
-      primary->state.has_bcl_barrier |= pending_bcl_barrier;
+   if (pending_barrier.dst_mask) {
+      v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
+                                          &pending_barrier);
    }
 }
 
@@ -1788,100 +2100,36 @@ v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
    }
 }
 
-/* This goes though the list of possible dynamic states in the pipeline and,
- * for those that are not configured as dynamic, copies relevant state into
- * the command buffer.
- */
 static void
-cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
-                                      const struct v3dv_dynamic_state *src)
-{
-   struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic;
-   uint32_t dynamic_mask = src->mask;
-   uint32_t dirty = 0;
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) {
-      dest->viewport.count = src->viewport.count;
-      if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
-                 src->viewport.count * sizeof(VkViewport))) {
-         typed_memcpy(dest->viewport.viewports,
-                      src->viewport.viewports,
-                      src->viewport.count);
-         typed_memcpy(dest->viewport.scale, src->viewport.scale,
-                      src->viewport.count);
-         typed_memcpy(dest->viewport.translate, src->viewport.translate,
-                      src->viewport.count);
-         dirty |= V3DV_CMD_DIRTY_VIEWPORT;
-      }
-   }
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) {
-      dest->scissor.count = src->scissor.count;
-      if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
-                 src->scissor.count * sizeof(VkRect2D))) {
-         typed_memcpy(dest->scissor.scissors,
-                      src->scissor.scissors, src->scissor.count);
-         dirty |= V3DV_CMD_DIRTY_SCISSOR;
-      }
-   }
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
-      if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
-                 sizeof(src->stencil_compare_mask))) {
-         dest->stencil_compare_mask = src->stencil_compare_mask;
-         dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
-      }
-   }
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
-      if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
-                 sizeof(src->stencil_write_mask))) {
-         dest->stencil_write_mask = src->stencil_write_mask;
-         dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
-      }
-   }
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
-      if (memcmp(&dest->stencil_reference, &src->stencil_reference,
-                 sizeof(src->stencil_reference))) {
-         dest->stencil_reference = src->stencil_reference;
-         dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
-      }
-   }
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
-      if (memcmp(dest->blend_constants, src->blend_constants,
-                 sizeof(src->blend_constants))) {
-         memcpy(dest->blend_constants, src->blend_constants,
-                sizeof(src->blend_constants));
-         dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
-      }
-   }
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) {
-      if (memcmp(&dest->depth_bias, &src->depth_bias,
-                 sizeof(src->depth_bias))) {
-         memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias));
-         dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
-      }
-   }
-
-   if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
-      if (dest->line_width != src->line_width) {
-         dest->line_width = src->line_width;
-         dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
-      }
+cmd_buffer_copy_private_dynamic_state(struct v3dv_dynamic_state *dst,
+                                      struct v3dv_dynamic_state *src,
+                                      struct vk_dynamic_graphics_state *src_dyn)
+{
+   if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
+         typed_memcpy(dst->viewport.scale, src->viewport.scale,
+                      MAX_VIEWPORTS);
+         typed_memcpy(dst->viewport.translate, src->viewport.translate,
+                      MAX_VIEWPORTS);
    }
+   if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES))
+      dst->color_write_enable = src->color_write_enable;
+}
 
-   if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
-      if (dest->color_write_enable != src->color_write_enable) {
-         dest->color_write_enable = src->color_write_enable;
-         dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
-      }
-   }
+/* This function copies relevant static state from the pipeline to the command
+ * buffer state.
+ *
+ * Notice the Vulkan runtime uses the term 'dynamic' to refer to all state
+ * that *could* be dynamic, even if it is not dynamic for a particular
+ * pipeline, so the terminology used in the runtime may be a bit misleading.
+ */
+static void
+cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
+                                      struct v3dv_pipeline *pipeline)
+{
+   vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, &pipeline->dynamic_graphics_state);
+   cmd_buffer_copy_private_dynamic_state(&cmd_buffer->state.dynamic, &pipeline->dynamic,
+                                         &pipeline->dynamic_graphics_state);
 
-   cmd_buffer->state.dynamic.mask = dynamic_mask;
-   cmd_buffer->state.dirty |= dirty;
 }
 
 static void
@@ -1889,13 +2137,17 @@ bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
                        struct v3dv_pipeline *pipeline)
 {
    assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
+
+   /* We need to unconditionally bind the pipeline static state, as the state
+    * could have changed (through calls to vkCmdSetXXX) between bindings of
+    * the same pipeline.
+    */
+   cmd_buffer_bind_pipeline_static_state(cmd_buffer, pipeline);
+
    if (cmd_buffer->state.gfx.pipeline == pipeline)
       return;
 
    cmd_buffer->state.gfx.pipeline = pipeline;
-
-   cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);
-
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
 }
 
@@ -1935,39 +2187,66 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
    }
 }
 
-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
+/* Considers the pipeline's negative_one_to_one state and applies it to the
+ * current viewport transform if needed to produce the resulting Z translate
+ * and scale parameters.
+ */
 void
-v3dv_viewport_compute_xform(const VkViewport *viewport,
-                            float scale[3],
-                            float translate[3])
-{
-   float x = viewport->x;
-   float y = viewport->y;
-   float half_width = 0.5f * viewport->width;
-   float half_height = 0.5f * viewport->height;
-   double n = viewport->minDepth;
-   double f = viewport->maxDepth;
-
-   scale[0] = half_width;
-   translate[0] = half_width + x;
-   scale[1] = half_height;
-   translate[1] = half_height + y;
-
-   scale[2] = (f - n);
-   translate[2] = n;
-
-   /* It seems that if the scale is small enough the hardware won't clip
-    * correctly so we work around this my choosing the smallest scale that
-    * seems to work.
-    *
-    * This case is exercised by CTS:
-    * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
+v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer,
+                                           uint32_t vp_idx,
+                                           float *translate_z, float *scale_z)
+{
+   const struct v3dv_viewport_state *vp_state = &cmd_buffer->state.dynamic.viewport;
+   const struct vk_viewport_state *vk_vp_state = &cmd_buffer->vk.dynamic_graphics_state.vp;
+
+   float t = vp_state->translate[vp_idx][2];
+   float s = vp_state->scale[vp_idx][2];
+
+   assert(cmd_buffer->state.gfx.pipeline);
+   if (cmd_buffer->state.gfx.pipeline->negative_one_to_one) {
+      t = (t + vk_vp_state->viewports[vp_idx].maxDepth) * 0.5f;
+      s *= 0.5f;
+   }
+
+   if (translate_z)
+      *translate_z = t;
+
+   if (scale_z)
+      *scale_z = s;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
+                               uint32_t attachmentCount,
+                               const VkBool32 *pColorWriteEnables)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
+   struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t color_write_enable = 0;
+
+   /* Vulkan runtime computes color_write_enable as an 8-bit bitset, setting a
+    * bit per attachment. But when emitting, it is combined with the
+    * color_write_mask, that is stored as a 32-bit mask (one bit per channel,
+    * per attachment). So we store the color_write_enable as a 32-bit mask
+    * ourselves.
     */
-   const float min_abs_scale = 0.000009f;
-   if (fabs(scale[2]) < min_abs_scale)
-      scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
+   for (uint32_t i = 0; i < attachmentCount; i++)
+      color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
+
+   if (v3dv_dyn->color_write_enable == color_write_enable)
+      return;
+
+   v3dv_dyn->color_write_enable = color_write_enable;
+   BITSET_SET(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
 }
 
+/* We keep a custom CmdSetViewport because we want to cache the outcome of
+ * viewport_compute_xform, and because we need to set the viewport count. This
+ * is specially relevant to our case because we are pushing/popping the
+ * dynamic state as part of the meta operations.
+ */
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
                     uint32_t firstViewport,
@@ -1975,63 +2254,55 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
                     const VkViewport *pViewports)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const uint32_t total_count = firstViewport + viewportCount;
+   struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
+   struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
 
+   const uint32_t total_count = firstViewport + viewportCount;
    assert(firstViewport < MAX_VIEWPORTS);
    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
 
-   if (state->dynamic.viewport.count < total_count)
-      state->dynamic.viewport.count = total_count;
-
-   if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
-               pViewports, viewportCount * sizeof(*pViewports))) {
-      return;
-   }
-
-   memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
-          viewportCount * sizeof(*pViewports));
+   vk_common_CmdSetViewportWithCount(commandBuffer,
+                                     total_count,
+                                     pViewports);
 
    for (uint32_t i = firstViewport; i < total_count; i++) {
-      v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
-                                  state->dynamic.viewport.scale[i],
-                                  state->dynamic.viewport.translate[i]);
+      v3dv_X(cmd_buffer->device, viewport_compute_xform)
+         (&dyn->vp.viewports[i], v3dv_dyn->viewport.scale[i],
+          v3dv_dyn->viewport.translate[i]);
    }
+}
 
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,
+                             uint32_t viewportCount,
+                             const VkViewport *pViewports)
+{
+   v3dv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
 }
 
+/* We keep a custom CmdSetScissor because we need to set the scissor
+ * count. This is specially relevant to our case because we are
+ * pushing/popping the dynamic state as part of the meta operations.
+ */
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
                    uint32_t firstScissor,
                    uint32_t scissorCount,
                    const VkRect2D *pScissors)
 {
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
    assert(firstScissor < MAX_SCISSORS);
    assert(firstScissor + scissorCount >= 1 &&
           firstScissor + scissorCount <= MAX_SCISSORS);
 
-   if (state->dynamic.scissor.count < firstScissor + scissorCount)
-      state->dynamic.scissor.count = firstScissor + scissorCount;
-
-   if (!memcmp(state->dynamic.scissor.scissors + firstScissor,
-               pScissors, scissorCount * sizeof(*pScissors))) {
-      return;
-   }
-
-   memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
-          scissorCount * sizeof(*pScissors));
-
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR;
+   vk_common_CmdSetScissorWithCount(commandBuffer,
+                                    firstScissor + scissorCount,
+                                    pScissors);
 }
 
 static void
 emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   if (cmd_buffer->state.dynamic.viewport.count == 0)
+   if (cmd_buffer->vk.dynamic_graphics_state.vp.viewport_count == 0)
       return;
 
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
@@ -2041,11 +2312,14 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
     */
    float *vptranslate = dynamic->viewport.translate[0];
    float *vpscale = dynamic->viewport.scale[0];
+   assert(vpscale[0] >= 0);
 
-   float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
-   float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
-   float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
-   float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
+   float vp_minx = vptranslate[0] - vpscale[0];
+   float vp_maxx = vptranslate[0] + vpscale[0];
+
+   /* With KHR_maintenance1 viewport may have negative Y */
+   float vp_miny = vptranslate[1] - fabsf(vpscale[1]);
+   float vp_maxy = vptranslate[1] + fabsf(vpscale[1]);
 
    /* Quoting from v3dx_emit:
     * "Clip to the scissor if it's enabled, but still clip to the
@@ -2074,18 +2348,15 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
                         cmd_buffer->state.render_area.extent.height);
 
-   minx = vp_minx;
-   miny = vp_miny;
-   maxx = vp_maxx;
-   maxy = vp_maxy;
-
    /* Clip against user provided scissor if needed.
     *
     * FIXME: right now we only allow one scissor. Below would need to be
     * updated if we support more
     */
-   if (dynamic->scissor.count > 0) {
-      VkRect2D *scissor = &dynamic->scissor.scissors[0];
+   struct vk_dynamic_graphics_state *vk_dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   if (vk_dyn->vp.scissor_count > 0) {
+      VkRect2D *scissor = &vk_dyn->vp.scissors[0];
       minx = MAX2(minx, scissor->offset.x);
       miny = MAX2(miny, scissor->offset.y);
       maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);
@@ -2108,12 +2379,11 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_X(cmd_buffer->device, job_emit_clip_window)
       (cmd_buffer->state.job, &cmd_buffer->state.clip_window);
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;
+   BITSET_CLEAR(vk_dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
 }
 
-static void
-update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
-                         uint32_t dirty_uniform_state)
+static bool
+update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer)
 {
    /* We need to update uniform streams if any piece of state that is passed
     * to the shader as a uniform may have changed.
@@ -2121,15 +2391,29 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
     * If only descriptor sets are dirty then we can safely ignore updates
     * for shader stages that don't access descriptors.
     */
-
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);
+   uint32_t dirty = cmd_buffer->state.dirty;
+   struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+
+   const bool dirty_uniform_state =
+      (dirty & (V3DV_CMD_DIRTY_PIPELINE |
+                V3DV_CMD_DIRTY_PUSH_CONSTANTS |
+                V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
+                V3DV_CMD_DIRTY_VIEW_INDEX |
+                V3DV_CMD_DIRTY_DRAW_ID)) ||
+      BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
 
-   const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE;
-   const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT;
-   const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
-   const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
-   const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX;
+   if (!dirty_uniform_state)
+      return false;
+
+   const bool has_new_pipeline = dirty & V3DV_CMD_DIRTY_PIPELINE;
+   const bool has_new_viewport = BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
+   const bool has_new_push_constants = dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+   const bool has_new_descriptors = dirty & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
+   const bool has_new_view_index = dirty & V3DV_CMD_DIRTY_VIEW_INDEX;
+   const bool has_new_draw_id = dirty & V3DV_CMD_DIRTY_DRAW_ID;
 
    /* VK_SHADER_STAGE_FRAGMENT_BIT */
    const bool has_new_descriptors_fs =
@@ -2143,8 +2427,7 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
    const bool needs_fs_update = has_new_pipeline ||
                                 has_new_view_index ||
                                 has_new_push_constants_fs ||
-                                has_new_descriptors_fs ||
-                                has_new_view_index;
+                                has_new_descriptors_fs;
 
    if (needs_fs_update) {
       struct v3dv_shader_variant *fs_variant =
@@ -2198,6 +2481,7 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
 
    const bool needs_vs_update = has_new_viewport ||
                                 has_new_view_index ||
+                                has_new_draw_id ||
                                 has_new_pipeline ||
                                 has_new_push_constants_vs ||
                                 has_new_descriptors_vs;
@@ -2217,6 +2501,9 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
    }
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DRAW_ID;
+
+   return true;
 }
 
 /* This stores command buffer state that we might be about to stomp for
@@ -2228,32 +2515,43 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
 {
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
 
+   /* Attachment state.
+    *
+    * We store this state even if we are not currently in a subpass
+    * (subpass_idx != -1) because we may get here to implement subpass
+    * resolves via vkCmdResolveImage from
+    * cmd_buffer_subpass_handle_pending_resolves. In that scenario we pretend
+    * we are no longer in a subpass because Vulkan disallows image resolves
+    * via vkCmdResolveImage during subpasses, but we still need to preserve
+    * attachment state because we may have more subpasses to go through
+    * after processing resolves in the current subass.
+    */
+   const uint32_t attachment_state_item_size =
+      sizeof(struct v3dv_cmd_buffer_attachment_state);
+   const uint32_t attachment_state_total_size =
+      attachment_state_item_size * state->attachment_alloc_count;
+   if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
+      if (state->meta.attachment_alloc_count > 0)
+         vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
+
+      state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
+                                          attachment_state_total_size, 8,
+                                          VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      if (!state->meta.attachments) {
+         v3dv_flag_oom(cmd_buffer, NULL);
+         return;
+      }
+      state->meta.attachment_alloc_count = state->attachment_alloc_count;
+   }
+   state->meta.attachment_count = state->attachment_alloc_count;
+   memcpy(state->meta.attachments, state->attachments,
+          attachment_state_total_size);
+
    if (state->subpass_idx != -1) {
       state->meta.subpass_idx = state->subpass_idx;
       state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);
       state->meta.pass = v3dv_render_pass_to_handle(state->pass);
 
-      const uint32_t attachment_state_item_size =
-         sizeof(struct v3dv_cmd_buffer_attachment_state);
-      const uint32_t attachment_state_total_size =
-         attachment_state_item_size * state->attachment_alloc_count;
-      if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
-         if (state->meta.attachment_alloc_count > 0)
-            vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
-
-         state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
-                                             attachment_state_total_size, 8,
-                                             VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-         if (!state->meta.attachments) {
-            v3dv_flag_oom(cmd_buffer, NULL);
-            return;
-         }
-         state->meta.attachment_alloc_count = state->attachment_alloc_count;
-      }
-      state->meta.attachment_count = state->attachment_alloc_count;
-      memcpy(state->meta.attachments, state->attachments,
-             attachment_state_total_size);
-
       state->meta.tile_aligned_render_area = state->tile_aligned_render_area;
       memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
    }
@@ -2262,6 +2560,8 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
     * account the graphics pipeline, and the graphics state
     */
    state->meta.gfx.pipeline = state->gfx.pipeline;
+   vk_dynamic_graphics_state_copy(&state->meta.dynamic_graphics_state,
+                                  &cmd_buffer->vk.dynamic_graphics_state);
    memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
 
    struct v3dv_descriptor_state *gfx_descriptor_state =
@@ -2277,35 +2577,35 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
       state->meta.has_descriptor_state = false;
    }
 
-   /* FIXME: if we keep track of wether we have bound any push constant state
-    *        at all we could restruct this only to cases where it is actually
-    *        necessary.
-    */
-   memcpy(state->meta.push_constants, cmd_buffer->push_constants_data,
-          sizeof(state->meta.push_constants));
+   if (cmd_buffer->state.push_constants_size > 0) {
+      state->meta.push_constants_size = cmd_buffer->state.push_constants_size;
+      memcpy(state->meta.push_constants, cmd_buffer->state.push_constants_data,
+             cmd_buffer->state.push_constants_size);
+      cmd_buffer->state.push_constants_size = 0;
+   }
 }
 
 /* This restores command buffer state after a meta operation
  */
 void
 v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
-                               uint32_t dirty_dynamic_state,
                                bool needs_subpass_resume)
 {
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
 
+   /* Attachment state */
+   assert(state->meta.attachment_count <= state->attachment_alloc_count);
+   const uint32_t attachment_state_item_size =
+      sizeof(struct v3dv_cmd_buffer_attachment_state);
+   const uint32_t attachment_state_total_size =
+      attachment_state_item_size * state->meta.attachment_count;
+   memcpy(state->attachments, state->meta.attachments,
+          attachment_state_total_size);
+
    if (state->meta.subpass_idx != -1) {
       state->pass = v3dv_render_pass_from_handle(state->meta.pass);
       state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);
 
-      assert(state->meta.attachment_count <= state->attachment_alloc_count);
-      const uint32_t attachment_state_item_size =
-         sizeof(struct v3dv_cmd_buffer_attachment_state);
-      const uint32_t attachment_state_total_size =
-         attachment_state_item_size * state->meta.attachment_count;
-      memcpy(state->attachments, state->meta.attachments,
-             attachment_state_total_size);
-
       state->tile_aligned_render_area = state->meta.tile_aligned_render_area;
       memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));
 
@@ -2331,10 +2631,11 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
       state->gfx.pipeline = NULL;
    }
 
-   if (dirty_dynamic_state) {
-      memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
-      state->dirty |= dirty_dynamic_state;
-   }
+   /* Restore dynamic state */
+   vk_dynamic_graphics_state_copy(&cmd_buffer->vk.dynamic_graphics_state,
+                                  &state->meta.dynamic_graphics_state);
+   memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
+   state->dirty = ~0;
 
    if (state->meta.has_descriptor_state) {
       if (state->meta.gfx.descriptor_state.valid != 0) {
@@ -2345,14 +2646,23 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   memcpy(cmd_buffer->push_constants_data, state->meta.push_constants,
-          sizeof(state->meta.push_constants));
+   /* We only need to restore push constant data if we had any data in the
+    * original command buffer and the meta operation wrote new push constant
+    * data.
+    */
+   if (state->meta.push_constants_size > 0 &&
+       cmd_buffer->state.push_constants_size > 0) {
+      memcpy(cmd_buffer->state.push_constants_data, state->meta.push_constants,
+             state->meta.push_constants_size);
+   }
+   cmd_buffer->state.push_constants_size = state->meta.push_constants_size;
 
    state->meta.gfx.pipeline = NULL;
    state->meta.framebuffer = VK_NULL_HANDLE;
    state->meta.pass = VK_NULL_HANDLE;
    state->meta.subpass_idx = -1;
    state->meta.has_descriptor_state = false;
+   state->meta.push_constants_size = 0;
 }
 
 static struct v3dv_job *
@@ -2399,7 +2709,7 @@ cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
  *    in rasterization."
  *
  * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
- * emit when we start a new frame at the begining of a subpass. At that point,
+ * emit when we start a new frame at the beginning of a subpass. At that point,
  * if the framebuffer doesn't have any attachments we won't enable MSAA and
  * the job won't be valid in the scenario described by the spec.
  *
@@ -2434,7 +2744,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
     * draw calls in them, and then using that info to decide if we need to
     * restart the primary job into which they are being recorded.
     */
-   if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+   if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
       return;
 
    /* Drop the current job and restart it with MSAA enabled */
@@ -2457,16 +2767,185 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
                         old_job->frame_tiling.width,
                         old_job->frame_tiling.height,
                         old_job->frame_tiling.layers,
-                        true,
+                        true, false,
                         old_job->frame_tiling.render_target_count,
                         old_job->frame_tiling.internal_bpp,
+                        old_job->frame_tiling.total_color_bpp,
                         true /* msaa */);
 
    v3dv_job_destroy(old_job);
 }
 
+static bool
+cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer,
+                                 struct v3dv_pipeline *pipeline,
+                                 bool indexed, bool indirect)
+{
+   const struct v3dv_descriptor_maps *vs_bin_maps =
+      pipeline->shared_data->maps[BROADCOM_SHADER_VERTEX_BIN];
+
+   const struct v3dv_descriptor_maps *gs_bin_maps =
+      pipeline->shared_data->maps[BROADCOM_SHADER_GEOMETRY_BIN];
+
+  VkAccessFlags buffer_access =
+      cmd_buffer->state.barrier.bcl_buffer_access;
+   if (buffer_access) {
+      /* Index buffer read */
+      if (indexed && (buffer_access & (VK_ACCESS_2_INDEX_READ_BIT |
+                                       VK_ACCESS_2_MEMORY_READ_BIT))) {
+         return true;
+      }
+
+      /* Indirect buffer read */
+      if (indirect && (buffer_access & (VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT |
+                                        VK_ACCESS_2_MEMORY_READ_BIT))) {
+         return true;
+      }
+
+      /* Attribute read */
+      if (buffer_access & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
+                          VK_ACCESS_2_MEMORY_READ_BIT)) {
+         const struct v3d_vs_prog_data *prog_data =
+            pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
+
+         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
+            if (prog_data->vattr_sizes[i] > 0)
+               return true;
+         }
+      }
+
+      /* UBO / SSBO read */
+      if (buffer_access & (VK_ACCESS_2_UNIFORM_READ_BIT |
+                           VK_ACCESS_2_SHADER_READ_BIT |
+                           VK_ACCESS_2_MEMORY_READ_BIT |
+                           VK_ACCESS_2_SHADER_STORAGE_READ_BIT)) {
+
+         if (vs_bin_maps->ubo_map.num_desc > 0 ||
+             vs_bin_maps->ssbo_map.num_desc > 0) {
+            return true;
+         }
+
+         if (gs_bin_maps && (gs_bin_maps->ubo_map.num_desc > 0 ||
+                             gs_bin_maps->ssbo_map.num_desc > 0)) {
+            return true;
+         }
+      }
+
+      /* SSBO write */
+      if (buffer_access & (VK_ACCESS_2_SHADER_WRITE_BIT |
+                           VK_ACCESS_2_MEMORY_WRITE_BIT |
+                           VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT)) {
+         if (vs_bin_maps->ssbo_map.num_desc > 0)
+            return true;
+
+         if (gs_bin_maps && gs_bin_maps->ssbo_map.num_desc > 0)
+            return true;
+      }
+
+      /* Texel Buffer read */
+      if (buffer_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+                           VK_ACCESS_2_MEMORY_READ_BIT)) {
+         if (vs_bin_maps->texture_map.num_desc > 0)
+            return true;
+
+         if (gs_bin_maps && gs_bin_maps->texture_map.num_desc > 0)
+            return true;
+      }
+   }
+
+   VkAccessFlags image_access =
+      cmd_buffer->state.barrier.bcl_image_access;
+   if (image_access) {
+      /* Image load / store */
+      if (image_access & (VK_ACCESS_2_SHADER_READ_BIT |
+                          VK_ACCESS_2_SHADER_WRITE_BIT |
+                          VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+                          VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                          VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
+                          VK_ACCESS_2_MEMORY_READ_BIT |
+                          VK_ACCESS_2_MEMORY_WRITE_BIT)) {
+         if (vs_bin_maps->texture_map.num_desc > 0 ||
+             vs_bin_maps->sampler_map.num_desc > 0) {
+            return true;
+         }
+
+         if (gs_bin_maps && (gs_bin_maps->texture_map.num_desc > 0 ||
+                             gs_bin_maps->sampler_map.num_desc > 0)) {
+            return true;
+         }
+      }
+   }
+
+   return false;
+}
+
 void
-v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
+v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer,
+                                 struct v3dv_job *job)
+{
+   job->needs_bcl_sync = true;
+   cmd_buffer->state.barrier.bcl_buffer_access = 0;
+   cmd_buffer->state.barrier.bcl_image_access = 0;
+}
+
+static inline uint32_t
+compute_prog_score(struct v3dv_shader_variant *vs)
+{
+   const uint32_t inst_count = vs->qpu_insts_size / sizeof(uint64_t);
+   const uint32_t tmu_count = vs->prog_data.base->tmu_count +
+                              vs->prog_data.base->tmu_spills +
+                              vs->prog_data.base->tmu_fills;
+   return inst_count + 4 * tmu_count;
+}
+
+static void
+job_update_double_buffer_score(struct v3dv_job *job,
+                               struct v3dv_pipeline *pipeline,
+                               uint32_t vertex_count,
+                               VkExtent2D *render_area)
+{
+   /* FIXME: assume anything with GS workloads is too expensive */
+   struct v3dv_shader_variant *gs_bin =
+      pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
+   if (gs_bin) {
+      job->can_use_double_buffer = false;
+      return;
+   }
+
+   /* Keep track of vertex processing: too much geometry processing would not
+    * be good for double-buffer.
+    */
+   struct v3dv_shader_variant *vs_bin =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+   assert(vs_bin);
+   uint32_t geom_score = vertex_count * compute_prog_score(vs_bin);
+
+   struct v3dv_shader_variant *vs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   assert(vs);
+   uint32_t vs_score = vertex_count * compute_prog_score(vs);
+   geom_score += vs_score;
+
+   job->double_buffer_score.geom += geom_score;
+
+   /* Compute pixel rendering cost.
+    *
+    * We estimate that on average a draw would render 0.2% of the pixels in
+    * the render area. That would be a 64x64 region in a 1920x1080 area.
+    */
+   struct v3dv_shader_variant *fs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   assert(fs);
+   uint32_t pixel_count = 0.002f * render_area->width * render_area->height;
+   uint32_t render_score = vs_score + pixel_count * compute_prog_score(fs);
+
+   job->double_buffer_score.render += render_score;
+}
+
+void
+v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
+                              bool indexed, bool indirect,
+                              uint32_t vertex_count)
 {
    assert(cmd_buffer->state.gfx.pipeline);
    assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
@@ -2489,6 +2968,23 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
    job->draw_count++;
 
+   /* Track VK_KHR_buffer_device_address usage in the job */
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
+
+   /* If this job is serialized (has consumed a barrier) then check if we need
+    * to sync at the binning stage by testing if the binning shaders involved
+    * with the draw call require access to external resources.
+    */
+   if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access ||
+                          cmd_buffer->state.barrier.bcl_image_access)) {
+      assert(!job->needs_bcl_sync);
+      if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline,
+                                           indexed, indirect)) {
+         v3dv_cmd_buffer_consume_bcl_sync(cmd_buffer, job);
+      }
+   }
+
    /* GL shader state binds shaders, uniform and vertex attribute state. The
     * compiler injects uniforms to handle some descriptor types (such as
     * textures), so we need to regen that when descriptor state changes.
@@ -2497,62 +2993,84 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
     * that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
     */
    uint32_t *dirty = &cmd_buffer->state.dirty;
+   struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
 
-   const uint32_t dirty_uniform_state =
-      *dirty & (V3DV_CMD_DIRTY_PIPELINE |
-                V3DV_CMD_DIRTY_PUSH_CONSTANTS |
-                V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
-                V3DV_CMD_DIRTY_VIEWPORT |
-                V3DV_CMD_DIRTY_VIEW_INDEX);
-
-   if (dirty_uniform_state)
-      update_gfx_uniform_state(cmd_buffer, dirty_uniform_state);
+   const bool dirty_uniform_state =
+      update_gfx_uniform_state(cmd_buffer);
 
    struct v3dv_device *device = cmd_buffer->device;
 
    if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
       v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);
 
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
+   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
       v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);
+   }
+
+   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
       v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);
    }
 
-   if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
       emit_scissor(cmd_buffer);
    }
 
-   if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS))
       v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);
-   }
 
    if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
       v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);
 
-   const uint32_t dynamic_stencil_dirty_flags =
-      V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
-      V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
-      V3DV_CMD_DIRTY_STENCIL_REFERENCE;
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))
+   bool any_dynamic_stencil_dirty =
+      BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+      BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+      BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+      BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
+
+   if (*dirty & V3DV_CMD_DIRTY_PIPELINE || any_dynamic_stencil_dirty)
       v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);
 
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
+   if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
       v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
+   }
 
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS))
+      v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
+
+   if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
       v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
+   }
 
    if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
       v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);
 
-   if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
       v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);
 
    if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
       v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);
 
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE))
+   if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
       v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
+   }
+
+   /* We disable double-buffer mode if indirect draws are used because in that
+    * case we don't know the vertex count.
+    */
+   if (indirect) {
+      job->can_use_double_buffer = false;
+   } else if (job->can_use_double_buffer) {
+      job_update_double_buffer_score(job, pipeline, vertex_count,
+                                      &cmd_buffer->state.render_area.extent);
+   }
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
 }
@@ -2561,18 +3079,23 @@ static inline void
 cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t view_index)
 {
-   cmd_buffer->state.view_index = view_index;
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
+   if (view_index != cmd_buffer->state.view_index) {
+      cmd_buffer->state.view_index = view_index;
+      cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
+   }
 }
 
 static void
 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_draw_info *info)
 {
+   uint32_t vertex_count =
+      info->vertex_count * info->instance_count;
 
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
    if (likely(!pass->multiview_enabled)) {
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      cmd_buffer_set_view_index(cmd_buffer, 0);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
       return;
    }
@@ -2580,7 +3103,7 @@ cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
    while (view_mask) {
       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
    }
 }
@@ -2606,6 +3129,35 @@ v3dv_CmdDraw(VkCommandBuffer commandBuffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
+                     uint32_t drawCount,
+                     const VkMultiDrawInfoEXT *pVertexInfo,
+                     uint32_t instanceCount,
+                     uint32_t firstInstance,
+                     uint32_t stride)
+
+{
+   if (drawCount == 0 || instanceCount == 0)
+      return;
+
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   uint32_t i = 0;
+   vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
+      cmd_buffer->state.draw_id = i;
+      cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
+
+      struct v3dv_draw_info info = {};
+      info.vertex_count = draw->vertexCount;
+      info.instance_count = instanceCount;
+      info.first_instance = firstInstance;
+      info.first_vertex = draw->firstVertex;
+
+      cmd_buffer_draw(cmd_buffer, &info);
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
                     uint32_t indexCount,
                     uint32_t instanceCount,
@@ -2618,9 +3170,12 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
 
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
+   uint32_t vertex_count = indexCount * instanceCount;
+
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
    if (likely(!pass->multiview_enabled)) {
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      cmd_buffer_set_view_index(cmd_buffer, 0);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
          (cmd_buffer, indexCount, instanceCount,
           firstIndex, vertexOffset, firstInstance);
@@ -2630,7 +3185,7 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
    while (view_mask) {
       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
          (cmd_buffer, indexCount, instanceCount,
           firstIndex, vertexOffset, firstInstance);
@@ -2638,6 +3193,48 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
+                            uint32_t drawCount,
+                            const VkMultiDrawIndexedInfoEXT *pIndexInfo,
+                            uint32_t instanceCount,
+                            uint32_t firstInstance,
+                            uint32_t stride,
+                            const int32_t *pVertexOffset)
+{
+   if (drawCount == 0 || instanceCount == 0)
+      return;
+
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   uint32_t i = 0;
+   vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+      uint32_t vertex_count = draw->indexCount * instanceCount;
+      int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
+
+      cmd_buffer->state.draw_id = i;
+      cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
+
+      struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+      if (likely(!pass->multiview_enabled)) {
+         cmd_buffer_set_view_index(cmd_buffer, 0);
+         v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
+         v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+            (cmd_buffer, draw->indexCount, instanceCount,
+             draw->firstIndex, vertexOffset, firstInstance);
+         continue;
+      }
+      uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+      while (view_mask) {
+         cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+         v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
+         v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+            (cmd_buffer, draw->indexCount, instanceCount,
+             draw->firstIndex, vertexOffset, firstInstance);
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
                      VkBuffer _buffer,
                      VkDeviceSize offset,
@@ -2653,7 +3250,8 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
 
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
    if (likely(!pass->multiview_enabled)) {
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      cmd_buffer_set_view_index(cmd_buffer, 0);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
          (cmd_buffer, buffer, offset, drawCount, stride);
       return;
@@ -2662,7 +3260,7 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
    while (view_mask) {
       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
          (cmd_buffer, buffer, offset, drawCount, stride);
    }
@@ -2684,7 +3282,8 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
 
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
    if (likely(!pass->multiview_enabled)) {
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      cmd_buffer_set_view_index(cmd_buffer, 0);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
          (cmd_buffer, buffer, offset, drawCount, stride);
       return;
@@ -2693,64 +3292,173 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
    while (view_mask) {
       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
-      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
       v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
          (cmd_buffer, buffer, offset, drawCount, stride);
    }
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
-                        VkPipelineStageFlags srcStageMask,
-                        VkPipelineStageFlags dstStageMask,
-                        VkDependencyFlags dependencyFlags,
-                        uint32_t memoryBarrierCount,
-                        const VkMemoryBarrier *pMemoryBarriers,
-                        uint32_t bufferBarrierCount,
-                        const VkBufferMemoryBarrier *pBufferBarriers,
-                        uint32_t imageBarrierCount,
-                        const VkImageMemoryBarrier *pImageBarriers)
+static void
+handle_barrier(VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask,
+               VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask,
+               bool is_image_barrier, bool is_buffer_barrier,
+               struct v3dv_barrier_state *state)
 {
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
    /* We only care about barriers between GPU jobs */
-   if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT ||
-       dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) {
+   if (srcStageMask == VK_PIPELINE_STAGE_2_HOST_BIT ||
+       dstStageMask == VK_PIPELINE_STAGE_2_HOST_BIT) {
       return;
    }
 
+   /* Track source of the barrier */
+   uint8_t src_mask = 0;
+
+   const VkPipelineStageFlags2 compute_mask =
+      VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+   if (srcStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
+      src_mask |= V3DV_BARRIER_COMPUTE_BIT;
+
+   const VkPipelineStageFlags2 transfer_mask =
+      VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
+      VK_PIPELINE_STAGE_2_COPY_BIT |
+      VK_PIPELINE_STAGE_2_BLIT_BIT |
+      VK_PIPELINE_STAGE_2_CLEAR_BIT;
+   if (srcStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
+      src_mask |= V3DV_BARRIER_TRANSFER_BIT;
+
+   const VkPipelineStageFlags2 graphics_mask = ~(compute_mask | transfer_mask);
+   if (srcStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
+      src_mask |= V3DV_BARRIER_GRAPHICS_BIT;
+
+   /* Track consumer of the barrier */
+   if (dstStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+      state->dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
+      state->src_mask_compute |= src_mask;
+   }
+
+   if (dstStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+      state->dst_mask |= V3DV_BARRIER_TRANSFER_BIT;
+      state->src_mask_transfer |= src_mask;
+   }
+
+   if (dstStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+      state->dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
+      state->src_mask_graphics |= src_mask;
+
+      if (dstStageMask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
+                          VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+                          VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
+                          VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
+                          VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
+                          VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
+                          VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
+                          VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
+                          VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
+                          VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
+                          VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
+                          VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+         if (is_image_barrier)
+            state->bcl_image_access |= dstAccessMask;
+
+         if (is_buffer_barrier)
+            state->bcl_buffer_access |= dstAccessMask;
+      }
+   }
+}
+
+void
+v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
+                                      const VkDependencyInfo *info)
+{
+   uint32_t imageBarrierCount = info->imageMemoryBarrierCount;
+   const VkImageMemoryBarrier2 *pImageBarriers = info->pImageMemoryBarriers;
+
+   uint32_t bufferBarrierCount = info->bufferMemoryBarrierCount;
+   const VkBufferMemoryBarrier2 *pBufferBarriers = info->pBufferMemoryBarriers;
+
+   uint32_t memoryBarrierCount = info->memoryBarrierCount;
+   const VkMemoryBarrier2 *pMemoryBarriers = info->pMemoryBarriers;
+
+   struct v3dv_barrier_state state = { 0 };
+   for (uint32_t i = 0; i < imageBarrierCount; i++) {
+      /* We can safely skip barriers for image layout transitions from UNDEFINED
+       * layout.
+       *
+       * Notice that KHR_synchronization2 allows to specify barriers that don't
+       * involve a layout transition by making oldLayout and newLayout the same,
+       * including UNDEFINED.
+       */
+      if (pImageBarriers[i].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+          pImageBarriers[i].oldLayout != pImageBarriers[i].newLayout) {
+         continue;
+      }
+
+      handle_barrier(pImageBarriers[i].srcStageMask,
+                     pImageBarriers[i].srcAccessMask,
+                     pImageBarriers[i].dstStageMask,
+                     pImageBarriers[i].dstAccessMask,
+                     true, false, &state);
+   }
+
+   for (uint32_t i = 0; i < bufferBarrierCount; i++) {
+      handle_barrier(pBufferBarriers[i].srcStageMask,
+                     pBufferBarriers[i].srcAccessMask,
+                     pBufferBarriers[i].dstStageMask,
+                     pBufferBarriers[i].dstAccessMask,
+                     false, true, &state);
+   }
+
+   for (uint32_t i = 0; i < memoryBarrierCount; i++) {
+      handle_barrier(pMemoryBarriers[i].srcStageMask,
+                     pMemoryBarriers[i].srcAccessMask,
+                     pMemoryBarriers[i].dstStageMask,
+                     pMemoryBarriers[i].dstAccessMask,
+                     true, true, &state);
+   }
+
+   /* Bail if we don't relevant barriers */
+   if (!state.dst_mask)
+      return;
+
    /* If we have a recording job, finish it here */
-   struct v3dv_job *job = cmd_buffer->state.job;
-   if (job)
+   if (cmd_buffer->state.job)
       v3dv_cmd_buffer_finish_job(cmd_buffer);
 
-   cmd_buffer->state.has_barrier = true;
-   if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
-                       VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-                       VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
-                       VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
-                       VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
-                       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) {
-      cmd_buffer->state.has_bcl_barrier = true;
-   }
+   /* Update barrier state in the command buffer */
+   v3dv_cmd_buffer_merge_barrier_state(&cmd_buffer->state.barrier, &state);
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
-                          uint32_t firstBinding,
-                          uint32_t bindingCount,
-                          const VkBuffer *pBuffers,
-                          const VkDeviceSize *pOffsets)
+v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
+                         const VkDependencyInfo *pDependencyInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
+   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, pDependencyInfo);
+}
 
-   /* We have to defer setting up vertex buffer since we need the buffer
-    * stride from the pipeline.
-    */
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
+                           uint32_t firstBinding,
+                           uint32_t bindingCount,
+                           const VkBuffer *pBuffers,
+                           const VkDeviceSize *pOffsets,
+                           const VkDeviceSize *pSizes,
+                           const VkDeviceSize *pStrides)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
 
    assert(firstBinding + bindingCount <= MAX_VBS);
    bool vb_state_changed = false;
+   if (pStrides) {
+      vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk,
+                                        firstBinding, bindingCount,
+                                        pStrides);
+      struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+      if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
+         vb_state_changed = true;
+   }
+   /* FIXME: at this moment we don't do any thing with pSizes. */
    for (uint32_t i = 0; i < bindingCount; i++) {
       if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) {
          vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);
@@ -2766,24 +3474,6 @@ v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
       cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
 }
 
-static uint32_t
-get_index_size(VkIndexType index_type)
-{
-   switch (index_type) {
-   case VK_INDEX_TYPE_UINT8_EXT:
-      return 1;
-      break;
-   case VK_INDEX_TYPE_UINT16:
-      return 2;
-      break;
-   case VK_INDEX_TYPE_UINT32:
-      return 4;
-      break;
-   default:
-      unreachable("Unsupported index type");
-   }
-}
-
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
                         VkBuffer buffer,
@@ -2792,7 +3482,7 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   const uint32_t index_size = get_index_size(indexType);
+   const uint32_t index_size = vk_index_type_to_bytes(indexType);
    if (buffer == cmd_buffer->state.index_buffer.buffer &&
        offset == cmd_buffer->state.index_buffer.offset &&
        index_size == cmd_buffer->state.index_buffer.index_size) {
@@ -2806,82 +3496,309 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
-                              VkStencilFaceFlags faceMask,
-                              uint32_t compareMask)
+v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
+                          uint32_t lineStippleFactor,
+                          uint16_t lineStipplePattern)
 {
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
-      cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff;
-   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
-      cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff;
-
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
+   /* We do not support stippled line rasterization so we just ignore this. */
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
-                            VkStencilFaceFlags faceMask,
-                            uint32_t writeMask)
+/**
+ * This checks a descriptor set to see if are binding any descriptors that would
+ * involve sampling from a linear image (the hardware only supports this for
+ * 1D images), and if so, attempts to create a tiled copy of the linear image
+ * and rewrite the descriptor set to use that instead.
+ *
+ * This was added to support a scenario with Android where some part of the UI
+ * wanted to show previews of linear swapchain images. For more details:
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712
+ *
+ * Currently this only supports a linear sampling from a simple 2D image, but
+ * it could be extended to support more cases if necessary.
+ */
+static void
+handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer,
+                                struct v3dv_descriptor_set *set,
+                                bool is_compute)
 {
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   for (int32_t i = 0; i < set->layout->binding_count; i++) {
+      const struct v3dv_descriptor_set_binding_layout *blayout =
+         &set->layout->binding[i];
+      if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
+          blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         continue;
 
-   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
-      cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff;
-   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
-      cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff;
+      struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index];
+      if (!desc->image_view)
+         continue;
 
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
-}
+      struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image;
+      struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view;
+      if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D ||
+                          view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) {
+         continue;
+      }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
-                            VkStencilFaceFlags faceMask,
-                            uint32_t reference)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+      /* FIXME: we can probably handle most of these restrictions too with
+       * a bit of extra effort.
+       */
+      if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D ||
+          view->vk.level_count != 1 || view->vk.layer_count != 1 ||
+          blayout->array_size != 1) {
+         fprintf(stderr, "Sampling from linear image is not supported. "
+                 "Expect corruption.\n");
+         continue;
+      }
 
-   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
-      cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff;
-   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
-      cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff;
+      /* We are sampling from a linear image. V3D doesn't support this
+       * so we create a tiled copy of the image and rewrite the descriptor
+       * to read from it instead.
+       */
+      perf_debug("Sampling from linear image is not supported natively and "
+                 "requires a copy.\n");
 
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
-}
+      struct v3dv_device *device = cmd_buffer->device;
+      VkDevice vk_device = v3dv_device_to_handle(device);
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
-                     float depthBiasConstantFactor,
-                     float depthBiasClamp,
-                     float depthBiasSlopeFactor)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+      /* Allocate shadow tiled image if needed, we only do this once for
+       * each image, on the first sampling attempt. We need to take a lock
+       * since we may be trying to do the same in another command buffer in
+       * a separate thread.
+       */
+      mtx_lock(&device->meta.mtx);
+      VkResult result;
+      VkImage tiled_image;
+      if (image->shadow) {
+         tiled_image = v3dv_image_to_handle(image->shadow);
+      } else {
+         VkImageCreateInfo image_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+            .flags = image->vk.create_flags,
+            .imageType = image->vk.image_type,
+            .format = image->vk.format,
+            .extent = {
+               image->vk.extent.width,
+               image->vk.extent.height,
+               image->vk.extent.depth,
+            },
+            .mipLevels = image->vk.mip_levels,
+            .arrayLayers = image->vk.array_layers,
+            .samples = image->vk.samples,
+            .tiling = VK_IMAGE_TILING_OPTIMAL,
+            .usage = image->vk.usage,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = 0,
+            .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+         };
+         result = v3dv_CreateImage(vk_device, &image_info,
+                                   &device->vk.alloc, &tiled_image);
+         if (result != VK_SUCCESS) {
+            fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                    "Expect corruption.\n");
+            mtx_unlock(&device->meta.mtx);
+            continue;
+         }
 
-   cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor;
-   cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp;
-   cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor;
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
-}
+         bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+         VkImageMemoryRequirementsInfo2 reqs_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
+            .image = tiled_image,
+         };
+
+         assert(image->plane_count <= V3DV_MAX_PLANE_COUNT);
+         for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) {
+            VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
+            VkImagePlaneMemoryRequirementsInfo plane_info = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO,
+               .planeAspect = plane_aspect,
+            };
+            if (disjoint)
+               reqs_info.pNext = &plane_info;
+
+            VkMemoryRequirements2 reqs = {
+               .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+            };
+            v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs);
+
+            VkDeviceMemory mem;
+            VkMemoryAllocateInfo alloc_info = {
+               .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+               .allocationSize = reqs.memoryRequirements.size,
+               .memoryTypeIndex = 0,
+            };
+            result = v3dv_AllocateMemory(vk_device, &alloc_info,
+                                         &device->vk.alloc, &mem);
+            if (result != VK_SUCCESS) {
+               fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                       "Expect corruption.\n");
+               v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
+               mtx_unlock(&device->meta.mtx);
+               continue;
+            }
+
+            VkBindImageMemoryInfo bind_info = {
+               .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
+               .image = tiled_image,
+               .memory = mem,
+               .memoryOffset = 0,
+            };
+            VkBindImagePlaneMemoryInfo plane_bind_info = {
+               .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO,
+               .planeAspect = plane_aspect,
+            };
+            if (disjoint)
+               bind_info.pNext = &plane_bind_info;
+            result = v3dv_BindImageMemory2(vk_device, 1, &bind_info);
+            if (result != VK_SUCCESS) {
+               fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                       "Expect corruption.\n");
+               v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
+               v3dv_FreeMemory(vk_device, mem, &device->vk.alloc);
+               mtx_unlock(&device->meta.mtx);
+               continue;
+            }
+         }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
-                       float minDepthBounds,
-                       float maxDepthBounds)
-{
-   /* We do not support depth bounds testing so we just ingore this. We are
-    * already asserting that pipelines don't enable the feature anyway.
-    */
-}
+         image->shadow = v3dv_image_from_handle(tiled_image);
+      }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
-                     float lineWidth)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+      /* Create a shadow view that refers to the tiled image if needed */
+      VkImageView tiled_view;
+      if (view->shadow) {
+         tiled_view = v3dv_image_view_to_handle(view->shadow);
+      } else {
+         VkImageViewCreateInfo view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = view->vk.create_flags,
+            .image = tiled_image,
+            .viewType = view->vk.view_type,
+            .format = view->vk.format,
+            .components = view->vk.swizzle,
+            .subresourceRange = {
+               .aspectMask = view->vk.aspects,
+               .baseMipLevel = view->vk.base_mip_level,
+               .levelCount = view->vk.level_count,
+               .baseArrayLayer = view->vk.base_array_layer,
+               .layerCount = view->vk.layer_count,
+            },
+         };
+         result = v3dv_create_image_view(device, &view_info, &tiled_view);
+         if (result != VK_SUCCESS) {
+            fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                    "Expect corruption.\n");
+            mtx_unlock(&device->meta.mtx);
+            continue;
+         }
+      }
+
+      view->shadow = v3dv_image_view_from_handle(tiled_view);
+
+      mtx_unlock(&device->meta.mtx);
+
+      /* Rewrite the descriptor to use the shadow view */
+      VkDescriptorImageInfo desc_image_info = {
+         .sampler = v3dv_sampler_to_handle(desc->sampler),
+         .imageView = tiled_view,
+         .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      };
+      VkWriteDescriptorSet write = {
+         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+         .dstSet = v3dv_descriptor_set_to_handle(set),
+         .dstBinding = i,
+         .dstArrayElement = 0, /* Assumes array_size is 1 */
+         .descriptorCount = 1,
+         .descriptorType = desc->type,
+         .pImageInfo = &desc_image_info,
+      };
+      v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
+
+      /* Now we need to actually copy the pixel data from the linear image
+       * into the tiled image storage to ensure it is up-to-date.
+       *
+       * FIXME: ideally we would track if the linear image is dirty and skip
+       * this step otherwise, but that would be a bit of a pain.
+       *
+       * Note that we need to place the copy job *before* the current job in
+       * the command buffer state so we have the tiled image ready to process
+       * an upcoming draw call in the current job that samples from it.
+       *
+       * Also, we need to use the TFU path for this copy, as any other path
+       * will use the tile buffer and would require a new framebuffer setup,
+       * thus requiring extra work to stop and resume any in-flight render
+       * pass. Since we are converting a full 2D texture here the TFU should
+       * be able to handle this.
+       */
+      for (int p = 0; p < image->plane_count; p++) {
+         VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
+         struct VkImageCopy2 copy_region = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
+            .srcSubresource = {
+               .aspectMask = image->plane_count == 1 ?
+                  view->vk.aspects : (view->vk.aspects & plane_aspect),
+               .mipLevel = view->vk.base_mip_level,
+               .baseArrayLayer = view->vk.base_array_layer,
+               .layerCount = view->vk.layer_count,
+            },
+            .srcOffset = {0, 0, 0 },
+            .dstSubresource = {
+               .aspectMask = image->plane_count == 1 ?
+                  view->vk.aspects : (view->vk.aspects & plane_aspect),
+               .mipLevel = view->vk.base_mip_level,
+               .baseArrayLayer = view->vk.base_array_layer,
+               .layerCount = view->vk.layer_count,
+            },
+            .dstOffset = { 0, 0, 0},
+            .extent = {
+               image->planes[p].width,
+               image->planes[p].height,
+               1,
+            },
+         };
+         struct v3dv_image *copy_src = image;
+         struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image);
+         bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src,
+                                                  &copy_region);
+         if (ok) {
+            /* This will emit the TFU job right before the current in-flight
+             * job (if any), since in-fight jobs are only added to the list
+             * when finished.
+             */
+            struct v3dv_job *tfu_job =
+               list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
+            assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU);
+            /* Serialize the copy since we don't know who is producing the linear
+             * image and we need the image to be ready by the time the copy
+             * executes.
+             */
+            tfu_job->serialize = V3DV_BARRIER_ALL;
 
-   cmd_buffer->state.dynamic.line_width = lineWidth;
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
+            /* Also, we need to ensure the TFU copy job completes before anyhing
+             * else coming after that may be using the tiled shadow copy.
+             */
+            if (cmd_buffer->state.job) {
+               /* If we already had an in-flight job (i.e. we are in a render
+                * pass) make sure the job waits for the TFU copy.
+                */
+               cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT;
+            } else {
+               /* Otherwise, make the the follow-up job syncs with the TFU
+                * job we just added when it is created by adding the
+                * corresponding barrier state.
+                */
+               if (!is_compute) {
+                  cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
+                  cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT;
+               } else {
+                  cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
+                  cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT;
+               }
+            }
+         } else {
+            fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                    "TFU doesn't support copy. Expect corruption.\n");
+         }
+      }
+   }
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -2917,6 +3834,15 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
          descriptor_state->descriptor_sets[index] = set;
          dirty_stages |= set->layout->shader_stages;
          descriptor_state_changed = true;
+
+         /* Check if we are sampling from a linear 2D image. This is not
+          * supported in hardware, but may be required for some applications
+          * so we will transparently convert to tiled at the expense of
+          * performance.
+          */
+         handle_sample_from_linear_image(cmd_buffer, set,
+                                         pipelineBindPoint ==
+                                         VK_PIPELINE_BIND_POINT_COMPUTE);
       }
 
       for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
@@ -2951,79 +3877,19 @@ v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size))
-      return;
-
-   memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size);
-
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS;
-   cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
-                          const float blendConstants[4])
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
-   if (!memcmp(state->dynamic.blend_constants, blendConstants,
-               sizeof(state->dynamic.blend_constants))) {
+   if (!memcmp((uint8_t *) cmd_buffer->state.push_constants_data + offset,
+               pValues, size)) {
       return;
    }
 
-   memcpy(state->dynamic.blend_constants, blendConstants,
-          sizeof(state->dynamic.blend_constants));
-
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
-}
+   memcpy((uint8_t *) cmd_buffer->state.push_constants_data + offset,
+           pValues, size);
+   cmd_buffer->state.push_constants_size =
+      MAX2(offset + size, cmd_buffer->state.push_constants_size);
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
-                               uint32_t attachmentCount,
-                               const VkBool32 *pColorWriteEnables)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   uint32_t color_write_enable = 0;
-
-   for (uint32_t i = 0; i < attachmentCount; i++)
-      color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
-
-   if (state->dynamic.color_write_enable == color_write_enable)
-      return;
-
-   state->dynamic.color_write_enable = color_write_enable;
-
-   state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
-}
-
-void
-v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
-                              struct v3dv_query_pool *pool,
-                              uint32_t first,
-                              uint32_t count)
-{
-   /* Resets can only happen outside a render pass instance so we should not
-    * be in the middle of job recording.
-    */
-   assert(cmd_buffer->state.pass == NULL);
-   assert(cmd_buffer->state.job == NULL);
-
-   assert(first < pool->query_count);
-   assert(first + count <= pool->query_count);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_RESET_QUERIES,
-                                     cmd_buffer, -1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   job->cpu.query_reset.pool = pool;
-   job->cpu.query_reset.first = first;
-   job->cpu.query_reset.count = count;
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
+   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS |
+                              V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO;
+   cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
 }
 
 void
@@ -3059,37 +3925,87 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
                             uint32_t query,
                             VkQueryControlFlags flags)
 {
-   /* FIXME: we only support one active query for now */
-   assert(cmd_buffer->state.query.active_query.bo == NULL);
    assert(query < pool->query_count);
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      /* FIXME: we only support one active occlusion query for now */
+      assert(cmd_buffer->state.query.active_query.bo == NULL);
+
+      cmd_buffer->state.query.active_query.bo = pool->occlusion.bo;
+      cmd_buffer->state.query.active_query.offset =
+         pool->queries[query].occlusion.offset;
+      cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+      break;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      assert(cmd_buffer->state.query.active_query.perf == NULL);
+      if (cmd_buffer->state.pass)
+         v3dv_cmd_buffer_subpass_finish(cmd_buffer);
 
-   cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
-   cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+      cmd_buffer->state.query.active_query.perf =
+         &pool->queries[query].perf;
+
+      if (cmd_buffer->state.pass) {
+         v3dv_cmd_buffer_subpass_resume(cmd_buffer,
+            cmd_buffer->state.subpass_idx);
+      }
+      break;
+   }
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
+void
+v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct v3dv_bo *occlusion_query_bo = state->query.active_query.bo;
+   if (occlusion_query_bo) {
+      assert(!state->query.active_query.paused_bo);
+      state->query.active_query.paused_bo = occlusion_query_bo;
+      state->query.active_query.bo = NULL;
+      state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+   }
 }
 
 void
-v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
-                          struct v3dv_query_pool *pool,
-                          uint32_t query)
+v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct v3dv_bo *occlusion_query_bo = state->query.active_query.paused_bo;
+   if (occlusion_query_bo) {
+      assert(!state->query.active_query.bo);
+      state->query.active_query.bo = occlusion_query_bo;
+      state->query.active_query.paused_bo = NULL;
+      state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+   }
+}
+
+static void
+v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+                                   struct v3dv_query_pool *pool,
+                                   uint32_t query)
 {
    assert(query < pool->query_count);
-   assert(cmd_buffer->state.query.active_query.bo != NULL);
+   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
+          pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
 
-   if  (cmd_buffer->state.pass) {
-      /* Queue the EndQuery in the command buffer state, we will create a CPU
-       * job to flag all of these queries as possibly available right after the
-       * render pass job in which they have been recorded.
-       */
+   /* For occlusion queries in the middle of a render pass we don't want to
+    * split the current job at the EndQuery just to emit query availability,
+    * instead we queue this state in the command buffer and we emit it when
+    * we finish the current job.
+    */
+   if  (cmd_buffer->state.pass &&
+        pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
       v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
-                                         sizeof(struct v3dv_end_query_cpu_job_info),
+                                         sizeof(struct v3dv_end_query_info),
                                          state->query.end.used_count,
                                          &state->query.end.alloc_count,
                                          (void **) &state->query.end.states);
       v3dv_return_if_oom(cmd_buffer, NULL);
 
-      struct v3dv_end_query_cpu_job_info *info =
+      struct v3dv_end_query_info *info =
          &state->query.end.states[state->query.end.used_count++];
 
       info->pool = pool;
@@ -3106,7 +4022,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
        *
        * In our case, only the first query is used but this means we still need
        * to flag the other queries as available so we don't emit errors when
-       * the applications attempt to retrive values from them.
+       * the applications attempt to retrieve values from them.
        */
       struct v3dv_render_pass *pass = cmd_buffer->state.pass;
       if (!pass->multiview_enabled) {
@@ -3116,60 +4032,65 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
          info->count = util_bitcount(subpass->view_mask);
       }
    } else {
-      /* Otherwise, schedule the CPU job immediately */
-      struct v3dv_job *job =
-         v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                        V3DV_JOB_TYPE_CPU_END_QUERY,
-                                        cmd_buffer, -1);
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      job->cpu.query_end.pool = pool;
-      job->cpu.query_end.query = query;
+      /* Otherwise, schedule the end query job immediately.
+       *
+       * Multiview queries cannot cross subpass boundaries, so query count is
+       * always 1.
+       */
+       if (pool->query_type == VK_QUERY_TYPE_OCCLUSION)
+         v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, pool, query, 1, 1);
+       else
+         cmd_buffer_emit_end_query_cpu(cmd_buffer, pool, query, 1);
+   }
+}
 
-      /* Multiview queries cannot cross subpass boundaries */
-      job->cpu.query_end.count = 1;
+static void
+v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
+                                    struct v3dv_query_pool *pool,
+                                    uint32_t query)
+{
+   assert(query < pool->query_count);
+   assert(cmd_buffer->state.query.active_query.bo != NULL);
 
-      list_addtail(&job->list_link, &cmd_buffer->jobs);
-   }
+   v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
 
    cmd_buffer->state.query.active_query.bo = NULL;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
-void
-v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
-                                   struct v3dv_query_pool *pool,
-                                   uint32_t first,
-                                   uint32_t count,
-                                   struct v3dv_buffer *dst,
-                                   uint32_t offset,
-                                   uint32_t stride,
-                                   VkQueryResultFlags flags)
+static void
+v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
+                                      struct v3dv_query_pool *pool,
+                                      uint32_t query)
 {
-   /* Copies can only happen outside a render pass instance so we should not
-    * be in the middle of job recording.
-    */
-   assert(cmd_buffer->state.pass == NULL);
-   assert(cmd_buffer->state.job == NULL);
+   assert(query < pool->query_count);
+   assert(cmd_buffer->state.query.active_query.perf != NULL);
 
-   assert(first < pool->query_count);
-   assert(first + count <= pool->query_count);
+   if (cmd_buffer->state.pass)
+      v3dv_cmd_buffer_subpass_finish(cmd_buffer);
 
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
-                                     cmd_buffer, -1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
 
-   job->cpu.query_copy_results.pool = pool;
-   job->cpu.query_copy_results.first = first;
-   job->cpu.query_copy_results.count = count;
-   job->cpu.query_copy_results.dst = dst;
-   job->cpu.query_copy_results.offset = offset;
-   job->cpu.query_copy_results.stride = stride;
-   job->cpu.query_copy_results.flags = flags;
+   cmd_buffer->state.query.active_query.perf = NULL;
 
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
+   if (cmd_buffer->state.pass)
+      v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
+}
+
+void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+                               struct v3dv_query_pool *pool,
+                               uint32_t query)
+{
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
+      break;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+      v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
+      break;
+   default:
+      unreachable("Unsupported query type");
+   }
 }
 
 void
@@ -3191,115 +4112,10 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
-                 VkEvent _event,
-                 VkPipelineStageFlags stageMask)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_event, event, _event);
-
-   /* Event (re)sets can only happen outside a render pass instance so we
-    * should not be in the middle of job recording.
-    */
-   assert(cmd_buffer->state.pass == NULL);
-   assert(cmd_buffer->state.job == NULL);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_SET_EVENT,
-                                     cmd_buffer, -1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   job->cpu.event_set.event = event;
-   job->cpu.event_set.state = 1;
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
-                   VkEvent _event,
-                   VkPipelineStageFlags stageMask)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_event, event, _event);
-
-   /* Event (re)sets can only happen outside a render pass instance so we
-    * should not be in the middle of job recording.
-    */
-   assert(cmd_buffer->state.pass == NULL);
-   assert(cmd_buffer->state.job == NULL);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_SET_EVENT,
-                                     cmd_buffer, -1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   job->cpu.event_set.event = event;
-   job->cpu.event_set.state = 0;
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
-                   uint32_t eventCount,
-                   const VkEvent *pEvents,
-                   VkPipelineStageFlags srcStageMask,
-                   VkPipelineStageFlags dstStageMask,
-                   uint32_t memoryBarrierCount,
-                   const VkMemoryBarrier *pMemoryBarriers,
-                   uint32_t bufferMemoryBarrierCount,
-                   const VkBufferMemoryBarrier *pBufferMemoryBarriers,
-                   uint32_t imageMemoryBarrierCount,
-                   const VkImageMemoryBarrier *pImageMemoryBarriers)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   assert(eventCount > 0);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
-                                     cmd_buffer, -1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;
-
-   job->cpu.event_wait.events =
-      vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8,
-               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!job->cpu.event_wait.events) {
-      v3dv_flag_oom(cmd_buffer, NULL);
-      return;
-   }
-   job->cpu.event_wait.event_count = eventCount;
-
-   for (uint32_t i = 0; i < eventCount; i++)
-      job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);
-
-   /* vkCmdWaitEvents can be recorded inside a render pass, so we might have
-    * an active job.
-    *
-    * If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen
-    * inside a render pass, it is safe to move the wait job so it happens right
-    * before the current job we are currently recording for the subpass, if any
-    * (it would actually be safe to move it all the way back to right before
-    * the start of the render pass).
-    *
-    * If we are outside a render pass then we should not have any on-going job
-    * and we are free to just add the wait job without restrictions.
-    */
-   assert(cmd_buffer->state.pass || !cmd_buffer->state.job);
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
-                       VkPipelineStageFlagBits pipelineStage,
-                       VkQueryPool queryPool,
-                       uint32_t query)
+v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
+                        VkPipelineStageFlags2 stage,
+                        VkQueryPool queryPool,
+                        uint32_t query)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
@@ -3349,24 +4165,9 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
 }
 
-#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
-#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
-/* Allow this dispatch to start while the last one is still running. */
-#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
-/* Maximum supergroup ID.  6 bits. */
-#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
-/* Batches per supergroup minus 1.  8 bits. */
-#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
-/* Workgroups per supergroup, 0 means 16 */
-#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
-#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
-
-#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
-#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
-#define V3D_CSD_CFG5_THREADING (1 << 0)
-
 void
 v3dv_cmd_buffer_rewrite_indirect_csd_job(
+   struct v3dv_device *device,
    struct v3dv_csd_indirect_cpu_job_info *info,
    const uint32_t *wg_counts)
 {
@@ -3386,15 +4187,22 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
    submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
    submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 
-   submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
-                    (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+   uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
+                          (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+   if (device->devinfo.ver < 71 ||
+       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+      submit->cfg[4] = num_batches - 1;
+   } else {
+      submit->cfg[4] = num_batches;
+   }
    assert(submit->cfg[4] != ~0);
 
    if (info->needs_wg_uniform_rewrite) {
       /* Make sure the GPU is not currently accessing the indirect CL for this
        * job, since we are about to overwrite some of the uniform data.
        */
-      v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE);
+      v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE);
 
       for (uint32_t i = 0; i < 3; i++) {
          if (info->wg_uniform_offsets[i]) {
@@ -3420,6 +4228,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t **wg_uniform_offsets_out,
                           uint32_t *wg_size_out)
 {
+   struct v3dv_device *device = cmd_buffer->device;
    struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
    assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
    struct v3dv_shader_variant *cs_variant =
@@ -3478,23 +4287,31 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
    if (wg_size_out)
       *wg_size_out = wg_size;
 
-   submit->cfg[4] = num_batches - 1;
+   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+   if (device->devinfo.ver < 71 ||
+       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+      submit->cfg[4] = num_batches - 1;
+   } else {
+      submit->cfg[4] = num_batches;
+   }
    assert(submit->cfg[4] != ~0);
 
    assert(pipeline->shared_data->assembly_bo);
    struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
 
    submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
-   submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
    if (cs_variant->prog_data.base->single_seg)
       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
    if (cs_variant->prog_data.base->threads == 4)
       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
+   /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved  */
+   if (device->devinfo.ver < 71)
+      submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
 
    if (cs_variant->prog_data.cs->shared_size > 0) {
       job->csd.shared_memory =
          v3dv_bo_alloc(cmd_buffer->device,
-                       cs_variant->prog_data.cs->shared_size * wgs_per_sg,
+                       cs_variant->prog_data.cs->shared_size * num_wgs,
                        "shared_vars", true);
       if (!job->csd.shared_memory) {
          v3dv_flag_oom(cmd_buffer, NULL);
@@ -3509,6 +4326,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                                      wg_uniform_offsets_out);
    submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
 
+
+   /* Track VK_KHR_buffer_device_address usage in the job */
+   job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
+
    v3dv_job_add_bo(job, uniforms.bo);
 
    return job;
@@ -3541,19 +4362,6 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
-                 uint32_t groupCountX,
-                 uint32_t groupCountY,
-                 uint32_t groupCountZ)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer_emit_pre_dispatch(cmd_buffer);
-   cmd_buffer_dispatch(cmd_buffer, 0, 0, 0,
-                       groupCountX, groupCountY, groupCountZ);
-}
-
-VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
                      uint32_t baseGroupX,
                      uint32_t baseGroupY,
@@ -3615,6 +4423,16 @@ cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
       job->cpu.csd_indirect.wg_uniform_offsets[2];
 
    list_addtail(&job->list_link, &cmd_buffer->jobs);
+
+   /* If we have a CPU queue we submit the CPU job directly to the
+    * queue and the CSD job will be dispatched from within the kernel
+    * queue, otherwise we will have to dispatch the CSD job manually
+    * right after the CPU job by adding it to the list of jobs in the
+    * command buffer.
+    */
+   if (!cmd_buffer->device->pdevice->caps.cpu_queue)
+      list_addtail(&csd_job->list_link, &cmd_buffer->jobs);
+
    cmd_buffer->state.job = NULL;
 }
 
@@ -3633,8 +4451,144 @@ v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
+v3dv_CmdBeginRenderingKHR(VkCommandBuffer commandBuffer,
+                          const VkRenderingInfoKHR *info)
 {
-   /* Nothing to do here since we only support a single device */
-   assert(deviceMask == 0x1);
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.suspending = info->flags & VK_RENDERING_SUSPENDING_BIT;
+   cmd_buffer->state.resuming = info->flags & VK_RENDERING_RESUMING_BIT;
+
+   /* FIXME: for resuming passes we might not need all this setup below since
+    * we are only mostly recording draw calls like in secondaries.
+    */
+
+   v3dv_setup_dynamic_render_pass(cmd_buffer, info);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   v3dv_setup_dynamic_framebuffer(cmd_buffer, info);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   state->pass = &state->dynamic_pass;
+   state->framebuffer = state->dynamic_framebuffer;
+
+   VkRenderPassBeginInfo begin_info = {
+      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+      .pNext = NULL,
+      .renderPass = v3dv_render_pass_to_handle(state->pass),
+      .framebuffer = v3dv_framebuffer_to_handle(state->framebuffer),
+      .renderArea = info->renderArea,
+   };
+
+   VkClearValue *clear_values = NULL;
+   if (state->pass->attachment_count > 0) {
+      clear_values =
+         vk_alloc(&cmd_buffer->device->vk.alloc,
+                  state->pass->attachment_count * sizeof(VkClearValue), 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      if (!clear_values) {
+            v3dv_flag_oom(cmd_buffer, NULL);
+            return;
+      }
+   }
+
+   for (int i = 0; i < info->colorAttachmentCount; i++) {
+      if (!info->pColorAttachments[i].imageView)
+         continue;
+
+      uint32_t a = cmd_buffer->state.dynamic_subpass.color_attachments[i].attachment;
+      assert(a < state->pass->attachment_count);
+      clear_values[a] = info->pColorAttachments[i].clearValue;
+   }
+
+   if (info->pDepthAttachment &&
+       info->pDepthAttachment->imageView != VK_NULL_HANDLE) {
+      uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
+      assert(a < state->pass->attachment_count);
+      clear_values[a].depthStencil.depth =
+         info->pDepthAttachment->clearValue.depthStencil.depth;
+   }
+
+   if (info->pStencilAttachment &&
+       info->pStencilAttachment->imageView != VK_NULL_HANDLE) {
+      uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
+      assert(a < state->pass->attachment_count);
+      clear_values[a].depthStencil.stencil =
+         info->pStencilAttachment->clearValue.depthStencil.stencil;
+   }
+
+   begin_info.clearValueCount = state->pass->attachment_count;
+   begin_info.pClearValues = clear_values;
+
+   cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+   cmd_buffer_init_render_pass_attachment_state(cmd_buffer, &begin_info);
+
+   if (clear_values)
+      vk_free(&cmd_buffer->vk.pool->alloc, clear_values);
+
+   state->render_area = info->renderArea;
+   constraint_clip_window_to_render_area(cmd_buffer);
+   v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdEndRenderingKHR(VkCommandBuffer commandBuffer)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   assert(state->subpass_idx == state->pass->subpass_count - 1);
+
+   /* If we have any pending jobs that were waiting for the current job
+    * to finish and we are suspending the pass here, we need to finish the
+    * job completely and ensure we emit the pending jobs immediately.
+    *
+    * FIXME: this is not optimal but since the resuming command buffer won't
+    * have the pending state we can't do it after the resuming chain completes
+    * without some extra work: we would have to generate the pending jobs
+    * now but not add them to this command buffer's job list, instead, they
+    * should be added to a separate list of "pending jobs" and at submit time
+    * we would accumulate these jobs during the suspend/resume chain and emit
+    * them all after the last job in the chain.
+    */
+   if (state->suspending && cmd_buffer_has_pending_jobs(cmd_buffer))
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+   /* If we don't have a job and we are suspending we will need to create one
+    * so we can link to a follow-up resume job. Because would be starting a new
+    * job, we should ensure the command buffer state is not flagged as resuming
+    * from a previous suspend. The new job will consume any pending barrier
+    * state if necessary.
+    */
+   struct v3dv_job *job = cmd_buffer->state.job;
+   if (!job && state->suspending) {
+      state->resuming = false;
+      job = v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->subpass_idx);
+      if (!job)
+         return;
+   }
+
+   /* If this job is suspending it means it will continue execution in another
+    * job (with the same RCL spec). We implement this by branching the BCL and
+    * we will patch the branch address when we know the resuming job.
+    */
+   if (state->suspending)
+      v3dv_X(cmd_buffer->device, cmd_buffer_suspend)(cmd_buffer);
+
+   v3dv_cmd_buffer_subpass_finish(cmd_buffer);
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+   /* This must be done after the resume/suspend chain completed. */
+   if (!state->suspending)
+      cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
+
+   state->framebuffer = NULL;
+   state->pass = NULL;
+   state->subpass_idx = -1;
+   state->suspending = false;
+   state->resuming = false;
 }
diff --git a/src/broadcom/vulkan/v3dv_debug.c b/src/broadcom/vulkan/v3dv_debug.c
index 055300d05c9..065e8f66026 100644
--- a/src/broadcom/vulkan/v3dv_debug.c
+++ b/src/broadcom/vulkan/v3dv_debug.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * based in part on radv_debug.h which is:
  * Copyright © 2017 Google.
diff --git a/src/broadcom/vulkan/v3dv_debug.h b/src/broadcom/vulkan/v3dv_debug.h
index 75f253700ed..bab21eef2b8 100644
--- a/src/broadcom/vulkan/v3dv_debug.h
+++ b/src/broadcom/vulkan/v3dv_debug.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * based in part on radv_debug.h which is:
  * Copyright © 2017 Google.
diff --git a/src/broadcom/vulkan/v3dv_descriptor_set.c b/src/broadcom/vulkan/v3dv_descriptor_set.c
index fd9ec935611..1d777ba08d4 100644
--- a/src/broadcom/vulkan/v3dv_descriptor_set.c
+++ b/src/broadcom/vulkan/v3dv_descriptor_set.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -28,19 +28,26 @@
 
 /*
  * For a given descriptor defined by the descriptor_set it belongs, its
- * binding layout, and array_index, it returns the map region assigned to it
- * from the descriptor pool bo.
+ * binding layout, array_index, and plane, it returns the map region assigned
+ * to it from the descriptor pool bo.
  */
-static void*
+static void *
 descriptor_bo_map(struct v3dv_device *device,
                   struct v3dv_descriptor_set *set,
                   const struct v3dv_descriptor_set_binding_layout *binding_layout,
                   uint32_t array_index)
 {
-   assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
+   /* Inline uniform blocks use BO memory to store UBO contents, not
+    * descriptor data, so their descriptor BO size is 0 even though they
+    * do use BO memory.
+    */
+   uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+   assert(bo_size > 0 ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
+
    return set->pool->bo->map +
       set->base_offset + binding_layout->descriptor_offset +
-      array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+      array_index * binding_layout->plane_stride * bo_size;
 }
 
 static bool
@@ -102,7 +109,7 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
  * It also returns the descriptor type, so the caller could do extra
  * validation or adding extra offsets if the bo contains more that one field.
  */
-static struct v3dv_cl_reloc
+struct v3dv_cl_reloc
 v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
                                       struct v3dv_descriptor_state *descriptor_state,
                                       struct v3dv_descriptor_map *map,
@@ -125,8 +132,13 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
    const struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set->layout->binding[binding_number];
 
-   assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
-   *out_type = binding_layout->type;
+
+   uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+
+   assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+          bo_size > 0);
+   if (out_type)
+      *out_type = binding_layout->type;
 
    uint32_t array_index = map->array_index[index];
    assert(array_index < binding_layout->array_size);
@@ -134,7 +146,7 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
    struct v3dv_cl_reloc reloc = {
       .bo = set->pool->bo,
       .offset = set->base_offset + binding_layout->descriptor_offset +
-      array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type),
+      array_index * binding_layout->plane_stride * bo_size,
    };
 
    return reloc;
@@ -213,40 +225,11 @@ v3dv_descriptor_map_get_sampler_state(struct v3dv_device *device,
           type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
 
    if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
-      reloc.offset += v3dv_X(device, combined_image_sampler_sampler_state_offset)();
+      reloc.offset += v3dv_X(device, combined_image_sampler_sampler_state_offset)(map->plane[index]);
 
    return reloc;
 }
 
-const struct v3dv_format*
-v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_state,
-                                       struct v3dv_descriptor_map *map,
-                                       struct v3dv_pipeline_layout *pipeline_layout,
-                                       uint32_t index,
-                                       VkFormat *out_vk_format)
-{
-   struct v3dv_descriptor *descriptor =
-      v3dv_descriptor_map_get_descriptor(descriptor_state, map,
-                                         pipeline_layout, index, NULL);
-
-   switch (descriptor->type) {
-   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-      assert(descriptor->buffer_view);
-      *out_vk_format = descriptor->buffer_view->vk_format;
-      return descriptor->buffer_view->format;
-   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-      assert(descriptor->image_view);
-      *out_vk_format = descriptor->image_view->vk.format;
-      return descriptor->image_view->format;
-   default:
-      unreachable("descriptor type doesn't has a texture format");
-   }
-}
-
 struct v3dv_bo*
 v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_state,
                                    struct v3dv_descriptor_map *map,
@@ -270,7 +253,8 @@ v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_stat
       assert(descriptor->image_view);
       struct v3dv_image *image =
          (struct v3dv_image *) descriptor->image_view->vk.image;
-      return image->mem->bo;
+      assert(map->plane[index] < image->plane_count);
+      return image->planes[map->plane[index]].mem->bo;
    }
    default:
       unreachable("descriptor type doesn't has a texture bo");
@@ -299,11 +283,66 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device,
           type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
 
    if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
-      reloc.offset += v3dv_X(device, combined_image_sampler_texture_state_offset)();
+      reloc.offset += v3dv_X(device, combined_image_sampler_texture_state_offset)(map->plane[index]);
 
    return reloc;
 }
 
+#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x));
+
+static void
+sha1_update_ycbcr_conversion(struct mesa_sha1 *ctx,
+                             const struct vk_ycbcr_conversion_state *conversion)
+{
+   SHA1_UPDATE_VALUE(ctx, conversion->format);
+   SHA1_UPDATE_VALUE(ctx, conversion->ycbcr_model);
+   SHA1_UPDATE_VALUE(ctx, conversion->ycbcr_range);
+   SHA1_UPDATE_VALUE(ctx, conversion->mapping);
+   SHA1_UPDATE_VALUE(ctx, conversion->chroma_offsets);
+   SHA1_UPDATE_VALUE(ctx, conversion->chroma_reconstruction);
+}
+
+static void
+sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
+                                          const struct v3dv_descriptor_set_binding_layout *layout,
+                                          const struct v3dv_descriptor_set_layout *set_layout)
+{
+   SHA1_UPDATE_VALUE(ctx, layout->type);
+   SHA1_UPDATE_VALUE(ctx, layout->array_size);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_index);
+   SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count);
+   SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset);
+   SHA1_UPDATE_VALUE(ctx, layout->immutable_samplers_offset);
+   SHA1_UPDATE_VALUE(ctx, layout->plane_stride);
+
+   if (layout->immutable_samplers_offset) {
+      const struct v3dv_sampler *immutable_samplers =
+         v3dv_immutable_samplers(set_layout, layout);
+
+      for (unsigned i = 0; i < layout->array_size; i++) {
+         const struct v3dv_sampler *sampler = &immutable_samplers[i];
+         if (sampler->conversion)
+            sha1_update_ycbcr_conversion(ctx, &sampler->conversion->state);
+      }
+   }
+}
+
+static void
+sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
+                                  const struct v3dv_descriptor_set_layout *layout)
+{
+   SHA1_UPDATE_VALUE(ctx, layout->flags);
+   SHA1_UPDATE_VALUE(ctx, layout->binding_count);
+   SHA1_UPDATE_VALUE(ctx, layout->shader_stages);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_count);
+   SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count);
+
+   for (uint16_t i = 0; i < layout->binding_count; i++)
+      sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i], layout);
+}
+
+
 /*
  * As anv and tu already points:
  *
@@ -326,16 +365,17 @@ v3dv_CreatePipelineLayout(VkDevice _device,
    layout = vk_object_zalloc(&device->vk, pAllocator, sizeof(*layout),
                              VK_OBJECT_TYPE_PIPELINE_LAYOUT);
    if (layout == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    layout->num_sets = pCreateInfo->setLayoutCount;
+   layout->ref_cnt = 1;
 
    uint32_t dynamic_offset_count = 0;
    for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
       V3DV_FROM_HANDLE(v3dv_descriptor_set_layout, set_layout,
                      pCreateInfo->pSetLayouts[set]);
+      v3dv_descriptor_set_layout_ref(set_layout);
       layout->set[set].layout = set_layout;
-
       layout->set[set].dynamic_offset_start = dynamic_offset_count;
       for (uint32_t b = 0; b < set_layout->binding_count; b++) {
          dynamic_offset_count += set_layout->binding[b].array_size *
@@ -356,11 +396,34 @@ v3dv_CreatePipelineLayout(VkDevice _device,
 
    layout->dynamic_offset_count = dynamic_offset_count;
 
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
+      _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
+                        sizeof(layout->set[s].dynamic_offset_start));
+   }
+   _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
+   _mesa_sha1_final(&ctx, layout->sha1);
+
    *pPipelineLayout = v3dv_pipeline_layout_to_handle(layout);
 
    return VK_SUCCESS;
 }
 
+void
+v3dv_pipeline_layout_destroy(struct v3dv_device *device,
+                             struct v3dv_pipeline_layout *layout,
+                             const VkAllocationCallbacks *alloc)
+{
+   assert(layout);
+
+   for (uint32_t i = 0; i < layout->num_sets; i++)
+      v3dv_descriptor_set_layout_unref(device, layout->set[i].layout);
+
+   vk_object_free(&device->vk, alloc, layout);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyPipelineLayout(VkDevice _device,
                           VkPipelineLayout _pipelineLayout,
@@ -371,7 +434,8 @@ v3dv_DestroyPipelineLayout(VkDevice _device,
 
    if (!pipeline_layout)
       return;
-   vk_object_free(&device->vk, pAllocator, pipeline_layout);
+
+   v3dv_pipeline_layout_unref(device, pipeline_layout, pAllocator);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -393,7 +457,10 @@ v3dv_CreateDescriptorPool(VkDevice _device,
    uint32_t bo_size = 0;
    uint32_t descriptor_count = 0;
 
-   assert(pCreateInfo->poolSizeCount > 0);
+   const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
+
    for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
       /* Verify supported descriptor type */
       switch(pCreateInfo->pPoolSizes[i].type) {
@@ -408,6 +475,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
          break;
       default:
          unreachable("Unimplemented descriptor type");
@@ -415,9 +483,28 @@ v3dv_CreateDescriptorPool(VkDevice _device,
       }
 
       assert(pCreateInfo->pPoolSizes[i].descriptorCount > 0);
-      descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
-      bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
-         pCreateInfo->pPoolSizes[i].descriptorCount;
+      if (pCreateInfo->pPoolSizes[i].type ==
+          VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         /* Inline uniform blocks are specified to use the descriptor array
+          * size as the size in bytes of the block.
+          */
+         assert(inline_info);
+         descriptor_count += inline_info->maxInlineUniformBlockBindings;
+         bo_size += pCreateInfo->pPoolSizes[i].descriptorCount;
+      } else {
+         descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+         bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
+            pCreateInfo->pPoolSizes[i].descriptorCount;
+      }
+   }
+
+   /* We align all our buffers to V3D_NON_COHERENT_ATOM_SIZE, make sure we
+    * allocate enough memory to honor that requirement for all our inline
+    * buffers too.
+    */
+   if (inline_info) {
+      bo_size += V3D_NON_COHERENT_ATOM_SIZE *
+                 inline_info->maxInlineUniformBlockBindings;
    }
 
    if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
@@ -433,7 +520,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
                            VK_OBJECT_TYPE_DESCRIPTOR_POOL);
 
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
       pool->host_memory_base = (uint8_t*)pool + sizeof(struct v3dv_descriptor_pool);
@@ -457,13 +544,15 @@ v3dv_CreateDescriptorPool(VkDevice _device,
       pool->bo = NULL;
    }
 
+   list_inithead(&pool->set_list);
+
    *pDescriptorPool = v3dv_descriptor_pool_to_handle(pool);
 
    return VK_SUCCESS;
 
  out_of_device_memory:
    vk_object_free(&device->vk, pAllocator, pool);
-   return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 }
 
 static void
@@ -498,6 +587,11 @@ v3dv_DestroyDescriptorPool(VkDevice _device,
    if (!pool)
       return;
 
+   list_for_each_entry_safe(struct v3dv_descriptor_set, set,
+                            &pool->set_list, pool_link) {
+      v3dv_descriptor_set_layout_unref(device, set->layout);
+   }
+
    if (!pool->host_memory_base) {
       for(int i = 0; i < pool->entry_count; ++i) {
          descriptor_set_destroy(device, pool, pool->entries[i].set, false);
@@ -520,6 +614,12 @@ v3dv_ResetDescriptorPool(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_descriptor_pool, pool, descriptorPool);
 
+   list_for_each_entry_safe(struct v3dv_descriptor_set, set,
+                            &pool->set_list, pool_link) {
+      v3dv_descriptor_set_layout_unref(device, set->layout);
+   }
+   list_inithead(&pool->set_list);
+
    if (!pool->host_memory_base) {
       for(int i = 0; i < pool->entry_count; ++i) {
          descriptor_set_destroy(device, pool, pool->entries[i].set, false);
@@ -539,6 +639,15 @@ v3dv_ResetDescriptorPool(VkDevice _device,
    return VK_SUCCESS;
 }
 
+void
+v3dv_descriptor_set_layout_destroy(struct v3dv_device *device,
+                                   struct v3dv_descriptor_set_layout *set_layout)
+{
+   assert(set_layout->ref_cnt == 0);
+   vk_object_base_finish(&set_layout->base);
+   vk_free2(&device->vk.alloc, NULL, set_layout);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateDescriptorSetLayout(VkDevice _device,
                                const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
@@ -552,6 +661,13 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
 
    uint32_t num_bindings = 0;
    uint32_t immutable_sampler_count = 0;
+
+   /* for immutable descriptors, the plane stride is the largest plane
+    * count of all combined image samplers. For mutable descriptors
+    * this is always 1 since multiplanar images are restricted to
+    * immutable combined image samplers.
+    */
+   uint8_t plane_stride = 1;
    for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
       num_bindings = MAX2(num_bindings, pCreateInfo->pBindings[j].binding + 1);
 
@@ -570,22 +686,40 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
       if ((desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
            desc_type == VK_DESCRIPTOR_TYPE_SAMPLER) &&
            pCreateInfo->pBindings[j].pImmutableSamplers) {
-         immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
+         uint32_t descriptor_count = pCreateInfo->pBindings[j].descriptorCount;
+         immutable_sampler_count += descriptor_count;
+
+         for (uint32_t i = 0; i < descriptor_count; i++) {
+            const VkSampler vk_sampler =
+               pCreateInfo->pBindings[j].pImmutableSamplers[i];
+            VK_FROM_HANDLE(v3dv_sampler, sampler, vk_sampler);
+            plane_stride = MAX2(plane_stride, sampler->plane_count);
+         }
       }
    }
 
-   uint32_t samplers_offset = sizeof(struct v3dv_descriptor_set_layout) +
-      num_bindings * sizeof(set_layout->binding[0]);
+   /* We place immutable samplers after the binding data. We want to use
+    * offsetof instead of any sizeof(struct v3dv_descriptor_set_layout)
+    * because the latter may include padding at the end of the struct.
+    */
+   uint32_t samplers_offset =
+      offsetof(struct v3dv_descriptor_set_layout, binding[num_bindings]);
+
    uint32_t size = samplers_offset +
       immutable_sampler_count * sizeof(struct v3dv_sampler);
 
-   set_layout = vk_object_zalloc(&device->vk, pAllocator, size,
-                                 VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
-
+   /* Descriptor set layouts are reference counted and therefore can survive
+    * vkDestroyPipelineSetLayout, so they need to be allocated with a device
+    * scope.
+    */
+   set_layout =
+      vk_zalloc(&device->vk.alloc, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (!set_layout)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   vk_object_base_init(&device->vk, &set_layout->base,
+                       VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
 
-   /* We just allocate all the immutable samplers at the end of the struct */
    struct v3dv_sampler *samplers = (void*) &set_layout->binding[num_bindings];
 
    assert(pCreateInfo->bindingCount == 0 || num_bindings > 0);
@@ -594,17 +728,15 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
    VkResult result = vk_create_sorted_bindings(pCreateInfo->pBindings,
                                                pCreateInfo->bindingCount, &bindings);
    if (result != VK_SUCCESS) {
-      vk_object_free(&device->vk, pAllocator, set_layout);
-      return vk_error(device->instance, result);
+      v3dv_descriptor_set_layout_destroy(device, set_layout);
+      return vk_error(device, result);
    }
 
-   memset(set_layout->binding, 0,
-          size - sizeof(struct v3dv_descriptor_set_layout));
-
    set_layout->binding_count = num_bindings;
    set_layout->flags = pCreateInfo->flags;
    set_layout->shader_stages = 0;
    set_layout->bo_size = 0;
+   set_layout->ref_cnt = 1;
 
    uint32_t descriptor_count = 0;
    uint32_t dynamic_offset_count = 0;
@@ -628,6 +760,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
          /* Nothing here, just to keep the descriptor type filtering below */
          break;
       default:
@@ -639,6 +772,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
       set_layout->binding[binding_number].array_size = binding->descriptorCount;
       set_layout->binding[binding_number].descriptor_index = descriptor_count;
       set_layout->binding[binding_number].dynamic_offset_index = dynamic_offset_count;
+      set_layout->binding[binding_number].plane_stride = plane_stride;
 
       if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
            binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
@@ -651,18 +785,40 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
 
          samplers += binding->descriptorCount;
          samplers_offset += sizeof(struct v3dv_sampler) * binding->descriptorCount;
-      }
 
-      descriptor_count += binding->descriptorCount;
-      dynamic_offset_count += binding->descriptorCount *
-         set_layout->binding[binding_number].dynamic_offset_count;
+         set_layout->binding[binding_number].plane_stride = plane_stride;
+      }
 
       set_layout->shader_stages |= binding->stageFlags;
 
-      set_layout->binding[binding_number].descriptor_offset = set_layout->bo_size;
-      set_layout->bo_size +=
-         v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
-         binding->descriptorCount;
+      if (binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         dynamic_offset_count += binding->descriptorCount *
+            set_layout->binding[binding_number].dynamic_offset_count;
+
+         descriptor_count += binding->descriptorCount;
+
+         set_layout->binding[binding_number].descriptor_offset =
+            set_layout->bo_size;
+         set_layout->bo_size +=
+            v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
+            binding->descriptorCount * set_layout->binding[binding_number].plane_stride;
+      } else {
+         /* We align all our buffers, inline buffers too. We made sure to take
+          * this account when calculating total BO size requirements at pool
+          * creation time.
+          */
+         set_layout->bo_size = align(set_layout->bo_size,
+                                     V3D_NON_COHERENT_ATOM_SIZE);
+
+         set_layout->binding[binding_number].descriptor_offset =
+            set_layout->bo_size;
+
+         /* Inline uniform blocks are not arrayed, instead descriptorCount
+          * specifies the size of the buffer in bytes.
+          */
+         set_layout->bo_size += binding->descriptorCount;
+         descriptor_count++;
+      }
    }
 
    free(bindings);
@@ -686,7 +842,7 @@ v3dv_DestroyDescriptorSetLayout(VkDevice _device,
    if (!set_layout)
       return;
 
-   vk_object_free(&device->vk, pAllocator, set_layout);
+   v3dv_descriptor_set_layout_unref(device, set_layout);
 }
 
 static inline VkResult
@@ -697,7 +853,7 @@ out_of_pool_memory(const struct v3dv_device *device,
     * by allocating a new pool, so they don't point to real issues.
     */
    if (!pool->is_driver_internal)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY)
+      return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
    else
       return VK_ERROR_OUT_OF_POOL_MEMORY;
 }
@@ -705,7 +861,7 @@ out_of_pool_memory(const struct v3dv_device *device,
 static VkResult
 descriptor_set_create(struct v3dv_device *device,
                       struct v3dv_descriptor_pool *pool,
-                      const struct v3dv_descriptor_set_layout *layout,
+                      struct v3dv_descriptor_set_layout *layout,
                       struct v3dv_descriptor_set **out_set)
 {
    struct v3dv_descriptor_set *set;
@@ -726,7 +882,7 @@ descriptor_set_create(struct v3dv_device *device,
                              VK_OBJECT_TYPE_DESCRIPTOR_SET);
 
       if (!set)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    set->pool = pool;
@@ -797,19 +953,24 @@ descriptor_set_create(struct v3dv_device *device,
                                        layout->binding[b].immutable_samplers_offset);
 
       for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
-         uint32_t combined_offset =
-            layout->binding[b].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
-            v3dv_X(device, combined_image_sampler_sampler_state_offset)() : 0;
-
-         void *desc_map = descriptor_bo_map(device, set, &layout->binding[b], i);
-         desc_map += combined_offset;
-
-         memcpy(desc_map,
-                samplers[i].sampler_state,
-                sizeof(samplers[i].sampler_state));
+         assert(samplers[i].plane_count <= V3DV_MAX_PLANE_COUNT);
+         for (uint8_t plane = 0; plane < samplers[i].plane_count; plane++) {
+            uint32_t combined_offset =
+               layout->binding[b].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
+               v3dv_X(device, combined_image_sampler_sampler_state_offset)(plane) : 0;
+            void *desc_map =
+               descriptor_bo_map(device, set, &layout->binding[b], i);
+            desc_map += combined_offset;
+
+            memcpy(desc_map, samplers[i].sampler_state,
+                   sizeof(samplers[i].sampler_state));
+         }
       }
    }
 
+   v3dv_descriptor_set_layout_ref(layout);
+   list_addtail(&set->pool_link, &pool->set_list);
+
    *out_set = set;
 
    return VK_SUCCESS;
@@ -860,8 +1021,13 @@ v3dv_FreeDescriptorSets(VkDevice _device,
 
    for (uint32_t i = 0; i < count; i++) {
       V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
-      if (set && !pool->host_memory_base)
-         descriptor_set_destroy(device, pool, set, true);
+
+      if (set) {
+         v3dv_descriptor_set_layout_unref(device, set->layout);
+         list_del(&set->pool_link);
+         if (!pool->host_memory_base)
+            descriptor_set_destroy(device, pool, set, true);
+      }
    }
 
    return VK_SUCCESS;
@@ -877,11 +1043,16 @@ descriptor_bo_copy(struct v3dv_device *device,
                    uint32_t src_array_index)
 {
    assert(dst_binding_layout->type == src_binding_layout->type);
+   assert(src_binding_layout->plane_stride == dst_binding_layout->plane_stride);
 
-   void *dst_map = descriptor_bo_map(device, dst_set, dst_binding_layout, dst_array_index);
-   void *src_map = descriptor_bo_map(device, src_set, src_binding_layout, src_array_index);
+   void *dst_map = descriptor_bo_map(device, dst_set, dst_binding_layout,
+                                     dst_array_index);
+   void *src_map = descriptor_bo_map(device, src_set, src_binding_layout,
+                                     src_array_index);
 
-   memcpy(dst_map, src_map, v3dv_X(device, descriptor_bo_size)(src_binding_layout->type));
+   memcpy(dst_map, src_map,
+          v3dv_X(device, descriptor_bo_size)(src_binding_layout->type) *
+          src_binding_layout->plane_stride);
 }
 
 static void
@@ -916,26 +1087,39 @@ write_image_descriptor(struct v3dv_device *device,
    descriptor->sampler = sampler;
    descriptor->image_view = iview;
 
+   assert(iview || sampler);
+   uint8_t plane_count = iview ? iview->plane_count : sampler->plane_count;
+
    void *desc_map = descriptor_bo_map(device, set,
                                       binding_layout, array_index);
 
-   if (iview) {
-      const uint32_t tex_state_index =
-         iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
-         desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1;
-      memcpy(desc_map,
-             iview->texture_shader_state[tex_state_index],
-             sizeof(iview->texture_shader_state[0]));
-      desc_map += v3dv_X(device, combined_image_sampler_sampler_state_offset)();
-   }
+   for (uint8_t plane = 0; plane < plane_count; plane++) {
+      if (iview) {
+         uint32_t offset = desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
+            v3dv_X(device, combined_image_sampler_texture_state_offset)(plane) : 0;
 
-   if (sampler && !binding_layout->immutable_samplers_offset) {
-      /* For immutable samplers this was already done as part of the
-       * descriptor set create, as that info can't change later
-       */
-      memcpy(desc_map,
-             sampler->sampler_state,
-             sizeof(sampler->sampler_state));
+         void *plane_desc_map = desc_map + offset;
+
+         const uint32_t tex_state_index =
+            iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
+            desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1;
+         memcpy(plane_desc_map,
+                iview->planes[plane].texture_shader_state[tex_state_index],
+                sizeof(iview->planes[plane].texture_shader_state[0]));
+      }
+
+      if (sampler && !binding_layout->immutable_samplers_offset) {
+         uint32_t offset = desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
+            v3dv_X(device, combined_image_sampler_sampler_state_offset)(plane) : 0;
+
+         void *plane_desc_map = desc_map + offset;
+         /* For immutable samplers this was already done as part of the
+          * descriptor set create, as that info can't change later
+          */
+         memcpy(plane_desc_map,
+                sampler->sampler_state,
+                sizeof(sampler->sampler_state));
+      }
    }
 }
 
@@ -960,6 +1144,31 @@ write_buffer_view_descriptor(struct v3dv_device *device,
           sizeof(bview->texture_shader_state));
 }
 
+static void
+write_inline_uniform_descriptor(struct v3dv_device *device,
+                                struct v3dv_descriptor *descriptor,
+                                struct v3dv_descriptor_set *set,
+                                const struct v3dv_descriptor_set_binding_layout *binding_layout,
+                                const void *data,
+                                size_t offset,
+                                size_t size)
+{
+   assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
+   descriptor->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
+   descriptor->buffer = NULL;
+
+   void *desc_map = descriptor_bo_map(device, set, binding_layout, 0);
+   memcpy(desc_map + offset, data, size);
+
+   /* Inline uniform buffers allocate BO space in the pool for all inline
+    * buffers it may allocate and then this space is assigned to individual
+    * descriptors when they are written, so we define the range of an inline
+    * buffer as the largest range of data that the client has written to it.
+    */
+   descriptor->offset = 0;
+   descriptor->range = MAX2(descriptor->range, offset + size);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_UpdateDescriptorSets(VkDevice  _device,
                           uint32_t descriptorWriteCount,
@@ -978,9 +1187,20 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
       struct v3dv_descriptor *descriptor = set->descriptors;
 
       descriptor += binding_layout->descriptor_index;
-      descriptor += writeset->dstArrayElement;
 
-      for (uint32_t j = 0; j < writeset->descriptorCount; ++j) {
+      /* Inline uniform blocks are not arrayed, instead they use dstArrayElement
+       * to specify the byte offset of the uniform update and descriptorCount
+       * to specify the size (in bytes) of the update.
+       */
+      uint32_t descriptor_count;
+      if (writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         descriptor += writeset->dstArrayElement;
+         descriptor_count = writeset->descriptorCount;
+      } else {
+         descriptor_count = 1;
+      }
+
+      for (uint32_t j = 0; j < descriptor_count; ++j) {
          switch(writeset->descriptorType) {
 
          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -993,12 +1213,11 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
             break;
          }
          case VK_DESCRIPTOR_TYPE_SAMPLER: {
-            /* If we are here we shouldn't be modifying a immutable sampler,
-             * so we don't ensure that would work or not crash. But let the
-             * validation layers check that
-             */
+            /* If we are here we shouldn't be modifying an immutable sampler */
+            assert(!binding_layout->immutable_samplers_offset);
             const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
             V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler);
+
             write_image_descriptor(device, descriptor, writeset->descriptorType,
                                    set, binding_layout, NULL, sampler,
                                    writeset->dstArrayElement + j);
@@ -1010,6 +1229,7 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {
             const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
             V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView);
+
             write_image_descriptor(device, descriptor, writeset->descriptorType,
                                    set, binding_layout, iview, NULL,
                                    writeset->dstArrayElement + j);
@@ -1019,7 +1239,17 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
             const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
             V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView);
-            V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler);
+            struct v3dv_sampler *sampler = NULL;
+            if (!binding_layout->immutable_samplers_offset) {
+               /* In general we ignore the sampler when updating a combined
+                * image sampler, but for YCbCr we kwnow that we must use
+                * immutable combined image samplers
+                */
+               assert(iview->plane_count == 1);
+               V3DV_FROM_HANDLE(v3dv_sampler, _sampler, image_info->sampler);
+               sampler = _sampler;
+            }
+
             write_image_descriptor(device, descriptor, writeset->descriptorType,
                                    set, binding_layout, iview, sampler,
                                    writeset->dstArrayElement + j);
@@ -1035,6 +1265,18 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
                                          writeset->dstArrayElement + j);
             break;
          }
+         case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+            const VkWriteDescriptorSetInlineUniformBlock *inline_write =
+               vk_find_struct_const(writeset->pNext,
+                                    WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
+            assert(inline_write->dataSize == writeset->descriptorCount);
+            write_inline_uniform_descriptor(device, descriptor, set,
+                                            binding_layout,
+                                            inline_write->pData,
+                                            writeset->dstArrayElement, /* offset */
+                                            inline_write->dataSize);
+            break;
+         }
          default:
             unreachable("unimplemented descriptor type");
             break;
@@ -1061,9 +1303,25 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
       struct v3dv_descriptor *dst_descriptor = dst_set->descriptors;
 
       src_descriptor += src_binding_layout->descriptor_index;
-      src_descriptor += copyset->srcArrayElement;
-
       dst_descriptor += dst_binding_layout->descriptor_index;
+
+      if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         /* {src,dst}ArrayElement specifies src/dst start offset and
+          * descriptorCount specifies size (in bytes) to copy.
+          */
+         const void *src_data = src_set->pool->bo->map +
+                                src_set->base_offset +
+                                src_binding_layout->descriptor_offset +
+                                copyset->srcArrayElement;
+         write_inline_uniform_descriptor(device, dst_descriptor, dst_set,
+                                         dst_binding_layout,
+                                         src_data,
+                                         copyset->dstArrayElement,
+                                         copyset->descriptorCount);
+         continue;
+      }
+
+      src_descriptor += copyset->srcArrayElement;
       dst_descriptor += copyset->dstArrayElement;
 
       for (uint32_t j = 0; j < copyset->descriptorCount; j++) {
@@ -1127,66 +1385,6 @@ v3dv_GetDescriptorSetLayoutSupport(
    pSupport->supported = supported;
 }
 
-VkResult
-v3dv_CreateDescriptorUpdateTemplate(
-   VkDevice _device,
-   const VkDescriptorUpdateTemplateCreateInfo *pCreateInfo,
-   const VkAllocationCallbacks *pAllocator,
-   VkDescriptorUpdateTemplate *pDescriptorUpdateTemplate)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   struct v3dv_descriptor_update_template *template;
-
-   size_t size = sizeof(*template) +
-      pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]);
-   template = vk_object_alloc(&device->vk, pAllocator, size,
-                              VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
-   if (template == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   template->bind_point = pCreateInfo->pipelineBindPoint;
-
-   assert(pCreateInfo->templateType ==
-          VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET);
-   template->set = pCreateInfo->set;
-
-   template->entry_count = pCreateInfo->descriptorUpdateEntryCount;
-   for (uint32_t i = 0; i < template->entry_count; i++) {
-      const VkDescriptorUpdateTemplateEntry *pEntry =
-         &pCreateInfo->pDescriptorUpdateEntries[i];
-
-      template->entries[i] = (struct v3dv_descriptor_template_entry) {
-         .type = pEntry->descriptorType,
-         .binding = pEntry->dstBinding,
-         .array_element = pEntry->dstArrayElement,
-         .array_count = pEntry->descriptorCount,
-         .offset = pEntry->offset,
-         .stride = pEntry->stride,
-      };
-   }
-
-   *pDescriptorUpdateTemplate =
-      v3dv_descriptor_update_template_to_handle(template);
-
-   return VK_SUCCESS;
-}
-
-void
-v3dv_DestroyDescriptorUpdateTemplate(
-   VkDevice _device,
-   VkDescriptorUpdateTemplate descriptorUpdateTemplate,
-   const VkAllocationCallbacks *pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template,
-                    descriptorUpdateTemplate);
-
-   if (!template)
-      return;
-
-   vk_object_free(&device->vk, pAllocator, template);
-}
-
 void
 v3dv_UpdateDescriptorSetWithTemplate(
    VkDevice _device,
@@ -1196,11 +1394,11 @@ v3dv_UpdateDescriptorSetWithTemplate(
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_descriptor_set, set, descriptorSet);
-   V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template,
+   V3DV_FROM_HANDLE(vk_descriptor_update_template, template,
                     descriptorUpdateTemplate);
 
    for (int i = 0; i < template->entry_count; i++) {
-      const struct v3dv_descriptor_template_entry *entry =
+      const struct vk_descriptor_template_entry *entry =
          &template->entries[i];
 
       const struct v3dv_descriptor_set_binding_layout *binding_layout =
@@ -1208,8 +1406,7 @@ v3dv_UpdateDescriptorSetWithTemplate(
 
       struct v3dv_descriptor *descriptor =
          set->descriptors +
-         binding_layout->descriptor_index +
-         entry->array_element;
+         binding_layout->descriptor_index;
 
       switch (entry->type) {
       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
@@ -1219,7 +1416,8 @@ v3dv_UpdateDescriptorSetWithTemplate(
          for (uint32_t j = 0; j < entry->array_count; j++) {
             const VkDescriptorBufferInfo *info =
                pData + entry->offset + j * entry->stride;
-            write_buffer_descriptor(descriptor + j, entry->type, info);
+            write_buffer_descriptor(descriptor + entry->array_element + j,
+                                    entry->type, info);
          }
          break;
 
@@ -1233,9 +1431,9 @@ v3dv_UpdateDescriptorSetWithTemplate(
                pData + entry->offset + j * entry->stride;
             V3DV_FROM_HANDLE(v3dv_image_view, iview, info->imageView);
             V3DV_FROM_HANDLE(v3dv_sampler, sampler, info->sampler);
-            write_image_descriptor(device, descriptor + j, entry->type,
-                                   set, binding_layout, iview, sampler,
-                                   entry->array_element + j);
+            write_image_descriptor(device, descriptor + entry->array_element + j,
+                                   entry->type, set, binding_layout, iview,
+                                   sampler, entry->array_element + j);
          }
          break;
 
@@ -1245,34 +1443,24 @@ v3dv_UpdateDescriptorSetWithTemplate(
             const VkBufferView *_bview =
                pData + entry->offset + j * entry->stride;
             V3DV_FROM_HANDLE(v3dv_buffer_view, bview, *_bview);
-            write_buffer_view_descriptor(device, descriptor + j, entry->type,
-                                         set, binding_layout, bview,
+            write_buffer_view_descriptor(device,
+                                         descriptor + entry->array_element + j,
+                                         entry->type, set, binding_layout, bview,
                                          entry->array_element + j);
          }
          break;
 
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+         write_inline_uniform_descriptor(device, descriptor, set,
+                                         binding_layout,
+                                         pData + entry->offset,
+                                         entry->array_element, /* offset */
+                                         entry->array_count);  /* size */
+         break;
+      }
+
       default:
          unreachable("Unsupported descriptor type");
       }
    }
 }
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSamplerYcbcrConversion(
-    VkDevice _device,
-    const VkSamplerYcbcrConversionCreateInfo *pCreateInfo,
-    const VkAllocationCallbacks *pAllocator,
-    VkSamplerYcbcrConversion *pYcbcrConversion)
-{
-   unreachable("Ycbcr sampler conversion is not supported");
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySamplerYcbcrConversion(
-    VkDevice _device,
-    VkSamplerYcbcrConversion YcbcrConversion,
-    const VkAllocationCallbacks *pAllocator)
-{
-   unreachable("Ycbcr sampler conversion is not supported");
-}
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index fec53ec38c5..7992cab59ff 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -44,12 +44,18 @@
 #include "compiler/v3d_compiler.h"
 
 #include "drm-uapi/v3d_drm.h"
-#include "format/u_format.h"
+#include "vk_drm_syncobj.h"
 #include "vk_util.h"
+#include "git_sha1.h"
 
 #include "util/build_id.h"
-#include "util/debug.h"
-#include "util/u_cpu_detect.h"
+#include "util/os_file.h"
+#include "util/u_debug.h"
+#include "util/format/u_format.h"
+
+#if DETECT_OS_ANDROID
+#include "vk_android.h"
+#endif
 
 #ifdef VK_USE_PLATFORM_XCB_KHR
 #include <xcb/xcb.h>
@@ -62,11 +68,15 @@
 #include "wayland-drm-client-protocol.h"
 #endif
 
-#ifdef USE_V3D_SIMULATOR
-#include "drm-uapi/i915_drm.h"
-#endif
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION)
 
-#define V3DV_API_VERSION VK_MAKE_VERSION(1, 0, VK_HEADER_VERSION)
+#ifdef ANDROID_STRICT
+#if ANDROID_API_LEVEL <= 32
+/* Android 12.1 and lower support only Vulkan API v1.1 */
+#undef V3DV_API_VERSION
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
+#endif
+#endif
 
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion)
@@ -75,25 +85,32 @@ v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion)
     return VK_SUCCESS;
 }
 
-#define V3DV_HAS_SURFACE (VK_USE_PLATFORM_WIN32_KHR ||   \
-                          VK_USE_PLATFORM_WAYLAND_KHR || \
-                          VK_USE_PLATFORM_XCB_KHR ||     \
-                          VK_USE_PLATFORM_XLIB_KHR ||    \
-                          VK_USE_PLATFORM_DISPLAY_KHR)
+#if defined(VK_USE_PLATFORM_WIN32_KHR) ||   \
+    defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
+    defined(VK_USE_PLATFORM_XCB_KHR) ||     \
+    defined(VK_USE_PLATFORM_XLIB_KHR) ||    \
+    defined(VK_USE_PLATFORM_DISPLAY_KHR)
+#define V3DV_USE_WSI_PLATFORM
+#endif
 
 static const struct vk_instance_extension_table instance_extensions = {
    .KHR_device_group_creation           = true,
 #ifdef VK_USE_PLATFORM_DISPLAY_KHR
    .KHR_display                         = true,
+   .KHR_get_display_properties2         = true,
+   .EXT_direct_mode_display             = true,
+   .EXT_acquire_drm_display             = true,
 #endif
    .KHR_external_fence_capabilities     = true,
    .KHR_external_memory_capabilities    = true,
    .KHR_external_semaphore_capabilities = true,
-   .KHR_get_display_properties2         = true,
    .KHR_get_physical_device_properties2 = true,
-#ifdef V3DV_HAS_SURFACE
+#ifdef V3DV_USE_WSI_PLATFORM
    .KHR_get_surface_capabilities2       = true,
    .KHR_surface                         = true,
+   .KHR_surface_protected_capabilities  = true,
+   .EXT_surface_maintenance1            = true,
+   .EXT_swapchain_colorspace            = true,
 #endif
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
    .KHR_wayland_surface                 = true,
@@ -104,7 +121,14 @@ static const struct vk_instance_extension_table instance_extensions = {
 #ifdef VK_USE_PLATFORM_XLIB_KHR
    .KHR_xlib_surface                    = true,
 #endif
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+   .EXT_acquire_xlib_display            = true,
+#endif
+#ifndef VK_USE_PLATFORM_WIN32_KHR
+   .EXT_headless_surface                = true,
+#endif
    .EXT_debug_report                    = true,
+   .EXT_debug_utils                     = true,
 };
 
 static void
@@ -112,43 +136,354 @@ get_device_extensions(const struct v3dv_physical_device *device,
                       struct vk_device_extension_table *ext)
 {
    *ext = (struct vk_device_extension_table) {
-      .KHR_bind_memory2                    = true,
-      .KHR_copy_commands2                  = true,
-      .KHR_dedicated_allocation            = true,
-      .KHR_device_group                    = true,
-      .KHR_descriptor_update_template      = true,
-      .KHR_external_fence                  = true,
-      .KHR_external_fence_fd               = true,
-      .KHR_external_memory                 = true,
-      .KHR_external_memory_fd              = true,
-      .KHR_external_semaphore              = true,
-      .KHR_external_semaphore_fd           = true,
-      .KHR_get_memory_requirements2        = true,
-      .KHR_image_format_list               = true,
-      .KHR_relaxed_block_layout            = true,
-      .KHR_maintenance1                    = true,
-      .KHR_maintenance2                    = true,
-      .KHR_maintenance3                    = true,
-      .KHR_multiview                       = true,
-      .KHR_shader_non_semantic_info        = true,
-      .KHR_sampler_mirror_clamp_to_edge    = true,
-      .KHR_storage_buffer_storage_class    = true,
-      .KHR_uniform_buffer_standard_layout  = true,
-#ifdef V3DV_HAS_SURFACE
-      .KHR_swapchain                       = true,
-      .KHR_incremental_present             = true,
+      .KHR_8bit_storage                     = true,
+      .KHR_16bit_storage                    = true,
+      .KHR_bind_memory2                     = true,
+      .KHR_buffer_device_address            = true,
+      .KHR_copy_commands2                   = true,
+      .KHR_create_renderpass2               = true,
+      .KHR_dedicated_allocation             = true,
+      .KHR_device_group                     = true,
+      .KHR_driver_properties                = true,
+      .KHR_descriptor_update_template       = true,
+      .KHR_depth_stencil_resolve            = true,
+      .KHR_dynamic_rendering                = true,
+      .KHR_external_fence                   = true,
+      .KHR_external_fence_fd                = true,
+      .KHR_external_memory                  = true,
+      .KHR_external_memory_fd               = true,
+      .KHR_external_semaphore               = true,
+      .KHR_external_semaphore_fd            = true,
+      .KHR_format_feature_flags2            = true,
+      .KHR_get_memory_requirements2         = true,
+      .KHR_image_format_list                = true,
+      .KHR_imageless_framebuffer            = true,
+      .KHR_index_type_uint8                 = true,
+      .KHR_line_rasterization               = true,
+      .KHR_load_store_op_none               = true,
+      .KHR_performance_query                = device->caps.perfmon,
+      .KHR_relaxed_block_layout             = true,
+      .KHR_maintenance1                     = true,
+      .KHR_maintenance2                     = true,
+      .KHR_maintenance3                     = true,
+      .KHR_maintenance4                     = true,
+      .KHR_multiview                        = true,
+      .KHR_pipeline_executable_properties   = true,
+      .KHR_separate_depth_stencil_layouts   = true,
+      .KHR_shader_expect_assume             = true,
+      .KHR_shader_float_controls            = true,
+      .KHR_shader_non_semantic_info         = true,
+      .KHR_sampler_mirror_clamp_to_edge     = true,
+      .KHR_sampler_ycbcr_conversion         = true,
+      .KHR_spirv_1_4                        = true,
+      .KHR_storage_buffer_storage_class     = true,
+      .KHR_timeline_semaphore               = true,
+      .KHR_uniform_buffer_standard_layout   = true,
+      .KHR_shader_integer_dot_product       = true,
+      .KHR_shader_terminate_invocation      = true,
+      .KHR_synchronization2                 = true,
+      .KHR_workgroup_memory_explicit_layout = true,
+#ifdef V3DV_USE_WSI_PLATFORM
+      .KHR_swapchain                        = true,
+      .KHR_swapchain_mutable_format         = true,
+      .KHR_incremental_present              = true,
+#endif
+      .KHR_variable_pointers                = true,
+      .KHR_vertex_attribute_divisor         = true,
+      .KHR_vulkan_memory_model              = true,
+      .KHR_zero_initialize_workgroup_memory = true,
+      .EXT_4444_formats                     = true,
+      .EXT_attachment_feedback_loop_layout  = true,
+      .EXT_border_color_swizzle             = true,
+      .EXT_color_write_enable               = true,
+      .EXT_custom_border_color              = true,
+      .EXT_depth_clip_control               = true,
+      .EXT_depth_clip_enable                = device->devinfo.ver >= 71,
+      .EXT_load_store_op_none               = true,
+      .EXT_inline_uniform_block             = true,
+      .EXT_extended_dynamic_state           = true,
+      .EXT_external_memory_dma_buf          = true,
+      .EXT_host_query_reset                 = true,
+      .EXT_image_drm_format_modifier        = true,
+      .EXT_image_robustness                 = true,
+      .EXT_index_type_uint8                 = true,
+      .EXT_line_rasterization               = true,
+      .EXT_memory_budget                    = true,
+      .EXT_multi_draw                       = true,
+      .EXT_physical_device_drm              = true,
+      .EXT_pipeline_creation_cache_control  = true,
+      .EXT_pipeline_creation_feedback       = true,
+      .EXT_pipeline_robustness              = true,
+      .EXT_primitive_topology_list_restart  = true,
+      .EXT_private_data                     = true,
+      .EXT_provoking_vertex                 = true,
+      .EXT_separate_stencil_usage           = true,
+      .EXT_shader_demote_to_helper_invocation = true,
+      .EXT_shader_module_identifier         = true,
+      .EXT_subgroup_size_control            = true,
+#ifdef V3DV_USE_WSI_PLATFORM
+      .EXT_swapchain_maintenance1           = true,
+#endif
+      .EXT_texel_buffer_alignment           = true,
+      .EXT_tooling_info                     = true,
+      .EXT_vertex_attribute_divisor         = true,
+#if DETECT_OS_ANDROID
+      .ANDROID_external_memory_android_hardware_buffer = true,
+      .ANDROID_native_buffer                = true,
+      .EXT_queue_family_foreign             = true,
+#endif
+   };
+}
+
+static void
+get_features(const struct v3dv_physical_device *physical_device,
+             struct vk_features *features)
+{
+   *features = (struct vk_features) {
+      /* Vulkan 1.0 */
+      .robustBufferAccess = true, /* This feature is mandatory */
+      .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
+      .imageCubeArray = true,
+      .independentBlend = true,
+      .geometryShader = true,
+      .tessellationShader = false,
+      .sampleRateShading = true,
+      .dualSrcBlend = false,
+      .logicOp = true,
+      .multiDrawIndirect = false,
+      .drawIndirectFirstInstance = true,
+      .depthClamp = physical_device->devinfo.ver >= 71,
+      .depthBiasClamp = true,
+      .fillModeNonSolid = true,
+      .depthBounds = physical_device->devinfo.ver >= 71,
+      .wideLines = true,
+      .largePoints = true,
+      .alphaToOne = true,
+      .multiViewport = false,
+      .samplerAnisotropy = true,
+      .textureCompressionETC2 = true,
+      .textureCompressionASTC_LDR = true,
+      /* Note that textureCompressionBC requires that the driver support all
+       * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim
+       * that we support it.
+       */
+      .textureCompressionBC = false,
+      .occlusionQueryPrecise = true,
+      .pipelineStatisticsQuery = false,
+      .vertexPipelineStoresAndAtomics = true,
+      .fragmentStoresAndAtomics = true,
+      .shaderTessellationAndGeometryPointSize = true,
+      .shaderImageGatherExtended = true,
+      .shaderStorageImageExtendedFormats = true,
+      .shaderStorageImageMultisample = false,
+      .shaderStorageImageReadWithoutFormat = true,
+      .shaderStorageImageWriteWithoutFormat = false,
+      .shaderUniformBufferArrayDynamicIndexing = false,
+      .shaderSampledImageArrayDynamicIndexing = false,
+      .shaderStorageBufferArrayDynamicIndexing = false,
+      .shaderStorageImageArrayDynamicIndexing = false,
+      .shaderClipDistance = true,
+      .shaderCullDistance = false,
+      .shaderFloat64 = false,
+      .shaderInt64 = false,
+      .shaderInt16 = false,
+      .shaderResourceResidency = false,
+      .shaderResourceMinLod = false,
+      .sparseBinding = false,
+      .sparseResidencyBuffer = false,
+      .sparseResidencyImage2D = false,
+      .sparseResidencyImage3D = false,
+      .sparseResidency2Samples = false,
+      .sparseResidency4Samples = false,
+      .sparseResidency8Samples = false,
+      .sparseResidency16Samples = false,
+      .sparseResidencyAliased = false,
+      .variableMultisampleRate = false,
+      .inheritedQueries = true,
+
+      /* Vulkan 1.1 */
+      .storageBuffer16BitAccess = true,
+      .uniformAndStorageBuffer16BitAccess = true,
+      .storagePushConstant16 = true,
+      .storageInputOutput16 = false,
+      .multiview = true,
+      .multiviewGeometryShader = false,
+      .multiviewTessellationShader = false,
+      .variablePointersStorageBuffer = true,
+      /* FIXME: this needs support for non-constant index on UBO/SSBO */
+      .variablePointers = false,
+      .protectedMemory = false,
+      .samplerYcbcrConversion = true,
+      .shaderDrawParameters = false,
+
+      /* Vulkan 1.2 */
+      .hostQueryReset = true,
+      .uniformAndStorageBuffer8BitAccess = true,
+      .uniformBufferStandardLayout = true,
+      /* V3D 4.2 wraps TMU vector accesses to 16-byte boundaries, so loads and
+       * stores of vectors that cross these boundaries would not work correctly
+       * with scalarBlockLayout and would need to be split into smaller vectors
+       * (and/or scalars) that don't cross these boundaries. For load/stores
+       * with dynamic offsets where we can't identify if the offset is
+       * problematic, we would always have to scalarize. Overall, this would
+       * not lead to best performance so let's just not support it.
+       */
+      .scalarBlockLayout = physical_device->devinfo.ver >= 71,
+      /* This tells applications 2 things:
+       *
+       * 1. If they can select just one aspect for barriers. For us barriers
+       *    decide if we need to split a job and we don't care if it is only
+       *    for one of the aspects of the image or both, so we don't really
+       *    benefit from seeing barriers that select just one aspect.
+       *
+       * 2. If they can program different layouts for each aspect. We
+       *    generally don't care about layouts, so again, we don't get any
+       *    benefits from this to limit the scope of image layout transitions.
+       *
+       * Still, Vulkan 1.2 requires this feature to be supported so we
+       * advertise it even though we don't really take advantage of it.
+       */
+      .separateDepthStencilLayouts = true,
+      .storageBuffer8BitAccess = true,
+      .storagePushConstant8 = true,
+      .imagelessFramebuffer = true,
+      .timelineSemaphore = true,
+
+      .samplerMirrorClampToEdge = true,
+
+      /* Extended subgroup types is mandatory by Vulkan 1.2, however, it is
+       * only in effect if the implementation supports non 32-bit types, which
+       * we don't, so in practice setting it to true doesn't have any
+       * implications for us.
+       */
+      .shaderSubgroupExtendedTypes = true,
+      .subgroupBroadcastDynamicId = true,
+
+      .vulkanMemoryModel = true,
+      .vulkanMemoryModelDeviceScope = true,
+      .vulkanMemoryModelAvailabilityVisibilityChains = true,
+
+      .bufferDeviceAddress = true,
+      .bufferDeviceAddressCaptureReplay = false,
+      .bufferDeviceAddressMultiDevice = false,
+
+      /* Vulkan 1.3 */
+      .inlineUniformBlock  = true,
+      /* Inline buffers work like push constants, so after their are bound
+       * some of their contents may be copied into the uniform stream as soon
+       * as the next draw/dispatch is recorded in the command buffer. This means
+       * that if the client updates the buffer contents after binding it to
+       * a command buffer, the next queue submit of that command buffer may
+       * not use the latest update to the buffer contents, but the data that
+       * was present in the buffer at the time it was bound to the command
+       * buffer.
+       */
+      .descriptorBindingInlineUniformBlockUpdateAfterBind = false,
+      .pipelineCreationCacheControl = true,
+      .privateData = true,
+      .maintenance4 = true,
+      .shaderZeroInitializeWorkgroupMemory = true,
+      .synchronization2 = true,
+      .robustImageAccess = true,
+      .shaderIntegerDotProduct = true,
+
+      /* VK_EXT_4444_formats */
+      .formatA4R4G4B4 = true,
+      .formatA4B4G4R4 = true,
+
+      /* VK_EXT_custom_border_color */
+      .customBorderColors = true,
+      .customBorderColorWithoutFormat = false,
+
+      /* VK_EXT_index_type_uint8 */
+      .indexTypeUint8 = true,
+
+      /* VK_EXT_line_rasterization */
+      .rectangularLines = true,
+      .bresenhamLines = true,
+      .smoothLines = true,
+      .stippledRectangularLines = false,
+      .stippledBresenhamLines = false,
+      .stippledSmoothLines = false,
+
+      /* VK_EXT_color_write_enable */
+      .colorWriteEnable = true,
+
+      /* VK_EXT_extended_dynamic_state */
+      .extendedDynamicState = true,
+
+      /* VK_KHR_pipeline_executable_properties */
+      .pipelineExecutableInfo = true,
+
+      /* VK_EXT_provoking_vertex */
+      .provokingVertexLast = true,
+      /* FIXME: update when supporting EXT_transform_feedback */
+      .transformFeedbackPreservesProvokingVertex = false,
+
+      /* VK_EXT_vertex_attribute_divisor */
+      .vertexAttributeInstanceRateDivisor = true,
+      .vertexAttributeInstanceRateZeroDivisor = false,
+
+      /* VK_KHR_performance_query */
+      .performanceCounterQueryPools = physical_device->caps.perfmon,
+      .performanceCounterMultipleQueryPools = false,
+
+      /* VK_EXT_texel_buffer_alignment */
+      .texelBufferAlignment = true,
+
+      /* VK_KHR_workgroup_memory_explicit_layout */
+      .workgroupMemoryExplicitLayout = true,
+      .workgroupMemoryExplicitLayoutScalarBlockLayout = false,
+      .workgroupMemoryExplicitLayout8BitAccess = true,
+      .workgroupMemoryExplicitLayout16BitAccess = true,
+
+      /* VK_EXT_border_color_swizzle */
+      .borderColorSwizzle = true,
+      .borderColorSwizzleFromImage = true,
+
+      /* VK_EXT_shader_module_identifier */
+      .shaderModuleIdentifier = true,
+
+      /* VK_EXT_depth_clip_control */
+      .depthClipControl = true,
+
+      /* VK_EXT_depth_clip_enable */
+      .depthClipEnable = physical_device->devinfo.ver >= 71,
+
+      /* VK_EXT_attachment_feedback_loop_layout */
+      .attachmentFeedbackLoopLayout = true,
+
+      /* VK_EXT_primitive_topology_list_restart */
+      .primitiveTopologyListRestart = true,
+      /* FIXME: we don't support tessellation shaders yet */
+      .primitiveTopologyPatchListRestart = false,
+
+      /* VK_EXT_pipeline_robustness */
+      .pipelineRobustness = true,
+
+      /* VK_EXT_multi_draw */
+      .multiDraw = true,
+
+      /* VK_KHR_shader_terminate_invocation */
+      .shaderTerminateInvocation = true,
+
+      /* VK_EXT_shader_demote_to_helper_invocation */
+      .shaderDemoteToHelperInvocation = true,
+
+      /* VK_EXT_subgroup_size_control */
+      .subgroupSizeControl = true,
+      .computeFullSubgroups = true,
+
+      /* VK_KHR_shader_expect_assume */
+      .shaderExpectAssume = true,
+
+      /* VK_KHR_dynamic_rendering */
+      .dynamicRendering = true,
+
+#ifdef V3DV_USE_WSI_PLATFORM
+      /* VK_EXT_swapchain_maintenance1 */
+      .swapchainMaintenance1 = true,
 #endif
-      .KHR_variable_pointers               = true,
-      .EXT_color_write_enable              = true,
-      .EXT_custom_border_color             = true,
-      .EXT_external_memory_dma_buf         = true,
-      .EXT_index_type_uint8                = true,
-      .EXT_physical_device_drm             = true,
-      .EXT_pipeline_creation_cache_control = true,
-      .EXT_pipeline_creation_feedback      = true,
-      .EXT_private_data                    = true,
-      .EXT_provoking_vertex                = true,
-      .EXT_vertex_attribute_divisor        = true,
    };
 }
 
@@ -165,6 +500,10 @@ v3dv_EnumerateInstanceExtensionProperties(const char *pLayerName,
       &instance_extensions, pPropertyCount, pProperties);
 }
 
+static VkResult enumerate_devices(struct vk_instance *vk_instance);
+
+static void destroy_physical_device(struct vk_physical_device *device);
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
                     const VkAllocationCallbacks *pAllocator,
@@ -186,6 +525,8 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
    struct vk_instance_dispatch_table dispatch_table;
    vk_instance_dispatch_table_from_entrypoints(
       &dispatch_table, &v3dv_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
 
    result = vk_instance_init(&instance->vk,
                              &instance_extensions,
@@ -194,12 +535,13 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
 
    if (result != VK_SUCCESS) {
       vk_free(pAllocator, instance);
-      return vk_error(instance, result);
+      return vk_error(NULL, result);
    }
 
    v3d_process_debug_variable();
 
-   instance->physicalDeviceCount = -1;
+   instance->vk.physical_devices.enumerate = enumerate_devices;
+   instance->vk.physical_devices.destroy = destroy_physical_device;
 
    /* We start with the default values for the pipeline_cache envvars */
    instance->pipeline_cache_enabled = true;
@@ -229,8 +571,6 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
       }
    }
 
-   util_cpu_detect();
-
    VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
 
    *pInstance = v3dv_instance_to_handle(instance);
@@ -256,11 +596,11 @@ physical_device_finish(struct v3dv_physical_device *device)
    v3dv_physical_device_free_disk_cache(device);
    v3d_compiler_free(device->compiler);
 
+   util_sparse_array_finish(&device->bo_map);
+
    close(device->render_fd);
    if (device->display_fd >= 0)
       close(device->display_fd);
-   if (device->master_fd >= 0)
-      close(device->master_fd);
 
    free(device->name);
 
@@ -272,6 +612,13 @@ physical_device_finish(struct v3dv_physical_device *device)
    mtx_destroy(&device->mutex);
 }
 
+static void
+destroy_physical_device(struct vk_physical_device *device)
+{
+   physical_device_finish((struct v3dv_physical_device *)device);
+   vk_free(&device->instance->alloc, device);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyInstance(VkInstance _instance,
                      const VkAllocationCallbacks *pAllocator)
@@ -281,12 +628,6 @@ v3dv_DestroyInstance(VkInstance _instance,
    if (!instance)
       return;
 
-   if (instance->physicalDeviceCount > 0) {
-      /* We support at most one physical device. */
-      assert(instance->physicalDeviceCount == 1);
-      physical_device_finish(&instance->physicalDevice);
-   }
-
    VG(VALGRIND_DESTROY_MEMPOOL(instance));
 
    vk_instance_finish(&instance->vk);
@@ -306,286 +647,39 @@ compute_heap_size()
    uint64_t total_ram = (uint64_t) v3d_simulator_get_mem_size();
 #endif
 
-   /* We don't want to burn too much ram with the GPU.  If the user has 4GiB
-    * or less, we use at most half.  If they have more than 4GiB, we use 3/4.
+   /* We don't want to burn too much ram with the GPU.  If the user has 4GB
+    * or less, we use at most half.  If they have more than 4GB we limit it
+    * to 3/4 with a max. of 4GB since the GPU cannot address more than that.
     */
-   uint64_t available_ram;
-   if (total_ram <= 4ull * 1024ull * 1024ull * 1024ull)
-      available_ram = total_ram / 2;
+   const uint64_t MAX_HEAP_SIZE = 4ull * 1024ull * 1024ull * 1024ull;
+   uint64_t available;
+   if (total_ram <= MAX_HEAP_SIZE)
+      available = total_ram / 2;
    else
-      available_ram = total_ram * 3 / 4;
-
-   return available_ram;
-}
-
-#if !using_v3d_simulator
-#ifdef VK_USE_PLATFORM_XCB_KHR
-static int
-create_display_fd_xcb(VkIcdSurfaceBase *surface)
-{
-   int fd = -1;
-
-   xcb_connection_t *conn;
-   xcb_dri3_open_reply_t *reply = NULL;
-   if (surface) {
-      if (surface->platform == VK_ICD_WSI_PLATFORM_XLIB)
-         conn = XGetXCBConnection(((VkIcdSurfaceXlib *)surface)->dpy);
-      else
-         conn = ((VkIcdSurfaceXcb *)surface)->connection;
-   } else {
-      conn = xcb_connect(NULL, NULL);
-   }
-
-   if (xcb_connection_has_error(conn))
-      goto finish;
-
-   const xcb_setup_t *setup = xcb_get_setup(conn);
-   xcb_screen_iterator_t iter = xcb_setup_roots_iterator(setup);
-   xcb_screen_t *screen = iter.data;
-
-   xcb_dri3_open_cookie_t cookie;
-   cookie = xcb_dri3_open(conn, screen->root, None);
-   reply = xcb_dri3_open_reply(conn, cookie, NULL);
-   if (!reply)
-      goto finish;
-
-   if (reply->nfd != 1)
-      goto finish;
-
-   fd = xcb_dri3_open_reply_fds(conn, reply)[0];
-   fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
-
-finish:
-   if (!surface)
-      xcb_disconnect(conn);
-   if (reply)
-      free(reply);
-
-   return fd;
-}
-#endif
-
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-struct v3dv_wayland_info {
-   struct wl_drm *wl_drm;
-   int fd;
-   bool is_set;
-   bool authenticated;
-};
-
-static void
-v3dv_drm_handle_device(void *data, struct wl_drm *drm, const char *device)
-{
-   struct v3dv_wayland_info *info = data;
-   info->fd = open(device, O_RDWR | O_CLOEXEC);
-   info->is_set = info->fd != -1;
-   if (!info->is_set) {
-      fprintf(stderr, "v3dv_drm_handle_device: could not open %s (%s)\n",
-              device, strerror(errno));
-      return;
-   }
-
-   drm_magic_t magic;
-   if (drmGetMagic(info->fd, &magic)) {
-      fprintf(stderr, "v3dv_drm_handle_device: drmGetMagic failed\n");
-      close(info->fd);
-      info->fd = -1;
-      info->is_set = false;
-      return;
-   }
-   wl_drm_authenticate(info->wl_drm, magic);
-}
-
-static void
-v3dv_drm_handle_format(void *data, struct wl_drm *drm, uint32_t format)
-{
-}
-
-static void
-v3dv_drm_handle_authenticated(void *data, struct wl_drm *drm)
-{
-   struct v3dv_wayland_info *info = data;
-   info->authenticated = true;
-}
-
-static void
-v3dv_drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t value)
-{
-}
-
-struct wl_drm_listener v3dv_drm_listener = {
-   .device = v3dv_drm_handle_device,
-   .format = v3dv_drm_handle_format,
-   .authenticated = v3dv_drm_handle_authenticated,
-   .capabilities = v3dv_drm_handle_capabilities
-};
-
-static void
-v3dv_registry_global(void *data,
-                     struct wl_registry *registry,
-                     uint32_t name,
-                     const char *interface,
-                     uint32_t version)
-{
-   struct v3dv_wayland_info *info = data;
-   if (strcmp(interface, "wl_drm") == 0) {
-      info->wl_drm = wl_registry_bind(registry, name, &wl_drm_interface,
-                                      MIN2(version, 2));
-      wl_drm_add_listener(info->wl_drm, &v3dv_drm_listener, data);
-   };
-}
-
-static void
-v3dv_registry_global_remove_cb(void *data,
-                               struct wl_registry *registry,
-                               uint32_t name)
-{
-}
-
-static int
-create_display_fd_wayland(VkIcdSurfaceBase *surface)
-{
-   struct wl_display *display;
-   struct wl_registry *registry = NULL;
-
-   struct v3dv_wayland_info info = {
-      .wl_drm = NULL,
-      .fd = -1,
-      .is_set = false,
-      .authenticated = false
-   };
-
-   if (surface)
-      display = ((VkIcdSurfaceWayland *) surface)->display;
-   else
-      display = wl_display_connect(NULL);
-
-   if (!display)
-      return -1;
-
-   registry = wl_display_get_registry(display);
-   if (!registry) {
-      if (!surface)
-         wl_display_disconnect(display);
-      return -1;
-   }
-
-   static const struct wl_registry_listener registry_listener = {
-      v3dv_registry_global,
-      v3dv_registry_global_remove_cb
-   };
-   wl_registry_add_listener(registry, &registry_listener, &info);
-
-   wl_display_roundtrip(display); /* For the registry advertisement */
-   wl_display_roundtrip(display); /* For the DRM device event */
-   wl_display_roundtrip(display); /* For the authentication event */
-
-   wl_drm_destroy(info.wl_drm);
-   wl_registry_destroy(registry);
-
-   if (!surface)
-      wl_display_disconnect(display);
-
-   if (!info.is_set)
-      return -1;
-
-   if (!info.authenticated)
-      return -1;
-
-   return info.fd;
-}
-#endif
-
-/* Acquire an authenticated display fd without a surface reference. This is the
- * case where the application is making WSI allocations outside the Vulkan
- * swapchain context (only Zink, for now). Since we lack information about the
- * underlying surface we just try our best to figure out the correct display
- * and platform to use. It should work in most cases.
- */
-static void
-acquire_display_device_no_surface(struct v3dv_instance *instance,
-                                  struct v3dv_physical_device *pdevice)
-{
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-   pdevice->display_fd = create_display_fd_wayland(NULL);
-#endif
-
-#ifdef VK_USE_PLATFORM_XCB_KHR
-   if (pdevice->display_fd == -1)
-      pdevice->display_fd = create_display_fd_xcb(NULL);
-#endif
-
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
-   if (pdevice->display_fd == - 1 && pdevice->master_fd >= 0)
-      pdevice->display_fd = dup(pdevice->master_fd);
-#endif
-}
+      available = MIN2(MAX_HEAP_SIZE, total_ram * 3 / 4);
 
-/* Acquire an authenticated display fd from the surface. This is the regular
- * case where the application is using swapchains to create WSI allocations.
- * In this case we use the surface information to figure out the correct
- * display and platform combination.
- */
-static void
-acquire_display_device_surface(struct v3dv_instance *instance,
-                               struct v3dv_physical_device *pdevice,
-                               VkIcdSurfaceBase *surface)
-{
-   /* Mesa will set both of VK_USE_PLATFORM_{XCB,XLIB} when building with
-    * platform X11, so only check for XCB and rely on XCB to get an
-    * authenticated device also for Xlib.
-    */
-#ifdef VK_USE_PLATFORM_XCB_KHR
-   if (surface->platform == VK_ICD_WSI_PLATFORM_XCB ||
-       surface->platform == VK_ICD_WSI_PLATFORM_XLIB) {
-      pdevice->display_fd = create_display_fd_xcb(surface);
-   }
-#endif
-
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-   if (surface->platform == VK_ICD_WSI_PLATFORM_WAYLAND)
-      pdevice->display_fd = create_display_fd_wayland(surface);
-#endif
-
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
-   if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY &&
-       pdevice->master_fd >= 0) {
-      pdevice->display_fd = dup(pdevice->master_fd);
-   }
-#endif
+   return available;
 }
-#endif /* !using_v3d_simulator */
 
-/* Attempts to get an authenticated display fd from the display server that
- * we can use to allocate BOs for presentable images.
- */
-VkResult
-v3dv_physical_device_acquire_display(struct v3dv_instance *instance,
-                                     struct v3dv_physical_device *pdevice,
-                                     VkIcdSurfaceBase *surface)
+static uint64_t
+compute_memory_budget(struct v3dv_physical_device *device)
 {
-   VkResult result = VK_SUCCESS;
-   mtx_lock(&pdevice->mutex);
-
-   if (pdevice->display_fd != -1)
-      goto done;
-
-   /* When running on the simulator we do everything on a single render node so
-    * we don't need to get an authenticated display fd from the display server.
-    */
+   uint64_t heap_size = device->memory.memoryHeaps[0].size;
+   uint64_t heap_used = device->heap_used;
+   uint64_t sys_available;
 #if !using_v3d_simulator
-   if (surface)
-      acquire_display_device_surface(instance, pdevice, surface);
-   else
-      acquire_display_device_no_surface(instance, pdevice);
-
-   if (pdevice->display_fd == -1)
-      result = VK_ERROR_INITIALIZATION_FAILED;
+   ASSERTED bool has_available_memory =
+      os_get_available_system_memory(&sys_available);
+   assert(has_available_memory);
+#else
+   sys_available = (uint64_t) v3d_simulator_get_mem_free();
 #endif
 
-done:
-   mtx_unlock(&pdevice->mutex);
-   return result;
+   /* Let's not incite the app to starve the system: report at most 90% of
+    * available system memory.
+    */
+   uint64_t heap_available = sys_available * 9 / 10;
+   return MIN2(heap_size, heap_used + heap_available);
 }
 
 static bool
@@ -604,7 +698,8 @@ device_has_expected_features(struct v3dv_physical_device *device)
 {
    return v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_TFU) &&
           v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CSD) &&
-          v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH);
+          v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH) &&
+          device->caps.multisync;
 }
 
 
@@ -614,14 +709,14 @@ init_uuids(struct v3dv_physical_device *device)
    const struct build_id_note *note =
       build_id_find_nhdr_for_addr(init_uuids);
    if (!note) {
-      return vk_errorf((struct v3dv_instance*) device->vk.instance,
+      return vk_errorf(device->vk.instance,
                        VK_ERROR_INITIALIZATION_FAILED,
                        "Failed to find build-id");
    }
 
    unsigned build_id_len = build_id_length(note);
    if (build_id_len < 20) {
-      return vk_errorf((struct v3dv_instance*) device->vk.instance,
+      return vk_errorf(device->vk.instance,
                        VK_ERROR_INITIALIZATION_FAILED,
                        "build-id too short.  It needs to be a SHA");
    }
@@ -672,38 +767,46 @@ v3dv_physical_device_init_disk_cache(struct v3dv_physical_device *device)
    _mesa_sha1_format(timestamp, device->driver_build_sha1);
 
    assert(device->name);
-   device->disk_cache = disk_cache_create(device->name, timestamp, 0);
+   device->disk_cache = disk_cache_create(device->name, timestamp, v3d_mesa_debug);
 #else
    device->disk_cache = NULL;
 #endif
 }
 
 static VkResult
-physical_device_init(struct v3dv_physical_device *device,
-                     struct v3dv_instance *instance,
-                     drmDevicePtr drm_render_device,
-                     drmDevicePtr drm_primary_device)
+create_physical_device(struct v3dv_instance *instance,
+                       drmDevicePtr gpu_device,
+                       drmDevicePtr display_device)
 {
    VkResult result = VK_SUCCESS;
-   int32_t master_fd = -1;
+   int32_t display_fd = -1;
    int32_t render_fd = -1;
 
+   struct v3dv_physical_device *device =
+      vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+
+   if (!device)
+      return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
    struct vk_physical_device_dispatch_table dispatch_table;
    vk_physical_device_dispatch_table_from_entrypoints
       (&dispatch_table, &v3dv_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
 
-   result = vk_physical_device_init(&device->vk, &instance->vk, NULL,
-                                    &dispatch_table);
+   result = vk_physical_device_init(&device->vk, &instance->vk, NULL, NULL,
+                                    NULL, &dispatch_table);
 
    if (result != VK_SUCCESS)
       goto fail;
 
-   assert(drm_render_device);
-   const char *path = drm_render_device->nodes[DRM_NODE_RENDER];
+   assert(gpu_device);
+   const char *path = gpu_device->nodes[DRM_NODE_RENDER];
    render_fd = open(path, O_RDWR | O_CLOEXEC);
    if (render_fd < 0) {
       fprintf(stderr, "Opening %s failed: %s\n", path, strerror(errno));
-      result = VK_ERROR_INCOMPATIBLE_DRIVER;
+      result = VK_ERROR_INITIALIZATION_FAILED;
       goto fail;
    }
 
@@ -714,12 +817,12 @@ physical_device_init(struct v3dv_physical_device *device,
 
    const char *primary_path;
 #if !using_v3d_simulator
-   if (drm_primary_device)
-      primary_path = drm_primary_device->nodes[DRM_NODE_PRIMARY];
+   if (display_device)
+      primary_path = display_device->nodes[DRM_NODE_PRIMARY];
    else
       primary_path = NULL;
 #else
-   primary_path = drm_render_device->nodes[DRM_NODE_PRIMARY];
+   primary_path = gpu_device->nodes[DRM_NODE_PRIMARY];
 #endif
 
    struct stat primary_stat = {0}, render_stat = {0};
@@ -727,8 +830,7 @@ physical_device_init(struct v3dv_physical_device *device,
    device->has_primary = primary_path;
    if (device->has_primary) {
       if (stat(primary_path, &primary_stat) != 0) {
-         result = vk_errorf(instance,
-                            VK_ERROR_INITIALIZATION_FAILED,
+         result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                             "failed to stat DRM primary node %s",
                             primary_path);
          goto fail;
@@ -738,8 +840,7 @@ physical_device_init(struct v3dv_physical_device *device,
    }
 
    if (fstat(render_fd, &render_stat) != 0) {
-      result = vk_errorf(instance,
-                         VK_ERROR_INITIALIZATION_FAILED,
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                          "failed to stat DRM render node %s",
                          path);
       goto fail;
@@ -747,16 +848,24 @@ physical_device_init(struct v3dv_physical_device *device,
    device->has_render = true;
    device->render_devid = render_stat.st_rdev;
 
-   if (instance->vk.enabled_extensions.KHR_display) {
+#if using_v3d_simulator
+   device->device_id = gpu_device->deviceinfo.pci->device_id;
+#endif
+
+   if (instance->vk.enabled_extensions.KHR_display ||
+       instance->vk.enabled_extensions.KHR_xcb_surface ||
+       instance->vk.enabled_extensions.KHR_xlib_surface ||
+       instance->vk.enabled_extensions.KHR_wayland_surface ||
+       instance->vk.enabled_extensions.EXT_acquire_drm_display) {
 #if !using_v3d_simulator
       /* Open the primary node on the vc4 display device */
-      assert(drm_primary_device);
-      master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+      assert(display_device);
+      display_fd = open(primary_path, O_RDWR | O_CLOEXEC);
 #else
       /* There is only one device with primary and render nodes.
        * Open its primary node.
        */
-      master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+      display_fd = open(primary_path, O_RDWR | O_CLOEXEC);
 #endif
    }
 
@@ -765,21 +874,32 @@ physical_device_init(struct v3dv_physical_device *device,
 #endif
 
    device->render_fd = render_fd;    /* The v3d render node  */
-   device->display_fd = -1;          /* Authenticated vc4 primary node */
-   device->master_fd = master_fd;    /* Master vc4 primary node */
+   device->display_fd = display_fd;  /* Master vc4 primary node */
 
    if (!v3d_get_device_info(device->render_fd, &device->devinfo, &v3dv_ioctl)) {
-      result = VK_ERROR_INCOMPATIBLE_DRIVER;
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to get info from device.");
       goto fail;
    }
 
    if (device->devinfo.ver < 42) {
-      result = VK_ERROR_INCOMPATIBLE_DRIVER;
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "Device version < 42.");
       goto fail;
    }
 
+   device->caps.cpu_queue =
+      v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CPU_QUEUE);
+
+   device->caps.multisync =
+      v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT);
+
+   device->caps.perfmon =
+      v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_PERFMON);
+
    if (!device_has_expected_features(device)) {
-      result = VK_ERROR_INCOMPATIBLE_DRIVER;
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "Kernel driver doesn't have required features.");
       goto fail;
    }
 
@@ -787,12 +907,15 @@ physical_device_init(struct v3dv_physical_device *device,
    if (result != VK_SUCCESS)
       goto fail;
 
-   device->compiler = v3d_compiler_init(&device->devinfo);
+   device->compiler = v3d_compiler_init(&device->devinfo,
+                                        MAX_INLINE_UNIFORM_BUFFERS);
    device->next_program_id = 0;
 
    ASSERTED int len =
-      asprintf(&device->name, "V3D %d.%d",
-               device->devinfo.ver / 10, device->devinfo.ver % 10);
+      asprintf(&device->name, "V3D %d.%d.%d",
+               device->devinfo.ver / 10,
+               device->devinfo.ver % 10,
+               device->devinfo.rev);
    assert(len != -1);
 
    v3dv_physical_device_init_disk_cache(device);
@@ -811,7 +934,31 @@ physical_device_init(struct v3dv_physical_device *device,
       VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
    mem->memoryTypes[0].heapIndex = 0;
 
-   device->options.merge_jobs = getenv("V3DV_NO_MERGE_JOBS") == NULL;
+   /* Initialize sparse array for refcounting imported BOs */
+   util_sparse_array_init(&device->bo_map, sizeof(struct v3dv_bo), 512);
+
+   device->options.merge_jobs = !V3D_DBG(NO_MERGE_JOBS);
+
+   device->drm_syncobj_type = vk_drm_syncobj_get_type(device->render_fd);
+
+   /* We don't support timelines in the uAPI yet and we don't want it getting
+    * suddenly turned on by vk_drm_syncobj_get_type() without us adding v3dv
+    * code for it first.
+    */
+   device->drm_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
+
+   /* Multiwait is required for emulated timeline semaphores and is supported
+    * by the v3d kernel interface.
+    */
+   device->drm_syncobj_type.features |= VK_SYNC_FEATURE_GPU_MULTI_WAIT;
+
+   device->sync_timeline_type =
+      vk_sync_timeline_get_type(&device->drm_syncobj_type);
+
+   device->sync_types[0] = &device->drm_syncobj_type;
+   device->sync_types[1] = &device->sync_timeline_type.sync;
+   device->sync_types[2] = NULL;
+   device->vk.supported_sync_types = device->sync_types;
 
    result = v3dv_wsi_init(device);
    if (result != VK_SUCCESS) {
@@ -820,35 +967,46 @@ physical_device_init(struct v3dv_physical_device *device,
    }
 
    get_device_extensions(device, &device->vk.supported_extensions);
+   get_features(device, &device->vk.supported_features);
+
+   mtx_init(&device->mutex, mtx_plain);
 
-   pthread_mutex_init(&device->mutex, NULL);
+   list_addtail(&device->vk.link, &instance->vk.physical_devices.list);
 
    return VK_SUCCESS;
 
 fail:
    vk_physical_device_finish(&device->vk);
+   vk_free(&instance->vk.alloc, device);
 
    if (render_fd >= 0)
       close(render_fd);
-   if (master_fd >= 0)
-      close(master_fd);
+   if (display_fd >= 0)
+      close(display_fd);
 
    return result;
 }
 
+/* This driver hook is expected to return VK_SUCCESS (unless a memory
+ * allocation error happened) if no compatible device is found. If a
+ * compatible device is found, it may return an error code if device
+ * inialization failed.
+ */
 static VkResult
-enumerate_devices(struct v3dv_instance *instance)
+enumerate_devices(struct vk_instance *vk_instance)
 {
-   /* TODO: Check for more devices? */
+   struct v3dv_instance *instance =
+      container_of(vk_instance, struct v3dv_instance, vk);
+
+   /* FIXME: Check for more devices? */
    drmDevicePtr devices[8];
-   VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER;
    int max_devices;
 
-   instance->physicalDeviceCount = 0;
-
    max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
    if (max_devices < 1)
-      return VK_ERROR_INCOMPATIBLE_DRIVER;
+      return VK_SUCCESS;
+
+   VkResult result = VK_SUCCESS;
 
 #if !using_v3d_simulator
    int32_t v3d_idx = -1;
@@ -856,25 +1014,24 @@ enumerate_devices(struct v3dv_instance *instance)
 #endif
    for (unsigned i = 0; i < (unsigned)max_devices; i++) {
 #if using_v3d_simulator
-      /* In the simulator, we look for an Intel render node */
+      /* In the simulator, we look for an Intel/AMD render node */
       const int required_nodes = (1 << DRM_NODE_RENDER) | (1 << DRM_NODE_PRIMARY);
       if ((devices[i]->available_nodes & required_nodes) == required_nodes &&
            devices[i]->bustype == DRM_BUS_PCI &&
-           devices[i]->deviceinfo.pci->vendor_id == 0x8086) {
-         result = physical_device_init(&instance->physicalDevice, instance,
-                                       devices[i], NULL);
-         if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
+          (devices[i]->deviceinfo.pci->vendor_id == 0x8086 ||
+           devices[i]->deviceinfo.pci->vendor_id == 0x1002)) {
+         result = create_physical_device(instance, devices[i], NULL);
+         if (result == VK_SUCCESS)
             break;
       }
 #else
-      /* On actual hardware, we should have a render node (v3d)
-       * and a primary node (vc4). We will need to use the primary
-       * to allocate WSI buffers and share them with the render node
-       * via prime, but that is a privileged operation so we need the
-       * primary node to be authenticated, and for that we need the
-       * display server to provide the device fd (with DRI3), so we
-       * here we only check that the device is present but we don't
-       * try to open it.
+      /* On actual hardware, we should have a gpu device (v3d) and a display
+       * device (vc4). We will need to use the display device to allocate WSI
+       * buffers and share them with the render node via prime, but that is a
+       * privileged operation so we need t have an authenticated display fd
+       * and for that we need the display server to provide the it (with DRI3),
+       * so here we only check that the device is present but we don't try to
+       * open it.
        */
       if (devices[i]->bustype != DRM_BUS_PLATFORM)
          continue;
@@ -882,7 +1039,8 @@ enumerate_devices(struct v3dv_instance *instance)
       if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
          char **compat = devices[i]->deviceinfo.platform->compatible;
          while (*compat) {
-            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
+            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
+                strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
                v3d_idx = i;
                break;
             }
@@ -891,8 +1049,9 @@ enumerate_devices(struct v3dv_instance *instance)
       } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
          char **compat = devices[i]->deviceinfo.platform->compatible;
          while (*compat) {
-            if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
-                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
+            if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
+                strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
                vc4_idx = i;
                break;
             }
@@ -903,345 +1062,35 @@ enumerate_devices(struct v3dv_instance *instance)
    }
 
 #if !using_v3d_simulator
-   if (v3d_idx == -1 || vc4_idx == -1)
-      result = VK_ERROR_INCOMPATIBLE_DRIVER;
-   else
-      result = physical_device_init(&instance->physicalDevice, instance,
-                                    devices[v3d_idx], devices[vc4_idx]);
+   if (v3d_idx != -1) {
+      drmDevicePtr v3d_device = devices[v3d_idx];
+      drmDevicePtr vc4_device = vc4_idx != -1 ? devices[vc4_idx] : NULL;
+      result = create_physical_device(instance, v3d_device, vc4_device);
+   }
 #endif
 
    drmFreeDevices(devices, max_devices);
 
-   if (result == VK_SUCCESS)
-      instance->physicalDeviceCount = 1;
-
    return result;
 }
 
-static VkResult
-instance_ensure_physical_device(struct v3dv_instance *instance)
-{
-   if (instance->physicalDeviceCount < 0) {
-      VkResult result = enumerate_devices(instance);
-      if (result != VK_SUCCESS &&
-          result != VK_ERROR_INCOMPATIBLE_DRIVER)
-         return result;
-   }
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult  VKAPI_CALL
-v3dv_EnumeratePhysicalDevices(VkInstance _instance,
-                              uint32_t *pPhysicalDeviceCount,
-                              VkPhysicalDevice *pPhysicalDevices)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount);
- 
-   VkResult result = instance_ensure_physical_device(instance);
-   if (result != VK_SUCCESS)
-      return result;
-
-   if (instance->physicalDeviceCount == 0)
-      return VK_SUCCESS;
-
-   assert(instance->physicalDeviceCount == 1);
-   vk_outarray_append(&out, i) {
-      *i = v3dv_physical_device_to_handle(&instance->physicalDevice);
-   }
-
-   return vk_outarray_status(&out);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_EnumeratePhysicalDeviceGroups(
-    VkInstance _instance,
-    uint32_t *pPhysicalDeviceGroupCount,
-    VkPhysicalDeviceGroupProperties *pPhysicalDeviceGroupProperties)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties,
-                         pPhysicalDeviceGroupCount);
-
-   VkResult result = instance_ensure_physical_device(instance);
-   if (result != VK_SUCCESS)
-      return result;
-
-   assert(instance->physicalDeviceCount == 1);
-
-   vk_outarray_append(&out, p) {
-      p->physicalDeviceCount = 1;
-      memset(p->physicalDevices, 0, sizeof(p->physicalDevices));
-      p->physicalDevices[0] =
-         v3dv_physical_device_to_handle(&instance->physicalDevice);
-      p->subsetAllocation = false;
-
-      vk_foreach_struct(ext, p->pNext)
-         v3dv_debug_ignored_stype(ext->sType);
-   }
-
-   return vk_outarray_status(&out);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
-                               VkPhysicalDeviceFeatures *pFeatures)
-{
-   memset(pFeatures, 0, sizeof(*pFeatures));
-
-   *pFeatures = (VkPhysicalDeviceFeatures) {
-      .robustBufferAccess = true, /* This feature is mandatory */
-      .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
-      .imageCubeArray = true,
-      .independentBlend = true,
-      .geometryShader = true,
-      .tessellationShader = false,
-      .sampleRateShading = true,
-      .dualSrcBlend = false,
-      .logicOp = true,
-      .multiDrawIndirect = false,
-      .drawIndirectFirstInstance = true,
-      .depthClamp = false,
-      .depthBiasClamp = true,
-      .fillModeNonSolid = true,
-      .depthBounds = false, /* Only available since V3D 4.3.16.2 */
-      .wideLines = true,
-      .largePoints = true,
-      .alphaToOne = true,
-      .multiViewport = false,
-      .samplerAnisotropy = true,
-      .textureCompressionETC2 = true,
-      .textureCompressionASTC_LDR = true,
-      /* Note that textureCompressionBC requires that the driver support all
-       * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim
-       * that we support it.
-       */
-      .textureCompressionBC = false,
-      .occlusionQueryPrecise = true,
-      .pipelineStatisticsQuery = false,
-      .vertexPipelineStoresAndAtomics = true,
-      .fragmentStoresAndAtomics = true,
-      .shaderTessellationAndGeometryPointSize = true,
-      .shaderImageGatherExtended = false,
-      .shaderStorageImageExtendedFormats = true,
-      .shaderStorageImageMultisample = false,
-      .shaderStorageImageReadWithoutFormat = false,
-      .shaderStorageImageWriteWithoutFormat = false,
-      .shaderUniformBufferArrayDynamicIndexing = false,
-      .shaderSampledImageArrayDynamicIndexing = false,
-      .shaderStorageBufferArrayDynamicIndexing = false,
-      .shaderStorageImageArrayDynamicIndexing = false,
-      .shaderClipDistance = true,
-      .shaderCullDistance = false,
-      .shaderFloat64 = false,
-      .shaderInt64 = false,
-      .shaderInt16 = false,
-      .shaderResourceResidency = false,
-      .shaderResourceMinLod = false,
-      .sparseBinding = false,
-      .sparseResidencyBuffer = false,
-      .sparseResidencyImage2D = false,
-      .sparseResidencyImage3D = false,
-      .sparseResidency2Samples = false,
-      .sparseResidency4Samples = false,
-      .sparseResidency8Samples = false,
-      .sparseResidency16Samples = false,
-      .sparseResidencyAliased = false,
-      .variableMultisampleRate = false,
-      .inheritedQueries = true,
-   };
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
-                                VkPhysicalDeviceFeatures2 *pFeatures)
-{
-   v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
-
-   VkPhysicalDeviceVulkan11Features vk11 = {
-      .storageBuffer16BitAccess = false,
-      .uniformAndStorageBuffer16BitAccess = false,
-      .storagePushConstant16 = false,
-      .storageInputOutput16 = false,
-      .multiview = true,
-      .multiviewGeometryShader = false,
-      .multiviewTessellationShader = false,
-      .variablePointersStorageBuffer = true,
-      /* FIXME: this needs support for non-constant index on UBO/SSBO */
-      .variablePointers = false,
-      .protectedMemory = false,
-      .samplerYcbcrConversion = false,
-      .shaderDrawParameters = false,
-   };
-
-   vk_foreach_struct(ext, pFeatures->pNext) {
-      switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
-         VkPhysicalDeviceCustomBorderColorFeaturesEXT *features =
-            (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext;
-         features->customBorderColors = true;
-         features->customBorderColorWithoutFormat = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: {
-         VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features =
-            (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext;
-         features->uniformBufferStandardLayout = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT: {
-         VkPhysicalDevicePrivateDataFeaturesEXT *features =
-            (VkPhysicalDevicePrivateDataFeaturesEXT *)ext;
-         features->privateData = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
-         VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
-            (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
-         features->indexTypeUint8 = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
-          VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = (void *) ext;
-          features->colorWriteEnable = true;
-          break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_CREATION_CACHE_CONTROL_FEATURES_EXT: {
-         VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *features = (void *) ext;
-         features->pipelineCreationCacheControl = true;
-         break;
-      }         
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: {
-         VkPhysicalDeviceProvokingVertexFeaturesEXT *features = (void *) ext;
-         features->provokingVertexLast = true;
-         /* FIXME: update when supporting EXT_transform_feedback */
-         features->transformFeedbackPreservesProvokingVertex = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
-         VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
-            (void *) ext;
-         features->vertexAttributeInstanceRateDivisor = true;
-         features->vertexAttributeInstanceRateZeroDivisor = false;
-         break;
-      }
-
-      /* Vulkan 1.1 */
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: {
-         VkPhysicalDeviceVulkan11Features *features =
-            (VkPhysicalDeviceVulkan11Features *)ext;
-         memcpy(features, &vk11, sizeof(VkPhysicalDeviceVulkan11Features));
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
-         VkPhysicalDevice16BitStorageFeatures *features = (void *) ext;
-         features->storageBuffer16BitAccess = vk11.storageBuffer16BitAccess;
-         features->uniformAndStorageBuffer16BitAccess =
-            vk11.uniformAndStorageBuffer16BitAccess;
-         features->storagePushConstant16 = vk11.storagePushConstant16;
-         features->storageInputOutput16 = vk11.storageInputOutput16;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
-         VkPhysicalDeviceMultiviewFeatures *features = (void *) ext;
-         features->multiview = vk11.multiview;
-         features->multiviewGeometryShader = vk11.multiviewGeometryShader;
-         features->multiviewTessellationShader = vk11.multiviewTessellationShader;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
-         VkPhysicalDeviceProtectedMemoryFeatures *features = (void *) ext;
-         features->protectedMemory = vk11.protectedMemory;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
-         VkPhysicalDeviceSamplerYcbcrConversionFeatures *features = (void *) ext;
-         features->samplerYcbcrConversion = vk11.samplerYcbcrConversion;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
-         VkPhysicalDeviceShaderDrawParametersFeatures *features = (void *) ext;
-         features->shaderDrawParameters = vk11.shaderDrawParameters;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
-         VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext;
-         features->variablePointersStorageBuffer =
-            vk11.variablePointersStorageBuffer;
-         features->variablePointers = vk11.variablePointers;
-         break;
-      }
-
-      default:
-         v3dv_debug_ignored_stype(ext->sType);
-         break;
-      }
-   }
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetDeviceGroupPeerMemoryFeatures(VkDevice device,
-                                      uint32_t heapIndex,
-                                      uint32_t localDeviceIndex,
-                                      uint32_t remoteDeviceIndex,
-                                      VkPeerMemoryFeatureFlags *pPeerMemoryFeatures)
-{
-   assert(localDeviceIndex == 0 && remoteDeviceIndex == 0);
-   *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
-                          VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
-                          VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
-                          VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
-}
-
 uint32_t
 v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev)
 {
    return 0x14E4; /* Broadcom */
 }
 
-
-#if using_v3d_simulator
-static bool
-get_i915_param(int fd, uint32_t param, int *value)
-{
-   int tmp;
-
-   struct drm_i915_getparam gp = {
-      .param = param,
-      .value = &tmp,
-   };
-
-   int ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
-   if (ret != 0)
-      return false;
-
-   *value = tmp;
-   return true;
-}
-#endif
-
 uint32_t
 v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
 {
 #if using_v3d_simulator
-   int devid = 0;
-
-   if (!get_i915_param(dev->render_fd, I915_PARAM_CHIPSET_ID, &devid))
-      fprintf(stderr, "Error getting device_id\n");
-
-   return devid;
+   return dev->device_id;
 #else
    switch (dev->devinfo.ver) {
    case 42:
       return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+   case 71:
+      return 0x55701C33; /* Broadcom deviceID for 2712 */
    default:
       unreachable("Unsupported V3D version");
    }
@@ -1260,18 +1109,18 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
    STATIC_ASSERT(MAX_STORAGE_BUFFERS >= MAX_DYNAMIC_STORAGE_BUFFERS);
 
    const uint32_t page_size = 4096;
-   const uint32_t mem_size = compute_heap_size();
+   const uint64_t mem_size = compute_heap_size();
 
    const uint32_t max_varying_components = 16 * 4;
 
-   const uint32_t v3d_coord_shift = 6;
-
-   const float v3d_point_line_granularity = 2.0f / (1 << v3d_coord_shift);
-   const uint32_t max_fb_size = 4096;
+   const float v3d_point_line_granularity = 2.0f / (1 << V3D_COORD_SHIFT);
+   const uint32_t max_fb_size = V3D_MAX_IMAGE_DIMENSION;
 
    const VkSampleCountFlags supported_sample_counts =
       VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
 
+   const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
+
    struct timespec clock_res;
    clock_getres(CLOCK_MONOTONIC, &clock_res);
    const float timestamp_period =
@@ -1279,18 +1128,18 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
 
    /* FIXME: this will probably require an in-depth review */
    VkPhysicalDeviceLimits limits = {
-      .maxImageDimension1D                      = 4096,
-      .maxImageDimension2D                      = 4096,
-      .maxImageDimension3D                      = 4096,
-      .maxImageDimensionCube                    = 4096,
-      .maxImageArrayLayers                      = 2048,
+      .maxImageDimension1D                      = V3D_MAX_IMAGE_DIMENSION,
+      .maxImageDimension2D                      = V3D_MAX_IMAGE_DIMENSION,
+      .maxImageDimension3D                      = V3D_MAX_IMAGE_DIMENSION,
+      .maxImageDimensionCube                    = V3D_MAX_IMAGE_DIMENSION,
+      .maxImageArrayLayers                      = V3D_MAX_ARRAY_LAYERS,
       .maxTexelBufferElements                   = (1ul << 28),
       .maxUniformBufferRange                    = V3D_MAX_BUFFER_RANGE,
       .maxStorageBufferRange                    = V3D_MAX_BUFFER_RANGE,
       .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
       .maxMemoryAllocationCount                 = mem_size / page_size,
       .maxSamplerAllocationCount                = 64 * 1024,
-      .bufferImageGranularity                   = 256, /* A cache line */
+      .bufferImageGranularity                   = V3D_NON_COHERENT_ATOM_SIZE,
       .sparseAddressSpaceSize                   = 0,
       .maxBoundDescriptorSets                   = MAX_SETS,
       .maxPerStageDescriptorSamplers            = V3D_MAX_TEXTURE_SAMPLERS,
@@ -1342,7 +1191,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxFragmentInputComponents               = max_varying_components,
       .maxFragmentOutputAttachments             = 4,
       .maxFragmentDualSrcAttachments            = 0,
-      .maxFragmentCombinedOutputResources       = MAX_RENDER_TARGETS +
+      .maxFragmentCombinedOutputResources       = max_rts +
                                                   MAX_STORAGE_BUFFERS +
                                                   MAX_STORAGE_IMAGES,
 
@@ -1352,10 +1201,11 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxComputeWorkGroupInvocations           = 256,
       .maxComputeWorkGroupSize                  = { 256, 256, 256 },
 
-      .subPixelPrecisionBits                    = v3d_coord_shift,
+      .subPixelPrecisionBits                    = V3D_COORD_SHIFT,
       .subTexelPrecisionBits                    = 8,
       .mipmapPrecisionBits                      = 8,
-      .maxDrawIndexedIndexValue                 = 0x00ffffff,
+      .maxDrawIndexedIndexValue                 = pdevice->devinfo.ver >= 71 ?
+                                                  0xffffffff : 0x00ffffff,
       .maxDrawIndirectCount                     = 0x7fffffff,
       .maxSamplerLodBias                        = 14.0f,
       .maxSamplerAnisotropy                     = 16.0f,
@@ -1365,7 +1215,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
                                                     2.0 * max_fb_size - 1 },
       .viewportSubPixelBits                     = 0,
       .minMemoryMapAlignment                    = page_size,
-      .minTexelBufferOffsetAlignment            = V3D_UIFBLOCK_SIZE,
+      .minTexelBufferOffsetAlignment            = V3D_TMU_TEXEL_ALIGN,
       .minUniformBufferOffsetAlignment          = 32,
       .minStorageBufferOffsetAlignment          = 32,
       .minTexelOffset                           = -8,
@@ -1374,7 +1224,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxTexelGatherOffset                     = 7,
       .minInterpolationOffset                   = -0.5,
       .maxInterpolationOffset                   = 0.5,
-      .subPixelInterpolationOffsetBits          = v3d_coord_shift,
+      .subPixelInterpolationOffsetBits          = V3D_COORD_SHIFT,
       .maxFramebufferWidth                      = max_fb_size,
       .maxFramebufferHeight                     = max_fb_size,
       .maxFramebufferLayers                     = 256,
@@ -1382,7 +1232,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .framebufferDepthSampleCounts             = supported_sample_counts,
       .framebufferStencilSampleCounts           = supported_sample_counts,
       .framebufferNoAttachmentsSampleCounts     = supported_sample_counts,
-      .maxColorAttachments                      = MAX_RENDER_TARGETS,
+      .maxColorAttachments                      = max_rts,
       .sampledImageColorSampleCounts            = supported_sample_counts,
       .sampledImageIntegerSampleCounts          = supported_sample_counts,
       .sampledImageDepthSampleCounts            = supported_sample_counts,
@@ -1404,7 +1254,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .standardSampleLocations                  = false,
       .optimalBufferCopyOffsetAlignment         = 32,
       .optimalBufferCopyRowPitchAlignment       = 32,
-      .nonCoherentAtomSize                      = 256,
+      .nonCoherentAtomSize                      = V3D_NON_COHERENT_ATOM_SIZE,
    };
 
    *pProperties = (VkPhysicalDeviceProperties) {
@@ -1431,7 +1281,166 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
 
    v3dv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
 
+   /* We don't really have special restrictions for the maximum
+    * descriptors per set, other than maybe not exceeding the limits
+    * of addressable memory in a single allocation on either the host
+    * or the GPU. This will be a much larger limit than any of the
+    * per-stage limits already available in Vulkan though, so in practice,
+    * it is not expected to limit anything beyond what is already
+    * constrained through per-stage limits.
+    */
+   const uint32_t max_host_descriptors =
+      (UINT32_MAX - sizeof(struct v3dv_descriptor_set)) /
+      sizeof(struct v3dv_descriptor);
+   const uint32_t max_gpu_descriptors =
+      (UINT32_MAX / v3dv_X(pdevice, max_descriptor_bo_size)());
+
+   VkPhysicalDeviceVulkan13Properties vk13 = {
+      .maxInlineUniformBlockSize = 4096,
+      .maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UNIFORM_BUFFERS,
+      .maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UNIFORM_BUFFERS,
+      .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
+         MAX_INLINE_UNIFORM_BUFFERS,
+      .maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
+         MAX_INLINE_UNIFORM_BUFFERS,
+      .maxBufferSize = V3D_MAX_BUFFER_RANGE,
+      .storageTexelBufferOffsetAlignmentBytes = V3D_TMU_TEXEL_ALIGN,
+      .storageTexelBufferOffsetSingleTexelAlignment = false,
+      .uniformTexelBufferOffsetAlignmentBytes = V3D_TMU_TEXEL_ALIGN,
+      .uniformTexelBufferOffsetSingleTexelAlignment = false,
+      /* No native acceleration for integer dot product. We use NIR lowering. */
+      .integerDotProduct8BitUnsignedAccelerated = false,
+      .integerDotProduct8BitMixedSignednessAccelerated = false,
+      .integerDotProduct4x8BitPackedUnsignedAccelerated = false,
+      .integerDotProduct4x8BitPackedSignedAccelerated = false,
+      .integerDotProduct4x8BitPackedMixedSignednessAccelerated = false,
+      .integerDotProduct16BitUnsignedAccelerated = false,
+      .integerDotProduct16BitSignedAccelerated = false,
+      .integerDotProduct16BitMixedSignednessAccelerated = false,
+      .integerDotProduct32BitUnsignedAccelerated = false,
+      .integerDotProduct32BitSignedAccelerated = false,
+      .integerDotProduct32BitMixedSignednessAccelerated = false,
+      .integerDotProduct64BitUnsignedAccelerated = false,
+      .integerDotProduct64BitSignedAccelerated = false,
+      .integerDotProduct64BitMixedSignednessAccelerated = false,
+      .integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating8BitSignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false,
+      .integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false,
+      .integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating16BitSignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false,
+      .integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating32BitSignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false,
+      .integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating64BitSignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false,
+      /* VK_EXT_subgroup_size_control */
+      .minSubgroupSize = V3D_CHANNELS,
+      .maxSubgroupSize = V3D_CHANNELS,
+      .maxComputeWorkgroupSubgroups = 16, /* 256 / 16 */
+      .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
+   };
+
+   VkPhysicalDeviceVulkan12Properties vk12 = {
+      .driverID = VK_DRIVER_ID_MESA_V3DV,
+      .conformanceVersion = {
+         .major = 1,
+         .minor = 3,
+         .subminor = 6,
+         .patch = 1,
+      },
+      .supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT,
+      .supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT,
+      /* FIXME: if we want to support independentResolveNone then we would
+       * need to honor attachment load operations on resolve attachments,
+       * which we currently ignore because the resolve makes them irrelevant,
+       * as it unconditionally writes all pixels in the render area. However,
+       * with independentResolveNone, it is possible to have one aspect of a
+       * D/S resolve attachment stay unresolved, in which case the attachment
+       * load operation is relevant.
+       *
+       * NOTE: implementing attachment load for resolve attachments isn't
+       * immediately trivial because these attachments are not part of the
+       * framebuffer and therefore we can't use the same mechanism we use
+       * for framebuffer attachments. Instead, we should probably have to
+       * emit a meta operation for that right at the start of the render
+       * pass (or subpass).
+       */
+      .independentResolveNone = false,
+      .independentResolve = false,
+      .maxTimelineSemaphoreValueDifference = UINT64_MAX,
+
+      .denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+      .roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+      .shaderSignedZeroInfNanPreserveFloat16 = true,
+      .shaderSignedZeroInfNanPreserveFloat32 = true,
+      .shaderSignedZeroInfNanPreserveFloat64 = false,
+      .shaderDenormPreserveFloat16 = true,
+      .shaderDenormPreserveFloat32 = true,
+      .shaderDenormPreserveFloat64 = false,
+      .shaderDenormFlushToZeroFloat16 = false,
+      .shaderDenormFlushToZeroFloat32 = false,
+      .shaderDenormFlushToZeroFloat64 = false,
+      .shaderRoundingModeRTEFloat16 = true,
+      .shaderRoundingModeRTEFloat32 = true,
+      .shaderRoundingModeRTEFloat64 = false,
+      .shaderRoundingModeRTZFloat16 = false,
+      .shaderRoundingModeRTZFloat32 = false,
+      .shaderRoundingModeRTZFloat64 = false,
+
+      /* V3D doesn't support min/max filtering */
+      .filterMinmaxSingleComponentFormats = false,
+      .filterMinmaxImageComponentMapping = false,
+
+      .framebufferIntegerColorSampleCounts =
+         VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT,
+   };
+   memset(vk12.driverName, 0, VK_MAX_DRIVER_NAME_SIZE);
+   snprintf(vk12.driverName, VK_MAX_DRIVER_NAME_SIZE, "V3DV Mesa");
+   memset(vk12.driverInfo, 0, VK_MAX_DRIVER_INFO_SIZE);
+   snprintf(vk12.driverInfo, VK_MAX_DRIVER_INFO_SIZE,
+            "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+
+   VkSubgroupFeatureFlags subgroup_ops = VK_SUBGROUP_FEATURE_BASIC_BIT;
+   if (pdevice->devinfo.ver >= 71) {
+      subgroup_ops |= VK_SUBGROUP_FEATURE_BALLOT_BIT |
+                      VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+                      VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+                      VK_SUBGROUP_FEATURE_VOTE_BIT |
+                      VK_SUBGROUP_FEATURE_QUAD_BIT;
+   }
+
+   VkPhysicalDeviceVulkan11Properties vk11 = {
+      .deviceLUIDValid = false,
+      .subgroupSize = V3D_CHANNELS,
+      .subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT |
+                                 VK_SHADER_STAGE_FRAGMENT_BIT,
+      .subgroupSupportedOperations = subgroup_ops,
+      .subgroupQuadOperationsInAllStages = false,
+      .pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES,
+      .maxMultiviewViewCount = MAX_MULTIVIEW_VIEW_COUNT,
+      .maxMultiviewInstanceIndex = UINT32_MAX - 1,
+      .protectedNoFault = false,
+      .maxPerSetDescriptors = MIN2(max_host_descriptors, max_gpu_descriptors),
+      /* Minimum required by the spec */
+      .maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE,
+   };
+   memcpy(vk11.deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+   memcpy(vk11.driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
+
+
    vk_foreach_struct(ext, pProperties->pNext) {
+      if (vk_get_physical_device_core_1_1_property_ext(ext, &vk11))
+         continue;
+      if (vk_get_physical_device_core_1_2_property_ext(ext, &vk12))
+         continue;
+      if (vk_get_physical_device_core_1_3_property_ext(ext, &vk13))
+         continue;
+
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: {
          VkPhysicalDeviceCustomBorderColorPropertiesEXT *props =
@@ -1453,15 +1462,31 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->maxVertexAttribDivisor = 0xffff;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
-         VkPhysicalDeviceIDProperties *id_props =
-            (VkPhysicalDeviceIDProperties *)ext;
-         memcpy(id_props->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
-         memcpy(id_props->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
-         /* The LUID is for Windows. */
-         id_props->deviceLUIDValid = false;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR : {
+         VkPhysicalDevicePerformanceQueryPropertiesKHR *props =
+            (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
+
+         props->allowCommandBufferQueryCopies = true;
+         break;
+      }
+#if DETECT_OS_ANDROID
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: {
+         VkPhysicalDevicePresentationPropertiesANDROID *props =
+            (VkPhysicalDevicePresentationPropertiesANDROID *)ext;
+         uint64_t front_rendering_usage = 0;
+         struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+         if (gralloc != NULL) {
+            u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage);
+            u_gralloc_destroy(&gralloc);
+         }
+         props->sharedImage = front_rendering_usage ? VK_TRUE
+                                                    : VK_FALSE;
          break;
       }
+#pragma GCC diagnostic pop
+#endif
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
          VkPhysicalDeviceDrmPropertiesEXT *props =
             (VkPhysicalDeviceDrmPropertiesEXT *)ext;
@@ -1477,34 +1502,10 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          }
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
-         VkPhysicalDeviceMaintenance3Properties *props =
-            (VkPhysicalDeviceMaintenance3Properties *)ext;
-         /* We don't really have special restrictions for the maximum
-          * descriptors per set, other than maybe not exceeding the limits
-          * of addressable memory in a single allocation on either the host
-          * or the GPU. This will be a much larger limit than any of the
-          * per-stage limits already available in Vulkan though, so in practice,
-          * it is not expected to limit anything beyond what is already
-          * constrained through per-stage limits.
-          */
-         uint32_t max_host_descriptors =
-            (UINT32_MAX - sizeof(struct v3dv_descriptor_set)) /
-            sizeof(struct v3dv_descriptor);
-         uint32_t max_gpu_descriptors =
-            (UINT32_MAX / v3dv_X(pdevice, max_descriptor_bo_size)());
-         props->maxPerSetDescriptors =
-            MIN2(max_host_descriptors, max_gpu_descriptors);
-
-         /* Minimum required by the spec */
-         props->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
-         VkPhysicalDeviceMultiviewProperties *props =
-            (VkPhysicalDeviceMultiviewProperties *)ext;
-         props->maxMultiviewViewCount = MAX_MULTIVIEW_VIEW_COUNT;
-         props->maxMultiviewInstanceIndex = UINT32_MAX - 1;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
+         VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
+            (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
+         props->lineSubPixelPrecisionBits = V3D_COORD_SHIFT;
          break;
       }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT:
@@ -1512,26 +1513,33 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
           * never provide this extension.
           */
          break;
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
-         VkPhysicalDevicePointClippingProperties *props =
-            (VkPhysicalDevicePointClippingProperties *)ext;
-         props->pointClippingBehavior =
-            VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_PROPERTIES_EXT: {
+         VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *props =
+            (VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *)ext;
+         STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
+                       sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+         memcpy(props->shaderModuleIdentifierAlgorithmUUID,
+                vk_shaderModuleIdentifierAlgorithmUUID,
+                sizeof(props->shaderModuleIdentifierAlgorithmUUID));
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: {
-         VkPhysicalDeviceProtectedMemoryProperties *props =
-            (VkPhysicalDeviceProtectedMemoryProperties *)ext;
-         props->protectedNoFault = false;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_PROPERTIES_EXT: {
+         VkPhysicalDevicePipelineRobustnessPropertiesEXT *props =
+            (VkPhysicalDevicePipelineRobustnessPropertiesEXT *)ext;
+         props->defaultRobustnessStorageBuffers =
+            VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT;
+         props->defaultRobustnessUniformBuffers =
+            VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT;
+         props->defaultRobustnessVertexInputs =
+            VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT;
+         props->defaultRobustnessImages =
+            VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DEVICE_DEFAULT_EXT;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
-         VkPhysicalDeviceSubgroupProperties *props =
-            (VkPhysicalDeviceSubgroupProperties *)ext;
-         props->subgroupSize = V3D_CHANNELS;
-         props->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
-         props->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT;
-         props->quadOperationsInAllStages = false;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: {
+         VkPhysicalDeviceMultiDrawPropertiesEXT *properties =
+            (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext;
+         properties->maxMultiDrawCount = 2048;
          break;
       }
       default:
@@ -1553,25 +1561,14 @@ v3dv_queue_family_properties = {
 };
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceQueueFamilyProperties(VkPhysicalDevice physicalDevice,
-                                            uint32_t *pCount,
-                                            VkQueueFamilyProperties *pQueueFamilyProperties)
-{
-   VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pCount);
-
-   vk_outarray_append(&out, p) {
-      *p = v3dv_queue_family_properties;
-   }
-}
-
-VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceQueueFamilyProperties2(VkPhysicalDevice physicalDevice,
                                              uint32_t *pQueueFamilyPropertyCount,
                                              VkQueueFamilyProperties2 *pQueueFamilyProperties)
 {
-   VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount);
+   VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out,
+                          pQueueFamilyProperties, pQueueFamilyPropertyCount);
 
-   vk_outarray_append(&out, p) {
+   vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
       p->queueFamilyProperties = v3dv_queue_family_properties;
 
       vk_foreach_struct(s, p->pNext) {
@@ -1592,11 +1589,28 @@ VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice physicalDevice,
                                         VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)
 {
+   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
+
    v3dv_GetPhysicalDeviceMemoryProperties(physicalDevice,
                                           &pMemoryProperties->memoryProperties);
 
    vk_foreach_struct(ext, pMemoryProperties->pNext) {
       switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: {
+         VkPhysicalDeviceMemoryBudgetPropertiesEXT *p =
+            (VkPhysicalDeviceMemoryBudgetPropertiesEXT *) ext;
+         p->heapUsage[0] = device->heap_used;
+         p->heapBudget[0] = compute_memory_budget(device);
+
+         /* The heapBudget and heapUsage values must be zero for array elements
+          * greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount
+          */
+         for (unsigned i = 1; i < VK_MAX_MEMORY_HEAPS; i++) {
+            p->heapBudget[i] = 0u;
+            p->heapUsage[i] = 0u;
+         }
+         break;
+      }
       default:
          v3dv_debug_ignored_stype(ext->sType);
          break;
@@ -1618,11 +1632,6 @@ v3dv_GetInstanceProcAddr(VkInstance _instance,
  * vk_icdGetInstanceProcAddr to work around certain LD_PRELOAD issues seen in apps.
  */
 PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction
-VKAPI_CALL vk_icdGetInstanceProcAddr(VkInstance instance,
-                                     const char *pName);
-
-PUBLIC
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 vk_icdGetInstanceProcAddr(VkInstance instance,
                           const char*                                 pName)
@@ -1630,23 +1639,6 @@ vk_icdGetInstanceProcAddr(VkInstance instance,
    return v3dv_GetInstanceProcAddr(instance, pName);
 }
 
-/* With version 4+ of the loader interface the ICD should expose
- * vk_icdGetPhysicalDeviceProcAddr()
- */
-PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
-vk_icdGetPhysicalDeviceProcAddr(VkInstance  _instance,
-                                const char* pName);
-
-PFN_vkVoidFunction
-vk_icdGetPhysicalDeviceProcAddr(VkInstance  _instance,
-                                const char* pName)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-
-   return vk_instance_get_physical_device_proc_addr(&instance->vk, pName);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
                                       VkLayerProperties *pProperties)
@@ -1671,30 +1663,66 @@ v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice,
       return VK_SUCCESS;
    }
 
-   return vk_error((struct v3dv_instance*) physical_device->vk.instance,
-                   VK_ERROR_LAYER_NOT_PRESENT);
+   return vk_error(physical_device, VK_ERROR_LAYER_NOT_PRESENT);
+}
+
+static void
+destroy_queue_syncs(struct v3dv_queue *queue)
+{
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      if (queue->last_job_syncs.syncs[i]) {
+         drmSyncobjDestroy(queue->device->pdevice->render_fd,
+                           queue->last_job_syncs.syncs[i]);
+      }
+   }
 }
 
 static VkResult
-queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
+queue_init(struct v3dv_device *device, struct v3dv_queue *queue,
+           const VkDeviceQueueCreateInfo *create_info,
+           uint32_t index_in_family)
 {
-   vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
+   VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info,
+                                   index_in_family);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = vk_queue_enable_submit_thread(&queue->vk);
+   if (result != VK_SUCCESS)
+      goto fail_submit_thread;
+
    queue->device = device;
-   queue->flags = 0;
+   queue->vk.driver_submit = v3dv_queue_driver_submit;
+
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      queue->last_job_syncs.first[i] = true;
+      int ret = drmSyncobjCreate(device->pdevice->render_fd,
+                                 DRM_SYNCOBJ_CREATE_SIGNALED,
+                                 &queue->last_job_syncs.syncs[i]);
+      if (ret) {
+         result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                            "syncobj create failed: %m");
+         goto fail_last_job_syncs;
+      }
+   }
+
    queue->noop_job = NULL;
-   list_inithead(&queue->submit_wait_list);
-   pthread_mutex_init(&queue->mutex, NULL);
    return VK_SUCCESS;
+
+fail_last_job_syncs:
+   destroy_queue_syncs(queue);
+fail_submit_thread:
+   vk_queue_finish(&queue->vk);
+   return result;
 }
 
 static void
 queue_finish(struct v3dv_queue *queue)
 {
-   vk_object_base_finish(&queue->base);
-   assert(list_is_empty(&queue->submit_wait_list));
    if (queue->noop_job)
       v3dv_job_destroy(queue->noop_job);
-   pthread_mutex_destroy(&queue->mutex);
+   destroy_queue_syncs(queue);
+   vk_queue_finish(&queue->vk);
 }
 
 static void
@@ -1728,19 +1756,6 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
 
-   /* Check enabled features */
-   if (pCreateInfo->pEnabledFeatures) {
-      VkPhysicalDeviceFeatures supported_features;
-      v3dv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
-      VkBool32 *supported_feature = (VkBool32 *)&supported_features;
-      VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures;
-      unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
-      for (uint32_t i = 0; i < num_features; i++) {
-         if (enabled_feature[i] && !supported_feature[i])
-            return vk_error(instance, VK_ERROR_FEATURE_NOT_PRESENT);
-      }
-   }
-
    /* Check requested queues (we only expose one queue ) */
    assert(pCreateInfo->queueCreateInfoCount == 1);
    for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
@@ -1759,56 +1774,46 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    struct vk_device_dispatch_table dispatch_table;
    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
                                              &v3dv_device_entrypoints, true);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &wsi_device_entrypoints, false);
    result = vk_device_init(&device->vk, &physical_device->vk,
                            &dispatch_table, pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
       vk_free(&device->vk.alloc, device);
-      return vk_error(instance, result);
+      return vk_error(NULL, result);
    }
 
+#if DETECT_OS_ANDROID
+   device->gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+   assert(device->gralloc);
+#endif
+
    device->instance = instance;
    device->pdevice = physical_device;
 
-   if (pAllocator)
-      device->vk.alloc = *pAllocator;
-   else
-      device->vk.alloc = physical_device->vk.instance->alloc;
+   mtx_init(&device->query_mutex, mtx_plain);
+   cnd_init(&device->query_ended);
+
+   device->vk.command_buffer_ops = &v3dv_cmd_buffer_ops;
 
-   pthread_mutex_init(&device->mutex, NULL);
+   vk_device_set_drm_fd(&device->vk, physical_device->render_fd);
+   vk_device_enable_threaded_submit(&device->vk);
 
-   result = queue_init(device, &device->queue);
+   result = queue_init(device, &device->queue,
+                       pCreateInfo->pQueueCreateInfos, 0);
    if (result != VK_SUCCESS)
       goto fail;
 
    device->devinfo = physical_device->devinfo;
 
-   /* Vulkan 1.1 and VK_KHR_get_physical_device_properties2 added
-    * VkPhysicalDeviceFeatures2 which can be used in the pNext chain of
-    * vkDeviceCreateInfo, in which case it should be used instead of
-    * pEnabledFeatures.
-    */
-   const VkPhysicalDeviceFeatures2 *features2 =
-      vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_FEATURES_2);
-   if (features2) {
-      memcpy(&device->features, &features2->features,
-             sizeof(device->features));
-   } else  if (pCreateInfo->pEnabledFeatures) {
-      memcpy(&device->features, pCreateInfo->pEnabledFeatures,
-             sizeof(device->features));
-   }
-
-   if (device->features.robustBufferAccess)
+   if (device->vk.enabled_features.robustBufferAccess)
       perf_debug("Device created with Robust Buffer Access enabled.\n");
 
-   int ret = drmSyncobjCreate(physical_device->render_fd,
-                              DRM_SYNCOBJ_CREATE_SIGNALED,
-                              &device->last_job_sync);
-   if (ret) {
-      result = VK_ERROR_INITIALIZATION_FAILED;
-      goto fail;
-   }
+   if (device->vk.enabled_features.robustImageAccess)
+      perf_debug("Device created with Robust Image Access enabled.\n");
 
-#ifdef DEBUG
+
+#if MESA_DEBUG
    v3dv_X(device, device_check_prepacked_sizes)();
 #endif
    init_device_meta(device);
@@ -1816,14 +1821,42 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
                             device->instance->default_pipeline_cache_enabled);
    device->default_attribute_float =
-      v3dv_pipeline_create_default_attribute_values(device, NULL);
+      v3dv_X(device, create_default_attribute_values)(device, NULL);
+
+   device->device_address_mem_ctx = ralloc_context(NULL);
+   util_dynarray_init(&device->device_address_bo_list,
+                      device->device_address_mem_ctx);
+
+   mtx_init(&device->events.lock, mtx_plain);
+   result = v3dv_event_allocate_resources(device);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   if (list_is_empty(&device->events.free_list)) {
+      result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      goto fail;
+   }
+
+   result = v3dv_query_allocate_resources(device);
+   if (result != VK_SUCCESS)
+      goto fail;
 
    *pDevice = v3dv_device_to_handle(device);
 
    return VK_SUCCESS;
 
 fail:
+   cnd_destroy(&device->query_ended);
+   mtx_destroy(&device->query_mutex);
+   queue_finish(&device->queue);
+   destroy_device_meta(device);
+   v3dv_pipeline_cache_finish(&device->default_pipeline_cache);
+   v3dv_event_free_resources(device);
+   v3dv_query_free_resources(device);
    vk_device_finish(&device->vk);
+#if DETECT_OS_ANDROID
+   u_gralloc_destroy(&device->gralloc);
+#endif
    vk_free(&device->vk.alloc, device);
 
    return result;
@@ -1835,10 +1868,14 @@ v3dv_DestroyDevice(VkDevice _device,
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
-   v3dv_DeviceWaitIdle(_device);
+   device->vk.dispatch_table.DeviceWaitIdle(_device);
    queue_finish(&device->queue);
-   pthread_mutex_destroy(&device->mutex);
-   drmSyncobjDestroy(device->pdevice->render_fd, device->last_job_sync);
+
+   v3dv_event_free_resources(device);
+   mtx_destroy(&device->events.lock);
+
+   v3dv_query_free_resources(device);
+
    destroy_device_meta(device);
    v3dv_pipeline_cache_finish(&device->default_pipeline_cache);
 
@@ -1847,36 +1884,23 @@ v3dv_DestroyDevice(VkDevice _device,
       device->default_attribute_float = NULL;
    }
 
+   ralloc_free(device->device_address_mem_ctx);
+
    /* Bo cache should be removed the last, as any other object could be
     * freeing their private bos
     */
    v3dv_bo_cache_destroy(device);
 
+   cnd_destroy(&device->query_ended);
+   mtx_destroy(&device->query_mutex);
+
    vk_device_finish(&device->vk);
+#if DETECT_OS_ANDROID
+   u_gralloc_destroy(&device->gralloc);
+#endif
    vk_free2(&device->vk.alloc, pAllocator, device);
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetDeviceQueue(VkDevice _device,
-                    uint32_t queueFamilyIndex,
-                    uint32_t queueIndex,
-                    VkQueue *pQueue)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   assert(queueIndex == 0);
-   assert(queueFamilyIndex == 0);
-
-   *pQueue = v3dv_queue_to_handle(&device->queue);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_DeviceWaitIdle(VkDevice _device)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   return v3dv_QueueWaitIdle(v3dv_queue_to_handle(&device->queue));
-}
-
 static VkResult
 device_alloc(struct v3dv_device *device,
              struct v3dv_device_memory *mem,
@@ -1914,15 +1938,12 @@ device_free(struct v3dv_device *device, struct v3dv_device_memory *mem)
     * display device to free the allocated dumb BO.
     */
    if (mem->is_for_wsi) {
-      assert(mem->has_bo_ownership);
-      device_free_wsi_dumb(device->instance->physicalDevice.display_fd,
-                           mem->bo->dumb_handle);
+      device_free_wsi_dumb(device->pdevice->display_fd, mem->bo->dumb_handle);
    }
 
-   if (mem->has_bo_ownership)
-      v3dv_bo_free(device, mem->bo);
-   else if (mem->bo)
-      vk_free(&device->vk.alloc, mem->bo);
+   p_atomic_add(&device->pdevice->heap_used, -((int64_t)mem->bo->size));
+
+   v3dv_bo_free(device, mem->bo);
 }
 
 static void
@@ -1967,21 +1988,12 @@ device_import_bo(struct v3dv_device *device,
                  int fd, uint64_t size,
                  struct v3dv_bo **bo)
 {
-   VkResult result;
-
-   *bo = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(struct v3dv_bo), 8,
-                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (*bo == NULL) {
-      result = VK_ERROR_OUT_OF_HOST_MEMORY;
-      goto fail;
-   }
+   *bo = NULL;
 
    off_t real_size = lseek(fd, 0, SEEK_END);
    lseek(fd, 0, SEEK_SET);
-   if (real_size < 0 || (uint64_t) real_size < size) {
-      result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
-      goto fail;
-   }
+   if (real_size < 0 || (uint64_t) real_size < size)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
 
    int render_fd = device->pdevice->render_fd;
    assert(render_fd >= 0);
@@ -1989,31 +2001,26 @@ device_import_bo(struct v3dv_device *device,
    int ret;
    uint32_t handle;
    ret = drmPrimeFDToHandle(render_fd, fd, &handle);
-   if (ret) {
-      result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
-      goto fail;
-   }
+   if (ret)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
 
    struct drm_v3d_get_bo_offset get_offset = {
       .handle = handle,
    };
    ret = v3dv_ioctl(render_fd, DRM_IOCTL_V3D_GET_BO_OFFSET, &get_offset);
-   if (ret) {
-      result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
-      goto fail;
-   }
+   if (ret)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
    assert(get_offset.offset != 0);
 
-   v3dv_bo_init(*bo, handle, size, get_offset.offset, "import", false);
+   *bo = v3dv_device_lookup_bo(device->pdevice, handle);
+   assert(*bo);
 
-   return VK_SUCCESS;
+   if ((*bo)->refcnt == 0)
+      v3dv_bo_init_import(*bo, handle, size, get_offset.offset, false);
+   else
+      p_atomic_inc(&(*bo)->refcnt);
 
-fail:
-   if (*bo) {
-      vk_free2(&device->vk.alloc, pAllocator, *bo);
-      *bo = NULL;
-   }
-   return result;
+   return VK_SUCCESS;
 }
 
 static VkResult
@@ -2030,19 +2037,8 @@ device_alloc_for_wsi(struct v3dv_device *device,
 #if using_v3d_simulator
       return device_alloc(device, mem, size);
 #else
-   /* If we are allocating for WSI we should have a swapchain and thus,
-    * we should've initialized the display device. However, Zink doesn't
-    * use swapchains, so in that case we can get here without acquiring the
-    * display device and we need to do it now.
-    */
    VkResult result;
-   struct v3dv_instance *instance = device->instance;
-   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
-   if (unlikely(pdevice->display_fd < 0)) {
-      result = v3dv_physical_device_acquire_display(instance, pdevice, NULL);
-      if (result != VK_SUCCESS)
-         return result;
-   }
+   struct v3dv_physical_device *pdevice = device->pdevice;
    assert(pdevice->display_fd != -1);
 
    mem->is_for_wsi = true;
@@ -2082,6 +2078,53 @@ fail_create:
 #endif
 }
 
+static void
+device_add_device_address_bo(struct v3dv_device *device,
+                                  struct v3dv_bo *bo)
+{
+   util_dynarray_append(&device->device_address_bo_list,
+                        struct v3dv_bo *,
+                        bo);
+}
+
+static void
+device_remove_device_address_bo(struct v3dv_device *device,
+                                struct v3dv_bo *bo)
+{
+   util_dynarray_delete_unordered(&device->device_address_bo_list,
+                                  struct v3dv_bo *,
+                                  bo);
+}
+
+static void
+free_memory(struct v3dv_device *device,
+            struct v3dv_device_memory *mem,
+            const VkAllocationCallbacks *pAllocator)
+{
+   if (mem == NULL)
+      return;
+
+   if (mem->bo->map)
+      device_unmap(device, mem);
+
+   if (mem->is_for_device_address)
+      device_remove_device_address_bo(device, mem->bo);
+
+   device_free(device, mem);
+
+   vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_FreeMemory(VkDevice _device,
+                VkDeviceMemory _mem,
+                const VkAllocationCallbacks *pAllocator)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_device_memory, mem, _mem);
+   free_memory(device, mem, pAllocator);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_AllocateMemory(VkDevice _device,
                     const VkMemoryAllocateInfo *pAllocateInfo,
@@ -2090,25 +2133,34 @@ v3dv_AllocateMemory(VkDevice _device,
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    struct v3dv_device_memory *mem;
-   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+   struct v3dv_physical_device *pdevice = device->pdevice;
 
    assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
 
-   /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */
-   assert(pAllocateInfo->allocationSize > 0);
+   /* We always allocate device memory in multiples of a page, so round up
+    * requested size to that.
+    */
+   const VkDeviceSize alloc_size = align64(pAllocateInfo->allocationSize, 4096);
+
+   if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE))
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   uint64_t heap_used = p_atomic_read(&pdevice->heap_used);
+   if (unlikely(heap_used + alloc_size > pdevice->memory.memoryHeaps[0].size))
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
-   mem = vk_object_zalloc(&device->vk, pAllocator, sizeof(*mem),
-                          VK_OBJECT_TYPE_DEVICE_MEMORY);
+   mem = vk_device_memory_create(&device->vk, pAllocateInfo,
+                                 pAllocator, sizeof(*mem));
    if (mem == NULL)
       return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.memoryTypeCount);
    mem->type = &pdevice->memory.memoryTypes[pAllocateInfo->memoryTypeIndex];
-   mem->has_bo_ownership = true;
    mem->is_for_wsi = false;
 
    const struct wsi_memory_allocate_info *wsi_info = NULL;
    const VkImportMemoryFdInfoKHR *fd_info = NULL;
+   const VkMemoryAllocateFlagsInfo *flags_info = NULL;
    vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
       switch ((unsigned)ext->sType) {
       case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA:
@@ -2118,16 +2170,14 @@ v3dv_AllocateMemory(VkDevice _device,
          fd_info = (void *)ext;
          break;
       case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO:
-         /* We don't support VK_KHR_buffer_device_address or multiple
-          * devices per device group, so we can ignore this.
-          */
+         flags_info = (void *)ext;
          break;
-      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR:
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
          /* We don't have particular optimizations associated with memory
           * allocations that won't be suballocated to multiple resources.
           */
          break;
-      case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR:
+      case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO:
          /* The mask of handle types specified here must be supported
           * according to VkExternalImageFormatProperties, so it must be
           * fd or dmabuf, which don't have special requirements for us.
@@ -2139,57 +2189,58 @@ v3dv_AllocateMemory(VkDevice _device,
       }
    }
 
-   VkResult result = VK_SUCCESS;
-
-   /* We always allocate device memory in multiples of a page, so round up
-    * requested size to that.
-    */
-   VkDeviceSize alloc_size = ALIGN(pAllocateInfo->allocationSize, 4096);
+   VkResult result;
 
-   if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE)) {
-      result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   if (wsi_info) {
+      result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size);
+   } else if (fd_info && fd_info->handleType) {
+      assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
+             fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+      result = device_import_bo(device, pAllocator,
+                                fd_info->fd, alloc_size, &mem->bo);
+      if (result == VK_SUCCESS)
+         close(fd_info->fd);
+   } else if (mem->vk.ahardware_buffer) {
+#if DETECT_OS_ANDROID
+      const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
+      assert(handle->numFds > 0);
+      size_t size = lseek(handle->data[0], 0, SEEK_END);
+      result = device_import_bo(device, pAllocator,
+                                handle->data[0], size, &mem->bo);
+#else
+      result = VK_ERROR_FEATURE_NOT_PRESENT;
+#endif
    } else {
-      if (wsi_info) {
-         result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size);
-      } else if (fd_info && fd_info->handleType) {
-         assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
-                fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
-         result = device_import_bo(device, pAllocator,
-                                   fd_info->fd, alloc_size, &mem->bo);
-         mem->has_bo_ownership = false;
-         if (result == VK_SUCCESS)
-            close(fd_info->fd);
-      } else {
-         result = device_alloc(device, mem, alloc_size);
-      }
+      result = device_alloc(device, mem, alloc_size);
    }
 
    if (result != VK_SUCCESS) {
-      vk_object_free(&device->vk, pAllocator, mem);
-      return vk_error(device->instance, result);
+      vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
+      return vk_error(device, result);
    }
 
-   *pMem = v3dv_device_memory_to_handle(mem);
-   return result;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_FreeMemory(VkDevice _device,
-                VkDeviceMemory _mem,
-                const VkAllocationCallbacks *pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_device_memory, mem, _mem);
-
-   if (mem == NULL)
-      return;
-
-   if (mem->bo->map)
-      v3dv_UnmapMemory(_device, _mem);
+   heap_used = p_atomic_add_return(&pdevice->heap_used, mem->bo->size);
+   if (heap_used > pdevice->memory.memoryHeaps[0].size) {
+      free_memory(device, mem, pAllocator);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
 
-   device_free(device, mem);
+   /* If this memory can be used via VK_KHR_buffer_device_address then we
+    * will need to manually add the BO to any job submit that makes use of
+    * VK_KHR_buffer_device_address, since such jobs may produce buffer
+    * load/store operations that may access any buffer memory allocated with
+    * this flag and we don't have any means to tell which buffers will be
+    * accessed through this mechanism since they don't even have to be bound
+    * through descriptor state.
+    */
+   if (flags_info &&
+       (flags_info->flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT)) {
+      mem->is_for_device_address = true;
+      device_add_device_address_bo(device, mem->bo);
+   }
 
-   vk_object_free(&device->vk, pAllocator, mem);
+   *pMem = v3dv_device_memory_to_handle(mem);
+   return result;
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -2217,7 +2268,7 @@ v3dv_MapMemory(VkDevice _device,
     */
    VkResult result = device_map(device, mem);
    if (result != VK_SUCCESS)
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
 
    *ppData = ((uint8_t *) mem->bo->map) + offset;
    return VK_SUCCESS;
@@ -2252,19 +2303,30 @@ v3dv_InvalidateMappedMemoryRanges(VkDevice _device,
    return VK_SUCCESS;
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetImageMemoryRequirements2(VkDevice device,
-                                 const VkImageMemoryRequirementsInfo2 *pInfo,
-                                 VkMemoryRequirements2 *pMemoryRequirements)
+static void
+get_image_memory_requirements(struct v3dv_image *image,
+                              VkImageAspectFlagBits planeAspect,
+                              VkMemoryRequirements2 *pMemoryRequirements)
 {
-   V3DV_FROM_HANDLE(v3dv_image, image, pInfo->image);
-
    pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
       .memoryTypeBits = 0x1,
-      .alignment = image->alignment,
-      .size = image->size
+      .alignment = image->planes[0].alignment,
+      .size = image->non_disjoint_size
    };
 
+   if (planeAspect != VK_IMAGE_ASPECT_NONE) {
+      assert(image->format->plane_count > 1);
+      /* Disjoint images should have a 0 non_disjoint_size */
+      assert(!pMemoryRequirements->memoryRequirements.size);
+
+      uint8_t plane = v3dv_image_aspect_to_plane(image, planeAspect);
+
+      VkMemoryRequirements *mem_reqs =
+         &pMemoryRequirements->memoryRequirements;
+      mem_reqs->alignment = image->planes[plane].alignment;
+      mem_reqs->size = image->planes[plane].size;
+   }
+
    vk_foreach_struct(ext, pMemoryRequirements->pNext) {
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
@@ -2281,6 +2343,65 @@ v3dv_GetImageMemoryRequirements2(VkDevice device,
    }
 }
 
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetImageMemoryRequirements2(VkDevice device,
+                                 const VkImageMemoryRequirementsInfo2 *pInfo,
+                                 VkMemoryRequirements2 *pMemoryRequirements)
+{
+   V3DV_FROM_HANDLE(v3dv_image, image, pInfo->image);
+
+   VkImageAspectFlagBits planeAspect = VK_IMAGE_ASPECT_NONE;
+   vk_foreach_struct_const(ext, pInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: {
+         VkImagePlaneMemoryRequirementsInfo *req =
+            (VkImagePlaneMemoryRequirementsInfo *) ext;
+         planeAspect = req->planeAspect;
+         break;
+      }
+      default:
+         v3dv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+
+   get_image_memory_requirements(image, planeAspect, pMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDeviceImageMemoryRequirements(
+    VkDevice _device,
+    const VkDeviceImageMemoryRequirements *pInfo,
+    VkMemoryRequirements2 *pMemoryRequirements)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+
+   struct v3dv_image image = { 0 };
+   vk_image_init(&device->vk, &image.vk, pInfo->pCreateInfo);
+
+   ASSERTED VkResult result =
+      v3dv_image_init(device, pInfo->pCreateInfo, NULL, &image);
+   assert(result == VK_SUCCESS);
+
+   /* From VkDeviceImageMemoryRequirements spec:
+    *
+    *   " planeAspect is a VkImageAspectFlagBits value specifying the aspect
+    *     corresponding to the image plane to query. This parameter is ignored
+    *     unless pCreateInfo::tiling is
+    *     VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, or pCreateInfo::flags has
+    *     VK_IMAGE_CREATE_DISJOINT_BIT set"
+    *
+    * We need to explicitly ignore that flag, or following asserts could be
+    * triggered.
+    */
+   VkImageAspectFlagBits planeAspect =
+      pInfo->pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT ||
+      pInfo->pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT ?
+      pInfo->planeAspect : 0;
+
+   get_image_memory_requirements(&image, planeAspect, pMemoryRequirements);
+}
+
 static void
 bind_image_memory(const VkBindImageMemoryInfo *info)
 {
@@ -2293,11 +2414,43 @@ bind_image_memory(const VkBindImageMemoryInfo *info)
     *    the VkMemoryRequirements structure returned from a call to
     *    vkGetImageMemoryRequirements with image"
     */
-   assert(info->memoryOffset % image->alignment == 0);
    assert(info->memoryOffset < mem->bo->size);
 
-   image->mem = mem;
-   image->mem_offset = info->memoryOffset;
+   uint64_t offset = info->memoryOffset;
+   if (image->non_disjoint_size) {
+      /* We only check for plane 0 as it is the only one that actually starts
+       * at that offset
+       */
+      assert(offset % image->planes[0].alignment == 0);
+      for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+         image->planes[plane].mem = mem;
+         image->planes[plane].mem_offset = offset;
+      }
+   } else {
+      const VkBindImagePlaneMemoryInfo *plane_mem_info =
+         vk_find_struct_const(info->pNext, BIND_IMAGE_PLANE_MEMORY_INFO);
+      assert(plane_mem_info);
+
+      /*
+       * From VkBindImagePlaneMemoryInfo spec:
+       *
+       *    "If the image’s tiling is VK_IMAGE_TILING_LINEAR or
+       *     VK_IMAGE_TILING_OPTIMAL, then planeAspect must be a single valid
+       *     format plane for the image"
+       *
+       * <skip>
+       *
+       *    "If the image’s tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+       *     then planeAspect must be a single valid memory plane for the
+       *     image"
+       *
+       * So planeAspect should only refer to one plane.
+       */
+      uint8_t plane = v3dv_plane_from_aspect(plane_mem_info->planeAspect);
+      assert(offset % image->planes[plane].alignment == 0);
+      image->planes[plane].mem = mem;
+      image->planes[plane].mem_offset = offset;
+   }
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -2306,21 +2459,59 @@ v3dv_BindImageMemory2(VkDevice _device,
                       const VkBindImageMemoryInfo *pBindInfos)
 {
    for (uint32_t i = 0; i < bindInfoCount; i++) {
+#if DETECT_OS_ANDROID
+      V3DV_FROM_HANDLE(v3dv_device_memory, mem, pBindInfos[i].memory);
+      V3DV_FROM_HANDLE(v3dv_device, device, _device);
+      if (mem != NULL && mem->vk.ahardware_buffer) {
+         AHardwareBuffer_Desc description;
+         const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
+
+         V3DV_FROM_HANDLE(v3dv_image, image, pBindInfos[i].image);
+         AHardwareBuffer_describe(mem->vk.ahardware_buffer, &description);
+
+         struct u_gralloc_buffer_handle gr_handle = {
+            .handle = handle,
+            .pixel_stride = description.stride,
+            .hal_format = description.format,
+         };
+
+         VkResult result = v3dv_gralloc_to_drm_explicit_layout(
+            device->gralloc,
+            &gr_handle,
+            image->android_explicit_layout,
+            image->android_plane_layouts,
+            V3DV_MAX_PLANE_COUNT);
+         if (result != VK_SUCCESS)
+            return result;
+
+         result = v3dv_update_image_layout(
+            device, image, image->android_explicit_layout->drmFormatModifier,
+            /* disjoint = */ false, image->android_explicit_layout);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+#endif
+
       const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
          vk_find_struct_const(pBindInfos->pNext,
                               BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR);
       if (swapchain_info && swapchain_info->swapchain) {
+#if !DETECT_OS_ANDROID
          struct v3dv_image *swapchain_image =
             v3dv_wsi_get_image_from_swapchain(swapchain_info->swapchain,
                                               swapchain_info->imageIndex);
+         /* Making the assumption that swapchain images are a single plane */
+         assert(swapchain_image->plane_count == 1);
          VkBindImageMemoryInfo swapchain_bind = {
             .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
             .image = pBindInfos[i].image,
-            .memory = v3dv_device_memory_to_handle(swapchain_image->mem),
-            .memoryOffset = swapchain_image->mem_offset,
+            .memory = v3dv_device_memory_to_handle(swapchain_image->planes[0].mem),
+            .memoryOffset = swapchain_image->planes[0].mem_offset,
          };
          bind_image_memory(&swapchain_bind);
-      } else {
+#endif
+      } else
+      {
          bind_image_memory(&pBindInfos[i]);
       }
    }
@@ -2328,19 +2519,39 @@ v3dv_BindImageMemory2(VkDevice _device,
    return VK_SUCCESS;
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetBufferMemoryRequirements2(VkDevice device,
-                                  const VkBufferMemoryRequirementsInfo2 *pInfo,
-                                  VkMemoryRequirements2 *pMemoryRequirements)
+void
+v3dv_buffer_init(struct v3dv_device *device,
+                 const VkBufferCreateInfo *pCreateInfo,
+                 struct v3dv_buffer *buffer,
+                 uint32_t alignment)
 {
-   V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
+   buffer->size = pCreateInfo->size;
+   buffer->usage = pCreateInfo->usage;
+   buffer->alignment = alignment;
+}
 
+static void
+get_buffer_memory_requirements(struct v3dv_buffer *buffer,
+                               VkMemoryRequirements2 *pMemoryRequirements)
+{
    pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
       .memoryTypeBits = 0x1,
       .alignment = buffer->alignment,
       .size = align64(buffer->size, buffer->alignment),
    };
 
+   /* UBO and SSBO may be read using ldunifa, which prefetches the next
+    * 4 bytes after a read. If the buffer's size is exactly a multiple
+    * of a page size and the shader reads the last 4 bytes with ldunifa
+    * the prefetching would read out of bounds and cause an MMU error,
+    * so we allocate extra space to avoid kernel error spamming.
+    */
+   bool can_ldunifa = buffer->usage &
+                      (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                       VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
+   if (can_ldunifa && (buffer->size % 4096 == 0))
+      pMemoryRequirements->memoryRequirements.size += buffer->alignment;
+
    vk_foreach_struct(ext, pMemoryRequirements->pNext) {
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
@@ -2357,8 +2568,30 @@ v3dv_GetBufferMemoryRequirements2(VkDevice device,
    }
 }
 
-static void
-bind_buffer_memory(const VkBindBufferMemoryInfo *info)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetBufferMemoryRequirements2(VkDevice device,
+                                  const VkBufferMemoryRequirementsInfo2 *pInfo,
+                                  VkMemoryRequirements2 *pMemoryRequirements)
+{
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
+   get_buffer_memory_requirements(buffer, pMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDeviceBufferMemoryRequirements(
+    VkDevice _device,
+    const VkDeviceBufferMemoryRequirements *pInfo,
+    VkMemoryRequirements2 *pMemoryRequirements)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+
+   struct v3dv_buffer buffer = { 0 };
+   v3dv_buffer_init(device, pInfo->pCreateInfo, &buffer, V3D_NON_COHERENT_ATOM_SIZE);
+   get_buffer_memory_requirements(&buffer, pMemoryRequirements);
+}
+
+void
+v3dv_buffer_bind_memory(const VkBindBufferMemoryInfo *info)
 {
    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->buffer);
    V3DV_FROM_HANDLE(v3dv_device_memory, mem, info->memory);
@@ -2383,7 +2616,7 @@ v3dv_BindBufferMemory2(VkDevice device,
                        const VkBindBufferMemoryInfo *pBindInfos)
 {
    for (uint32_t i = 0; i < bindInfoCount; i++)
-      bind_buffer_memory(&pBindInfos[i]);
+      v3dv_buffer_bind_memory(&pBindInfos[i]);
 
    return VK_SUCCESS;
 }
@@ -2406,16 +2639,16 @@ v3dv_CreateBuffer(VkDevice  _device,
    buffer = vk_object_zalloc(&device->vk, pAllocator, sizeof(*buffer),
                              VK_OBJECT_TYPE_BUFFER);
    if (buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   buffer->size = pCreateInfo->size;
-   buffer->usage = pCreateInfo->usage;
-   buffer->alignment = 256; /* nonCoherentAtomSize */
+   v3dv_buffer_init(device, pCreateInfo, buffer, V3D_NON_COHERENT_ATOM_SIZE);
 
    /* Limit allocations to 32-bit */
    const VkDeviceSize aligned_size = align64(buffer->size, buffer->alignment);
-   if (aligned_size > UINT32_MAX || aligned_size < buffer->size)
+   if (aligned_size > UINT32_MAX || aligned_size < buffer->size) {
+      vk_free(&device->vk.alloc, buffer);
       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   }
 
    *pBuffer = v3dv_buffer_to_handle(buffer);
 
@@ -2452,20 +2685,32 @@ v3dv_CreateFramebuffer(VkDevice _device,
    framebuffer = vk_object_zalloc(&device->vk, pAllocator, size,
                                   VK_OBJECT_TYPE_FRAMEBUFFER);
    if (framebuffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    framebuffer->width = pCreateInfo->width;
    framebuffer->height = pCreateInfo->height;
    framebuffer->layers = pCreateInfo->layers;
    framebuffer->has_edge_padding = true;
 
+   const VkFramebufferAttachmentsCreateInfo *imageless =
+      vk_find_struct_const(pCreateInfo->pNext,
+      FRAMEBUFFER_ATTACHMENTS_CREATE_INFO);
+
    framebuffer->attachment_count = pCreateInfo->attachmentCount;
    framebuffer->color_attachment_count = 0;
-   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
-      framebuffer->attachments[i] =
-         v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]);
-      if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-         framebuffer->color_attachment_count++;
+   for (uint32_t i = 0; i < framebuffer->attachment_count; i++) {
+      if (!imageless) {
+         framebuffer->attachments[i] =
+            v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]);
+         if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+            framebuffer->color_attachment_count++;
+      } else {
+         assert(i < imageless->attachmentImageInfoCount);
+         if (imageless->pAttachmentImageInfos[i].usage &
+             VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+            framebuffer->color_attachment_count++;
+         }
+      }
    }
 
    *pFramebuffer = v3dv_framebuffer_to_handle(framebuffer);
@@ -2487,6 +2732,105 @@ v3dv_DestroyFramebuffer(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, fb);
 }
 
+void
+v3dv_setup_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer,
+                               const VkRenderingInfoKHR *info)
+{
+   struct v3dv_device *device = cmd_buffer->device;
+
+   /* Max framebuffer attachments is max_color_RTs + D/S multiplied by two for
+    * MSAA resolves.
+    */
+   const uint32_t max_attachments =
+      2 * (V3D_MAX_RENDER_TARGETS(device->devinfo.ver) + 1);
+   const uint32_t attachments_alloc_size =
+      sizeof(struct v3dv_image_view *) * max_attachments;
+
+   /* Only allocate the dynamic framebuffer once and will stay valid
+    * for the duration of the command buffer.
+    */
+   struct v3dv_framebuffer *fb = cmd_buffer->state.dynamic_framebuffer;
+   if (!fb) {
+      uint32_t alloc_size = sizeof(struct v3dv_framebuffer) +
+                            attachments_alloc_size;
+      fb = vk_object_zalloc(&cmd_buffer->device->vk, NULL, alloc_size,
+                            VK_OBJECT_TYPE_FRAMEBUFFER);
+      if (fb == NULL) {
+         v3dv_flag_oom(cmd_buffer, NULL);
+         return;
+      }
+      cmd_buffer->state.dynamic_framebuffer = fb;
+   } else {
+      memset(fb->attachments, 0, attachments_alloc_size);
+   }
+
+   fb->width = info->renderArea.offset.x + info->renderArea.extent.width;
+   fb->height = info->renderArea.offset.y + info->renderArea.extent.height;
+
+   /* From the Vulkan spec for VkFramebufferCreateInfo:
+    *
+    *    "If the render pass uses multiview, then layers must be one (...)"
+    */
+   fb->layers = info->viewMask == 0 ? info->layerCount : 1;
+
+   struct v3dv_render_pass *pass = &cmd_buffer->state.dynamic_pass;
+   assert(pass->subpass_count == 1 && pass->subpasses);
+   assert(pass->subpasses[0].color_count == info->colorAttachmentCount);
+   fb->color_attachment_count = info->colorAttachmentCount;
+
+   uint32_t a = 0;
+   for (int i = 0; i < info->colorAttachmentCount; i++) {
+      if (info->pColorAttachments[i].imageView == VK_NULL_HANDLE)
+         continue;
+      fb->attachments[a++] =
+         v3dv_image_view_from_handle(info->pColorAttachments[i].imageView);
+      if (info->pColorAttachments[i].resolveMode != VK_RESOLVE_MODE_NONE) {
+         fb->attachments[a++] =
+            v3dv_image_view_from_handle(info->pColorAttachments[i].resolveImageView);
+      }
+   }
+
+   if ((info->pDepthAttachment && info->pDepthAttachment->imageView) ||
+       (info->pStencilAttachment && info->pStencilAttachment->imageView)) {
+      const struct VkRenderingAttachmentInfo *common_ds_info =
+         (info->pDepthAttachment &&
+          info->pDepthAttachment->imageView != VK_NULL_HANDLE) ?
+         info->pDepthAttachment :
+         info->pStencilAttachment;
+
+      fb->attachments[a++] =
+         v3dv_image_view_from_handle(common_ds_info->imageView);
+
+      if (common_ds_info->resolveMode != VK_RESOLVE_MODE_NONE) {
+         fb->attachments[a++] =
+            v3dv_image_view_from_handle(common_ds_info->resolveImageView);
+      }
+   }
+
+   assert(a == pass->attachment_count);
+   fb->attachment_count = a;
+
+   /* Dynamic rendering doesn't provide the size of the underlying framebuffer
+    * so we estimate its size from the render area. This means it is possible
+    * the underlying attachments are larger and thus we cannot assume we have
+    * edge padding.
+    */
+   fb->has_edge_padding = false;
+}
+
+void
+v3dv_destroy_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   if (!cmd_buffer->state.dynamic_framebuffer)
+      return;
+
+   VkDevice vk_device = v3dv_device_to_handle(cmd_buffer->device);
+   VkFramebuffer vk_dynamic_fb =
+      v3dv_framebuffer_to_handle(cmd_buffer->state.dynamic_framebuffer);
+   v3dv_DestroyFramebuffer(vk_device, vk_dynamic_fb, NULL);
+   cmd_buffer->state.dynamic_framebuffer = NULL;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
                               VkExternalMemoryHandleTypeFlagBits handleType,
@@ -2494,7 +2838,7 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
                               VkMemoryFdPropertiesKHR *pMemoryFdProperties)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+   struct v3dv_physical_device *pdevice = device->pdevice;
 
    switch (handleType) {
    case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
@@ -2502,7 +2846,7 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
          (1 << pdevice->memory.memoryTypeCount) - 1;
       return VK_SUCCESS;
    default:
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 }
 
@@ -2523,7 +2867,7 @@ v3dv_GetMemoryFdKHR(VkDevice _device,
                             mem->bo->handle,
                             DRM_CLOEXEC, &fd);
    if (ret)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    *pFd = fd;
 
@@ -2531,63 +2875,6 @@ v3dv_GetMemoryFdKHR(VkDevice _device,
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateEvent(VkDevice _device,
-                 const VkEventCreateInfo *pCreateInfo,
-                 const VkAllocationCallbacks *pAllocator,
-                 VkEvent *pEvent)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   struct v3dv_event *event =
-      vk_object_zalloc(&device->vk, pAllocator, sizeof(*event),
-                       VK_OBJECT_TYPE_EVENT);
-   if (!event)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   /* Events are created in the unsignaled state */
-   event->state = false;
-   *pEvent = v3dv_event_to_handle(event);
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroyEvent(VkDevice _device,
-                  VkEvent _event,
-                  const VkAllocationCallbacks *pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_event, event, _event);
-
-   if (!event)
-      return;
-
-   vk_object_free(&device->vk, pAllocator, event);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetEventStatus(VkDevice _device, VkEvent _event)
-{
-   V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   return p_atomic_read(&event->state) ? VK_EVENT_SET : VK_EVENT_RESET;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_SetEvent(VkDevice _device, VkEvent _event)
-{
-   V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   p_atomic_set(&event->state, 1);
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetEvent(VkDevice _device, VkEvent _event)
-{
-   V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   p_atomic_set(&event->state, 0);
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateSampler(VkDevice _device,
                  const VkSamplerCreateInfo *pCreateInfo,
                  const VkAllocationCallbacks *pAllocator,
@@ -2601,7 +2888,9 @@ v3dv_CreateSampler(VkDevice _device,
    sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
                               VK_OBJECT_TYPE_SAMPLER);
    if (!sampler)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   sampler->plane_count = 1;
 
    sampler->compare_enable = pCreateInfo->compareEnable;
    sampler->unnormalized_coordinates = pCreateInfo->unnormalizedCoordinates;
@@ -2610,7 +2899,21 @@ v3dv_CreateSampler(VkDevice _device,
       vk_find_struct_const(pCreateInfo->pNext,
                            SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT);
 
-   v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
+   const VkSamplerYcbcrConversionInfo *ycbcr_conv_info =
+      vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
+
+   const struct vk_format_ycbcr_info *ycbcr_info = NULL;
+
+   if (ycbcr_conv_info) {
+      VK_FROM_HANDLE(vk_ycbcr_conversion, conversion, ycbcr_conv_info->conversion);
+      ycbcr_info = vk_format_get_ycbcr_info(conversion->state.format);
+      if (ycbcr_info) {
+         sampler->plane_count = ycbcr_info->n_planes;
+         sampler->conversion = conversion;
+      }
+   }
+
+   v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);
 
    *pSampler = v3dv_sampler_to_handle(sampler);
 
@@ -2659,49 +2962,65 @@ v3dv_GetImageSparseMemoryRequirements2(
    *pSparseMemoryRequirementCount = 0;
 }
 
-/* vk_icd.h does not declare this function, so we declare it here to
- * suppress Wmissing-prototypes.
- */
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion);
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDeviceImageSparseMemoryRequirements(
+    VkDevice device,
+    const VkDeviceImageMemoryRequirements *pInfo,
+    uint32_t *pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   *pSparseMemoryRequirementCount = 0;
+}
 
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion)
+VkDeviceAddress
+v3dv_GetBufferDeviceAddress(VkDevice device,
+                            const VkBufferDeviceAddressInfo *pInfo)
 {
-   /* For the full details on loader interface versioning, see
-    * <https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers/blob/master/loader/LoaderAndLayerInterface.md>.
-    * What follows is a condensed summary, to help you navigate the large and
-    * confusing official doc.
-    *
-    *   - Loader interface v0 is incompatible with later versions. We don't
-    *     support it.
-    *
-    *   - In loader interface v1:
-    *       - The first ICD entrypoint called by the loader is
-    *         vk_icdGetInstanceProcAddr(). The ICD must statically expose this
-    *         entrypoint.
-    *       - The ICD must statically expose no other Vulkan symbol unless it is
-    *         linked with -Bsymbolic.
-    *       - Each dispatchable Vulkan handle created by the ICD must be
-    *         a pointer to a struct whose first member is VK_LOADER_DATA. The
-    *         ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC.
-    *       - The loader implements vkCreate{PLATFORM}SurfaceKHR() and
-    *         vkDestroySurfaceKHR(). The ICD must be capable of working with
-    *         such loader-managed surfaces.
-    *
-    *    - Loader interface v2 differs from v1 in:
-    *       - The first ICD entrypoint called by the loader is
-    *         vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must
-    *         statically expose this entrypoint.
-    *
-    *    - Loader interface v3 differs from v2 in:
-    *        - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(),
-    *          vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR,
-    *          because the loader no longer does so.
-    *
-    *    - Loader interface v4 differs from v3 in:
-    *        - The ICD must implement vk_icdGetPhysicalDeviceProcAddr().
-    */
-   *pSupportedVersion = MIN2(*pSupportedVersion, 3u);
-   return VK_SUCCESS;
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
+   return buffer->mem_offset + buffer->mem->bo->offset;
+}
+
+uint64_t
+v3dv_GetBufferOpaqueCaptureAddress(VkDevice device,
+                                   const VkBufferDeviceAddressInfo *pInfo)
+{
+   /* Not implemented */
+   return 0;
+}
+
+uint64_t
+v3dv_GetDeviceMemoryOpaqueCaptureAddress(
+    VkDevice device,
+    const VkDeviceMemoryOpaqueCaptureAddressInfo *pInfo)
+{
+   /* Not implemented */
+   return 0;
+}
+
+VkResult
+v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
+                                      nir_shader *nir,
+                                      VkPipelineLayout pipeline_layout,
+                                      VkPipeline *pipeline)
+{
+   struct vk_shader_module cs_m = vk_shader_module_from_nir(nir);
+
+   VkPipelineShaderStageCreateInfo set_event_cs_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .module = vk_shader_module_to_handle(&cs_m),
+      .pName = "main",
+   };
+
+   VkComputePipelineCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = set_event_cs_stage,
+      .layout = pipeline_layout,
+   };
+
+   VkResult result =
+      v3dv_CreateComputePipelines(v3dv_device_to_handle(device), VK_NULL_HANDLE,
+                                  1, &info, &device->vk.alloc, pipeline);
+
+   return result;
 }
diff --git a/src/broadcom/vulkan/v3dv_event.c b/src/broadcom/vulkan/v3dv_event.c
new file mode 100644
index 00000000000..a3aad37d9c7
--- /dev/null
+++ b/src/broadcom/vulkan/v3dv_event.c
@@ -0,0 +1,712 @@
+/*
+ * Copyright © 2022 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "compiler/nir/nir_builder.h"
+
+#include "vk_common_entrypoints.h"
+
+static nir_shader *
+get_set_event_cs()
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+                                                  "set event cs");
+
+   nir_def *buf =
+      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+                                .desc_set = 0,
+                                .binding = 0,
+                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+   nir_def *offset =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+   nir_def *value =
+      nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+   nir_store_ssbo(&b, value, buf, offset,
+                  .access = 0, .write_mask = 0x1, .align_mul = 4);
+
+   return b.shader;
+}
+
+static nir_shader *
+get_wait_event_cs()
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+                                                  "wait event cs");
+
+   nir_def *buf =
+      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+                                .desc_set = 0,
+                                .binding = 0,
+                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+   nir_def *offset =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+   nir_loop *loop = nir_push_loop(&b);
+      nir_def *load =
+         nir_load_ssbo(&b, 1, 8, buf, offset, .access = 0, .align_mul = 4);
+      nir_def *value = nir_i2i32(&b, load);
+
+      nir_if *if_stmt = nir_push_if(&b, nir_ieq_imm(&b, value, 1));
+      nir_jump(&b, nir_jump_break);
+      nir_pop_if(&b, if_stmt);
+   nir_pop_loop(&b, loop);
+
+   return b.shader;
+}
+
+static bool
+create_event_pipelines(struct v3dv_device *device)
+{
+   VkResult result;
+
+   if (!device->events.descriptor_set_layout) {
+      /* Pipeline layout:
+       *  - 1 storage buffer for the BO with the events state.
+       *  - 2 push constants:
+       *    0B: offset of the event in the buffer (4 bytes).
+       *    4B: value for the event (1 byte), only used with the set_event_pipeline.
+       */
+      VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
+         .binding = 0,
+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      };
+
+      VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+         .bindingCount = 1,
+         .pBindings = &descriptor_set_layout_binding,
+      };
+
+      result =
+         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
+                                        &descriptor_set_layout_info,
+                                        &device->vk.alloc,
+                                        &device->events.descriptor_set_layout);
+
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   if (!device->events.pipeline_layout) {
+      VkPipelineLayoutCreateInfo pipeline_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+         .setLayoutCount = 1,
+         .pSetLayouts = &device->events.descriptor_set_layout,
+         .pushConstantRangeCount = 1,
+         .pPushConstantRanges =
+             &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 5 },
+      };
+
+      result =
+         v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+                                   &pipeline_layout_info,
+                                   &device->vk.alloc,
+                                   &device->events.pipeline_layout);
+
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   VkPipeline pipeline;
+
+   if (!device->events.set_event_pipeline) {
+      nir_shader *set_event_cs_nir = get_set_event_cs();
+      result = v3dv_create_compute_pipeline_from_nir(device,
+                                                     set_event_cs_nir,
+                                                     device->events.pipeline_layout,
+                                                     &pipeline);
+      ralloc_free(set_event_cs_nir);
+      if (result != VK_SUCCESS)
+         return false;
+
+      device->events.set_event_pipeline = pipeline;
+   }
+
+   if (!device->events.wait_event_pipeline) {
+      nir_shader *wait_event_cs_nir = get_wait_event_cs();
+      result = v3dv_create_compute_pipeline_from_nir(device,
+                                                     wait_event_cs_nir,
+                                                     device->events.pipeline_layout,
+                                                     &pipeline);
+      ralloc_free(wait_event_cs_nir);
+      if (result != VK_SUCCESS)
+         return false;
+
+      device->events.wait_event_pipeline = pipeline;
+   }
+
+   return true;
+}
+
+static void
+destroy_event_pipelines(struct v3dv_device *device)
+{
+   VkDevice _device = v3dv_device_to_handle(device);
+
+   v3dv_DestroyPipeline(_device, device->events.set_event_pipeline,
+                         &device->vk.alloc);
+   device->events.set_event_pipeline = VK_NULL_HANDLE;
+
+   v3dv_DestroyPipeline(_device, device->events.wait_event_pipeline,
+                         &device->vk.alloc);
+   device->events.wait_event_pipeline = VK_NULL_HANDLE;
+
+   v3dv_DestroyPipelineLayout(_device, device->events.pipeline_layout,
+                              &device->vk.alloc);
+   device->events.pipeline_layout = VK_NULL_HANDLE;
+
+   v3dv_DestroyDescriptorSetLayout(_device,
+                                   device->events.descriptor_set_layout,
+                                   &device->vk.alloc);
+   device->events.descriptor_set_layout = VK_NULL_HANDLE;
+}
+
+static void
+init_event(struct v3dv_device *device, struct v3dv_event *event, uint32_t index)
+{
+   vk_object_base_init(&device->vk, &event->base, VK_OBJECT_TYPE_EVENT);
+   event->index = index;
+   list_addtail(&event->link, &device->events.free_list);
+}
+
+VkResult
+v3dv_event_allocate_resources(struct v3dv_device *device)
+{
+   VkResult result = VK_SUCCESS;
+   VkDevice _device = v3dv_device_to_handle(device);
+
+   /* BO with event states. Make sure we always align to a page size (4096)
+    * to ensure we use all the memory the kernel will allocate for the BO.
+    *
+    * CTS has tests that require over 8192 active events (yes, really) so
+    * let's make sure we allow for that.
+    */
+   const uint32_t bo_size = 3 * 4096;
+   struct v3dv_bo *bo = v3dv_bo_alloc(device, bo_size, "events", true);
+   if (!bo) {
+      result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      goto fail;
+   }
+
+   device->events.bo = bo;
+
+   if (!v3dv_bo_map(device, bo, bo_size)) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail;
+   }
+
+   /* Pre-allocate our events, each event requires 1 byte of BO storage */
+   device->events.event_count = bo_size;
+   device->events.events =
+      vk_zalloc2(&device->vk.alloc, NULL,
+                 device->events.event_count * sizeof(struct v3dv_event), 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!device->events.events) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail;
+   }
+
+   list_inithead(&device->events.free_list);
+   for (int i = 0; i < device->events.event_count; i++)
+      init_event(device, &device->events.events[i], i);
+
+   /* Vulkan buffer for the event state BO */
+   VkBufferCreateInfo buf_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+      .size = bo->size,
+      .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+   };
+   result = v3dv_CreateBuffer(_device, &buf_info, NULL,
+                              &device->events.buffer);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   struct v3dv_device_memory *mem =
+      vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
+                       VK_OBJECT_TYPE_DEVICE_MEMORY);
+   if (!mem) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail;
+   }
+
+   mem->bo = bo;
+   mem->type = &device->pdevice->memory.memoryTypes[0];
+
+   device->events.mem = v3dv_device_memory_to_handle(mem);
+   VkBindBufferMemoryInfo bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+      .buffer = device->events.buffer,
+      .memory = device->events.mem,
+      .memoryOffset = 0,
+   };
+   v3dv_BindBufferMemory2(_device, 1, &bind_info);
+
+   /* Pipelines */
+   if (!create_event_pipelines(device)) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail;
+   }
+
+   /* Descriptor pool & set to access the buffer */
+   VkDescriptorPoolSize pool_size = {
+      .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+      .descriptorCount = 1,
+   };
+   VkDescriptorPoolCreateInfo pool_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+      .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+      .maxSets = 1,
+      .poolSizeCount = 1,
+      .pPoolSizes = &pool_size,
+   };
+   result =
+      v3dv_CreateDescriptorPool(_device, &pool_info, NULL,
+                                &device->events.descriptor_pool);
+
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   VkDescriptorSetAllocateInfo alloc_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+      .descriptorPool = device->events.descriptor_pool,
+      .descriptorSetCount = 1,
+      .pSetLayouts = &device->events.descriptor_set_layout,
+   };
+   result = v3dv_AllocateDescriptorSets(_device, &alloc_info,
+                                         &device->events.descriptor_set);
+   if (result != VK_SUCCESS)
+      goto  fail;
+
+   VkDescriptorBufferInfo desc_buf_info = {
+      .buffer = device->events.buffer,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE,
+   };
+
+   VkWriteDescriptorSet write = {
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = device->events.descriptor_set,
+      .dstBinding = 0,
+      .dstArrayElement = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+      .pBufferInfo = &desc_buf_info,
+   };
+   v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
+
+   return VK_SUCCESS;
+
+fail:
+   v3dv_event_free_resources(device);
+   return result;
+}
+
+void
+v3dv_event_free_resources(struct v3dv_device *device)
+{
+   if (device->events.bo) {
+      v3dv_bo_free(device, device->events.bo);
+      device->events.bo = NULL;
+   }
+
+   if (device->events.events) {
+      vk_free2(&device->vk.alloc, NULL, device->events.events);
+      device->events.events = NULL;
+   }
+
+   if (device->events.mem) {
+      vk_object_free(&device->vk, NULL,
+                     v3dv_device_memory_from_handle(device->events.mem));
+      device->events.mem = VK_NULL_HANDLE;
+   }
+
+   v3dv_DestroyBuffer(v3dv_device_to_handle(device),
+                      device->events.buffer, NULL);
+   device->events.buffer = VK_NULL_HANDLE;
+
+   v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
+                           device->events.descriptor_pool,
+                           1, &device->events.descriptor_set);
+   device->events.descriptor_set = VK_NULL_HANDLE;
+
+   v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
+                              device->events.descriptor_pool,
+                              NULL);
+   device->events.descriptor_pool = VK_NULL_HANDLE;
+
+   destroy_event_pipelines(device);
+}
+
+static struct v3dv_event *
+allocate_event(struct v3dv_device *device)
+{
+   mtx_lock(&device->events.lock);
+   if (list_is_empty(&device->events.free_list)) {
+      mtx_unlock(&device->events.lock);
+      return NULL;
+   }
+
+   struct v3dv_event *event =
+      list_first_entry(&device->events.free_list, struct v3dv_event, link);
+   list_del(&event->link);
+   mtx_unlock(&device->events.lock);
+
+   return event;
+}
+
+static void
+free_event(struct v3dv_device *device, uint32_t index)
+{
+   assert(index < device->events.event_count);
+   mtx_lock(&device->events.lock);
+   list_addtail(&device->events.events[index].link, &device->events.free_list);
+   mtx_unlock(&device->events.lock);
+}
+
+static void
+event_set_value(struct v3dv_device *device,
+                       struct v3dv_event *event,
+                       uint8_t value)
+{
+   assert(value == 0 || value == 1);
+   uint8_t *data = (uint8_t *) device->events.bo->map;
+   data[event->index] = value;
+}
+
+static uint8_t
+event_get_value(struct v3dv_device *device, struct v3dv_event *event)
+{
+   uint8_t *data = (uint8_t *) device->events.bo->map;
+   return data[event->index];
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_CreateEvent(VkDevice _device,
+                 const VkEventCreateInfo *pCreateInfo,
+                 const VkAllocationCallbacks *pAllocator,
+                 VkEvent *pEvent)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   VkResult result = VK_SUCCESS;
+
+   struct v3dv_event *event = allocate_event(device);
+   if (!event) {
+      result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      goto fail;
+   }
+
+   event_set_value(device, event, 0);
+   *pEvent = v3dv_event_to_handle(event);
+   return VK_SUCCESS;
+
+fail:
+   return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_DestroyEvent(VkDevice _device,
+                  VkEvent _event,
+                  const VkAllocationCallbacks *pAllocator)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+   if (!event)
+      return;
+
+   free_event(device, event->index);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetEventStatus(VkDevice _device, VkEvent _event)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+   return event_get_value(device, event) ? VK_EVENT_SET : VK_EVENT_RESET;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_SetEvent(VkDevice _device, VkEvent _event)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+   event_set_value(device, event, 1);
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_ResetEvent(VkDevice _device, VkEvent _event)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+   event_set_value(device, event, 0);
+   return VK_SUCCESS;
+}
+
+static void
+cmd_buffer_emit_set_event(struct v3dv_cmd_buffer *cmd_buffer,
+                          struct v3dv_event *event,
+                          uint8_t value)
+{
+   assert(value == 0 || value == 1);
+
+   struct v3dv_device *device = cmd_buffer->device;
+   VkCommandBuffer commandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+   v3dv_CmdBindPipeline(commandBuffer,
+                        VK_PIPELINE_BIND_POINT_COMPUTE,
+                        device->events.set_event_pipeline);
+
+   v3dv_CmdBindDescriptorSets(commandBuffer,
+                              VK_PIPELINE_BIND_POINT_COMPUTE,
+                              device->events.pipeline_layout,
+                              0, 1, &device->events.descriptor_set, 0, NULL);
+
+   assert(event->index < device->events.event_count);
+   uint32_t offset = event->index;
+   v3dv_CmdPushConstants(commandBuffer,
+                         device->events.pipeline_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT,
+                         0, 4, &offset);
+
+   v3dv_CmdPushConstants(commandBuffer,
+                         device->events.pipeline_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT,
+                         4, 1, &value);
+
+   vk_common_CmdDispatch(commandBuffer, 1, 1, 1);
+
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+static void
+cmd_buffer_emit_wait_event(struct v3dv_cmd_buffer *cmd_buffer,
+                           struct v3dv_event *event)
+{
+   struct v3dv_device *device = cmd_buffer->device;
+   VkCommandBuffer commandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+   v3dv_CmdBindPipeline(commandBuffer,
+                        VK_PIPELINE_BIND_POINT_COMPUTE,
+                        device->events.wait_event_pipeline);
+
+   v3dv_CmdBindDescriptorSets(commandBuffer,
+                              VK_PIPELINE_BIND_POINT_COMPUTE,
+                              device->events.pipeline_layout,
+                              0, 1, &device->events.descriptor_set, 0, NULL);
+
+   assert(event->index < device->events.event_count);
+   uint32_t offset = event->index;
+   v3dv_CmdPushConstants(commandBuffer,
+                         device->events.pipeline_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT,
+                         0, 4, &offset);
+
+   vk_common_CmdDispatch(commandBuffer, 1, 1, 1);
+
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetEvent2(VkCommandBuffer commandBuffer,
+                  VkEvent _event,
+                  const VkDependencyInfo *pDependencyInfo)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+   /* Event (re)sets can only happen outside a render pass instance so we
+    * should not be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   /* We need to add the compute stage to the dstStageMask of all dependencies,
+    * so let's go ahead and patch the dependency info we receive.
+    */
+   struct v3dv_device *device = cmd_buffer->device;
+
+   uint32_t memory_barrier_count = pDependencyInfo->memoryBarrierCount;
+   VkMemoryBarrier2 *memory_barriers = memory_barrier_count ?
+      vk_alloc2(&device->vk.alloc, NULL,
+                memory_barrier_count * sizeof(memory_barriers[0]), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+   for (int i = 0; i < memory_barrier_count; i++) {
+      memory_barriers[i] = pDependencyInfo->pMemoryBarriers[i];
+      memory_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+   }
+
+   uint32_t buffer_barrier_count = pDependencyInfo->bufferMemoryBarrierCount;
+   VkBufferMemoryBarrier2 *buffer_barriers = buffer_barrier_count ?
+      vk_alloc2(&device->vk.alloc, NULL,
+                buffer_barrier_count * sizeof(buffer_barriers[0]), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+   for (int i = 0; i < buffer_barrier_count; i++) {
+      buffer_barriers[i] = pDependencyInfo->pBufferMemoryBarriers[i];
+      buffer_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+   }
+
+   uint32_t image_barrier_count = pDependencyInfo->imageMemoryBarrierCount;
+   VkImageMemoryBarrier2 *image_barriers = image_barrier_count ?
+      vk_alloc2(&device->vk.alloc, NULL,
+                image_barrier_count * sizeof(image_barriers[0]), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+   for (int i = 0; i < image_barrier_count; i++) {
+      image_barriers[i] = pDependencyInfo->pImageMemoryBarriers[i];
+      image_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+   }
+
+   VkDependencyInfo info = {
+      .sType = pDependencyInfo->sType,
+      .dependencyFlags = pDependencyInfo->dependencyFlags,
+      .memoryBarrierCount = memory_barrier_count,
+      .pMemoryBarriers = memory_barriers,
+      .bufferMemoryBarrierCount = buffer_barrier_count,
+      .pBufferMemoryBarriers = buffer_barriers,
+      .imageMemoryBarrierCount = image_barrier_count,
+      .pImageMemoryBarriers = image_barriers,
+   };
+
+   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &info);
+
+   cmd_buffer_emit_set_event(cmd_buffer, event, 1);
+
+   if (memory_barriers)
+      vk_free2(&device->vk.alloc, NULL, memory_barriers);
+   if (buffer_barriers)
+      vk_free2(&device->vk.alloc, NULL, buffer_barriers);
+   if (image_barriers)
+      vk_free2(&device->vk.alloc, NULL, image_barriers);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdResetEvent2(VkCommandBuffer commandBuffer,
+                    VkEvent _event,
+                    VkPipelineStageFlags2 stageMask)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+   /* Event (re)sets can only happen outside a render pass instance so we
+    * should not be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   VkMemoryBarrier2 barrier = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+      .srcStageMask = stageMask,
+      .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+   };
+   VkDependencyInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+      .memoryBarrierCount = 1,
+      .pMemoryBarriers = &barrier,
+   };
+   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &info);
+
+   cmd_buffer_emit_set_event(cmd_buffer, event, 0);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdWaitEvents2(VkCommandBuffer commandBuffer,
+                    uint32_t eventCount,
+                    const VkEvent *pEvents,
+                    const VkDependencyInfo *pDependencyInfo)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   for (uint32_t i = 0; i < eventCount; i++) {
+      struct v3dv_event *event = v3dv_event_from_handle(pEvents[i]);;
+      cmd_buffer_emit_wait_event(cmd_buffer, event);
+   }
+
+   /* We need to add the compute stage to the srcStageMask of all dependencies,
+    * so let's go ahead and patch the dependency info we receive.
+    */
+   struct v3dv_device *device = cmd_buffer->device;
+   for (int e = 0; e < eventCount; e++) {
+      const VkDependencyInfo *info = &pDependencyInfo[e];
+
+      uint32_t memory_barrier_count = info->memoryBarrierCount;
+      VkMemoryBarrier2 *memory_barriers = memory_barrier_count ?
+         vk_alloc2(&device->vk.alloc, NULL,
+                   memory_barrier_count * sizeof(memory_barriers[0]), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+      for (int i = 0; i < memory_barrier_count; i++) {
+         memory_barriers[i] = info->pMemoryBarriers[i];
+         memory_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+      }
+
+      uint32_t buffer_barrier_count = info->bufferMemoryBarrierCount;
+      VkBufferMemoryBarrier2 *buffer_barriers = buffer_barrier_count ?
+         vk_alloc2(&device->vk.alloc, NULL,
+                   buffer_barrier_count * sizeof(buffer_barriers[0]), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+      for (int i = 0; i < buffer_barrier_count; i++) {
+         buffer_barriers[i] = info->pBufferMemoryBarriers[i];
+         buffer_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+      }
+
+      uint32_t image_barrier_count = info->imageMemoryBarrierCount;
+      VkImageMemoryBarrier2 *image_barriers = image_barrier_count ?
+         vk_alloc2(&device->vk.alloc, NULL,
+                   image_barrier_count * sizeof(image_barriers[0]), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+      for (int i = 0; i < image_barrier_count; i++) {
+         image_barriers[i] = info->pImageMemoryBarriers[i];
+         image_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+      }
+
+      VkDependencyInfo new_info = {
+         .sType = info->sType,
+         .dependencyFlags = info->dependencyFlags,
+         .memoryBarrierCount = memory_barrier_count,
+         .pMemoryBarriers = memory_barriers,
+         .bufferMemoryBarrierCount = buffer_barrier_count,
+         .pBufferMemoryBarriers = buffer_barriers,
+         .imageMemoryBarrierCount = image_barrier_count,
+         .pImageMemoryBarriers = image_barriers,
+      };
+
+      v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &new_info);
+
+      if (memory_barriers)
+         vk_free2(&device->vk.alloc, NULL, memory_barriers);
+      if (buffer_barriers)
+         vk_free2(&device->vk.alloc, NULL, buffer_barriers);
+      if (image_barriers)
+         vk_free2(&device->vk.alloc, NULL, image_barriers);
+   }
+}
diff --git a/src/broadcom/vulkan/v3dv_formats.c b/src/broadcom/vulkan/v3dv_formats.c
index 6e32d341a25..4d8f648d26a 100644
--- a/src/broadcom/vulkan/v3dv_formats.c
+++ b/src/broadcom/vulkan/v3dv_formats.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -22,15 +22,20 @@
  */
 
 #include "v3dv_private.h"
+#if DETECT_OS_ANDROID
+#include "vk_android.h"
+#endif
+#include "vk_enum_defines.h"
 #include "vk_util.h"
-#include "vk_format_info.h"
 
 #include "drm-uapi/drm_fourcc.h"
 #include "util/format/u_format.h"
 #include "vulkan/wsi/wsi_common.h"
 
+#include <vulkan/vulkan_android.h>
+
 const uint8_t *
-v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f)
+v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f, uint8_t plane)
 {
    const struct v3dv_format *vf = v3dv_X(device, get_format)(f);
    static const uint8_t fallback[] = {0, 1, 2, 3};
@@ -38,23 +43,43 @@ v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f)
    if (!vf)
       return fallback;
 
-   return vf->swizzle;
+   return vf->planes[plane].swizzle;
 }
 
-uint8_t
-v3dv_get_tex_return_size(const struct v3dv_format *vf,
-                         bool compare_enable)
+bool
+v3dv_format_swizzle_needs_rb_swap(const uint8_t *swizzle)
 {
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT))
-      return 16;
+   /* Normal case */
+   if (swizzle[0] == PIPE_SWIZZLE_Z)
+      return swizzle[2] == PIPE_SWIZZLE_X;
+
+   /* Format uses reverse flag */
+   if (swizzle[0] == PIPE_SWIZZLE_Y)
+      return swizzle[2] == PIPE_SWIZZLE_W;
 
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT))
-      return 32;
+   return false;
+}
+
+bool
+v3dv_format_swizzle_needs_reverse(const uint8_t *swizzle)
+{
+   /* Normal case */
+   if (swizzle[0] == PIPE_SWIZZLE_W &&
+       swizzle[1] == PIPE_SWIZZLE_Z &&
+       swizzle[2] == PIPE_SWIZZLE_Y &&
+       swizzle[3] == PIPE_SWIZZLE_X) {
+      return true;
+   }
 
-   if (compare_enable)
-      return 16;
+   /* Format uses RB swap flag */
+   if (swizzle[0] == PIPE_SWIZZLE_Y &&
+       swizzle[1] == PIPE_SWIZZLE_Z &&
+       swizzle[2] == PIPE_SWIZZLE_W &&
+       swizzle[3] == PIPE_SWIZZLE_X) {
+      return true;
+   }
 
-   return vf->return_size;
+   return false;
 }
 
 /* Some cases of transfer operations are raw data copies that don't depend
@@ -62,6 +87,9 @@ v3dv_get_tex_return_size(const struct v3dv_format *vf,
  * involved). In these cases, it is safe to choose any format supported by
  * the TFU so long as it has the same texel size, which allows us to use the
  * TFU paths with formats that are not TFU supported otherwise.
+ *
+ * Even when copying multi-plane images, we are copying per-plane, so the
+ * compatible TFU format will be single-plane.
  */
 const struct v3dv_format *
 v3dv_get_compatible_tfu_format(struct v3dv_device *device,
@@ -82,20 +110,18 @@ v3dv_get_compatible_tfu_format(struct v3dv_device *device,
       *out_vk_format = vk_format;
 
    const struct v3dv_format *format = v3dv_X(device, get_format)(vk_format);
-   assert(v3dv_X(device, tfu_supports_tex_format)(format->tex_type));
+   assert(format->plane_count == 1);
+   assert(v3dv_X(device, tfu_supports_tex_format)(format->planes[0].tex_type));
 
    return format;
 }
 
-static VkFormatFeatureFlags
-image_format_features(struct v3dv_physical_device *pdevice,
-                      VkFormat vk_format,
-                      const struct v3dv_format *v3dv_format,
-                      VkImageTiling tiling)
+static VkFormatFeatureFlags2
+image_format_plane_features(struct v3dv_physical_device *pdevice,
+                            VkFormat vk_format,
+                            const struct v3dv_format_plane *v3dv_format,
+                            VkImageTiling tiling)
 {
-   if (!v3dv_format || !v3dv_format->supported)
-      return 0;
-
    const VkImageAspectFlags aspects = vk_format_aspects(vk_format);
 
    const VkImageAspectFlags zs_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
@@ -114,7 +140,7 @@ image_format_features(struct v3dv_physical_device *pdevice,
       return 0;
    }
 
-   VkFormatFeatureFlags flags = 0;
+   VkFormatFeatureFlags2 flags = 0;
 
    /* Raster format is only supported for 1D textures, so let's just
     * always require optimal tiling for anything that requires sampling.
@@ -123,55 +149,127 @@ image_format_features(struct v3dv_physical_device *pdevice,
     */
    if (v3dv_format->tex_type != TEXTURE_DATA_FORMAT_NO &&
        tiling == VK_IMAGE_TILING_OPTIMAL) {
-      flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
-               VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+               VK_FORMAT_FEATURE_2_BLIT_SRC_BIT;
 
-      if (v3dv_format->supports_filtering)
-         flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
    }
 
    if (v3dv_format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
       if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
-         flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
-                  VK_FORMAT_FEATURE_BLIT_DST_BIT;
-         if (v3dv_X(pdevice, format_supports_blending)(v3dv_format))
-            flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+         flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+                  VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
       } else if (aspects & zs_aspects) {
-         flags |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
-                  VK_FORMAT_FEATURE_BLIT_DST_BIT;
+         flags |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT |
+                  VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
       }
    }
 
    const struct util_format_description *desc =
       vk_format_description(vk_format);
-   assert(desc);
 
-   if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->is_array) {
-      flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
-      if (desc->nr_channels == 1 && vk_format_is_int(vk_format))
-         flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
-   } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
-              vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
-              vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
-      /* To comply with shaderStorageImageExtendedFormats */
-      flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+   if (tiling != VK_IMAGE_TILING_LINEAR) {
+      if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->is_array) {
+         flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+         if (desc->nr_channels == 1 && vk_format_is_int(vk_format))
+            flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
+      } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
+                 vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32 ||
+                 vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
+                 vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
+         /* To comply with shaderStorageImageExtendedFormats */
+         flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+      }
+   }
+
+   /* All our depth formats support shadow comparisons. */
+   if (vk_format_has_depth(vk_format) &&
+       (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) {
+      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
    }
 
    if (flags) {
-      flags |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
-               VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+      flags |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
    }
 
    return flags;
 }
 
-static VkFormatFeatureFlags
+static VkFormatFeatureFlags2
+image_format_features(struct v3dv_physical_device *pdevice,
+                       VkFormat vk_format,
+                       const struct v3dv_format *v3dv_format,
+                       VkImageTiling tiling)
+{
+   if (!v3dv_format || !v3dv_format->plane_count)
+      return 0;
+
+   VkFormatFeatureFlags2 flags = ~0ull;
+   for (uint8_t plane = 0;
+        flags && plane < v3dv_format->plane_count;
+        plane++) {
+      VkFormat plane_format = vk_format_get_plane_format(vk_format, plane);
+
+      flags &= image_format_plane_features(pdevice,
+                                           plane_format,
+                                           &v3dv_format->planes[plane],
+                                           tiling);
+   }
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(vk_format);
+
+   if (ycbcr_info) {
+      assert(v3dv_format->plane_count == ycbcr_info->n_planes);
+
+      flags |= VK_FORMAT_FEATURE_2_DISJOINT_BIT;
+
+      if (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT) {
+         flags |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+         for (unsigned p = 0; p < ycbcr_info->n_planes; p++) {
+            if (ycbcr_info->planes[p].denominator_scales[0] > 1 ||
+                ycbcr_info->planes[p].denominator_scales[1] > 1) {
+               flags |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT;
+               break;
+            }
+         }
+      }
+
+      /* FIXME: in the future we should be able to support BLIT_SRC via the
+       * blit_shader path
+       */
+      const VkFormatFeatureFlags2 disallowed_ycbcr_image_features =
+         VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+         VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+         VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+         VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT |
+         VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+
+      flags &= ~disallowed_ycbcr_image_features;
+   }
+
+   if (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT &&
+       v3dv_format->supports_filtering) {
+      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+   }
+
+   if (flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT &&
+       v3dv_X(pdevice, format_supports_blending)(v3dv_format)) {
+      flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT;
+   }
+
+   return flags;
+}
+
+static VkFormatFeatureFlags2
 buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format)
 {
-   if (!v3dv_format || !v3dv_format->supported)
+   if (!v3dv_format)
       return 0;
 
-   if (!v3dv_format->supported)
+   if (v3dv_format->plane_count != 1)
       return 0;
 
    /* We probably only want to support buffer formats that have a
@@ -182,32 +280,39 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
 
    const struct util_format_description *desc =
       vk_format_description(vk_format);
-   assert(desc);
 
-   VkFormatFeatureFlags flags = 0;
+   VkFormatFeatureFlags2 flags = 0;
    if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
        desc->is_array) {
-      flags |=  VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
-      if (v3dv_format->tex_type != TEXTURE_DATA_FORMAT_NO) {
-         flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT |
-                  VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+      flags |=  VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT;
+      if (v3dv_format->planes[0].tex_type != TEXTURE_DATA_FORMAT_NO) {
+         /* STORAGE_READ_WITHOUT_FORMAT can also be applied for buffers. From spec:
+          *   "VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT specifies
+          *    that image views or buffer views created with this format can
+          *    be used as storage images for read operations without
+          *    specifying a format."
+          */
+         flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
       }
-   } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32) {
-      flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT |
-               VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT |
-               VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+   } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
+              vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
+      flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT |
+               VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT |
+               VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
    } else if (vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
               vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
-      flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT |
-               VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+      flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT |
+               VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
    }
 
    if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        desc->is_array &&
        desc->nr_channels == 1 &&
        vk_format_is_int(vk_format)) {
-      flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
    }
 
    return flags;
@@ -216,48 +321,44 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
 bool
 v3dv_buffer_format_supports_features(struct v3dv_device *device,
                                      VkFormat vk_format,
-                                     VkFormatFeatureFlags features)
+                                     VkFormatFeatureFlags2 features)
 {
    const struct v3dv_format *v3dv_format = v3dv_X(device, get_format)(vk_format);
-   const VkFormatFeatureFlags supported =
+   const VkFormatFeatureFlags2 supported =
       buffer_format_features(vk_format, v3dv_format);
    return (supported & features) == features;
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice,
-                                       VkFormat format,
-                                       VkFormatProperties* pFormatProperties)
+v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
+                                        VkFormat format,
+                                        VkFormatProperties2 *pFormatProperties)
 {
    V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
    const struct v3dv_format *v3dv_format = v3dv_X(pdevice, get_format)(format);
 
-   *pFormatProperties = (VkFormatProperties) {
-      .linearTilingFeatures =
-         image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_LINEAR),
-      .optimalTilingFeatures =
-         image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_OPTIMAL),
-      .bufferFeatures =
-         buffer_format_features(format, v3dv_format),
+   VkFormatFeatureFlags2 linear2, optimal2, buffer2;
+   linear2 = image_format_features(pdevice, format, v3dv_format,
+                                   VK_IMAGE_TILING_LINEAR);
+   optimal2 = image_format_features(pdevice, format, v3dv_format,
+                                    VK_IMAGE_TILING_OPTIMAL);
+   buffer2 = buffer_format_features(format, v3dv_format);
+   pFormatProperties->formatProperties = (VkFormatProperties) {
+      .linearTilingFeatures = vk_format_features2_to_features(linear2),
+      .optimalTilingFeatures = vk_format_features2_to_features(optimal2),
+      .bufferFeatures = vk_format_features2_to_features(buffer2),
    };
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
-                                        VkFormat format,
-                                        VkFormatProperties2 *pFormatProperties)
-{
-   v3dv_GetPhysicalDeviceFormatProperties(physicalDevice, format,
-                                          &pFormatProperties->formatProperties);
 
    vk_foreach_struct(ext, pFormatProperties->pNext) {
       switch ((unsigned)ext->sType) {
       case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT: {
          struct VkDrmFormatModifierPropertiesListEXT *list = (void *)ext;
-         VK_OUTARRAY_MAKE(out, list->pDrmFormatModifierProperties,
-                          &list->drmFormatModifierCount);
+         VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
+                                list->pDrmFormatModifierProperties,
+                                &list->drmFormatModifierCount);
          if (pFormatProperties->formatProperties.linearTilingFeatures) {
-            vk_outarray_append(&out, mod_props) {
+            vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT,
+                                     &out, mod_props) {
                mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR;
                mod_props->drmFormatModifierPlaneCount = 1;
                mod_props->drmFormatModifierTilingFeatures =
@@ -265,7 +366,8 @@ v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
             }
          }
          if (pFormatProperties->formatProperties.optimalTilingFeatures) {
-            vk_outarray_append(&out, mod_props) {
+            vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT,
+                                     &out, mod_props) {
                mod_props->drmFormatModifier = DRM_FORMAT_MOD_BROADCOM_UIF;
                mod_props->drmFormatModifierPlaneCount = 1;
                mod_props->drmFormatModifierTilingFeatures =
@@ -274,6 +376,36 @@ v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
          }
          break;
       }
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT: {
+         struct VkDrmFormatModifierPropertiesList2EXT *list = (void *)ext;
+         VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out,
+                                list->pDrmFormatModifierProperties,
+                                &list->drmFormatModifierCount);
+         if (linear2) {
+            vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT,
+                                     &out, mod_props) {
+               mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR;
+               mod_props->drmFormatModifierPlaneCount = 1;
+               mod_props->drmFormatModifierTilingFeatures = linear2;
+            }
+         }
+         if (optimal2) {
+            vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT,
+                                     &out, mod_props) {
+               mod_props->drmFormatModifier = DRM_FORMAT_MOD_BROADCOM_UIF;
+               mod_props->drmFormatModifierPlaneCount = 1;
+               mod_props->drmFormatModifierTilingFeatures = optimal2;
+            }
+         }
+         break;
+      }
+      case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: {
+         VkFormatProperties3 *props = (VkFormatProperties3 *)ext;
+         props->linearTilingFeatures = linear2;
+         props->optimalTilingFeatures = optimal2;
+         props->bufferFeatures = buffer2;
+         break;
+      }
       default:
          v3dv_debug_ignored_stype(ext->sType);
          break;
@@ -290,7 +422,7 @@ get_image_format_properties(
    VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties)
 {
    const struct v3dv_format *v3dv_format = v3dv_X(physical_device, get_format)(info->format);
-   VkFormatFeatureFlags format_feature_flags =
+   VkFormatFeatureFlags2 format_feature_flags =
       image_format_features(physical_device, info->format, v3dv_format, tiling);
    if (!format_feature_flags)
       goto unsupported;
@@ -307,8 +439,24 @@ get_image_format_properties(
    if (info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT)
       goto unsupported;
 
-   if (info->usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_SRC_BIT)) {
+   const VkImageStencilUsageCreateInfo *stencil_usage_info =
+      vk_find_struct_const(info->pNext, IMAGE_STENCIL_USAGE_CREATE_INFO);
+
+   VkImageUsageFlags image_usage =
+      info->usage | (stencil_usage_info ? stencil_usage_info->stencilUsage : 0);
+
+   /* If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set it means the usage flags may
+    * not be be supported for the image format but are supported for at least
+    * one compatible format from which an image view can be created for the
+    * image. This means we should not report the format as unsupported based
+    * on the usage flags when usage refers to how an image view may be used
+    * (i.e. as a framebuffer attachment, for sampling, etc).
+    */
+   VkImageUsageFlags view_usage =
+      info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT ? 0 : image_usage;
+
+   if (image_usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT)) {
          goto unsupported;
       }
 
@@ -323,16 +471,16 @@ get_image_format_properties(
       }
    }
 
-   if (info->usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_DST_BIT)) {
+   if (image_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT)) {
          goto unsupported;
       }
    }
 
-   if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
+   if (view_usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+                     VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT))
          goto unsupported;
-      }
 
       /* Sampling of raster depth/stencil images is not supported. Since 1D
        * images are always raster, even if the user requested optimal tiling,
@@ -344,50 +492,47 @@ get_image_format_properties(
       }
    }
 
-   if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
+   if (view_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT)) {
          goto unsupported;
       }
    }
 
-   if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
+   if (view_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT)) {
          goto unsupported;
       }
    }
 
-   if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+   if (view_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
       if (!(format_feature_flags &
-            VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+            VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) {
          goto unsupported;
       }
    }
 
-   /* FIXME: these are taken from VkPhysicalDeviceLimits, we should just put
-    * these limits available in the physical device and read them from there
-    * wherever we need them.
-    */
    switch (info->type) {
    case VK_IMAGE_TYPE_1D:
-      pImageFormatProperties->maxExtent.width = 4096;
+      pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION;
       pImageFormatProperties->maxExtent.height = 1;
       pImageFormatProperties->maxExtent.depth = 1;
-      pImageFormatProperties->maxArrayLayers = 2048;
-      pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */
+      pImageFormatProperties->maxArrayLayers = V3D_MAX_ARRAY_LAYERS;
+      pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS;
       break;
    case VK_IMAGE_TYPE_2D:
-      pImageFormatProperties->maxExtent.width = 4096;
-      pImageFormatProperties->maxExtent.height = 4096;
+      pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION;
+      pImageFormatProperties->maxExtent.height = V3D_MAX_IMAGE_DIMENSION;
       pImageFormatProperties->maxExtent.depth = 1;
-      pImageFormatProperties->maxArrayLayers = 2048;
-      pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */
+      pImageFormatProperties->maxArrayLayers =
+         v3dv_format->plane_count == 1 ? V3D_MAX_ARRAY_LAYERS : 1;
+      pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS;
       break;
    case VK_IMAGE_TYPE_3D:
-      pImageFormatProperties->maxExtent.width = 4096;
-      pImageFormatProperties->maxExtent.height = 4096;
-      pImageFormatProperties->maxExtent.depth = 4096;
+      pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION;
+      pImageFormatProperties->maxExtent.height = V3D_MAX_IMAGE_DIMENSION;
+      pImageFormatProperties->maxExtent.depth = V3D_MAX_IMAGE_DIMENSION;
       pImageFormatProperties->maxArrayLayers = 1;
-      pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */
+      pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS;
       break;
    default:
       unreachable("bad VkImageType");
@@ -416,16 +561,50 @@ get_image_format_properties(
    if (tiling != VK_IMAGE_TILING_LINEAR &&
        info->type == VK_IMAGE_TYPE_2D &&
        !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
-       (format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT ||
-        format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+       (format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT ||
+        format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) {
       pImageFormatProperties->sampleCounts |= VK_SAMPLE_COUNT_4_BIT;
    }
 
    if (tiling == VK_IMAGE_TILING_LINEAR)
       pImageFormatProperties->maxMipLevels = 1;
 
+   /* From the Vulkan 1.2 spec, section 12.3. Images, VkImageCreateInfo structure:
+    *
+    *   "Images created with one of the formats that require a sampler Y′CBCR
+    *    conversion, have further restrictions on their limits and
+    *    capabilities compared to images created with other formats. Creation
+    *    of images with a format requiring Y′CBCR conversion may not be
+    *    supported unless other parameters meet all of the constraints:
+    *
+    *    * imageType is VK_IMAGE_TYPE_2D
+    *    * mipLevels is 1
+    *    * arrayLayers is 1, unless the ycbcrImageArrays feature is enabled, or
+    *      otherwise indicated by VkImageFormatProperties::maxArrayLayers, as
+    *      returned by vkGetPhysicalDeviceImageFormatProperties
+    *    * samples is VK_SAMPLE_COUNT_1_BIT
+    *
+    * Implementations may support additional limits and capabilities beyond
+    * those listed above."
+    *
+    * We don't provide such additional limits, so we set those limits, or just
+    * return unsupported.
+    */
+   if (vk_format_get_plane_count(info->format) > 1) {
+      if (info->type != VK_IMAGE_TYPE_2D)
+         goto unsupported;
+      pImageFormatProperties->maxMipLevels = 1;
+      pImageFormatProperties->maxArrayLayers = 1;
+      pImageFormatProperties->sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+   }
+
    pImageFormatProperties->maxResourceSize = 0xffffffff; /* 32-bit allocation */
 
+   if (pYcbcrImageFormatProperties) {
+      pYcbcrImageFormatProperties->combinedImageSamplerDescriptorCount =
+          vk_format_get_plane_count(info->format);
+   }
+
    return VK_SUCCESS;
 
 unsupported:
@@ -486,6 +665,8 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
    const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
    const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *drm_format_mod_info = NULL;
    VkExternalImageFormatProperties *external_props = NULL;
+   UNUSED VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
+   VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
    VkImageTiling tiling = base_info->tiling;
 
    /* Extract input structs */
@@ -494,6 +675,9 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
          external_info = (const void *) s;
          break;
+      case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO:
+         /* Do nothing, get_image_format_properties() below will handle it */;
+         break;
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT:
          drm_format_mod_info = (const void *) s;
          switch (drm_format_mod_info->drmFormatModifier) {
@@ -522,6 +706,12 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
       case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
          external_props = (void *) s;
          break;
+      case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
+         android_usage = (void *)s;
+         break;
+      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
+         ycbcr_props = (void *) s;
+         break;
       default:
          v3dv_debug_ignored_stype(s->sType);
          break;
@@ -530,7 +720,8 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
 
    VkResult result =
       get_image_format_properties(physical_device, base_info, tiling,
-                                  &base_props->imageFormatProperties, NULL);
+                                  &base_props->imageFormatProperties,
+                                  ycbcr_props);
    if (result != VK_SUCCESS)
       goto done;
 
@@ -541,12 +732,28 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
          if (external_props)
             external_props->externalMemoryProperties = prime_fd_props;
          break;
+#if DETECT_OS_ANDROID
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID:
+         if (external_props) {
+            external_props->externalMemoryProperties.exportFromImportedHandleTypes = 0;
+            external_props->externalMemoryProperties.compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;
+            external_props->externalMemoryProperties.externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT | VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT;
+         }
+         break;
+#endif
       default:
          result = VK_ERROR_FORMAT_NOT_SUPPORTED;
          break;
       }
    }
 
+   if (android_usage) {
+#if DETECT_OS_ANDROID
+      android_usage->androidHardwareBufferUsage =
+         vk_image_usage_to_ahb_usage(base_info->flags, base_info->usage);
+#endif
+   }
+
 done:
    return result;
 }
diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c
index c7ae05c4c22..358c03c555f 100644
--- a/src/broadcom/vulkan/v3dv_image.c
+++ b/src/broadcom/vulkan/v3dv_image.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,9 +26,11 @@
 #include "drm-uapi/drm_fourcc.h"
 #include "util/format/u_format.h"
 #include "util/u_math.h"
-#include "vk_format_info.h"
 #include "vk_util.h"
 #include "vulkan/wsi/wsi_common.h"
+#if DETECT_OS_ANDROID
+#include "vk_android.h"
+#endif
 
 /**
  * Computes the HW's UIFblock padding for a given height/cpp.
@@ -71,32 +73,61 @@ v3d_get_ub_pad(uint32_t cpp, uint32_t height)
    return 0;
 }
 
-static void
-v3d_setup_slices(struct v3dv_image *image)
+/**
+ * Computes the dimension with required padding for mip levels.
+ *
+ * This padding is required for width and height dimensions when the mip
+ * level is greater than 1, and for the depth dimension when the mip level
+ * is greater than 0. This function expects to be passed a mip level >= 1.
+ *
+ * Note: Hardware documentation seems to suggest that the third argument
+ * should be the utile dimensions, but through testing it was found that
+ * the block dimension should be used instead.
+ */
+static uint32_t
+v3d_get_dimension_mpad(uint32_t dimension, uint32_t level, uint32_t block_dimension)
 {
-   assert(image->cpp > 0);
+   assert(level >= 1);
+   uint32_t pot_dim = u_minify(dimension, 1);
+   pot_dim = util_next_power_of_two(DIV_ROUND_UP(pot_dim, block_dimension));
+   uint32_t padded_dim = block_dimension * pot_dim;
+   return u_minify(padded_dim, level - 1);
+}
 
-   uint32_t width = image->vk.extent.width;
-   uint32_t height = image->vk.extent.height;
-   uint32_t depth = image->vk.extent.depth;
+static bool
+v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
+                       uint32_t plane_offset,
+                       const VkSubresourceLayout *plane_layouts)
+{
+   assert(image->planes[plane].cpp > 0);
 
-   /* Note that power-of-two padding is based on level 1.  These are not
-    * equivalent to just util_next_power_of_two(dimension), because at a
-    * level 0 dimension of 9, the level 1 power-of-two padded value is 4,
-    * not 8.
-    */
-   uint32_t pot_width = 2 * util_next_power_of_two(u_minify(width, 1));
-   uint32_t pot_height = 2 * util_next_power_of_two(u_minify(height, 1));
-   uint32_t pot_depth = 2 * util_next_power_of_two(u_minify(depth, 1));
+   uint32_t width = image->planes[plane].width;
+   uint32_t height = image->planes[plane].height;
+   uint32_t depth = image->vk.extent.depth;
 
-   uint32_t utile_w = v3d_utile_width(image->cpp);
-   uint32_t utile_h = v3d_utile_height(image->cpp);
+   uint32_t utile_w = v3d_utile_width(image->planes[plane].cpp);
+   uint32_t utile_h = v3d_utile_height(image->planes[plane].cpp);
    uint32_t uif_block_w = utile_w * 2;
    uint32_t uif_block_h = utile_h * 2;
 
    uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
    uint32_t block_height = vk_format_get_blockheight(image->vk.format);
 
+   /* Note that power-of-two padding is based on level 1.  These are not
+    * equivalent to just util_next_power_of_two(dimension), because at a
+    * level 0 dimension of 9, the level 1 power-of-two padded value is 4,
+    * not 8. Additionally the pot padding is based on the block size.
+    */
+   uint32_t pot_width = 2 * v3d_get_dimension_mpad(width,
+                                                   1,
+                                                   block_width);
+   uint32_t pot_height = 2 * v3d_get_dimension_mpad(height,
+                                                    1,
+                                                    block_height);
+   uint32_t pot_depth = 2 * v3d_get_dimension_mpad(depth,
+                                                   1,
+                                                   1);
+
    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT ||
           image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
    bool msaa = image->vk.samples != VK_SAMPLE_COUNT_1_BIT;
@@ -107,14 +138,30 @@ v3d_setup_slices(struct v3dv_image *image)
    assert(depth > 0);
    assert(image->vk.mip_levels >= 1);
 
-   uint32_t offset = 0;
+   /* Texture Base Address needs to be 64-byte aligned. If we have an explicit
+    * plane layout we will return false to fail image creation with appropriate
+    * error code.
+    */
+   uint32_t offset;
+   if (plane_layouts) {
+      offset = plane_layouts[plane].offset;
+      if (offset % 64 != 0)
+         return false;
+   } else {
+      offset = plane_offset;
+   }
+   assert(plane_offset % 64 == 0);
+
    for (int32_t i = image->vk.mip_levels - 1; i >= 0; i--) {
-      struct v3d_resource_slice *slice = &image->slices[i];
+      struct v3d_resource_slice *slice = &image->planes[plane].slices[i];
+
+      slice->width = u_minify(width, i);
+      slice->height = u_minify(height, i);
 
       uint32_t level_width, level_height, level_depth;
       if (i < 2) {
-         level_width = u_minify(width, i);
-         level_height = u_minify(height, i);
+         level_width = slice->width;
+         level_height = slice->height;
       } else {
          level_width = u_minify(pot_width, i);
          level_height = u_minify(pot_height, i);
@@ -136,7 +183,7 @@ v3d_setup_slices(struct v3dv_image *image)
       if (!image->tiled) {
          slice->tiling = V3D_TILING_RASTER;
          if (image->vk.image_type == VK_IMAGE_TYPE_1D)
-            level_width = align(level_width, 64 / image->cpp);
+            level_width = align(level_width, 64 / image->planes[plane].cpp);
       } else {
          if ((i != 0 || !uif_top) &&
              (level_width <= utile_w || level_height <= utile_h)) {
@@ -158,7 +205,8 @@ v3d_setup_slices(struct v3dv_image *image)
             level_width = align(level_width, 4 * uif_block_w);
             level_height = align(level_height, uif_block_h);
 
-            slice->ub_pad = v3d_get_ub_pad(image->cpp, level_height);
+            slice->ub_pad = v3d_get_ub_pad(image->planes[plane].cpp,
+                                           level_height);
             level_height += slice->ub_pad * uif_block_h;
 
             /* If the padding set us to to be aligned to the page cache size,
@@ -175,12 +223,25 @@ v3d_setup_slices(struct v3dv_image *image)
       }
 
       slice->offset = offset;
-      slice->stride = level_width * image->cpp;
+      slice->stride = level_width * image->planes[plane].cpp;
+
+      /* We assume that rowPitch in the plane layout refers to level 0 */
+      if (plane_layouts && i == 0) {
+         if (plane_layouts[plane].rowPitch < slice->stride)
+            return false;
+         if (plane_layouts[plane].rowPitch % image->planes[plane].cpp)
+            return false;
+         if (image->tiled && (plane_layouts[plane].rowPitch % (4 * uif_block_w)))
+            return false;
+         slice->stride = plane_layouts[plane].rowPitch;
+      }
+
       slice->padded_height = level_height;
       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
           slice->tiling == V3D_TILING_UIF_XOR) {
          slice->padded_height_of_output_image_in_uif_blocks =
-            slice->padded_height / (2 * v3d_utile_height(image->cpp));
+            slice->padded_height /
+               (2 * v3d_utile_height(image->planes[plane].cpp));
       }
 
       slice->size = level_height * slice->stride;
@@ -188,7 +249,7 @@ v3d_setup_slices(struct v3dv_image *image)
 
       /* The HW aligns level 1's base to a page if any of level 1 or
        * below could be UIF XOR.  The lower levels then inherit the
-       * alignment for as long as necesary, thanks to being power of
+       * alignment for as long as necessary, thanks to being power of
        * two aligned.
        */
       if (i == 1 &&
@@ -200,7 +261,7 @@ v3d_setup_slices(struct v3dv_image *image)
       offset += slice_total_size;
    }
 
-   image->size = offset;
+   image->planes[plane].size = offset - plane_offset;
 
    /* UIF/UBLINEAR levels need to be aligned to UIF-blocks, and LT only
     * needs to be aligned to utile boundaries.  Since tiles are laid out
@@ -209,14 +270,27 @@ v3d_setup_slices(struct v3dv_image *image)
     * slices.
     *
     * We additionally align to 4k, which improves UIF XOR performance.
+    *
+    * Finally, because the Texture Base Address field must be 64-byte aligned,
+    * we also need to align linear images to 64 if the image is going to be
+    * used for transfer.
     */
-   image->alignment = image->tiled ? 4096 : image->cpp;
+   if (image->tiled) {
+      image->planes[plane].alignment = 4096;
+   } else {
+      image->planes[plane].alignment =
+         (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) ?
+            64 : image->planes[plane].cpp;
+   }
+
    uint32_t align_offset =
-      align(image->slices[0].offset, image->alignment) - image->slices[0].offset;
+      align(image->planes[plane].slices[0].offset,
+            image->planes[plane].alignment) -
+            image->planes[plane].slices[0].offset;
    if (align_offset) {
-      image->size += align_offset;
+      image->planes[plane].size += align_offset;
       for (int i = 0; i < image->vk.mip_levels; i++)
-         image->slices[i].offset += align_offset;
+         image->planes[plane].slices[i].offset += align_offset;
    }
 
    /* Arrays and cube textures have a stride which is the distance from
@@ -224,41 +298,112 @@ v3d_setup_slices(struct v3dv_image *image)
     * we need to program the stride between slices of miplevel 0.
     */
    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
-      image->cube_map_stride =
-         align(image->slices[0].offset + image->slices[0].size, 64);
-      image->size += image->cube_map_stride * (image->vk.array_layers - 1);
+      image->planes[plane].cube_map_stride =
+         align(image->planes[plane].slices[0].offset +
+               image->planes[plane].slices[0].size, 64);
+
+      if (plane_layouts && image->vk.array_layers > 1) {
+         if (plane_layouts[plane].arrayPitch % 64 != 0)
+            return false;
+         if (plane_layouts[plane].arrayPitch <
+             image->planes[plane].cube_map_stride) {
+            return false;
+         }
+         image->planes[plane].cube_map_stride = plane_layouts[plane].arrayPitch;
+      }
+
+      image->planes[plane].size += image->planes[plane].cube_map_stride *
+                                   (image->vk.array_layers - 1);
    } else {
-      image->cube_map_stride = image->slices[0].size;
+      image->planes[plane].cube_map_stride = image->planes[plane].slices[0].size;
+      if (plane_layouts) {
+         /* We assume that depthPitch in the plane layout refers to level 0 */
+         if (plane_layouts[plane].depthPitch !=
+             image->planes[plane].slices[0].size) {
+               return false;
+         }
+      }
+   }
+
+   return true;
+}
+
+static VkResult
+v3d_setup_slices(struct v3dv_image *image, bool disjoint,
+                 const VkSubresourceLayout *plane_layouts)
+{
+   if (disjoint && image->plane_count == 1)
+      disjoint = false;
+
+   uint64_t offset = 0;
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      offset = disjoint ? 0 : offset;
+      if (!v3d_setup_plane_slices(image, plane, offset, plane_layouts)) {
+         assert(plane_layouts);
+         return VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT;
+      }
+      offset += align64(image->planes[plane].size, 64);
    }
+
+   /* From the Vulkan spec:
+    *
+    *   "If the size of the resultant image would exceed maxResourceSize, then
+    *    vkCreateImage must fail and return VK_ERROR_OUT_OF_DEVICE_MEMORY. This
+    *    failure may occur even when all image creation parameters satisfy their
+    *    valid usage requirements."
+    */
+   if (offset > 0xffffffff)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   image->non_disjoint_size = disjoint ? 0 : offset;
+   return VK_SUCCESS;
 }
 
 uint32_t
-v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer)
+v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer,
+                  uint8_t plane)
 {
-   const struct v3d_resource_slice *slice = &image->slices[level];
+   const struct v3d_resource_slice *slice = &image->planes[plane].slices[level];
 
    if (image->vk.image_type == VK_IMAGE_TYPE_3D)
-      return image->mem_offset + slice->offset + layer * slice->size;
+      return image->planes[plane].mem_offset + slice->offset + layer * slice->size;
    else
-      return image->mem_offset + slice->offset + layer * image->cube_map_stride;
+      return image->planes[plane].mem_offset + slice->offset +
+         layer * image->planes[plane].cube_map_stride;
 }
 
-static VkResult
-create_image(struct v3dv_device *device,
-             const VkImageCreateInfo *pCreateInfo,
-             const VkAllocationCallbacks *pAllocator,
-             VkImage *pImage)
+VkResult
+v3dv_update_image_layout(struct v3dv_device *device,
+                         struct v3dv_image *image,
+                         uint64_t modifier,
+                         bool disjoint,
+                         const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info)
 {
-   struct v3dv_image *image = NULL;
+   assert(!explicit_mod_info ||
+          image->plane_count == explicit_mod_info->drmFormatModifierPlaneCount);
 
-   image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image));
-   if (image == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   assert(!explicit_mod_info ||
+          modifier == explicit_mod_info->drmFormatModifier);
+
+   image->tiled = modifier != DRM_FORMAT_MOD_LINEAR;
+
+   image->vk.drm_format_mod = modifier;
+
+   return v3d_setup_slices(image, disjoint,
+                           explicit_mod_info ? explicit_mod_info->pPlaneLayouts :
+                                               NULL);
+}
 
+VkResult
+v3dv_image_init(struct v3dv_device *device,
+                const VkImageCreateInfo *pCreateInfo,
+                const VkAllocationCallbacks *pAllocator,
+                struct v3dv_image *image)
+{
    /* When using the simulator the WSI common code will see that our
     * driver wsi device doesn't match the display device and because of that
     * it will not attempt to present directly from the swapchain images,
-    * instead it will use the prime blit path (use_prime_blit flag in
+    * instead it will use the prime blit path (use_buffer_blit flag in
     * struct wsi_swapchain), where it copies the contents of the swapchain
     * images to a linear buffer with appropriate row stride for presentation.
     * As a result, on that path, swapchain images do not have any special
@@ -266,11 +411,20 @@ create_image(struct v3dv_device *device,
     */
    VkImageTiling tiling = pCreateInfo->tiling;
    uint64_t modifier = DRM_FORMAT_MOD_INVALID;
+   const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = NULL;
+   const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info = NULL;
+#if DETECT_OS_ANDROID
+   if (image->is_native_buffer_memory) {
+      assert(image->android_explicit_layout);
+      explicit_mod_info = image->android_explicit_layout;
+      modifier = explicit_mod_info->drmFormatModifier;
+   }
+#endif
    if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
-      const VkImageDrmFormatModifierListCreateInfoEXT *mod_info =
+      mod_info =
          vk_find_struct_const(pCreateInfo->pNext,
                               IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
-      const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info =
+      explicit_mod_info =
          vk_find_struct_const(pCreateInfo->pNext,
                               IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
       assert(mod_info || explicit_mod_info);
@@ -297,21 +451,42 @@ create_image(struct v3dv_device *device,
       tiling = VK_IMAGE_TILING_LINEAR;
    }
 
+   if (modifier == DRM_FORMAT_MOD_INVALID)
+      modifier = (tiling == VK_IMAGE_TILING_OPTIMAL) ? DRM_FORMAT_MOD_BROADCOM_UIF
+                                                     : DRM_FORMAT_MOD_LINEAR;
+
    const struct v3dv_format *format =
-      v3dv_X(device, get_format)(pCreateInfo->format);
-   v3dv_assert(format != NULL && format->supported);
+      v3dv_X(device, get_format)(image->vk.format);
+   v3dv_assert(format != NULL && format->plane_count);
 
    assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT ||
           pCreateInfo->samples == VK_SAMPLE_COUNT_4_BIT);
 
    image->format = format;
-   image->cpp = vk_format_get_blocksize(image->vk.format);
-   image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL ||
-                  (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
-                   modifier != DRM_FORMAT_MOD_LINEAR);
 
-   image->vk.tiling = tiling;
-   image->vk.drm_format_mod = modifier;
+   image->plane_count = vk_format_get_plane_count(image->vk.format);
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(image->vk.format);
+
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      VkFormat plane_format =
+         vk_format_get_plane_format(image->vk.format, plane);
+      image->planes[plane].cpp =
+         vk_format_get_blocksize(plane_format);
+      image->planes[plane].vk_format = plane_format;
+
+      image->planes[plane].width = image->vk.extent.width;
+      image->planes[plane].height = image->vk.extent.height;
+
+      if (ycbcr_info) {
+         image->planes[plane].width /=
+            ycbcr_info->planes[plane].denominator_scales[0];
+
+         image->planes[plane].height /=
+            ycbcr_info->planes[plane].denominator_scales[1];
+      }
+   }
 
    /* Our meta paths can create image views with compatible formats for any
     * image, so always set this flag to keep the common Vulkan image code
@@ -319,11 +494,112 @@ create_image(struct v3dv_device *device,
     */
    image->vk.create_flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
 
-   v3d_setup_slices(image);
+#if DETECT_OS_ANDROID
+   /* At this time, an AHB handle is not yet provided.
+    * Image layout will be filled up during vkBindImageMemory2
+    */
+   if (image->is_ahb)
+      return VK_SUCCESS;
+#endif
+
+   bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+
+   return v3dv_update_image_layout(device, image, modifier, disjoint,
+                                   explicit_mod_info);
+}
+
+static VkResult
+create_image(struct v3dv_device *device,
+             const VkImageCreateInfo *pCreateInfo,
+             const VkAllocationCallbacks *pAllocator,
+             VkImage *pImage)
+{
+   VkResult result;
+   struct v3dv_image *image = NULL;
+
+   image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image));
+   if (image == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+#if DETECT_OS_ANDROID
+   const VkExternalMemoryImageCreateInfo *external_info =
+      vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO);
+
+   const VkNativeBufferANDROID *native_buffer =
+      vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
+
+   if (native_buffer != NULL)
+      image->is_native_buffer_memory = true;
+
+   image->is_ahb = external_info && (external_info->handleTypes &
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID);
+
+   assert(!(image->is_ahb && image->is_native_buffer_memory));
+
+   if (image->is_ahb || image->is_native_buffer_memory) {
+      image->android_explicit_layout = vk_alloc2(&device->vk.alloc, pAllocator,
+                                                 sizeof(VkImageDrmFormatModifierExplicitCreateInfoEXT),
+                                                 8,
+                                                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!image->android_explicit_layout) {
+         result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto fail;
+      }
+
+      image->android_plane_layouts = vk_alloc2(&device->vk.alloc, pAllocator,
+         sizeof(VkSubresourceLayout) * V3DV_MAX_PLANE_COUNT,
+         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!image->android_plane_layouts) {
+         result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto fail;
+      }
+   }
+
+   if (image->is_native_buffer_memory) {
+      struct u_gralloc_buffer_handle gr_handle = {
+         .handle = native_buffer->handle,
+         .hal_format = native_buffer->format,
+         .pixel_stride = native_buffer->stride,
+      };
+
+      result = v3dv_gralloc_to_drm_explicit_layout(device->gralloc,
+                                                   &gr_handle,
+                                                   image->android_explicit_layout,
+                                                   image->android_plane_layouts,
+                                                   V3DV_MAX_PLANE_COUNT);
+      if (result != VK_SUCCESS)
+         goto fail;
+   }
+#endif
+
+   result = v3dv_image_init(device, pCreateInfo, pAllocator, image);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+#if DETECT_OS_ANDROID
+   if (image->is_native_buffer_memory) {
+      result = v3dv_import_native_buffer_fd(v3dv_device_to_handle(device),
+                                            native_buffer->handle->data[0], pAllocator,
+                                            v3dv_image_to_handle(image));
+      if (result != VK_SUCCESS)
+         goto fail;
+   }
+#endif
 
    *pImage = v3dv_image_to_handle(image);
 
    return VK_SUCCESS;
+
+fail:
+#if DETECT_OS_ANDROID
+   if (image->android_explicit_layout)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout);
+   if (image->android_plane_layouts)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts);
+#endif
+
+   vk_image_destroy(&device->vk, pAllocator, &image->vk);
+   return result;
 }
 
 static VkResult
@@ -381,8 +657,14 @@ v3dv_CreateImage(VkDevice _device,
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
+#if DETECT_OS_ANDROID
+   /* VkImageSwapchainCreateInfoKHR is not useful at all */
+   const VkImageSwapchainCreateInfoKHR *swapchain_info = NULL;
+#else
    const VkImageSwapchainCreateInfoKHR *swapchain_info =
       vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
+#endif
+
    if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE)
       return create_image_from_swapchain(device, pCreateInfo, swapchain_info,
                                          pAllocator, pImage);
@@ -398,13 +680,30 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
 {
    V3DV_FROM_HANDLE(v3dv_image, image, _image);
 
+   uint8_t plane = v3dv_plane_from_aspect(subresource->aspectMask);
    const struct v3d_resource_slice *slice =
-      &image->slices[subresource->mipLevel];
+      &image->planes[plane].slices[subresource->mipLevel];
+
+   /* About why the offset below works for both disjoint and non-disjoint
+    * cases, from the Vulkan spec:
+    *
+    *   "If the image is disjoint, then the offset is relative to the base
+    *    address of the plane."
+    *
+    *   "If the image is non-disjoint, then the offset is relative to the base
+    *    address of the image."
+    *
+    * In our case, the per-plane mem_offset for non-disjoint images is the
+    * same for all planes and matches the base address of the image.
+    */
    layout->offset =
-      v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer);
+      v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer,
+                        plane) - image->planes[plane].mem_offset;
    layout->rowPitch = slice->stride;
-   layout->depthPitch = image->cube_map_stride;
-   layout->arrayPitch = image->cube_map_stride;
+   layout->depthPitch = image->vk.image_type == VK_IMAGE_TYPE_3D ?
+                        image->planes[plane].cube_map_stride : 0;
+   layout->arrayPitch = image->vk.array_layers > 1 ?
+                        image->planes[plane].cube_map_stride : 0;
 
    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       layout->size = slice->size;
@@ -419,7 +718,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
          layout->size = slice->size * image->vk.extent.depth;
       } else {
             const struct v3d_resource_slice *prev_slice =
-               &image->slices[subresource->mipLevel - 1];
+               &image->planes[plane].slices[subresource->mipLevel - 1];
             layout->size = prev_slice->offset - slice->offset;
       }
    }
@@ -436,6 +735,35 @@ v3dv_DestroyImage(VkDevice _device,
    if (image == NULL)
       return;
 
+   /* If we have created a shadow tiled image for this image we must also free
+    * it (along with its memory allocation).
+    */
+   if (image->shadow) {
+      bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+      for (int i = 0; i < (disjoint ? image->plane_count : 1); i++) {
+         if (image->shadow->planes[i].mem) {
+            v3dv_FreeMemory(_device,
+                            v3dv_device_memory_to_handle(image->shadow->planes[i].mem),
+                            pAllocator);
+         }
+      }
+      v3dv_DestroyImage(_device, v3dv_image_to_handle(image->shadow),
+                        pAllocator);
+      image->shadow = NULL;
+   }
+
+#if DETECT_OS_ANDROID
+   if (image->is_native_buffer_memory)
+      v3dv_FreeMemory(_device,
+                      v3dv_device_memory_to_handle(image->planes[0].mem),
+                      pAllocator);
+
+   if (image->android_explicit_layout)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout);
+   if (image->android_plane_layouts)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts);
+#endif
+
    vk_image_destroy(&device->vk, pAllocator, &image->vk);
 }
 
@@ -451,96 +779,102 @@ v3dv_image_type_to_view_type(VkImageType type)
    }
 }
 
-static enum pipe_swizzle
-vk_component_mapping_to_pipe_swizzle(VkComponentSwizzle swz)
-{
-   assert(swz != VK_COMPONENT_SWIZZLE_IDENTITY);
-
-   switch (swz) {
-   case VK_COMPONENT_SWIZZLE_ZERO:
-      return PIPE_SWIZZLE_0;
-   case VK_COMPONENT_SWIZZLE_ONE:
-      return PIPE_SWIZZLE_1;
-   case VK_COMPONENT_SWIZZLE_R:
-      return PIPE_SWIZZLE_X;
-   case VK_COMPONENT_SWIZZLE_G:
-      return PIPE_SWIZZLE_Y;
-   case VK_COMPONENT_SWIZZLE_B:
-      return PIPE_SWIZZLE_Z;
-   case VK_COMPONENT_SWIZZLE_A:
-      return PIPE_SWIZZLE_W;
-   default:
-      unreachable("Unknown VkComponentSwizzle");
-   };
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateImageView(VkDevice _device,
-                     const VkImageViewCreateInfo *pCreateInfo,
-                     const VkAllocationCallbacks *pAllocator,
-                     VkImageView *pView)
+static VkResult
+create_image_view(struct v3dv_device *device,
+                  bool driver_internal,
+                  const VkImageViewCreateInfo *pCreateInfo,
+                  const VkAllocationCallbacks *pAllocator,
+                  VkImageView *pView)
 {
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_image, image, pCreateInfo->image);
    struct v3dv_image_view *iview;
 
-   iview = vk_image_view_create(&device->vk, pCreateInfo, pAllocator,
-                                sizeof(*iview));
+   iview = vk_image_view_create(&device->vk, driver_internal, pCreateInfo,
+                                pAllocator, sizeof(*iview));
    if (iview == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   const VkImageAspectFlagBits any_plane_aspect =
+      VK_IMAGE_ASPECT_PLANE_0_BIT |
+      VK_IMAGE_ASPECT_PLANE_1_BIT |
+      VK_IMAGE_ASPECT_PLANE_2_BIT;
+
+   if (image->vk.aspects & any_plane_aspect) {
+      assert((image->vk.aspects & ~any_plane_aspect) == 0);
+      iview->plane_count = 0;
+      static const VkImageAspectFlagBits plane_aspects[]= {
+         VK_IMAGE_ASPECT_PLANE_0_BIT,
+         VK_IMAGE_ASPECT_PLANE_1_BIT,
+         VK_IMAGE_ASPECT_PLANE_2_BIT
+      };
+      for (uint8_t plane = 0; plane < V3DV_MAX_PLANE_COUNT; plane++) {
+         if (iview->vk.aspects & plane_aspects[plane])
+            iview->planes[iview->plane_count++].image_plane = plane;
+      }
+   } else {
+      iview->plane_count = 1;
+      iview->planes[0].image_plane = 0;
+   }
+   /* At this point we should have at least one plane */
+   assert(iview->plane_count > 0);
 
    const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
 
-   iview->offset = v3dv_layer_offset(image, iview->vk.base_mip_level,
-                                     iview->vk.base_array_layer);
-
    /* If we have D24S8 format but the view only selects the stencil aspect
     * we want to re-interpret the format as RGBA8_UINT, then map our stencil
     * data reads to the R component and ignore the GBA channels that contain
     * the depth aspect data.
+    *
+    * FIXME: thwe code belows calls vk_component_mapping_to_pipe_swizzle
+    * only so it can then call util_format_compose_swizzles later. Maybe it
+    * makes sense to implement swizzle composition using VkSwizzle directly.
     */
    VkFormat format;
-   uint8_t image_view_swizzle[4];
-   if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
+   if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
        range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
       format = VK_FORMAT_R8G8B8A8_UINT;
-      image_view_swizzle[0] = PIPE_SWIZZLE_X;
-      image_view_swizzle[1] = PIPE_SWIZZLE_0;
-      image_view_swizzle[2] = PIPE_SWIZZLE_0;
-      image_view_swizzle[3] = PIPE_SWIZZLE_1;
+      uint8_t stencil_aspect_swizzle[4] = {
+         PIPE_SWIZZLE_X, PIPE_SWIZZLE_0, PIPE_SWIZZLE_0, PIPE_SWIZZLE_1,
+      };
+      uint8_t view_swizzle[4];
+      vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);
+
+      util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
+                                   iview->view_swizzle);
    } else {
-      format = pCreateInfo->format;
-
-      /* FIXME: we are doing this vk to pipe swizzle mapping just to call
-       * util_format_compose_swizzles. Would be good to check if it would be
-       * better to reimplement the latter using vk component
-       */
-      image_view_swizzle[0] =
-         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.r);
-      image_view_swizzle[1] =
-         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.g);
-      image_view_swizzle[2] =
-         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.b);
-      image_view_swizzle[3] =
-         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.a);
+      format = iview->vk.format;
+      vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
+                                           iview->view_swizzle);
    }
 
-   iview->vk.format = format;
+   iview->vk.view_format = format;
    iview->format = v3dv_X(device, get_format)(format);
-   assert(iview->format && iview->format->supported);
+   assert(iview->format && iview->format->plane_count);
 
-   if (vk_format_is_depth_or_stencil(iview->vk.format)) {
-      iview->internal_type =
-         v3dv_X(device, get_internal_depth_type)(iview->vk.format);
-   } else {
-      v3dv_X(device, get_internal_type_bpp_for_output_format)
-         (iview->format->rt_type, &iview->internal_type, &iview->internal_bpp);
-   }
+   for (uint8_t plane = 0; plane < iview->plane_count; plane++) {
+      iview->planes[plane].offset = v3dv_layer_offset(image,
+                                                      iview->vk.base_mip_level,
+                                                      iview->vk.base_array_layer,
+                                                      plane);
+
+      if (vk_format_is_depth_or_stencil(iview->vk.view_format)) {
+         iview->planes[plane].internal_type =
+            v3dv_X(device, get_internal_depth_type)(iview->vk.view_format);
+      } else {
+         v3dv_X(device, get_internal_type_bpp_for_output_format)
+            (iview->format->planes[plane].rt_type,
+             &iview->planes[plane].internal_type,
+             &iview->planes[plane].internal_bpp);
+      }
 
-   const uint8_t *format_swizzle = v3dv_get_format_swizzle(device, format);
-   util_format_compose_swizzles(format_swizzle, image_view_swizzle,
-                                iview->swizzle);
-   iview->swap_rb = iview->swizzle[0] == PIPE_SWIZZLE_Z;
+      const uint8_t *format_swizzle =
+         v3dv_get_format_swizzle(device, format, plane);
+      util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
+                                   iview->planes[plane].swizzle);
+
+      iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
+      iview->planes[plane].channel_reverse = v3dv_format_swizzle_needs_reverse(format_swizzle);
+   }
 
    v3dv_X(device, pack_texture_shader_state)(device, iview);
 
@@ -549,6 +883,25 @@ v3dv_CreateImageView(VkDevice _device,
    return VK_SUCCESS;
 }
 
+VkResult
+v3dv_create_image_view(struct v3dv_device *device,
+                       const VkImageViewCreateInfo *pCreateInfo,
+                       VkImageView *pView)
+{
+   return create_image_view(device, true, pCreateInfo, NULL, pView);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_CreateImageView(VkDevice _device,
+                     const VkImageViewCreateInfo *pCreateInfo,
+                     const VkAllocationCallbacks *pAllocator,
+                     VkImageView *pView)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+
+   return create_image_view(device, false, pCreateInfo, pAllocator, pView);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyImageView(VkDevice _device,
                       VkImageView imageView,
@@ -560,6 +913,13 @@ v3dv_DestroyImageView(VkDevice _device,
    if (image_view == NULL)
       return;
 
+   if (image_view->shadow) {
+      v3dv_DestroyImageView(_device,
+                            v3dv_image_view_to_handle(image_view->shadow),
+                            pAllocator);
+      image_view->shadow = NULL;
+   }
+
    vk_image_view_destroy(&device->vk, pAllocator, &image_view->vk);
 }
 
@@ -578,7 +938,7 @@ v3dv_CreateBufferView(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*view),
                        VK_OBJECT_TYPE_BUFFER_VIEW);
    if (!view)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    uint32_t range;
    if (pCreateInfo->range == VK_WHOLE_SIZE)
@@ -596,8 +956,10 @@ v3dv_CreateBufferView(VkDevice _device,
    view->vk_format = pCreateInfo->format;
    view->format = v3dv_X(device, get_format)(view->vk_format);
 
+   /* We don't support multi-plane formats for buffer views */
+   assert(view->format->plane_count == 1);
    v3dv_X(device, get_internal_type_bpp_for_output_format)
-      (view->format->rt_type, &view->internal_type, &view->internal_bpp);
+      (view->format->planes[0].rt_type, &view->internal_type, &view->internal_bpp);
 
    if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT ||
        buffer->usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)
diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h
index aaab1ce03ac..4df172e6bf3 100644
--- a/src/broadcom/vulkan/v3dv_limits.h
+++ b/src/broadcom/vulkan/v3dv_limits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,8 +23,6 @@
 #ifndef V3DV_LIMITS_H
 #define V3DV_LIMITS_H
 
-#define NSEC_PER_SEC 1000000000ull
-
 /* From vulkan spec "If the multiple viewports feature is not enabled,
  * scissorCount must be 1", ditto for viewportCount. For now we don't support
  * that feature.
@@ -43,7 +41,8 @@
 #define MAX_STORAGE_IMAGES 4
 #define MAX_INPUT_ATTACHMENTS 4
 
-#define MAX_UNIFORM_BUFFERS 12
+#define MAX_UNIFORM_BUFFERS 16
+#define MAX_INLINE_UNIFORM_BUFFERS 4
 #define MAX_STORAGE_BUFFERS 8
 
 #define MAX_DYNAMIC_UNIFORM_BUFFERS 8
@@ -51,8 +50,6 @@
 #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
                              MAX_DYNAMIC_STORAGE_BUFFERS)
 
-#define MAX_RENDER_TARGETS 4
-
 #define MAX_MULTIVIEW_VIEW_COUNT 16
 
 /* These are tunable parameters in the HW design, but all the V3D
diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
index 5555c690bb3..d8868142329 100644
--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,8 +25,8 @@
 #include "v3dv_meta_common.h"
 
 #include "compiler/nir/nir_builder.h"
-#include "vk_format_info.h"
 #include "util/u_pack_color.h"
+#include "vk_common_entrypoints.h"
 
 static void
 get_hw_clear_color(struct v3dv_device *device,
@@ -68,7 +68,13 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
 {
    const VkOffset3D origin = { 0, 0, 0 };
    VkFormat fb_format;
-   if (!v3dv_meta_can_use_tlb(image, &origin, &fb_format))
+
+   /* From vkCmdClearColorImage spec:
+    *  "image must not use any of the formats that require a sampler YCBCR
+    *   conversion"
+    */
+   assert(image->plane_count == 1);
+   if (!v3dv_meta_can_use_tlb(image, 0, 0, &origin, NULL, &fb_format))
       return false;
 
    uint32_t internal_type, internal_bpp;
@@ -120,8 +126,9 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       if (!job)
          return true;
 
-      v3dv_job_start_frame(job, width, height, max_layer, false,
-                           1, internal_bpp,
+      v3dv_job_start_frame(job, width, height, max_layer,
+                           false, true, 1, internal_bpp,
+                           4 * v3d_internal_bpp_words(internal_bpp),
                            image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
 
       struct v3dv_meta_framebuffer framebuffer;
@@ -161,11 +168,15 @@ v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,
       .color = *pColor,
    };
 
+   cmd_buffer->state.is_transfer = true;
+
    for (uint32_t i = 0; i < rangeCount; i++) {
       if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
          continue;
       unreachable("Unsupported color clear.");
    }
+
+   cmd_buffer->state.is_transfer = false;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -183,11 +194,15 @@ v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
       .depthStencil = *pDepthStencil,
    };
 
+   cmd_buffer->state.is_transfer = true;
+
    for (uint32_t i = 0; i < rangeCount; i++) {
       if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
          continue;
       unreachable("Unsupported depth/stencil clear.");
    }
+
+   cmd_buffer->state.is_transfer = false;
 }
 
 static void
@@ -304,39 +319,6 @@ v3dv_meta_clear_finish(struct v3dv_device *device)
    }
 }
 
-static nir_ssa_def *
-gen_rect_vertices(nir_builder *b)
-{
-   nir_ssa_def *vertex_id = nir_load_vertex_id(b);
-
-   /* vertex 0: -1.0, -1.0
-    * vertex 1: -1.0,  1.0
-    * vertex 2:  1.0, -1.0
-    * vertex 3:  1.0,  1.0
-    *
-    * so:
-    *
-    * channel 0 is vertex_id < 2 ? -1.0 :  1.0
-    * channel 1 is vertex id & 1 ?  1.0 : -1.0
-    */
-
-   nir_ssa_def *one = nir_imm_int(b, 1);
-   nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
-   nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
-
-   nir_ssa_def *comp[4];
-   comp[0] = nir_bcsel(b, c0cmp,
-                       nir_imm_float(b, -1.0f),
-                       nir_imm_float(b, 1.0f));
-
-   comp[1] = nir_bcsel(b, c1cmp,
-                       nir_imm_float(b, 1.0f),
-                       nir_imm_float(b, -1.0f));
-   comp[2] = nir_imm_float(b, 0.0f);
-   comp[3] = nir_imm_float(b, 1.0f);
-   return nir_vec(b, comp, 4);
-}
-
 static nir_shader *
 get_clear_rect_vs()
 {
@@ -349,7 +331,7 @@ get_clear_rect_vs()
       nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
    vs_out_pos->data.location = VARYING_SLOT_POS;
 
-   nir_ssa_def *pos = gen_rect_vertices(&b);
+   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
    nir_store_var(&b, vs_out_pos, pos, 0xf);
 
    return b.shader;
@@ -372,8 +354,8 @@ get_clear_rect_gs(uint32_t push_constant_layer_base)
    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
                                (1ull << VARYING_SLOT_LAYER);
-   nir->info.gs.input_primitive = GL_TRIANGLES;
-   nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
+   nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
+   nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
    nir->info.gs.vertices_in = 3;
    nir->info.gs.vertices_out = 3;
    nir->info.gs.invocations = 1;
@@ -406,7 +388,7 @@ get_clear_rect_gs(uint32_t push_constant_layer_base)
       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
 
       /* gl_Layer from push constants */
-      nir_ssa_def *layer =
+      nir_def *layer =
          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                                 .base = push_constant_layer_base, .range = 4);
       nir_store_var(&b, gs_out_layer, layer, 0x1);
@@ -434,7 +416,7 @@ get_color_clear_rect_fs(uint32_t rt_idx, VkFormat format)
       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
    fs_out_color->data.location = FRAG_RESULT_DATA0 + rt_idx;
 
-   nir_ssa_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
+   nir_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
    nir_store_var(&b, fs_out_color, color_load, 0xf);
 
    return b.shader;
@@ -452,7 +434,7 @@ get_depth_clear_rect_fs()
                           "out_depth");
    fs_out_depth->data.location = FRAG_RESULT_DEPTH;
 
-   nir_ssa_def *depth_load =
+   nir_def *depth_load =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
 
    nir_store_var(&b, fs_out_depth, depth_load, 0x1);
@@ -475,12 +457,11 @@ create_pipeline(struct v3dv_device *device,
                 VkPipeline *pipeline)
 {
    VkPipelineShaderStageCreateInfo stages[3] = { 0 };
-   struct vk_shader_module vs_m;
+   struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
    struct vk_shader_module gs_m;
    struct vk_shader_module fs_m;
 
    uint32_t stage_count = 0;
-   v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
    stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
    stages[stage_count].stage = VK_SHADER_STAGE_VERTEX_BIT;
    stages[stage_count].module = vk_shader_module_to_handle(&vs_m);
@@ -488,7 +469,7 @@ create_pipeline(struct v3dv_device *device,
    stage_count++;
 
    if (gs_nir) {
-      v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
+      gs_m = vk_shader_module_from_nir(gs_nir);
       stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
       stages[stage_count].stage = VK_SHADER_STAGE_GEOMETRY_BIT;
       stages[stage_count].module = vk_shader_module_to_handle(&gs_m);
@@ -497,7 +478,7 @@ create_pipeline(struct v3dv_device *device,
    }
 
    if (fs_nir) {
-      v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
+      fs_m = vk_shader_module_from_nir(fs_nir);
       stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
       stages[stage_count].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
       stages[stage_count].module = vk_shader_module_to_handle(&fs_m);
@@ -581,6 +562,7 @@ create_pipeline(struct v3dv_device *device,
                                    pipeline);
 
    ralloc_free(vs_nir);
+   ralloc_free(gs_nir);
    ralloc_free(fs_nir);
 
    return result;
@@ -592,7 +574,7 @@ create_color_clear_pipeline(struct v3dv_device *device,
                             uint32_t subpass_idx,
                             uint32_t rt_idx,
                             VkFormat format,
-                            uint32_t samples,
+                            VkSampleCountFlagBits samples,
                             uint32_t components,
                             bool is_layered,
                             VkPipelineLayout pipeline_layout,
@@ -709,10 +691,11 @@ static VkResult
 create_color_clear_render_pass(struct v3dv_device *device,
                                uint32_t rt_idx,
                                VkFormat format,
-                               uint32_t samples,
+                               VkSampleCountFlagBits samples,
                                VkRenderPass *pass)
 {
-   VkAttachmentDescription att = {
+   VkAttachmentDescription2 att = {
+      .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
       .format = format,
       .samples = samples,
       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
@@ -721,12 +704,14 @@ create_color_clear_render_pass(struct v3dv_device *device,
       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
    };
 
-   VkAttachmentReference att_ref = {
+   VkAttachmentReference2 att_ref = {
+      .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
       .attachment = rt_idx,
       .layout = VK_IMAGE_LAYOUT_GENERAL,
    };
 
-   VkSubpassDescription subpass = {
+   VkSubpassDescription2 subpass = {
+      .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
       .inputAttachmentCount = 0,
       .colorAttachmentCount = 1,
@@ -737,8 +722,8 @@ create_color_clear_render_pass(struct v3dv_device *device,
       .pPreserveAttachments = NULL,
    };
 
-   VkRenderPassCreateInfo info = {
-      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+   VkRenderPassCreateInfo2 info = {
+      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
       .attachmentCount = 1,
       .pAttachments = &att,
       .subpassCount = 1,
@@ -747,14 +732,14 @@ create_color_clear_render_pass(struct v3dv_device *device,
       .pDependencies = NULL,
    };
 
-   return v3dv_CreateRenderPass(v3dv_device_to_handle(device),
-                                &info, &device->vk.alloc, pass);
+   return v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
+                                 &info, &device->vk.alloc, pass);
 }
 
 static inline uint64_t
 get_color_clear_pipeline_cache_key(uint32_t rt_idx,
                                    VkFormat format,
-                                   uint32_t samples,
+                                   VkSampleCountFlagBits samples,
                                    uint32_t components,
                                    bool is_layered)
 {
@@ -764,7 +749,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
    uint32_t bit_offset = 0;
 
    key |= rt_idx;
-   bit_offset += 2;
+   bit_offset += 3;
 
    key |= ((uint64_t) format) << bit_offset;
    bit_offset += 32;
@@ -819,7 +804,7 @@ get_color_clear_pipeline(struct v3dv_device *device,
                          uint32_t rt_idx,
                          uint32_t attachment_idx,
                          VkFormat format,
-                         uint32_t samples,
+                         VkSampleCountFlagBits samples,
                          uint32_t components,
                          bool is_layered,
                          struct v3dv_meta_color_clear_pipeline **pipeline)
@@ -1012,7 +997,7 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
    assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
    const VkFormat format =
       cmd_buffer->state.pass->attachments[attachment_idx].desc.format;
-   const VkFormat samples =
+   const VkSampleCountFlagBits samples =
       cmd_buffer->state.pass->attachments[attachment_idx].desc.samples;
    const uint32_t components = VK_COLOR_COMPONENT_R_BIT |
                                VK_COLOR_COMPONENT_G_BIT |
@@ -1049,8 +1034,6 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                         VK_PIPELINE_BIND_POINT_GRAPHICS,
                         pipeline->pipeline);
 
-   uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
-
    for (uint32_t i = 0; i < rect_count; i++) {
       const VkViewport viewport = {
          .x = rects[i].rect.offset.x,
@@ -1087,7 +1070,7 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
       cmd_buffer, (uintptr_t)pipeline,
       (v3dv_cmd_buffer_private_obj_destroy_cb) destroy_color_clear_pipeline);
 
-   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
 }
 
 /* Emits a scissored quad, clearing the depth aspect by writing to gl_FragDepth
@@ -1139,18 +1122,14 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                         VK_PIPELINE_BIND_POINT_GRAPHICS,
                         pipeline->pipeline);
 
-   uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-      v3dv_CmdSetStencilReference(cmd_buffer_handle,
-                                  VK_STENCIL_FACE_FRONT_AND_BACK,
-                                  clear_ds->stencil);
-      v3dv_CmdSetStencilWriteMask(cmd_buffer_handle,
-                                  VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
-      v3dv_CmdSetStencilCompareMask(cmd_buffer_handle,
-                                    VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
-      dynamic_states |= VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK |
-                        VK_DYNAMIC_STATE_STENCIL_WRITE_MASK |
-                        VK_DYNAMIC_STATE_STENCIL_REFERENCE;
+      vk_common_CmdSetStencilReference(cmd_buffer_handle,
+                                       VK_STENCIL_FACE_FRONT_AND_BACK,
+                                       clear_ds->stencil);
+      vk_common_CmdSetStencilWriteMask(cmd_buffer_handle,
+                                       VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
+      vk_common_CmdSetStencilCompareMask(cmd_buffer_handle,
+                                         VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
    }
 
    for (uint32_t i = 0; i < rect_count; i++) {
@@ -1179,7 +1158,7 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
 }
 
 static void
@@ -1212,9 +1191,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   /* We can only clear attachments in the current subpass */
-   assert(attachmentCount <= 5); /* 4 color + D/S */
+   /* We can have at most max_color_RTs + 1 D/S attachments */
+   assert(attachmentCount <=
+          V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);
 
+   /* We can only clear attachments in the current subpass */
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
 
    assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
@@ -1225,6 +1206,9 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
     * framebuffers, we use a geometry shader to redirect clears to the
     * appropriate layers.
     */
+
+   v3dv_cmd_buffer_pause_occlusion_query(cmd_buffer);
+
    bool is_layered, all_rects_same_layers;
    gather_layering_info(rectCount, pRects, &is_layered, &all_rects_same_layers);
    for (uint32_t i = 0; i < attachmentCount; i++) {
@@ -1242,4 +1226,6 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
                                      rectCount, pRects);
       }
    }
+
+   v3dv_cmd_buffer_resume_occlusion_query(cmd_buffer);
 }
diff --git a/src/broadcom/vulkan/v3dv_meta_common.h b/src/broadcom/vulkan/v3dv_meta_common.h
index 555b55f90b7..3be51b56a1f 100644
--- a/src/broadcom/vulkan/v3dv_meta_common.h
+++ b/src/broadcom/vulkan/v3dv_meta_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,30 +23,6 @@
 #ifndef V3DV_META_COMMON_H
 #define V3DV_META_COMMON_H
 
-/* Disable level 0 write, just write following mipmaps */
-#define V3D_TFU_IOA_DIMTW (1 << 0)
-#define V3D_TFU_IOA_FORMAT_SHIFT 3
-#define V3D_TFU_IOA_FORMAT_LINEARTILE 3
-#define V3D_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
-#define V3D_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
-#define V3D_TFU_IOA_FORMAT_UIF_NO_XOR 6
-#define V3D_TFU_IOA_FORMAT_UIF_XOR 7
-
-#define V3D_TFU_ICFG_NUMMM_SHIFT 5
-#define V3D_TFU_ICFG_TTYPE_SHIFT 9
-
-#define V3D_TFU_ICFG_OPAD_SHIFT 22
-
-#define V3D_TFU_ICFG_FORMAT_SHIFT 18
-#define V3D_TFU_ICFG_FORMAT_RASTER 0
-#define V3D_TFU_ICFG_FORMAT_SAND_128 1
-#define V3D_TFU_ICFG_FORMAT_SAND_256 2
-#define V3D_TFU_ICFG_FORMAT_LINEARTILE 11
-#define V3D_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
-#define V3D_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
-#define V3D_TFU_ICFG_FORMAT_UIF_NO_XOR 14
-#define V3D_TFU_ICFG_FORMAT_UIF_XOR 15
-
 /**
  * Copy/Clear operations implemented in v3dv_meta_*.c that use the TLB hardware
  * need to figure out TLB programming from the target image data instead of an
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index 85cd8e06638..0713b1b4084 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,9 +25,8 @@
 #include "v3dv_meta_common.h"
 
 #include "compiler/nir/nir_builder.h"
-#include "vk_format_info.h"
 #include "util/u_pack_color.h"
-#include "vulkan/util/vk_common_entrypoints.h"
+#include "vk_common_entrypoints.h"
 
 static uint32_t
 meta_blit_key_hash(const void *key)
@@ -42,6 +41,19 @@ meta_blit_key_compare(const void *key1, const void *key2)
 }
 
 static bool
+texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
+                         VkImageAspectFlags aspect,
+                         struct v3dv_image *image,
+                         VkFormat dst_format,
+                         VkFormat src_format,
+                         struct v3dv_buffer *buffer,
+                         uint32_t buffer_bpp,
+                         VkColorComponentFlags cmask,
+                         VkComponentMapping *cswizzle,
+                         uint32_t region_count,
+                         const VkBufferImageCopy2 *regions);
+
+static bool
 create_blit_pipeline_layout(struct v3dv_device *device,
                             VkDescriptorSetLayout *descriptor_set_layout,
                             VkPipelineLayout *pipeline_layout)
@@ -338,18 +350,41 @@ get_compatible_tlb_format(VkFormat format)
 /**
  * Checks if we can implement an image copy or clear operation using the TLB
  * hardware.
+ *
+ * The extent and miplevel are only used to validate tile stores (to match the
+ * region to store against the miplevel dimensions to avoid avoid cases where
+ * the region to store is not a aligned to tile boundaries). If extent is
+ * NULL no checks are done (which is fine if the image will only be used for a
+ * TLB load or when we know in advance that the store will be for the entire
+ * size of the image miplevel).
+ *
+ * For tlb copies we are doing a per-plane copy, so for multi-plane formats,
+ * the compatible format will be single-plane.
  */
 bool
 v3dv_meta_can_use_tlb(struct v3dv_image *image,
+                      uint8_t plane,
+                      uint8_t miplevel,
                       const VkOffset3D *offset,
+                      const VkExtent3D *extent,
                       VkFormat *compat_format)
 {
    if (offset->x != 0 || offset->y != 0)
       return false;
 
-   if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
+   /* FIXME: this is suboptimal, what we really want to check is that the
+    * extent of the region to copy is the full slice or a multiple of the
+    * tile size.
+    */
+   if (extent) {
+      struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel];
+      if (slice->width != extent->width || slice->height != extent->height)
+         return false;
+   }
+
+   if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
       if (compat_format)
-         *compat_format = image->vk.format;
+         *compat_format = image->planes[plane].vk_format;
       return true;
    }
 
@@ -357,9 +392,11 @@ v3dv_meta_can_use_tlb(struct v3dv_image *image,
     * a compatible format instead.
     */
    if (compat_format) {
-      *compat_format = get_compatible_tlb_format(image->vk.format);
-      if (*compat_format != VK_FORMAT_UNDEFINED)
+      *compat_format = get_compatible_tlb_format(image->planes[plane].vk_format);
+      if (*compat_format != VK_FORMAT_UNDEFINED) {
+         assert(vk_format_get_plane_count(*compat_format) == 1);
          return true;
+      }
    }
 
    return false;
@@ -379,11 +416,17 @@ static bool
 copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_buffer *buffer,
                          struct v3dv_image *image,
-                         const VkBufferImageCopy2KHR *region)
+                         const VkBufferImageCopy2 *region)
 {
    VkFormat fb_format;
-   if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
+   uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
+   assert(plane < image->plane_count);
+
+   if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
+                              &region->imageOffset, &region->imageExtent,
+                              &fb_format)) {
       return false;
+   }
 
    uint32_t internal_type, internal_bpp;
    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -403,13 +446,16 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy from compressed format using a compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
+   const uint32_t block_w =
+      vk_format_get_blockwidth(image->planes[plane].vk_format);
+   const uint32_t block_h =
+      vk_format_get_blockheight(image->planes[plane].vk_format);
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, false,
-                        1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        false);
 
    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -432,29 +478,110 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             VkFormat src_format,
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
-            const VkImageBlit2KHR *region,
+            const VkImageBlit2 *region,
             VkFilter filter,
             bool dst_is_padded_image);
 
+
 /**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
+ * A structure that contains all the information we may need in various
+ * processes involving image to buffer copies implemented with blit paths.
+ */
+struct image_to_buffer_info {
+   /* Source image info */
+   VkFormat src_format;
+   uint8_t plane;
+   VkColorComponentFlags cmask;
+   VkComponentMapping cswizzle;
+   VkImageAspectFlags src_copy_aspect;
+   uint32_t block_width;
+   uint32_t block_height;
+
+   /* Destination buffer info */
+   VkFormat dst_format;
+   uint32_t buf_width;
+   uint32_t buf_height;
+   uint32_t buf_bpp;
+   VkImageAspectFlags dst_copy_aspect;
+};
+
+static VkImageBlit2
+blit_region_for_image_to_buffer(const VkOffset3D *offset,
+                                const VkExtent3D *extent,
+                                uint32_t mip_level,
+                                uint32_t base_layer,
+                                uint32_t layer_offset,
+                                struct image_to_buffer_info *info)
+{
+   VkImageBlit2 output = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
+      .srcSubresource = {
+         .aspectMask = info->src_copy_aspect,
+         .mipLevel = mip_level,
+         .baseArrayLayer = base_layer + layer_offset,
+         .layerCount = 1,
+      },
+      .srcOffsets = {
+         {
+            DIV_ROUND_UP(offset->x, info->block_width),
+            DIV_ROUND_UP(offset->y, info->block_height),
+            offset->z + layer_offset,
+         },
+         {
+            DIV_ROUND_UP(offset->x + extent->width, info->block_width),
+            DIV_ROUND_UP(offset->y + extent->height, info->block_height),
+            offset->z + layer_offset + 1,
+         },
+      },
+      .dstSubresource = {
+         .aspectMask = info->dst_copy_aspect,
+         .mipLevel = 0,
+         .baseArrayLayer = 0,
+         .layerCount = 1,
+      },
+      .dstOffsets = {
+         { 0, 0, 0 },
+         {
+            DIV_ROUND_UP(extent->width, info->block_width),
+            DIV_ROUND_UP(extent->height, info->block_height),
+            1
+         },
+      },
+   };
+
+   return output;
+}
+
+/**
+ * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can
+ * use to implement buffer to image copies with blit paths.
+ *
+ * Returns false if the copy operation can't be implemented with a blit.
  */
 static bool
-copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
-                          struct v3dv_buffer *buffer,
-                          struct v3dv_image *image,
-                          const VkBufferImageCopy2KHR *region)
+gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer,
+                            struct v3dv_image *image,
+                            const VkBufferImageCopy2 *region,
+                            struct image_to_buffer_info *out_info)
 {
-   bool handled = false;
+   bool supported = false;
+
+   VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask;
+   /* For multi-planar images we copy one plane at a time using an image alias
+    * with a color aspect for each plane.
+    */
+   if (image->plane_count > 1)
+      dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+
+   VkImageAspectFlags src_copy_aspect = region->imageSubresource.aspectMask;
+   uint8_t plane = v3dv_plane_from_aspect(src_copy_aspect);
+   assert(plane < image->plane_count);
 
    /* Generally, the bpp of the data in the buffer matches that of the
     * source image. The exception is the case where we are copying
     * stencil (8bpp) to a combined d24s8 image (32bpp).
     */
-   uint32_t buffer_bpp = image->cpp;
-
-   VkImageAspectFlags copy_aspect = region->imageSubresource.aspectMask;
+   uint32_t buffer_bpp = image->planes[plane].cpp;
 
    /* Because we are going to implement the copy as a blit, we need to create
     * a linear image from the destination buffer and we also want our blit
@@ -477,22 +604,23 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
    };
    switch (buffer_bpp) {
    case 16:
-      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
       dst_format = VK_FORMAT_R32G32B32A32_UINT;
       src_format = dst_format;
       break;
    case 8:
-      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
       dst_format = VK_FORMAT_R16G16B16A16_UINT;
       src_format = dst_format;
       break;
    case 4:
-      switch (copy_aspect) {
+      switch (dst_copy_aspect) {
       case VK_IMAGE_ASPECT_COLOR_BIT:
          src_format = VK_FORMAT_R8G8B8A8_UINT;
          dst_format = VK_FORMAT_R8G8B8A8_UINT;
          break;
       case VK_IMAGE_ASPECT_DEPTH_BIT:
+         assert(image->plane_count == 1);
          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
@@ -517,7 +645,8 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          }
          break;
       case VK_IMAGE_ASPECT_STENCIL_BIT:
-         assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
+         assert(image->plane_count == 1);
+         assert(dst_copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
          /* Copying from S8D24. We want to write 8-bit stencil values only,
           * so adjust the buffer bpp for that. Since the hardware stores stencil
@@ -529,23 +658,23 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          break;
       default:
          unreachable("unsupported aspect");
-         return handled;
+         return supported;
       };
       break;
    case 2:
-      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
-             copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
+      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
+             dst_copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
       dst_format = VK_FORMAT_R16_UINT;
       src_format = dst_format;
       break;
    case 1:
-      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
       dst_format = VK_FORMAT_R8_UINT;
       src_format = dst_format;
       break;
    default:
       unreachable("unsupported bit-size");
-      return handled;
+      return supported;
    };
 
    /* The hardware doesn't support linear depth/stencil stores, so we
@@ -554,10 +683,10 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
     */
    assert(vk_format_is_color(src_format));
    assert(vk_format_is_color(dst_format));
-   copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+   dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
 
    /* We should be able to handle the blit if we got this far */
-   handled = true;
+   supported = true;
 
    /* Obtain the 2D buffer region spec */
    uint32_t buf_width, buf_height;
@@ -572,99 +701,250 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
       buf_height = region->bufferImageHeight;
 
    /* If the image is compressed, the bpp refers to blocks, not pixels */
-   uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
-   uint32_t block_height = vk_format_get_blockheight(image->vk.format);
-   buf_width = buf_width / block_width;
-   buf_height = buf_height / block_height;
+   uint32_t block_width =
+      vk_format_get_blockwidth(image->planes[plane].vk_format);
+   uint32_t block_height =
+      vk_format_get_blockheight(image->planes[plane].vk_format);
+   buf_width = DIV_ROUND_UP(buf_width, block_width);
+   buf_height = DIV_ROUND_UP(buf_height, block_height);
+
+   out_info->src_format = src_format;
+   out_info->dst_format = dst_format;
+   out_info->src_copy_aspect = src_copy_aspect;
+   out_info->dst_copy_aspect = dst_copy_aspect;
+   out_info->buf_width = buf_width;
+   out_info->buf_height = buf_height;
+   out_info->buf_bpp = buffer_bpp;
+   out_info->block_width = block_width;
+   out_info->block_height = block_height;
+   out_info->cmask = cmask;
+   out_info->cswizzle = cswizzle;
+   out_info->plane = plane;
+
+   return supported;
+}
 
-   /* Compute layers to copy */
-   uint32_t num_layers;
-   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
-      num_layers = region->imageSubresource.layerCount;
-   else
-      num_layers = region->imageExtent.depth;
-   assert(num_layers > 0);
+/* Creates a linear image to alias buffer memory. It also includes that image
+ * as a private object in the cmd_buffer.
+ *
+ * This is used for cases where we want to implement an image to buffer copy,
+ * but we need to rely on a mechanism that uses an image as destination, like
+ * blitting.
+ */
+static VkResult
+create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+                         struct v3dv_buffer *buffer,
+                         const VkBufferImageCopy2 *region,
+                         struct image_to_buffer_info *info,
+                         uint32_t layer,
+                         VkImage *out_image)
+{
+   VkImageCreateInfo image_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+      .imageType = VK_IMAGE_TYPE_2D,
+      .format = info->dst_format,
+      .extent = { info->buf_width, info->buf_height, 1 },
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = VK_SAMPLE_COUNT_1_BIT,
+      .tiling = VK_IMAGE_TILING_LINEAR,
+      .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+      .queueFamilyIndexCount = 0,
+      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
 
-   /* Our blit interface can see the real format of the images to detect
-    * copies between compressed and uncompressed images and adapt the
-    * blit region accordingly. Here we are just doing a raw copy of
-    * compressed data, but we are passing an uncompressed view of the
-    * buffer for the blit destination image (since compressed formats are
-    * not renderable), so we also want to provide an uncompressed view of
-    * the source image.
-    */
    VkResult result;
    struct v3dv_device *device = cmd_buffer->device;
    VkDevice _device = v3dv_device_to_handle(device);
-   if (vk_format_is_compressed(image->vk.format)) {
-      VkImage uiview;
-      VkImageCreateInfo uiview_info = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-         .imageType = VK_IMAGE_TYPE_3D,
-         .format = dst_format,
-         .extent = { buf_width, buf_height, image->vk.extent.depth },
-         .mipLevels = image->vk.mip_levels,
-         .arrayLayers = image->vk.array_layers,
-         .samples = image->vk.samples,
-         .tiling = image->vk.tiling,
-         .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-         .queueFamilyIndexCount = 0,
-         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-      };
-      result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview);
-      if (result != VK_SUCCESS)
-         return handled;
 
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)uiview,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+   VkImage buffer_image;
+   result =
+      v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
+   if (result != VK_SUCCESS)
+      return result;
 
-      result =
-         vk_common_BindImageMemory(_device, uiview,
-                                   v3dv_device_memory_to_handle(image->mem),
-                                   image->mem_offset);
-      if (result != VK_SUCCESS)
-         return handled;
+   *out_image = buffer_image;
+
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)buffer_image,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+
+   /* Bind the buffer memory to the image
+    */
+   VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
+      layer * info->buf_width * info->buf_height * info->buf_bpp;
+
+   result =
+      vk_common_BindImageMemory(_device, buffer_image,
+                                v3dv_device_memory_to_handle(buffer->mem),
+                                buffer_offset);
+   return result;
+}
 
-      image = v3dv_image_from_handle(uiview);
+/**
+ * Creates an image with a single mip level that aliases the memory of a
+ * mip level in another image, re-interpreting the memory with an uncompressed
+ * format. The image is added to the command buffer as a private object for
+ * disposal.
+ */
+static bool
+create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer,
+                             struct v3dv_image *image,
+                             VkFormat format,
+                             uint32_t plane,
+                             uint32_t mip_level,
+                             uint32_t layer,
+                             VkImage *alias)
+{
+   VkResult result;
+   assert(!vk_format_is_compressed(format));
+
+   struct v3dv_device *device = cmd_buffer->device;
+   VkDevice vk_device = v3dv_device_to_handle(device);
+   uint32_t mip_width = image->planes[plane].slices[mip_level].width;
+   uint32_t mip_height = image->planes[plane].slices[mip_level].height;
+
+   uint32_t block_width =
+      vk_format_get_blockwidth(image->planes[plane].vk_format);
+   uint32_t block_height =
+      vk_format_get_blockheight(image->planes[plane].vk_format);
+
+   VkImageCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+      .imageType = image->vk.image_type,
+      .format = format,
+      .extent = { DIV_ROUND_UP(mip_width, block_width),
+                  DIV_ROUND_UP(mip_height, block_height),
+                  1 },
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = image->vk.samples,
+      .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
+      .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+      .queueFamilyIndexCount = 0,
+      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
+   result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias);
+   if (result != VK_SUCCESS)
+      return false;
+
+   /* The alias we have just created has just one mip, but we may be aliasing
+    * any mip in the original image. Because the slice setup changes based on
+    * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally)
+    * and this can influence the tiling layout selected for the slice, we want
+    * to make sure we copy the slice description from the actual mip level in
+    * the original image, and then rewrite any fields that we need for the
+    * alias. Particularly, we want to make the offset 0 because we are going to
+    * bind the underlying image memory exactly at the start of the selected mip.
+    * We also want to relax the image alignment requirements to the minimum
+    * (the one imposed by the Texture Base Address field) since we may not be
+    * aliasing a level 0 (for which we typically want a page alignment for
+    * optimal performance).
+    */
+   V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias);
+   v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level];
+   v3dv_alias->planes[plane].slices[0].width = info.extent.width;
+   v3dv_alias->planes[plane].slices[0].height = info.extent.height;
+   v3dv_alias->planes[plane].slices[0].offset = 0;
+   v3dv_alias->planes[plane].alignment = 64;
+
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)*alias,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+
+   result =
+      vk_common_BindImageMemory(vk_device, *alias,
+                                v3dv_device_memory_to_handle(image->planes[plane].mem),
+                                v3dv_layer_offset(image, mip_level, layer, plane));
+   return result == VK_SUCCESS;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
+                          struct v3dv_buffer *buffer,
+                          struct v3dv_image *image,
+                          const VkBufferImageCopy2 *region)
+{
+   bool handled = false;
+   struct image_to_buffer_info info;
+
+   /* This path uses a shader blit which doesn't support linear images. Return
+    * early to avoid all the heavy lifting in preparation for the
+    * blit_shader() call that is bound to fail in that scenario.
+    */
+   if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) {
+      return handled;
    }
 
+   handled = gather_image_to_buffer_info(cmd_buffer, image, region,
+                                         &info);
+
+   if (!handled)
+      return handled;
+
+   /* We should be able to handle the blit if we got this far */
+   handled = true;
+
+   /* Compute layers to copy */
+   uint32_t num_layers;
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
+      num_layers = region->imageSubresource.layerCount;
+   else
+      num_layers = region->imageExtent.depth;
+   assert(num_layers > 0);
+
    /* Copy requested layers */
+   VkResult result;
+   VkImageBlit2 blit_region;
+   uint32_t mip_level = region->imageSubresource.mipLevel;
+   uint32_t base_layer = region->imageSubresource.baseArrayLayer;
    for (uint32_t i = 0; i < num_layers; i++) {
-      /* Create the destination blit image from the destination buffer */
-      VkImageCreateInfo image_info = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-         .imageType = VK_IMAGE_TYPE_2D,
-         .format = dst_format,
-         .extent = { buf_width, buf_height, 1 },
-         .mipLevels = 1,
-         .arrayLayers = 1,
-         .samples = VK_SAMPLE_COUNT_1_BIT,
-         .tiling = VK_IMAGE_TILING_LINEAR,
-         .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-         .queueFamilyIndexCount = 0,
-         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-      };
-
-      VkImage buffer_image;
-      result =
-         v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
-      if (result != VK_SUCCESS)
-         return handled;
+      uint32_t layer_offset = i;
+
+      if (vk_format_is_compressed(image->vk.format)) {
+         /* Our blit interface can see the real format of the images to detect
+          * copies between compressed and uncompressed images and adapt the
+          * blit region accordingly. Here we are just doing a raw copy of
+          * compressed data, but we are passing an uncompressed view of the
+          * buffer for the blit destination image (since compressed formats are
+          * not renderable), so we also want to provide an uncompressed view of
+          * the source image.
+          *
+          * It is important that we create the alias over the selected mip
+          * level (instead of aliasing the entire image) because an uncompressed
+          * view of the image won't have the same number of mip levels as the
+          * original image and the implicit mip size calculations the hw will
+          * do to sample from a non-zero mip level may not match exactly between
+          * compressed and uncompressed views.
+          */
+         VkImage alias;
+         if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format,
+                                           info.plane, mip_level,
+                                           base_layer + layer_offset,
+                                           &alias)) {
+            return handled;
+         }
 
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)buffer_image,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+         /* We are aliasing the selected mip level and layer with a
+          * single-mip and single-layer image.
+          */
+         image = v3dv_image_from_handle(alias);
+         mip_level = 0;
+         base_layer = 0;
+         layer_offset = 0;
+      }
 
-      /* Bind the buffer memory to the image */
-      VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
-         i * buf_width * buf_height * buffer_bpp;
+      /* Create the destination blit image from the destination buffer */
+      VkImage buffer_image;
       result =
-         vk_common_BindImageMemory(_device, buffer_image,
-                                   v3dv_device_memory_to_handle(buffer->mem),
-                                   buffer_offset);
+         create_image_from_buffer(cmd_buffer, buffer, region, &info,
+                                  i, &buffer_image);
       if (result != VK_SUCCESS)
          return handled;
 
@@ -676,48 +956,17 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
        * image, but that we need to blit to a S8D24 destination (the only
        * stencil format we support).
        */
-      const VkImageBlit2KHR blit_region = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
-         .srcSubresource = {
-            .aspectMask = copy_aspect,
-            .mipLevel = region->imageSubresource.mipLevel,
-            .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
-            .layerCount = 1,
-         },
-         .srcOffsets = {
-            {
-               DIV_ROUND_UP(region->imageOffset.x, block_width),
-               DIV_ROUND_UP(region->imageOffset.y, block_height),
-               region->imageOffset.z + i,
-            },
-            {
-               DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
-                            block_width),
-               DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
-                            block_height),
-               region->imageOffset.z + i + 1,
-            },
-         },
-         .dstSubresource = {
-            .aspectMask = copy_aspect,
-            .mipLevel = 0,
-            .baseArrayLayer = 0,
-            .layerCount = 1,
-         },
-         .dstOffsets = {
-            { 0, 0, 0 },
-            {
-               DIV_ROUND_UP(region->imageExtent.width, block_width),
-               DIV_ROUND_UP(region->imageExtent.height, block_height),
-               1
-            },
-         },
-      };
+      blit_region =
+         blit_region_for_image_to_buffer(&region->imageOffset,
+                                         &region->imageExtent,
+                                         mip_level, base_layer, layer_offset,
+                                         &info);
 
       handled = blit_shader(cmd_buffer,
-                            v3dv_image_from_handle(buffer_image), dst_format,
-                            image, src_format,
-                            cmask, &cswizzle,
+                            v3dv_image_from_handle(buffer_image),
+                            info.dst_format,
+                            image, info.src_format,
+                            info.cmask, &info.cswizzle,
                             &blit_region, VK_FILTER_NEAREST, false);
       if (!handled) {
          /* This is unexpected, we should have a supported blit spec */
@@ -730,9 +979,110 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
    return true;
 }
 
+static bool
+copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+                               struct v3dv_image *dst,
+                               struct v3dv_image *src,
+                               const VkImageCopy2 *region);
+
+static VkImageCopy2
+image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region,
+                                      struct image_to_buffer_info *info,
+                                      uint32_t layer)
+{
+   VkImageCopy2 output = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
+      .srcSubresource = {
+         .aspectMask = info->src_copy_aspect,
+         .mipLevel = region->imageSubresource.mipLevel,
+         .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer,
+         .layerCount = 1,
+      },
+      .srcOffset = {
+            DIV_ROUND_UP(region->imageOffset.x, info->block_width),
+            DIV_ROUND_UP(region->imageOffset.y, info->block_height),
+            region->imageOffset.z,
+      },
+      .dstSubresource = {
+         .aspectMask = info->dst_copy_aspect,
+         .mipLevel = 0,
+         .baseArrayLayer = 0,
+         .layerCount = 1,
+      },
+      .dstOffset = { 0, 0, 0 },
+      .extent = {
+         DIV_ROUND_UP(region->imageExtent.width, info->block_width),
+         DIV_ROUND_UP(region->imageExtent.height, info->block_height),
+         1
+      },
+   };
+
+   return output;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+                                  struct v3dv_buffer *dst_buffer,
+                                  struct v3dv_image *src_image,
+                                  const VkBufferImageCopy2 *region)
+{
+   bool handled = false;
+   VkImage dst_buffer_image;
+   struct image_to_buffer_info info;
+
+   /* This is a requirement for copy_image_linear_texel_buffer below. We check
+    * it in advance in order to do an early return
+    */
+   if (src_image->tiled)
+      return false;
+
+   handled =
+      gather_image_to_buffer_info(cmd_buffer, src_image, region,
+                                  &info);
+   if (!handled)
+      return handled;
+
+   /* At this point the implementation should support the copy, any possible
+    * error below are for different reasons, like out-of-memory error
+    */
+   handled = true;
+
+   uint32_t num_layers;
+   if (src_image->vk.image_type != VK_IMAGE_TYPE_3D)
+      num_layers = region->imageSubresource.layerCount;
+   else
+      num_layers = region->imageExtent.depth;
+   assert(num_layers > 0);
+
+   VkResult result;
+   VkImageCopy2 image_region;
+   for (uint32_t layer = 0; layer < num_layers; layer++) {
+      /* Create the destination image from the destination buffer */
+      result =
+         create_image_from_buffer(cmd_buffer, dst_buffer, region, &info,
+                                  layer, &dst_buffer_image);
+      if (result != VK_SUCCESS)
+         return handled;
+
+      image_region =
+         image_copy_region_for_image_to_buffer(region, &info, layer);
+
+      handled =
+         copy_image_linear_texel_buffer(cmd_buffer,
+                                        v3dv_image_from_handle(dst_buffer_image),
+                                        src_image, &image_region);
+   }
+
+   return handled;
+}
+
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
-                              const VkCopyImageToBufferInfo2KHR *info)
+v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
+                              const VkCopyImageToBufferInfo2 *info)
 
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -741,13 +1091,23 @@ v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
 
    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
+   cmd_buffer->state.is_transfer = true;
+
    for (uint32_t i = 0; i < info->regionCount; i++) {
-      if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i]))
+      const VkBufferImageCopy2 *region = &info->pRegions[i];
+
+      if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, region))
          continue;
-      if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &info->pRegions[i]))
+
+      if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region))
          continue;
+
+      if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region))
+         continue;
+
       unreachable("Unsupported image to buffer copy.");
    }
+   cmd_buffer->state.is_transfer = false;
 }
 
 /**
@@ -758,10 +1118,15 @@ static bool
 copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                struct v3dv_image *dst,
                struct v3dv_image *src,
-               const VkImageCopy2KHR *region)
+               const VkImageCopy2 *region)
 {
+   if (V3D_DBG(DISABLE_TFU)) {
+      perf_debug("Copy images: TFU disabled, fallbacks could be slower.\n");
+      return false;
+   }
+
    /* Destination can't be raster format */
-   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
+   if (!dst->tiled)
       return false;
 
    /* We can only do full copies, so if the format is D24S8 both aspects need
@@ -772,7 +1137,7 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
                                              VK_IMAGE_ASPECT_STENCIL_BIT;
        if (region->dstSubresource.aspectMask != ds_aspects)
-         return false;
+          return false;
    }
 
    /* Don't handle copies between uncompressed and compressed formats for now.
@@ -797,9 +1162,14 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
       return false;
 
+   uint8_t src_plane =
+      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+   uint8_t dst_plane =
+      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+
    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
-   uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
-   uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
+   uint32_t dst_width = u_minify(dst->planes[dst_plane].width, dst_mip_level);
+   uint32_t dst_height = u_minify(dst->planes[dst_plane].height, dst_mip_level);
    if (region->extent.width != dst_width || region->extent.height != dst_height)
       return false;
 
@@ -809,8 +1179,10 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     *    members represent the texel dimensions of the source image and not
     *    the destination."
     */
-   const uint32_t block_w = vk_format_get_blockwidth(src->vk.format);
-   const uint32_t block_h = vk_format_get_blockheight(src->vk.format);
+   const uint32_t block_w =
+      vk_format_get_blockwidth(src->planes[src_plane].vk_format);
+   const uint32_t block_h =
+      vk_format_get_blockheight(src->planes[src_plane].vk_format);
    uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
@@ -834,10 +1206,10 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     * the underlying pixel data according to its format, we can always choose
     * to use compatible formats that are supported with the TFU unit.
     */
-   assert(dst->cpp == src->cpp);
+   assert(dst->planes[dst_plane].cpp == src->planes[src_plane].cpp);
    const struct v3dv_format *format =
       v3dv_get_compatible_tfu_format(cmd_buffer->device,
-                                     dst->cpp, NULL);
+                                     dst->planes[dst_plane].cpp, NULL);
 
    /* Emit a TFU job for each layer to blit */
    const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
@@ -850,15 +1222,47 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->dstSubresource.baseArrayLayer : region->dstOffset.z;
    for (uint32_t i = 0; i < layer_count; i++) {
-      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
-         (cmd_buffer, dst, dst_mip_level, base_dst_layer + i,
-          src, src_mip_level, base_src_layer + i,
-          width, height, format);
+      const uint32_t dst_offset =
+         dst->planes[dst_plane].mem->bo->offset +
+         v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i, dst_plane);
+      const uint32_t src_offset =
+         src->planes[src_plane].mem->bo->offset +
+         v3dv_layer_offset(src, src_mip_level, base_src_layer + i, src_plane);
+
+      const struct v3d_resource_slice *dst_slice =
+         &dst->planes[dst_plane].slices[dst_mip_level];
+      const struct v3d_resource_slice *src_slice =
+         &src->planes[src_plane].slices[src_mip_level];
+
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+         cmd_buffer,
+         dst->planes[dst_plane].mem->bo->handle,
+         dst_offset,
+         dst_slice->tiling,
+         dst_slice->padded_height,
+         dst->planes[dst_plane].cpp,
+         src->planes[src_plane].mem->bo->handle,
+         src_offset,
+         src_slice->tiling,
+         src_slice->tiling == V3D_TILING_RASTER ?
+                              src_slice->stride : src_slice->padded_height,
+         src->planes[src_plane].cpp,
+         /* All compatible TFU formats are single-plane */
+         width, height, &format->planes[0]);
    }
 
    return true;
 }
 
+inline bool
+v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
+                               struct v3dv_image *dst,
+                               struct v3dv_image *src,
+                               const VkImageCopy2 *region)
+{
+   return copy_image_tfu(cmd_buffer, dst, src, region);
+}
+
 /**
  * Returns true if the implementation supports the requested operation (even if
  * it failed to process it, for example, due to an out-of-memory error).
@@ -867,11 +1271,20 @@ static bool
 copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                struct v3dv_image *dst,
                struct v3dv_image *src,
-               const VkImageCopy2KHR *region)
+               const VkImageCopy2 *region)
 {
+   uint8_t src_plane =
+      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+   assert(src_plane < src->plane_count);
+   uint8_t dst_plane =
+      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+   assert(dst_plane < dst->plane_count);
+
    VkFormat fb_format;
-   if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, &fb_format) ||
-       !v3dv_meta_can_use_tlb(dst, &region->dstOffset, &fb_format)) {
+   if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel,
+                              &region->srcOffset, NULL, &fb_format) ||
+       !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel,
+                              &region->dstOffset, &region->extent, &fb_format)) {
       return false;
    }
 
@@ -881,7 +1294,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
     *     dstImage has a multi-planar image format then the aspectMask member
     *     of srcSubresource and dstSubresource must match."
     */
-   assert(region->dstSubresource.aspectMask ==
+   assert(src->plane_count != 1 || dst->plane_count != 1 ||
+          region->dstSubresource.aspectMask ==
           region->srcSubresource.aspectMask);
    uint32_t internal_type, internal_bpp;
    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -911,12 +1325,15 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy to compressed image using compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
-   const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
+   const uint32_t block_w =
+      vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
+   const uint32_t block_h =
+      vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp,
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
 
    struct v3dv_meta_framebuffer framebuffer;
@@ -951,6 +1368,8 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
                    VkFormat format)
 {
    assert(!vk_format_is_compressed(format));
+   /* We don't support ycbcr compressed formats */
+   assert(src->plane_count == 1);
 
    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
 
@@ -966,7 +1385,7 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
       .mipLevels = src->vk.mip_levels,
       .arrayLayers = src->vk.array_layers,
       .samples = src->vk.samples,
-      .tiling = src->vk.tiling,
+      .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
       .usage = src->vk.usage,
    };
 
@@ -979,8 +1398,8 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
     }
 
     struct v3dv_image *image = v3dv_image_from_handle(_image);
-    image->mem = src->mem;
-    image->mem_offset = src->mem_offset;
+    image->planes[0].mem = src->planes[0].mem;
+    image->planes[0].mem_offset = src->planes[0].mem_offset;
     return image;
 }
 
@@ -992,12 +1411,26 @@ static bool
 copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_image *dst,
                 struct v3dv_image *src,
-                const VkImageCopy2KHR *region)
+                const VkImageCopy2 *region)
 {
-   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
-   const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
-   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
-   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
+   if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D)
+      return false;
+
+   uint8_t src_plane =
+      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+   assert(src_plane < src->plane_count);
+   uint8_t dst_plane =
+      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+   assert(dst_plane < dst->plane_count);
+
+   const uint32_t src_block_w =
+      vk_format_get_blockwidth(src->planes[src_plane].vk_format);
+   const uint32_t src_block_h =
+      vk_format_get_blockheight(src->planes[src_plane].vk_format);
+   const uint32_t dst_block_w =
+      vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
+   const uint32_t dst_block_h =
+      vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
 
@@ -1033,10 +1466,10 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
        * divisors for the width and height depending on the source image's
        * bpp.
        */
-      assert(src->cpp == dst->cpp);
+      assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
 
       format = VK_FORMAT_R32G32_UINT;
-      switch (src->cpp) {
+      switch (src->planes[src_plane].cpp) {
       case 16:
          format = VK_FORMAT_R32G32B32A32_UINT;
          break;
@@ -1061,13 +1494,15 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       dst = create_image_alias(cmd_buffer, dst,
                                dst_scale_w, dst_scale_h, format);
    } else {
-      format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
-         src->vk.format : get_compatible_tlb_format(src->vk.format);
+      format = src->format->planes[src_plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
+         src->planes[src_plane].vk_format :
+         get_compatible_tlb_format(src->planes[src_plane].vk_format);
       if (format == VK_FORMAT_UNDEFINED)
          return false;
 
       const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
-      if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO)
+      assert(f->plane_count < 2);
+      if (!f->plane_count || f->planes[0].tex_type == TEXTURE_DATA_FORMAT_NO)
          return false;
    }
 
@@ -1090,14 +1525,21 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
     * (since the region dimensions are already specified in terms of the source
     * image).
     */
+   uint32_t region_width = region->extent.width * src_scale_w;
+   uint32_t region_height = region->extent.height * src_scale_h;
+   if (src_block_w > 1)
+      region_width = util_next_power_of_two(region_width);
+   if (src_block_h > 1)
+      region_height = util_next_power_of_two(region_height);
+
    const VkOffset3D src_start = {
       region->srcOffset.x * src_scale_w,
       region->srcOffset.y * src_scale_h,
       region->srcOffset.z,
    };
    const VkOffset3D src_end = {
-      src_start.x + region->extent.width * src_scale_w,
-      src_start.y + region->extent.height * src_scale_h,
+      src_start.x + region_width,
+      src_start.y + region_height,
       src_start.z + region->extent.depth,
    };
 
@@ -1107,13 +1549,13 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       region->dstOffset.z,
    };
    const VkOffset3D dst_end = {
-      dst_start.x + region->extent.width * src_scale_w,
-      dst_start.y + region->extent.height * src_scale_h,
+      dst_start.x + region_width,
+      dst_start.y + region_height,
       dst_start.z + region->extent.depth,
    };
 
-   const VkImageBlit2KHR blit_region = {
-      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
+   const VkImageBlit2 blit_region = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
       .srcSubresource = region->srcSubresource,
       .srcOffsets = { src_start, src_end },
       .dstSubresource = region->dstSubresource,
@@ -1130,9 +1572,113 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
    return handled;
 }
 
+static bool
+copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+                               struct v3dv_image *dst,
+                               struct v3dv_image *src,
+                               const VkImageCopy2 *region)
+{
+   if (src->tiled)
+      return false;
+
+   /* Implementations are allowed to restrict linear images like this */
+   assert(region->srcOffset.z == 0);
+   assert(region->dstOffset.z == 0);
+   assert(region->srcSubresource.mipLevel == 0);
+   assert(region->srcSubresource.baseArrayLayer == 0);
+   assert(region->srcSubresource.layerCount == 1);
+   assert(region->dstSubresource.mipLevel == 0);
+   assert(region->dstSubresource.baseArrayLayer == 0);
+   assert(region->dstSubresource.layerCount == 1);
+
+   uint8_t src_plane =
+      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+   uint8_t dst_plane =
+      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+
+   assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
+   const uint32_t bpp = src->planes[src_plane].cpp;
+
+   VkFormat format;
+   switch (bpp) {
+   case 16:
+      format = VK_FORMAT_R32G32B32A32_UINT;
+      break;
+   case 8:
+      format = VK_FORMAT_R16G16B16A16_UINT;
+      break;
+   case 4:
+      format = VK_FORMAT_R8G8B8A8_UINT;
+      break;
+   case 2:
+      format = VK_FORMAT_R16_UINT;
+      break;
+   case 1:
+      format = VK_FORMAT_R8_UINT;
+      break;
+   default:
+      unreachable("unsupported bit-size");
+      return false;
+   }
+
+   VkComponentMapping ident_swizzle = {
+      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
+      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
+      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
+      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
+   };
+
+   const uint32_t buf_stride = src->planes[src_plane].slices[0].stride;
+   const VkDeviceSize buf_offset =
+      region->srcOffset.y * buf_stride + region->srcOffset.x * bpp;
+
+   struct v3dv_buffer src_buffer;
+   vk_object_base_init(&cmd_buffer->device->vk, &src_buffer.base,
+                       VK_OBJECT_TYPE_BUFFER);
+
+   const struct VkBufferCreateInfo buf_create_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+      .size = src->planes[src_plane].size,
+      .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+   };
+   v3dv_buffer_init(cmd_buffer->device, &buf_create_info, &src_buffer,
+                    src->planes[src_plane].alignment);
+
+   const VkBindBufferMemoryInfo buf_bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+      .buffer = v3dv_buffer_to_handle(&src_buffer),
+      .memory = v3dv_device_memory_to_handle(src->planes[src_plane].mem),
+      .memoryOffset = src->planes[src_plane].mem_offset +
+         v3dv_layer_offset(src, 0, 0, src_plane),
+   };
+   v3dv_buffer_bind_memory(&buf_bind_info);
+
+   const VkBufferImageCopy2 copy_region = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
+      .pNext = NULL,
+      .bufferOffset = buf_offset,
+      .bufferRowLength = buf_stride / bpp,
+      .bufferImageHeight = src->vk.extent.height,
+      .imageSubresource = region->dstSubresource,
+      .imageOffset = region->dstOffset,
+      .imageExtent = region->extent,
+   };
+
+   return texel_buffer_shader_copy(cmd_buffer,
+                                   region->dstSubresource.aspectMask,
+                                   dst,
+                                   format,
+                                   format,
+                                   &src_buffer,
+                                   src->planes[src_plane].cpp,
+                                   0 /* color mask: full */, &ident_swizzle,
+                                   1, &copy_region);
+}
+
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
-                      const VkCopyImageInfo2KHR *info)
+v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer,
+                      const VkCopyImageInfo2 *info)
 
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -1141,25 +1687,34 @@ v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
 
    assert(src->vk.samples == dst->vk.samples);
 
+   cmd_buffer->state.is_transfer = true;
+
    for (uint32_t i = 0; i < info->regionCount; i++) {
-      if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i]))
+      const VkImageCopy2 *region = &info->pRegions[i];
+      if (copy_image_tfu(cmd_buffer, dst, src, region))
+         continue;
+      if (copy_image_tlb(cmd_buffer, dst, src, region))
          continue;
-      if (copy_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
+      if (copy_image_blit(cmd_buffer, dst, src, region))
          continue;
-      if (copy_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
+      if (copy_image_linear_texel_buffer(cmd_buffer, dst, src, region))
          continue;
       unreachable("Image copy not supported");
    }
+
+   cmd_buffer->state.is_transfer = false;
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
-                       const VkCopyBufferInfo2KHR *pCopyBufferInfo)
+v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
+                       const VkCopyBufferInfo2 *pCopyBufferInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
 
+   cmd_buffer->state.is_transfer = true;
+
    for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
       v3dv_X(cmd_buffer->device, meta_copy_buffer)
          (cmd_buffer,
@@ -1167,6 +1722,8 @@ v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
           src_buffer->mem->bo, src_buffer->mem_offset,
           &pCopyBufferInfo->pRegions[i]);
    }
+
+   cmd_buffer->state.is_transfer = false;
 }
 
 static void
@@ -1202,12 +1759,14 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
       return;
    }
 
+   cmd_buffer->state.is_transfer = true;
+
    memcpy(src_bo->map, pData, dataSize);
 
    v3dv_bo_unmap(cmd_buffer->device, src_bo);
 
-   VkBufferCopy2KHR region = {
-      .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2_KHR,
+   VkBufferCopy2 region = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
       .srcOffset = 0,
       .dstOffset = dstOffset,
       .size = dataSize,
@@ -1217,11 +1776,12 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
       (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
        src_bo, 0, &region);
 
-   if (!copy_job)
-      return;
+   if (copy_job) {
+      v3dv_cmd_buffer_add_private_obj(
+         cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
+   }
 
-   v3dv_cmd_buffer_add_private_obj(
-      cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
+   cmd_buffer->state.is_transfer = false;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -1234,6 +1794,8 @@ v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
 
+   cmd_buffer->state.is_transfer = true;
+
    struct v3dv_bo *bo = dst_buffer->mem->bo;
 
    /* From the Vulkan spec:
@@ -1248,6 +1810,8 @@ v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
 
    v3dv_X(cmd_buffer->device, meta_fill_buffer)
       (cmd_buffer, bo, dstOffset, size, data);
+
+   cmd_buffer->state.is_transfer = false;
 }
 
 /**
@@ -1258,19 +1822,24 @@ static bool
 copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_image *image,
                          struct v3dv_buffer *buffer,
-                         const VkBufferImageCopy2KHR *region)
+                         const VkBufferImageCopy2 *region)
 {
+   if (V3D_DBG(DISABLE_TFU)) {
+      perf_debug("Copy buffer to image: TFU disabled, fallbacks could be slower.\n");
+      return false;
+   }
+
    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* Destination can't be raster format */
-   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
+   if (!image->tiled)
       return false;
 
    /* We can't copy D24S8 because buffer to image copies only copy one aspect
     * at a time, and the TFU copies full images. Also, V3D depth bits for
     * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
     * the Vulkan spec has the buffer data specified the other way around, so it
-    * is not a straight copy, we would havew to swizzle the channels, which the
+    * is not a straight copy, we would have to swizzle the channels, which the
     * TFU can't do.
     */
    if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
@@ -1295,12 +1864,20 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    else
       height = region->bufferImageHeight;
 
-   if (width != image->vk.extent.width || height != image->vk.extent.height)
+   const uint8_t plane =
+      v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
+
+   const uint32_t mip_level = region->imageSubresource.mipLevel;
+   const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
+
+   if (width != slice->width || height != slice->height)
       return false;
 
    /* Handle region semantics for compressed images */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
+   const uint32_t block_w =
+      vk_format_get_blockwidth(image->planes[plane].vk_format);
+   const uint32_t block_h =
+      vk_format_get_blockheight(image->planes[plane].vk_format);
    width = DIV_ROUND_UP(width, block_w);
    height = DIV_ROUND_UP(height, block_h);
 
@@ -1311,10 +1888,10 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     */
    const struct v3dv_format *format =
       v3dv_get_compatible_tfu_format(cmd_buffer->device,
-                                     image->cpp, NULL);
-
-   const uint32_t mip_level = region->imageSubresource.mipLevel;
-   const struct v3d_resource_slice *slice = &image->slices[mip_level];
+                                     image->planes[plane].cpp, NULL);
+   /* We only use single-plane formats with the TFU */
+   assert(format->plane_count == 1);
+   const struct v3dv_format_plane *format_plane = &format->planes[0];
 
    uint32_t num_layers;
    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
@@ -1323,14 +1900,14 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       num_layers = region->imageExtent.depth;
    assert(num_layers > 0);
 
-   assert(image->mem && image->mem->bo);
-   const struct v3dv_bo *dst_bo = image->mem->bo;
+   assert(image->planes[plane].mem && image->planes[plane].mem->bo);
+   const struct v3dv_bo *dst_bo = image->planes[plane].mem->bo;
 
    assert(buffer->mem && buffer->mem->bo);
    const struct v3dv_bo *src_bo = buffer->mem->bo;
 
    /* Emit a TFU job per layer to copy */
-   const uint32_t buffer_stride = width * image->cpp;
+   const uint32_t buffer_stride = width * image->planes[plane].cpp;
    for (int i = 0; i < num_layers; i++) {
       uint32_t layer;
       if (image->vk.image_type != VK_IMAGE_TYPE_3D)
@@ -1338,46 +1915,27 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       else
          layer = region->imageOffset.z + i;
 
-      struct drm_v3d_submit_tfu tfu = {
-         .ios = (height << 16) | width,
-         .bo_handles = {
-            dst_bo->handle,
-            src_bo->handle != dst_bo->handle ? src_bo->handle : 0
-         },
-      };
-
       const uint32_t buffer_offset =
          buffer->mem_offset + region->bufferOffset +
          height * buffer_stride * i;
-
       const uint32_t src_offset = src_bo->offset + buffer_offset;
-      tfu.iia |= src_offset;
-      tfu.icfg |= V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT;
-      tfu.iis |= width;
 
       const uint32_t dst_offset =
-         dst_bo->offset + v3dv_layer_offset(image, mip_level, layer);
-      tfu.ioa |= dst_offset;
-
-      tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
-                  (slice->tiling - V3D_TILING_LINEARTILE)) <<
-                   V3D_TFU_IOA_FORMAT_SHIFT;
-      tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
-
-      /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
-       * OPAD field for the destination (how many extra UIF blocks beyond
-       * those necessary to cover the height).
-       */
-      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
-          slice->tiling == V3D_TILING_UIF_XOR) {
-         uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp);
-         uint32_t implicit_padded_height = align(height, uif_block_h);
-         uint32_t icfg =
-            (slice->padded_height - implicit_padded_height) / uif_block_h;
-         tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
-      }
-
-      v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
+         dst_bo->offset + v3dv_layer_offset(image, mip_level, layer, plane);
+
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+             cmd_buffer,
+             dst_bo->handle,
+             dst_offset,
+             slice->tiling,
+             slice->padded_height,
+             image->planes[plane].cpp,
+             src_bo->handle,
+             src_offset,
+             V3D_TILING_RASTER,
+             width,
+             1,
+             width, height, format_plane);
    }
 
    return true;
@@ -1391,11 +1949,17 @@ static bool
 copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_image *image,
                          struct v3dv_buffer *buffer,
-                         const VkBufferImageCopy2KHR *region)
+                         const VkBufferImageCopy2 *region)
 {
    VkFormat fb_format;
-   if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
+   uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
+   assert(plane < image->plane_count);
+
+   if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
+                              &region->imageOffset, &region->imageExtent,
+                              &fb_format)) {
       return false;
+   }
 
    uint32_t internal_type, internal_bpp;
    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -1415,13 +1979,16 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy to compressed format using a compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
+   const uint32_t block_w =
+      vk_format_get_blockwidth(image->planes[plane].vk_format);
+   const uint32_t block_h =
+      vk_format_get_blockheight(image->planes[plane].vk_format);
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, false,
-                        1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        false);
 
    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -1440,7 +2007,7 @@ static bool
 create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                                struct v3dv_image *image,
                                struct v3dv_buffer *buffer,
-                               const VkBufferImageCopy2KHR *region)
+                               const VkBufferImageCopy2 *region)
 {
    if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
       return true;
@@ -1569,8 +2136,6 @@ create_blit_render_pass(struct v3dv_device *device,
                         VkRenderPass *pass_load,
                         VkRenderPass *pass_no_load);
 
-static nir_ssa_def *gen_rect_vertices(nir_builder *b);
-
 static bool
 create_pipeline(struct v3dv_device *device,
                 struct v3dv_render_pass *pass,
@@ -1595,7 +2160,7 @@ get_texel_buffer_copy_vs()
                           glsl_vec4_type(), "gl_Position");
    vs_out_pos->data.location = VARYING_SLOT_POS;
 
-   nir_ssa_def *pos = gen_rect_vertices(&b);
+   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
    nir_store_var(&b, vs_out_pos, pos, 0xf);
 
    return b.shader;
@@ -1618,8 +2183,8 @@ get_texel_buffer_copy_gs()
    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
                                (1ull << VARYING_SLOT_LAYER);
-   nir->info.gs.input_primitive = GL_TRIANGLES;
-   nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
+   nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
+   nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
    nir->info.gs.vertices_in = 3;
    nir->info.gs.vertices_out = 3;
    nir->info.gs.invocations = 1;
@@ -1652,7 +2217,7 @@ get_texel_buffer_copy_gs()
       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
 
       /* gl_Layer from push constants */
-      nir_ssa_def *layer =
+      nir_def *layer =
          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                                 .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
                                 .range = 4);
@@ -1666,7 +2231,7 @@ get_texel_buffer_copy_gs()
    return nir;
 }
 
-static nir_ssa_def *
+static nir_def *
 load_frag_coord(nir_builder *b)
 {
    nir_foreach_shader_in_variable(var, b->shader) {
@@ -1730,24 +2295,24 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
    /* Load the box describing the pixel region we want to copy from the
     * texel buffer.
     */
-   nir_ssa_def *box =
+   nir_def *box =
       nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
                              .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
                              .range = 16);
 
    /* Load the buffer stride (this comes in texel units) */
-   nir_ssa_def *stride =
+   nir_def *stride =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                              .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
                              .range = 4);
 
    /* Load the buffer offset (this comes in texel units) */
-   nir_ssa_def *offset =
+   nir_def *offset =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                              .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
                              .range = 4);
 
-   nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
+   nir_def *coord = nir_f2i32(&b, load_frag_coord(&b));
 
    /* Load pixel data from texel buffer based on the x,y offset of the pixel
     * within the box. Texel buffers are 1D arrays of texels.
@@ -1757,28 +2322,26 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
     * texel buffer should always be within its bounds and we we don't need
     * to add a check for that here.
     */
-   nir_ssa_def *x_offset =
+   nir_def *x_offset =
       nir_isub(&b, nir_channel(&b, coord, 0),
                    nir_channel(&b, box, 0));
-   nir_ssa_def *y_offset =
+   nir_def *y_offset =
       nir_isub(&b, nir_channel(&b, coord, 1),
                    nir_channel(&b, box, 1));
-   nir_ssa_def *texel_offset =
+   nir_def *texel_offset =
       nir_iadd(&b, nir_iadd(&b, offset, x_offset),
                    nir_imul(&b, y_offset, stride));
 
-   nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+   nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def;
    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
    tex->op = nir_texop_txf;
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(texel_offset);
-   tex->src[1].src_type = nir_tex_src_texture_deref;
-   tex->src[1].src = nir_src_for_ssa(tex_deref);
+   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset);
+   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
    tex->dest_type = nir_type_uint32;
    tex->is_array = false;
    tex->coord_components = 1;
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
+   nir_def_init(&tex->instr, &tex->def, 4, 32);
    nir_builder_instr_insert(&b, &tex->instr);
 
    uint32_t swiz[4];
@@ -1790,7 +2353,7 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
    swiz[3] =
       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
-   nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4);
+   nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4);
    nir_store_var(&b, fs_out_color, s, 0xf);
 
    return b.shader;
@@ -1876,7 +2439,7 @@ get_copy_texel_buffer_pipeline(
    mtx_lock(&device->meta.mtx);
    struct hash_entry *entry =
       _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
-                              &key);
+                              key);
    if (entry) {
       mtx_unlock(&device->meta.mtx);
       *pipeline = entry->data;
@@ -1905,8 +2468,10 @@ get_copy_texel_buffer_pipeline(
    if (!ok)
       goto fail;
 
+   uint8_t *dupkey = malloc(V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+   memcpy(dupkey, key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
    _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
-                           &key, *pipeline);
+                           dupkey, *pipeline);
 
    mtx_unlock(&device->meta.mtx);
    return true;
@@ -1938,7 +2503,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
                          VkColorComponentFlags cmask,
                          VkComponentMapping *cswizzle,
                          uint32_t region_count,
-                         const VkBufferImageCopy2KHR *regions)
+                         const VkBufferImageCopy2 *regions)
 {
    VkResult result;
    bool handled = false;
@@ -1957,7 +2522,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
    /* We only handle color copies. Callers can copy D/S aspects by using
     * a compatible color format and maybe a cmask/cswizzle for D24 formats.
     */
-   if (aspect != VK_IMAGE_ASPECT_COLOR_BIT)
+   if (!vk_format_is_color(dst_format) || !vk_format_is_color(src_format))
       return handled;
 
    /* FIXME: we only handle uncompressed images for now. */
@@ -1978,7 +2543,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
    if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
       if (v3dv_buffer_format_supports_features(
              cmd_buffer->device, src_format,
-             VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
+             VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT)) {
          buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
       } else {
          return handled;
@@ -2027,13 +2592,10 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
    if (result != VK_SUCCESS)
       return handled;
 
-   /* FIXME: for some reason passing region->bufferOffset here for the
-    * offset field doesn't work, making the following CTS tests fail:
-    *
-    * dEQP-VK.api.copy_and_blit.core.buffer_to_image.*buffer_offset*
-    *
-    * So instead we pass 0 here and we pass the offset in texels as a push
-    * constant to the shader, which seems to work correctly.
+   /* We can't pass region->bufferOffset here for the offset field because
+    * the texture base pointer in the texture shader state must be a 64-byte
+    * aligned value. Instead, we use 0 here and we pass the offset in texels
+    * as a push constant to the shader.
     */
    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
    VkBufferViewCreateInfo buffer_view_info = {
@@ -2068,7 +2630,6 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
 
    /* Push command buffer state before starting meta operation */
    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
-   uint32_t dirty_dynamic_state = 0;
 
    /* Bind common state for all layers and regions  */
    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
@@ -2087,8 +2648,10 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
     * For 3D images, this creates a layered framebuffer with a number of
     * layers matching the depth extent of the 3D image.
     */
-   uint32_t fb_width = u_minify(image->vk.extent.width, resource->mipLevel);
-   uint32_t fb_height = u_minify(image->vk.extent.height, resource->mipLevel);
+   uint8_t plane = v3dv_plane_from_aspect(aspect);
+   uint32_t fb_width = u_minify(image->planes[plane].width, resource->mipLevel);
+   uint32_t fb_height = u_minify(image->planes[plane].height, resource->mipLevel);
+
    VkImageViewCreateInfo image_view_info = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
       .image = v3dv_image_to_handle(image),
@@ -2103,8 +2666,8 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
       },
    };
    VkImageView image_view;
-   result = v3dv_CreateImageView(_device, &image_view_info,
-                                 &cmd_buffer->device->vk.alloc, &image_view);
+   result = v3dv_create_image_view(cmd_buffer->device,
+                                   &image_view_info, &image_view);
    if (result != VK_SUCCESS)
       goto fail;
 
@@ -2173,7 +2736,12 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
          .clearValueCount = 0,
       };
 
-      v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
+      VkSubpassBeginInfo sp_info = {
+         .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
+         .contents = VK_SUBPASS_CONTENTS_INLINE,
+      };
+
+      v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
       struct v3dv_job *job = cmd_buffer->state.job;
       if (!job)
          goto fail;
@@ -2190,9 +2758,8 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
       }
 
       /* For each region */
-      dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
       for (uint32_t r = 0; r < region_count; r++) {
-         const VkBufferImageCopy2KHR *region = &regions[r];
+         const VkBufferImageCopy2 *region = &regions[r];
 
          /* Obtain the 2D buffer region spec */
          uint32_t buf_width, buf_height;
@@ -2240,11 +2807,15 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
          v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
       } /* For each region */
 
-      v3dv_CmdEndRenderPass(_cmd_buffer);
+      VkSubpassEndInfo sp_end_info = {
+         .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
+      };
+
+      v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
    } /* For each layer */
 
 fail:
-   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
    return handled;
 }
 
@@ -2263,7 +2834,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                           VkColorComponentFlags cmask,
                           VkComponentMapping *cswizzle,
                           uint32_t region_count,
-                          const VkBufferImageCopy2KHR *regions)
+                          const VkBufferImageCopy2 *regions)
 {
    /* Since we can't sample linear images we need to upload the linear
     * buffer to a tiled image that we can use as a blit source, which
@@ -2338,14 +2909,19 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
     */
    assert(num_layers == 1 || region_count == 1);
 
-   const uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
-   const uint32_t block_height = vk_format_get_blockheight(image->vk.format);
+   uint8_t plane = v3dv_plane_from_aspect(aspect);
+   assert(plane < image->plane_count);
+
+   const uint32_t block_width =
+      vk_format_get_blockwidth(image->planes[plane].vk_format);
+   const uint32_t block_height =
+      vk_format_get_blockheight(image->planes[plane].vk_format);
 
    /* Copy regions by uploading each region to a temporary tiled image using
     * the memory we have just allocated as storage.
     */
    for (uint32_t r = 0; r < region_count; r++) {
-      const VkBufferImageCopy2KHR *region = &regions[r];
+      const VkBufferImageCopy2 *region = &regions[r];
 
       /* Obtain the 2D buffer region spec */
       uint32_t buf_width, buf_height;
@@ -2396,16 +2972,23 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
          if (result != VK_SUCCESS)
             return handled;
 
+         /* When copying a multi-plane image the aspect indicates the plane to
+          * copy. For these, we only copy one plane at a time, which is always
+          * a color plane.
+          */
+         VkImageAspectFlags copy_aspect =
+            image->plane_count == 1 ? aspect : VK_IMAGE_ASPECT_COLOR_BIT;
+
          /* Upload buffer contents for the selected layer */
          const VkDeviceSize buf_offset_bytes =
             region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
-         const VkBufferImageCopy2KHR buffer_image_copy = {
-            .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2_KHR,
+         const VkBufferImageCopy2 buffer_image_copy = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
             .bufferOffset = buf_offset_bytes,
             .bufferRowLength = region->bufferRowLength / block_width,
             .bufferImageHeight = region->bufferImageHeight / block_height,
             .imageSubresource = {
-               .aspectMask = aspect,
+               .aspectMask = copy_aspect,
                .mipLevel = 0,
                .baseArrayLayer = 0,
                .layerCount = 1,
@@ -2434,10 +3017,10 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
           * image, but that we need to blit to a S8D24 destination (the only
           * stencil format we support).
           */
-         const VkImageBlit2KHR blit_region = {
-            .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
+         const VkImageBlit2 blit_region = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
             .srcSubresource = {
-               .aspectMask = aspect,
+               .aspectMask = copy_aspect,
                .mipLevel = 0,
                .baseArrayLayer = 0,
                .layerCount = 1,
@@ -2493,7 +3076,7 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
                             struct v3dv_image *image,
                             struct v3dv_buffer *buffer,
                             uint32_t region_count,
-                            const VkBufferImageCopy2KHR *regions,
+                            const VkBufferImageCopy2 *regions,
                             bool use_texel_buffer)
 {
    /* We can only call this with region_count > 1 if we can batch the regions
@@ -2501,12 +3084,20 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
     * the same aspect.
     */
    VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
+   const VkImageAspectFlagBits any_plane_aspect =
+      VK_IMAGE_ASPECT_PLANE_0_BIT |
+      VK_IMAGE_ASPECT_PLANE_1_BIT |
+      VK_IMAGE_ASPECT_PLANE_2_BIT;
+
+   bool is_plane_aspect = aspect & any_plane_aspect;
 
    /* Generally, the bpp of the data in the buffer matches that of the
     * destination image. The exception is the case where we are uploading
     * stencil (8bpp) to a combined d24s8 image (32bpp).
     */
-   uint32_t buf_bpp = image->cpp;
+   uint8_t plane = v3dv_plane_from_aspect(aspect);
+   assert(plane < image->plane_count);
+   uint32_t buf_bpp = image->planes[plane].cpp;
 
    /* We are about to upload the buffer data to an image so we can then
     * blit that to our destination region. Because we are going to implement
@@ -2539,6 +3130,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
    case 4:
       switch (aspect) {
       case VK_IMAGE_ASPECT_COLOR_BIT:
+      case VK_IMAGE_ASPECT_PLANE_0_BIT:
+      case VK_IMAGE_ASPECT_PLANE_1_BIT:
+      case VK_IMAGE_ASPECT_PLANE_2_BIT:
          src_format = VK_FORMAT_R8G8B8A8_UINT;
          dst_format = src_format;
          break;
@@ -2548,7 +3142,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
          src_format = VK_FORMAT_R8G8B8A8_UINT;
          dst_format = src_format;
-         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
 
          /* For D24 formats, the Vulkan spec states that the depth component
           * in the buffer is stored in the 24-LSB, but V3D wants it in the
@@ -2578,7 +3171,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
          src_format = VK_FORMAT_R8_UINT;
          dst_format = VK_FORMAT_R8G8B8A8_UINT;
          cmask = VK_COLOR_COMPONENT_R_BIT;
-         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
          break;
       default:
          unreachable("unsupported aspect");
@@ -2586,12 +3178,14 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
       };
       break;
    case 2:
-      aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
+             aspect == VK_IMAGE_ASPECT_DEPTH_BIT ||
+             is_plane_aspect);
       src_format = VK_FORMAT_R16_UINT;
       dst_format = src_format;
       break;
    case 1:
-      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT || is_plane_aspect);
       src_format = VK_FORMAT_R8_UINT;
       dst_format = src_format;
       break;
@@ -2615,75 +3209,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
    }
 }
 
-/**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
- */
-static bool
-copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
-                         struct v3dv_image *image,
-                         struct v3dv_buffer *buffer,
-                         const VkBufferImageCopy2KHR *region)
-{
-   /* FIXME */
-   if (vk_format_is_depth_or_stencil(image->vk.format))
-      return false;
-
-   if (vk_format_is_compressed(image->vk.format))
-      return false;
-
-   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
-      return false;
-
-   uint32_t buffer_width, buffer_height;
-   if (region->bufferRowLength == 0)
-      buffer_width = region->imageExtent.width;
-   else
-      buffer_width = region->bufferRowLength;
-
-   if (region->bufferImageHeight == 0)
-      buffer_height = region->imageExtent.height;
-   else
-      buffer_height = region->bufferImageHeight;
-
-   uint32_t buffer_stride = buffer_width * image->cpp;
-   uint32_t buffer_layer_stride = buffer_stride * buffer_height;
-
-   uint32_t num_layers;
-   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
-      num_layers = region->imageSubresource.layerCount;
-   else
-      num_layers = region->imageExtent.depth;
-   assert(num_layers > 0);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
-                                     cmd_buffer, -1);
-   if (!job)
-      return true;
-
-   job->cpu.copy_buffer_to_image.image = image;
-   job->cpu.copy_buffer_to_image.buffer = buffer;
-   job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
-   job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
-   job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
-   job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
-   job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
-   job->cpu.copy_buffer_to_image.mip_level =
-      region->imageSubresource.mipLevel;
-   job->cpu.copy_buffer_to_image.base_layer =
-      region->imageSubresource.baseArrayLayer;
-   job->cpu.copy_buffer_to_image.layer_count = num_layers;
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-
-   return true;
-}
-
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
-                              const VkCopyBufferToImageInfo2KHR *info)
+v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
+                              const VkCopyBufferToImageInfo2 *info)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
@@ -2691,6 +3219,8 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
 
    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
+   cmd_buffer->state.is_transfer = true;
+
    uint32_t r = 0;
    while (r < info->regionCount) {
       /* The TFU and TLB paths can only copy one region at a time and the region
@@ -2739,12 +3269,6 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
        * slow it might not be worth it and we should instead put more effort
        * in handling more cases with the other paths.
        */
-      if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer,
-                                   &info->pRegions[r])) {
-         batch_size = 1;
-         goto handled;
-      }
-
       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
                                       batch_size, &info->pRegions[r], false)) {
          goto handled;
@@ -2755,6 +3279,8 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
 handled:
       r += batch_size;
    }
+
+   cmd_buffer->state.is_transfer = false;
 }
 
 static void
@@ -2773,17 +3299,31 @@ static bool
 blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
          struct v3dv_image *dst,
          struct v3dv_image *src,
-         const VkImageBlit2KHR *region)
+         const VkImageBlit2 *region)
 {
+   if (V3D_DBG(DISABLE_TFU)) {
+      perf_debug("Blit: TFU disabled, fallbacks could be slower.");
+      return false;
+   }
+
    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
    assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
+   /* From vkCmdBlitImage:
+    *   "srcImage must not use a format that requires a sampler YCBCR
+    *    conversion"
+    *   "dstImage must not use a format that requires a sampler YCBCR
+    *    conversion"
+    */
+   assert(dst->plane_count == 1);
+   assert(src->plane_count == 1);
+
    /* Format must match */
    if (src->vk.format != dst->vk.format)
       return false;
 
    /* Destination can't be raster format */
-   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
+   if (!dst->tiled)
       return false;
 
    /* Source region must start at (0,0) */
@@ -2825,7 +3365,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     */
    const struct v3dv_format *format =
       v3dv_get_compatible_tfu_format(cmd_buffer->device,
-                                     dst->cpp, NULL);
+                                     dst->planes[0].cpp, NULL);
 
    /* Emit a TFU job for each layer to blit */
    assert(region->dstSubresource.layerCount ==
@@ -2871,10 +3411,31 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
          dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
       const uint32_t src_layer =
          src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
-      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
-         (cmd_buffer, dst, dst_mip_level, dst_layer,
-          src, src_mip_level, src_layer,
-          dst_width, dst_height, format);
+
+      const uint32_t dst_offset =
+         dst->planes[0].mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level,
+                                                            dst_layer, 0);
+      const uint32_t src_offset =
+         src->planes[0].mem->bo->offset + v3dv_layer_offset(src, src_mip_level,
+                                                            src_layer, 0);
+
+      const struct v3d_resource_slice *dst_slice = &dst->planes[0].slices[dst_mip_level];
+      const struct v3d_resource_slice *src_slice = &src->planes[0].slices[src_mip_level];
+
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+         cmd_buffer,
+         dst->planes[0].mem->bo->handle,
+         dst_offset,
+         dst_slice->tiling,
+         dst_slice->padded_height,
+         dst->planes[0].cpp,
+         src->planes[0].mem->bo->handle,
+         src_offset,
+         src_slice->tiling,
+         src_slice->tiling == V3D_TILING_RASTER ?
+                              src_slice->stride : src_slice->padded_height,
+         src->planes[0].cpp,
+         dst_width, dst_height, &format->planes[0]);
    }
 
    return true;
@@ -2941,7 +3502,8 @@ create_blit_render_pass(struct v3dv_device *device,
    const bool is_color_blit = vk_format_is_color(dst_format);
 
    /* Attachment load operation is specified below */
-   VkAttachmentDescription att = {
+   VkAttachmentDescription2 att = {
+      .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
       .format = dst_format,
       .samples = VK_SAMPLE_COUNT_1_BIT,
       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
@@ -2949,12 +3511,14 @@ create_blit_render_pass(struct v3dv_device *device,
       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
    };
 
-   VkAttachmentReference att_ref = {
+   VkAttachmentReference2 att_ref = {
+      .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
       .attachment = 0,
       .layout = VK_IMAGE_LAYOUT_GENERAL,
    };
 
-   VkSubpassDescription subpass = {
+   VkSubpassDescription2 subpass = {
+      .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
       .inputAttachmentCount = 0,
       .colorAttachmentCount = is_color_blit ? 1 : 0,
@@ -2965,8 +3529,8 @@ create_blit_render_pass(struct v3dv_device *device,
       .pPreserveAttachments = NULL,
    };
 
-   VkRenderPassCreateInfo info = {
-      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+   VkRenderPassCreateInfo2 info = {
+      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
       .attachmentCount = 1,
       .pAttachments = &att,
       .subpassCount = 1,
@@ -2977,60 +3541,27 @@ create_blit_render_pass(struct v3dv_device *device,
 
    VkResult result;
    att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
-   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
-                                  &info, &device->vk.alloc, pass_load);
+   result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
+                                   &info, &device->vk.alloc, pass_load);
    if (result != VK_SUCCESS)
       return false;
 
    att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
-   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
-                                  &info, &device->vk.alloc, pass_no_load);
+   result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
+                                   &info, &device->vk.alloc, pass_no_load);
    return result == VK_SUCCESS;
 }
 
-static nir_ssa_def *
-gen_rect_vertices(nir_builder *b)
-{
-   nir_ssa_def *vertex_id = nir_load_vertex_id(b);
-
-   /* vertex 0: -1.0, -1.0
-    * vertex 1: -1.0,  1.0
-    * vertex 2:  1.0, -1.0
-    * vertex 3:  1.0,  1.0
-    *
-    * so:
-    *
-    * channel 0 is vertex_id < 2 ? -1.0 :  1.0
-    * channel 1 is vertex id & 1 ?  1.0 : -1.0
-    */
-
-   nir_ssa_def *one = nir_imm_int(b, 1);
-   nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
-   nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
-
-   nir_ssa_def *comp[4];
-   comp[0] = nir_bcsel(b, c0cmp,
-                       nir_imm_float(b, -1.0f),
-                       nir_imm_float(b, 1.0f));
-
-   comp[1] = nir_bcsel(b, c1cmp,
-                       nir_imm_float(b, 1.0f),
-                       nir_imm_float(b, -1.0f));
-   comp[2] = nir_imm_float(b, 0.0f);
-   comp[3] = nir_imm_float(b, 1.0f);
-   return nir_vec(b, comp, 4);
-}
-
-static nir_ssa_def *
+static nir_def *
 gen_tex_coords(nir_builder *b)
 {
-   nir_ssa_def *tex_box =
+   nir_def *tex_box =
       nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
 
-   nir_ssa_def *tex_z =
+   nir_def *tex_z =
       nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
 
-   nir_ssa_def *vertex_id = nir_load_vertex_id(b);
+   nir_def *vertex_id = nir_load_vertex_id(b);
 
    /* vertex 0: src0_x, src0_y
     * vertex 1: src0_x, src1_y
@@ -3043,11 +3574,11 @@ gen_tex_coords(nir_builder *b)
     * channel 1 is vertex id & 1 ? src1_y : src0_y
     */
 
-   nir_ssa_def *one = nir_imm_int(b, 1);
-   nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
-   nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
+   nir_def *one = nir_imm_int(b, 1);
+   nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2);
+   nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
 
-   nir_ssa_def *comp[4];
+   nir_def *comp[4];
    comp[0] = nir_bcsel(b, c0cmp,
                        nir_channel(b, tex_box, 0),
                        nir_channel(b, tex_box, 2));
@@ -3060,9 +3591,9 @@ gen_tex_coords(nir_builder *b)
    return nir_vec(b, comp, 4);
 }
 
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_read(struct nir_builder *b,
-                      nir_ssa_def *tex_pos,
+                      nir_def *tex_pos,
                       enum glsl_base_type tex_type,
                       enum glsl_sampler_dim dim)
 {
@@ -3075,57 +3606,49 @@ build_nir_tex_op_read(struct nir_builder *b,
    sampler->data.descriptor_set = 0;
    sampler->data.binding = 0;
 
-   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
    tex->sampler_dim = dim;
    tex->op = nir_texop_tex;
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(tex_pos);
-   tex->src[1].src_type = nir_tex_src_texture_deref;
-   tex->src[1].src = nir_src_for_ssa(tex_deref);
-   tex->src[2].src_type = nir_tex_src_sampler_deref;
-   tex->src[2].src = nir_src_for_ssa(tex_deref);
+   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
+   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
+   tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref);
    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
    tex->is_array = glsl_sampler_type_is_array(sampler_type);
    tex->coord_components = tex_pos->num_components;
 
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
+   nir_def_init(&tex->instr, &tex->def, 4, 32);
    nir_builder_instr_insert(b, &tex->instr);
-   return &tex->dest.ssa;
+   return &tex->def;
 }
 
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
                                  nir_variable *sampler,
-                                 nir_ssa_def *tex_deref,
+                                 nir_def *tex_deref,
                                  enum glsl_base_type tex_type,
-                                 nir_ssa_def *tex_pos,
-                                 nir_ssa_def *sample_idx)
+                                 nir_def *tex_pos,
+                                 nir_def *sample_idx)
 {
-   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4);
+   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
    tex->op = nir_texop_txf_ms;
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(tex_pos);
-   tex->src[1].src_type = nir_tex_src_texture_deref;
-   tex->src[1].src = nir_src_for_ssa(tex_deref);
-   tex->src[2].src_type = nir_tex_src_sampler_deref;
-   tex->src[2].src = nir_src_for_ssa(tex_deref);
-   tex->src[3].src_type = nir_tex_src_ms_index;
-   tex->src[3].src = nir_src_for_ssa(sample_idx);
+   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
+   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
+   tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx);
    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
    tex->is_array = false;
    tex->coord_components = tex_pos->num_components;
 
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
+   nir_def_init(&tex->instr, &tex->def, 4, 32);
    nir_builder_instr_insert(b, &tex->instr);
-   return &tex->dest.ssa;
+   return &tex->def;
 }
 
 /* Fetches all samples at the given position and averages them */
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_ms_resolve(struct nir_builder *b,
-                            nir_ssa_def *tex_pos,
+                            nir_def *tex_pos,
                             enum glsl_base_type tex_type,
                             VkSampleCountFlagBits src_samples)
 {
@@ -3139,10 +3662,10 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b,
 
    const bool is_int = glsl_base_type_is_integer(tex_type);
 
-   nir_ssa_def *tmp = NULL;
-   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+   nir_def *tmp = NULL;
+   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
    for (uint32_t i = 0; i < src_samples; i++) {
-      nir_ssa_def *s =
+      nir_def *s =
          build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
                                           tex_type, tex_pos,
                                           nir_imm_int(b, i));
@@ -3157,13 +3680,13 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b,
    }
 
    assert(!is_int);
-   return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples));
+   return nir_fmul_imm(b, tmp, 1.0f / src_samples);
 }
 
 /* Fetches the current sample (gl_SampleID) at the given position */
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_ms_read(struct nir_builder *b,
-                         nir_ssa_def *tex_pos,
+                         nir_def *tex_pos,
                          enum glsl_base_type tex_type)
 {
    const struct glsl_type *sampler_type =
@@ -3173,17 +3696,17 @@ build_nir_tex_op_ms_read(struct nir_builder *b,
    sampler->data.descriptor_set = 0;
    sampler->data.binding = 0;
 
-   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
 
    return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
                                            tex_type, tex_pos,
                                            nir_load_sample_id(b));
 }
 
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op(struct nir_builder *b,
                  struct v3dv_device *device,
-                 nir_ssa_def *tex_pos,
+                 nir_def *tex_pos,
                  enum glsl_base_type tex_type,
                  VkSampleCountFlagBits dst_samples,
                  VkSampleCountFlagBits src_samples,
@@ -3227,10 +3750,10 @@ get_blit_vs()
    vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
    vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
 
-   nir_ssa_def *pos = gen_rect_vertices(&b);
+   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
    nir_store_var(&b, vs_out_pos, pos, 0xf);
 
-   nir_ssa_def *tex_coord = gen_tex_coords(&b);
+   nir_def *tex_coord = gen_tex_coords(&b);
    nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
 
    return b.shader;
@@ -3281,11 +3804,11 @@ get_color_blit_fs(struct v3dv_device *device,
       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
    fs_out_color->data.location = FRAG_RESULT_DATA0;
 
-   nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
+   nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
    const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
    tex_coord = nir_channels(&b, tex_coord, channel_mask);
 
-   nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
+   nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
                                          dst_samples, src_samples, sampler_dim);
 
    /* For integer textures, if the bit-size of the destination is too small to
@@ -3300,7 +3823,7 @@ get_color_blit_fs(struct v3dv_device *device,
       enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
       enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
 
-      nir_ssa_def *c[4];
+      nir_def *c[4];
       for (uint32_t i = 0; i < 4; i++) {
          c[i] = nir_channel(&b, color, i);
 
@@ -3318,11 +3841,11 @@ get_color_blit_fs(struct v3dv_device *device,
 
          assert(dst_bit_size > 0);
          if (util_format_is_pure_uint(dst_pformat)) {
-            nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
+            nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
             c[i] = nir_umin(&b, c[i], max);
          } else {
-            nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
-            nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
+            nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
+            nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
             c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
          }
       }
@@ -3348,14 +3871,12 @@ create_pipeline(struct v3dv_device *device,
                 const VkPipelineLayout layout,
                 VkPipeline *pipeline)
 {
-   struct vk_shader_module vs_m;
+   struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
+   struct vk_shader_module fs_m = vk_shader_module_from_nir(fs_nir);
    struct vk_shader_module gs_m;
-   struct vk_shader_module fs_m;
 
    uint32_t num_stages = gs_nir ? 3 : 2;
 
-   v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
-   v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
 
    VkPipelineShaderStageCreateInfo stages[3] = {
       {
@@ -3379,7 +3900,7 @@ create_pipeline(struct v3dv_device *device,
    };
 
    if (gs_nir) {
-      v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
+      gs_m = vk_shader_module_from_nir(gs_nir);
       stages[2].module = vk_shader_module_to_handle(&gs_m);
    }
 
@@ -3452,6 +3973,7 @@ create_pipeline(struct v3dv_device *device,
                                    pipeline);
 
    ralloc_free(vs_nir);
+   ralloc_free(gs_nir);
    ralloc_free(fs_nir);
 
    return result == VK_SUCCESS;
@@ -3762,6 +4284,8 @@ allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
  * cmask parameter (which can be 0 to default to all channels), as well as a
  * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
  * to use the default identity swizzle).
+ *
+ * Supports multi-plane formats too.
  */
 static bool
 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
@@ -3771,25 +4295,23 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             VkFormat src_format,
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
-            const VkImageBlit2KHR *_region,
+            const VkImageBlit2 *region,
             VkFilter filter,
             bool dst_is_padded_image)
 {
    bool handled = true;
    VkResult result;
-   uint32_t dirty_dynamic_state = 0;
 
    /* We don't support rendering to linear depth/stencil, this should have
     * been rewritten to a compatible color blit by the caller.
     */
-   assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR ||
-          !vk_format_is_depth_or_stencil(dst_format));
+   assert(dst->tiled || !vk_format_is_depth_or_stencil(dst_format));
 
    /* Can't sample from linear images */
-   if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && src->vk.image_type != VK_IMAGE_TYPE_1D)
+   if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) {
       return false;
+   }
 
-   VkImageBlit2KHR region = *_region;
    /* Rewrite combined D/S blits to compatible color blits */
    if (vk_format_is_depth_or_stencil(dst_format)) {
       assert(src_format == dst_format);
@@ -3803,12 +4325,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
          break;
       case VK_FORMAT_X8_D24_UNORM_PACK32:
       case VK_FORMAT_D24_UNORM_S8_UINT:
-         if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
             cmask |= VK_COLOR_COMPONENT_G_BIT |
                      VK_COLOR_COMPONENT_B_BIT |
                      VK_COLOR_COMPONENT_A_BIT;
          }
-         if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+         if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
             assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
             cmask |= VK_COLOR_COMPONENT_R_BIT;
          }
@@ -3818,10 +4340,15 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
          unreachable("Unsupported depth/stencil format");
       };
       src_format = dst_format;
-      region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-      region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
    }
 
+   uint8_t src_plane =
+      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+   assert(src_plane < src->plane_count);
+   uint8_t dst_plane =
+      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+   assert(dst_plane < dst->plane_count);
+
    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
                                             VK_COLOR_COMPONENT_G_BIT |
                                             VK_COLOR_COMPONENT_B_BIT |
@@ -3844,34 +4371,40 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
     * need to apply those same semantics here when we compute the size of the
     * destination image level.
     */
-   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
-   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
-   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
-   const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
+   const uint32_t dst_block_w =
+      vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
+   const uint32_t dst_block_h =
+      vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
+   const uint32_t src_block_w =
+      vk_format_get_blockwidth(src->planes[src_plane].vk_format);
+   const uint32_t src_block_h =
+      vk_format_get_blockheight(src->planes[src_plane].vk_format);
    const uint32_t dst_level_w =
       u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
-               region.dstSubresource.mipLevel);
+               region->dstSubresource.mipLevel);
    const uint32_t dst_level_h =
       u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
-               region.dstSubresource.mipLevel);
+               region->dstSubresource.mipLevel);
 
    const uint32_t src_level_w =
-      u_minify(src->vk.extent.width, region.srcSubresource.mipLevel);
+      u_minify(src->planes[src_plane].width, region->srcSubresource.mipLevel);
    const uint32_t src_level_h =
-      u_minify(src->vk.extent.height, region.srcSubresource.mipLevel);
+      u_minify(src->planes[src_plane].height, region->srcSubresource.mipLevel);
+
+   assert(src->plane_count == 1 || src->vk.image_type != VK_IMAGE_TYPE_3D);
    const uint32_t src_level_d =
-      u_minify(src->vk.extent.depth, region.srcSubresource.mipLevel);
+      u_minify(src->vk.extent.depth, region->srcSubresource.mipLevel);
 
    uint32_t dst_x, dst_y, dst_w, dst_h;
    bool dst_mirror_x, dst_mirror_y;
-   compute_blit_box(region.dstOffsets,
+   compute_blit_box(region->dstOffsets,
                     dst_level_w, dst_level_h,
                     &dst_x, &dst_y, &dst_w, &dst_h,
                     &dst_mirror_x, &dst_mirror_y);
 
    uint32_t src_x, src_y, src_w, src_h;
    bool src_mirror_x, src_mirror_y;
-   compute_blit_box(region.srcOffsets,
+   compute_blit_box(region->srcOffsets,
                     src_level_w, src_level_h,
                     &src_x, &src_y, &src_w, &src_h,
                     &src_mirror_x, &src_mirror_y);
@@ -3880,10 +4413,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t max_dst_layer;
    bool dst_mirror_z = false;
    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
-      min_dst_layer = region.dstSubresource.baseArrayLayer;
-      max_dst_layer = min_dst_layer + region.dstSubresource.layerCount;
+      min_dst_layer = region->dstSubresource.baseArrayLayer;
+      max_dst_layer = min_dst_layer + region->dstSubresource.layerCount;
    } else {
-      compute_blit_3d_layers(region.dstOffsets,
+      compute_blit_3d_layers(region->dstOffsets,
                              &min_dst_layer, &max_dst_layer,
                              &dst_mirror_z);
    }
@@ -3892,10 +4425,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t max_src_layer;
    bool src_mirror_z = false;
    if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
-      min_src_layer = region.srcSubresource.baseArrayLayer;
-      max_src_layer = min_src_layer + region.srcSubresource.layerCount;
+      min_src_layer = region->srcSubresource.baseArrayLayer;
+      max_src_layer = min_src_layer + region->srcSubresource.layerCount;
    } else {
-      compute_blit_3d_layers(region.srcOffsets,
+      compute_blit_3d_layers(region->srcOffsets,
                              &min_src_layer, &max_src_layer,
                              &src_mirror_z);
    }
@@ -4010,7 +4543,6 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    };
 
    /* Record per-layer commands */
-   VkImageAspectFlags aspects = region.dstSubresource.aspectMask;
    for (uint32_t i = 0; i < layer_count; i++) {
       /* Setup framebuffer */
       VkImageViewCreateInfo dst_image_view_info = {
@@ -4019,16 +4551,16 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
          .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
          .format = dst_format,
          .subresourceRange = {
-            .aspectMask = aspects,
-            .baseMipLevel = region.dstSubresource.mipLevel,
+            .aspectMask = region->dstSubresource.aspectMask,
+            .baseMipLevel = region->dstSubresource.mipLevel,
             .levelCount = 1,
             .baseArrayLayer = min_dst_layer + i,
             .layerCount = 1
          },
       };
       VkImageView dst_image_view;
-      result = v3dv_CreateImageView(_device, &dst_image_view_info,
-                                    &device->vk.alloc, &dst_image_view);
+      result = v3dv_create_image_view(device, &dst_image_view_info,
+                                      &dst_image_view);
       if (result != VK_SUCCESS)
          goto fail;
 
@@ -4078,8 +4610,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
          .format = src_format,
          .components = *cswizzle,
          .subresourceRange = {
-            .aspectMask = aspects,
-            .baseMipLevel = region.srcSubresource.mipLevel,
+            .aspectMask = region->srcSubresource.aspectMask,
+            .baseMipLevel = region->srcSubresource.mipLevel,
             .levelCount = 1,
             .baseArrayLayer =
                src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
@@ -4087,8 +4619,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
          },
       };
       VkImageView src_image_view;
-      result = v3dv_CreateImageView(_device, &src_image_view_info,
-                                    &device->vk.alloc, &src_image_view);
+      result = v3dv_create_image_view(device, &src_image_view_info,
+                                      &src_image_view);
       if (result != VK_SUCCESS)
          goto fail;
 
@@ -4146,7 +4678,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
          .clearValueCount = 0,
       };
 
-      v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
+      VkSubpassBeginInfo sp_info = {
+         .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
+         .contents = VK_SUBPASS_CONTENTS_INLINE,
+      };
+
+      v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
       struct v3dv_job *job = cmd_buffer->state.job;
       if (!job)
          goto fail;
@@ -4170,25 +4707,37 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
 
       v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
 
-      v3dv_CmdEndRenderPass(_cmd_buffer);
-      dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
+      VkSubpassEndInfo sp_end_info = {
+         .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
+      };
+
+      v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
    }
 
 fail:
-   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
 
    return handled;
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
-                      const VkBlitImageInfo2KHR *pBlitImageInfo)
+v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer,
+                      const VkBlitImageInfo2 *pBlitImageInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
    V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
 
-    /* This command can only happen outside a render pass */
+   /* From vkCmdBlitImage:
+    *   "srcImage must not use a format that requires a sampler YCBCR
+    *    conversion"
+    *   "dstImage must not use a format that requires a sampler YCBCR
+    *    conversion"
+    */
+   assert(src->plane_count == 1);
+   assert(dst->plane_count == 1);
+
+   /* This command can only happen outside a render pass */
    assert(cmd_buffer->state.pass == NULL);
    assert(cmd_buffer->state.job == NULL);
 
@@ -4199,29 +4748,41 @@ v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
    /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
    assert(!vk_format_is_compressed(dst->vk.format));
 
+   cmd_buffer->state.is_transfer = true;
+
    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
-      if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i]))
+      const VkImageBlit2 *region = &pBlitImageInfo->pRegions[i];
+
+      if (blit_tfu(cmd_buffer, dst, src, region))
          continue;
       if (blit_shader(cmd_buffer,
                       dst, dst->vk.format,
                       src, src->vk.format,
                       0, NULL,
-                      &pBlitImageInfo->pRegions[i],
+                      region,
                       pBlitImageInfo->filter, true)) {
          continue;
       }
       unreachable("Unsupported blit operation");
    }
+
+   cmd_buffer->state.is_transfer = false;
 }
 
 static bool
 resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                   struct v3dv_image *dst,
                   struct v3dv_image *src,
-                  const VkImageResolve2KHR *region)
+                  const VkImageResolve2 *region)
 {
-   if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, NULL) ||
-       !v3dv_meta_can_use_tlb(dst, &region->dstOffset, NULL)) {
+   /* No resolve for multi-planar images. Using plane 0 */
+   assert(dst->plane_count == 1);
+   assert(src->plane_count == 1);
+
+   if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel,
+                              &region->srcOffset, NULL, NULL) ||
+       !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel,
+                              &region->dstOffset, &region->extent, NULL)) {
       return false;
    }
 
@@ -4242,8 +4803,10 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    if (!job)
       return true;
 
-   const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
-   const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
+   const uint32_t block_w =
+      vk_format_get_blockwidth(dst->planes[0].vk_format);
+   const uint32_t block_h =
+      vk_format_get_blockheight(dst->planes[0].vk_format);
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
@@ -4252,8 +4815,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       (fb_format, region->srcSubresource.aspectMask,
        &internal_type, &internal_bpp);
 
-   v3dv_job_start_frame(job, width, height, num_layers, false,
-                        1, internal_bpp, true);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        true);
 
    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -4271,10 +4835,10 @@ static bool
 resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                    struct v3dv_image *dst,
                    struct v3dv_image *src,
-                   const VkImageResolve2KHR *region)
+                   const VkImageResolve2 *region)
 {
-   const VkImageBlit2KHR blit_region = {
-      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
+   const VkImageBlit2 blit_region = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
       .srcSubresource = region->srcSubresource,
       .srcOffsets = {
          region->srcOffset,
@@ -4300,8 +4864,8 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
-                         const VkResolveImageInfo2KHR *info)
+v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer,
+                         const VkResolveImageInfo2 *info)
 
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -4315,6 +4879,12 @@ v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
    assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
+   /* We don't support multi-sampled multi-plane images */
+   assert(src->plane_count == 1);
+   assert(dst->plane_count == 1);
+
+   cmd_buffer->state.is_transfer = true;
+
    for (uint32_t i = 0; i < info->regionCount; i++) {
       if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
          continue;
@@ -4322,4 +4892,6 @@ v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
          continue;
       unreachable("Unsupported multismaple resolve operation");
    }
+
+   cmd_buffer->state.is_transfer = false;
 }
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 464703e42a4..ae6e37159d4 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,7 @@
 #include "v3dv_private.h"
 
 static uint32_t
-num_subpass_attachments(const VkSubpassDescription *desc)
+num_subpass_attachments(const VkSubpassDescription2 *desc)
 {
    return desc->inputAttachmentCount +
           desc->colorAttachmentCount +
@@ -33,11 +33,11 @@ num_subpass_attachments(const VkSubpassDescription *desc)
 }
 
 static void
-set_use_tlb_resolve(struct v3dv_device *device,
+set_try_tlb_resolve(struct v3dv_device *device,
                     struct v3dv_render_pass_attachment *att)
 {
    const struct v3dv_format *format = v3dv_X(device, get_format)(att->desc.format);
-   att->use_tlb_resolve = v3dv_X(device, format_supports_tlb_resolve)(format);
+   att->try_tlb_resolve = v3dv_X(device, format_supports_tlb_resolve)(format);
 }
 
 static void
@@ -82,7 +82,7 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device,
 
          if (subpass->resolve_attachments &&
              subpass->resolve_attachments[j].attachment != VK_ATTACHMENT_UNUSED) {
-            set_use_tlb_resolve(device, att);
+            set_try_tlb_resolve(device, att);
          }
       }
 
@@ -92,6 +92,9 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device,
             pass->attachments[ds_attachment_idx].first_subpass = i;
          if (i > pass->attachments[ds_attachment_idx].last_subpass)
             pass->attachments[ds_attachment_idx].last_subpass = i;
+
+         if (subpass->ds_resolve_attachment.attachment != VK_ATTACHMENT_UNUSED)
+            set_try_tlb_resolve(device, &pass->attachments[ds_attachment_idx]);
       }
 
       for (uint32_t j = 0; j < subpass->input_count; j++) {
@@ -118,21 +121,57 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device,
    }
 }
 
+/* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
+ * the clear might get lost. If a subpass has this then we can't emit
+ * the clear using the TLB and we have to do it as a draw call. This
+ * issue is fixed since V3D 4.3.18.
+ *
+ * FIXME: separate stencil.
+ */
+static void
+check_do_depth_stencil_clear_with_draw(struct v3dv_device *device,
+                                       struct v3dv_render_pass *pass,
+                                       struct v3dv_subpass *subpass)
+{
+   if (device->devinfo.ver > 42 ||
+       subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
+      return;
+   }
+
+   struct v3dv_render_pass_attachment *att =
+      &pass->attachments[subpass->ds_attachment.attachment];
+   if (att->desc.format != VK_FORMAT_D24_UNORM_S8_UINT)
+      return;
+
+   if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+       att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD) {
+      subpass->do_depth_clear_with_draw = true;
+   } else if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD &&
+              att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+      subpass->do_stencil_clear_with_draw = true;
+   }
+}
 
 VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateRenderPass(VkDevice _device,
-                      const VkRenderPassCreateInfo *pCreateInfo,
-                      const VkAllocationCallbacks *pAllocator,
-                      VkRenderPass *pRenderPass)
+v3dv_CreateRenderPass2(VkDevice _device,
+                       const VkRenderPassCreateInfo2 *pCreateInfo,
+                       const VkAllocationCallbacks *pAllocator,
+                       VkRenderPass *pRenderPass)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    struct v3dv_render_pass *pass;
 
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO);
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2);
 
-   const VkRenderPassMultiviewCreateInfo *multiview_info =
-      vk_find_struct_const(pCreateInfo->pNext, RENDER_PASS_MULTIVIEW_CREATE_INFO);
-   bool multiview_enabled = multiview_info && multiview_info->subpassCount > 0;
+   /* From the VK_KHR_multiview spec:
+    *
+    *   When a subpass uses a non-zero view mask, multiview functionality is
+    *   considered to be enabled. Multiview is all-or-nothing for a render
+    *   pass - that is, either all subpasses must have a non-zero view mask
+    *   (though some subpasses may have only one view) or all must be zero.
+    */
+   bool multiview_enabled = pCreateInfo->subpassCount &&
+      pCreateInfo->pSubpasses[0].viewMask;
 
    size_t size = sizeof(*pass);
    size_t subpasses_offset = size;
@@ -143,7 +182,7 @@ v3dv_CreateRenderPass(VkDevice _device,
    pass = vk_object_zalloc(&device->vk, pAllocator, size,
                            VK_OBJECT_TYPE_RENDER_PASS);
    if (pass == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    pass->multiview_enabled = multiview_enabled;
    pass->attachment_count = pCreateInfo->attachmentCount;
@@ -156,7 +195,7 @@ v3dv_CreateRenderPass(VkDevice _device,
 
    uint32_t subpass_attachment_count = 0;
    for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
-      const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+      const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
       subpass_attachment_count += num_subpass_attachments(desc);
    }
 
@@ -168,7 +207,7 @@ v3dv_CreateRenderPass(VkDevice _device,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (pass->subpass_attachments == NULL) {
          vk_object_free(&device->vk, pAllocator, pass);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    } else {
       pass->subpass_attachments = NULL;
@@ -176,13 +215,12 @@ v3dv_CreateRenderPass(VkDevice _device,
 
    struct v3dv_subpass_attachment *p = pass->subpass_attachments;
    for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
-      const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+      const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
       struct v3dv_subpass *subpass = &pass->subpasses[i];
 
       subpass->input_count = desc->inputAttachmentCount;
       subpass->color_count = desc->colorAttachmentCount;
-      if (multiview_enabled)
-         subpass->view_mask = multiview_info->pViewMasks[i];
+      subpass->view_mask = desc->viewMask;
 
       if (desc->inputAttachmentCount > 0) {
          subpass->input_attachments = p;
@@ -226,27 +264,38 @@ v3dv_CreateRenderPass(VkDevice _device,
             .layout = desc->pDepthStencilAttachment->layout,
          };
 
-         /* GFXH-1461: if depth is cleared but stencil is loaded (or viceversa),
-          * the clear might get lost. If a subpass has this then we can't emit
-          * the clear using the TLB and we have to do it as a draw call.
-          *
-          * FIXME: separate stencil.
-          */
-         if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
-            struct v3dv_render_pass_attachment *att =
-               &pass->attachments[subpass->ds_attachment.attachment];
-            if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
-               if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
-                   att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD) {
-                  subpass->do_depth_clear_with_draw = true;
-               } else if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD &&
-                          att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
-                  subpass->do_stencil_clear_with_draw = true;
-               }
-            }
+         check_do_depth_stencil_clear_with_draw(device, pass, subpass);
+
+         /* VK_KHR_depth_stencil_resolve */
+         const VkSubpassDescriptionDepthStencilResolve *resolve_desc =
+            vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE);
+         const VkAttachmentReference2 *resolve_att =
+            resolve_desc && resolve_desc->pDepthStencilResolveAttachment &&
+            resolve_desc->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED ?
+               resolve_desc->pDepthStencilResolveAttachment : NULL;
+         if (resolve_att) {
+            subpass->ds_resolve_attachment = (struct v3dv_subpass_attachment) {
+               .attachment = resolve_att->attachment,
+               .layout = resolve_att->layout,
+            };
+            assert(resolve_desc->depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT ||
+                   resolve_desc->stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
+            subpass->resolve_depth =
+               resolve_desc->depthResolveMode != VK_RESOLVE_MODE_NONE &&
+               resolve_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT;
+            subpass->resolve_stencil =
+               resolve_desc->stencilResolveMode != VK_RESOLVE_MODE_NONE &&
+               resolve_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT;
+         } else {
+            subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED;
+            subpass->resolve_depth = false;
+            subpass->resolve_stencil = false;
          }
       } else {
          subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED;
+         subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED;
+         subpass->resolve_depth = false;
+         subpass->resolve_stencil = false;
       }
    }
 
@@ -280,50 +329,44 @@ subpass_get_granularity(struct v3dv_device *device,
                         uint32_t subpass_idx,
                         VkExtent2D *granularity)
 {
-   static const uint8_t tile_sizes[] = {
-      64, 64,
-      64, 32,
-      32, 32,
-      32, 16,
-      16, 16,
-      16,  8,
-       8,  8
-   };
-
-   /* Our tile size depends on the number of color attachments and the maximum
-    * bpp across them.
-    */
+   /* Granularity is defined by the tile size */
    assert(subpass_idx < pass->subpass_count);
    struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
-   const uint32_t color_attachment_count = subpass->color_count;
+   const uint32_t color_count = subpass->color_count;
 
+   bool msaa = false;
    uint32_t max_internal_bpp = 0;
-   for (uint32_t i = 0; i < color_attachment_count; i++) {
+   uint32_t total_color_bpp = 0;
+   for (uint32_t i = 0; i < color_count; i++) {
       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
       if (attachment_idx == VK_ATTACHMENT_UNUSED)
          continue;
-      const VkAttachmentDescription *desc =
+      const VkAttachmentDescription2 *desc =
          &pass->attachments[attachment_idx].desc;
       const struct v3dv_format *format = v3dv_X(device, get_format)(desc->format);
       uint32_t internal_type, internal_bpp;
+      /* We don't support rendering to YCbCr images */
+      assert(format->plane_count == 1);
       v3dv_X(device, get_internal_type_bpp_for_output_format)
-         (format->rt_type, &internal_type, &internal_bpp);
+         (format->planes[0].rt_type, &internal_type, &internal_bpp);
 
       max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
-   }
-
-   uint32_t idx = 0;
-   if (color_attachment_count > 2)
-      idx += 2;
-   else if (color_attachment_count > 1)
-      idx += 1;
+      total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
 
-   idx += max_internal_bpp;
+      if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
+         msaa = true;
+   }
 
-   assert(idx < ARRAY_SIZE(tile_sizes));
+   /* If requested, double-buffer may or may not be enabled depending on
+    * heuristics so we choose a conservative granularity here, with it disabled.
+    */
+   uint32_t width, height;
+   v3d_choose_tile_size(&device->devinfo, color_count,
+                        max_internal_bpp, total_color_bpp, msaa,
+                        false /* double-buffer */, &width, &height);
    *granularity = (VkExtent2D) {
-      .width = tile_sizes[idx * 2],
-      .height = tile_sizes[idx * 2 + 1]
+      .width = width,
+      .height = height
    };
 }
 
@@ -390,3 +433,264 @@ v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device,
           (fb->has_edge_padding &&
            area->offset.y + area->extent.height >= fb->height));
 }
+
+static void
+setup_dynamic_attachment(struct v3dv_device *device,
+                         struct v3dv_render_pass_attachment *att,
+                         const VkRenderingAttachmentInfo *info,
+                         bool is_stencil,
+                         bool is_resolve)
+{
+   struct v3dv_image_view *view = v3dv_image_view_from_handle(info->imageView);
+
+   VkAttachmentLoadOp load_op, stencil_load_op;
+   VkAttachmentStoreOp store_op, stencil_store_op;
+
+   if (!is_stencil) {
+      stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+      stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+      if (!is_resolve) {
+         load_op = info->loadOp;
+         store_op = info->storeOp;
+      } else {
+         load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+         store_op = VK_ATTACHMENT_STORE_OP_STORE;
+      }
+   } else {
+      load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+      store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+      if (!is_resolve) {
+         stencil_load_op = info->loadOp;
+         stencil_store_op = info->storeOp;
+      } else {
+         stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+         stencil_store_op = VK_ATTACHMENT_STORE_OP_STORE;
+      }
+   }
+
+   att->desc = (VkAttachmentDescription2) {
+      .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
+      .flags = 0,
+      .format = view->vk.format,
+      .samples = view->vk.image->samples,
+      .loadOp = load_op,
+      .storeOp = store_op,
+      .stencilLoadOp = stencil_load_op,
+      .stencilStoreOp = stencil_store_op,
+      .initialLayout = info->imageLayout,
+      .finalLayout = info->imageLayout,
+   };
+
+   if (is_resolve)
+      set_try_tlb_resolve(device, att);
+}
+
+void
+v3dv_setup_dynamic_render_pass(struct v3dv_cmd_buffer *cmd_buffer,
+                               const VkRenderingInfoKHR *info)
+{
+   struct v3dv_device *device = cmd_buffer->device;
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   struct v3dv_render_pass *pass = &state->dynamic_pass;
+   struct v3dv_subpass *subpass = &state->dynamic_subpass;
+   struct v3dv_render_pass_attachment *pass_attachments =
+      &state->dynamic_attachments[0];
+   struct v3dv_subpass_attachment *subpass_attachments =
+      &state->dynamic_subpass_attachments[0];
+
+   memset(pass, 0, sizeof(*pass));
+   memset(subpass, 0, sizeof(*subpass));
+   memset(pass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+   memset(subpass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+
+   vk_object_base_init(&device->vk, (struct vk_object_base *) pass,
+                       VK_OBJECT_TYPE_RENDER_PASS);
+
+   pass->attachments = pass_attachments;
+   pass->subpass_attachments = subpass_attachments;
+
+   subpass->view_mask = info->viewMask;
+   subpass->color_count = info->colorAttachmentCount;
+   subpass->color_attachments = &subpass_attachments[0];
+   subpass->resolve_attachments = &subpass_attachments[subpass->color_count];
+
+   pass->multiview_enabled = info->viewMask != 0;
+   pass->subpass_count = 1;
+   pass->subpasses = subpass;
+
+   int a = 0;
+   for (int i = 0; i < info->colorAttachmentCount; i++) {
+      struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+      const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
+
+      if (att_info->imageView == VK_NULL_HANDLE) {
+         subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+         subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+         continue;
+      }
+
+      setup_dynamic_attachment(device, att, att_info, false, false);
+      subpass->color_attachments[i].attachment = a++;
+      subpass->color_attachments[i].layout = att_info->imageLayout;
+
+      if (att_info->resolveMode != VK_RESOLVE_MODE_NONE) {
+         struct v3dv_render_pass_attachment *resolve_att = &pass->attachments[a];
+         setup_dynamic_attachment(device, resolve_att, att_info, false, true);
+         subpass->resolve_attachments[i].attachment = a++;
+         subpass->resolve_attachments[i].layout = att_info->resolveImageLayout;
+      } else {
+         subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+      }
+   }
+
+   bool has_depth = info->pDepthAttachment &&
+                    info->pDepthAttachment->imageView != VK_NULL_HANDLE;
+   bool has_stencil = info->pStencilAttachment &&
+                      info->pStencilAttachment->imageView != VK_NULL_HANDLE;
+   if (has_depth || has_stencil) {
+      struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+      subpass->ds_attachment.attachment = a++;
+
+      bool has_depth_resolve = false;
+      bool has_stencil_resolve = false;
+
+      if (has_depth) {
+         setup_dynamic_attachment(device, att, info->pDepthAttachment,
+                                  false, false);
+         subpass->ds_attachment.layout = info->pDepthAttachment->imageLayout;
+         has_depth_resolve =
+            info->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE;
+      }
+
+      if (has_stencil) {
+         if (has_depth) {
+            att->desc.stencilLoadOp = info->pStencilAttachment->loadOp;
+            att->desc.stencilStoreOp = info->pStencilAttachment->storeOp;
+         } else {
+            setup_dynamic_attachment(device, att, info->pStencilAttachment,
+                                     true, false);
+            subpass->ds_attachment.layout =
+               info->pStencilAttachment->imageLayout;
+         }
+         has_stencil_resolve =
+            info->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE;
+      }
+
+      if (has_depth_resolve || has_stencil_resolve) {
+         struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+         subpass->ds_resolve_attachment.attachment = a++;
+         if (has_depth_resolve) {
+            setup_dynamic_attachment(device, att, info->pDepthAttachment,
+                                     false, true);
+            subpass->ds_resolve_attachment.layout =
+               info->pDepthAttachment->resolveImageLayout;
+            subpass->resolve_depth = true;
+         }
+         if (has_stencil_resolve) {
+            if (has_depth_resolve) {
+               att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+               att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
+            } else {
+               setup_dynamic_attachment(device, att, info->pStencilAttachment,
+                                        true, true);
+               subpass->ds_resolve_attachment.layout =
+                  info->pStencilAttachment->resolveImageLayout;
+            }
+            subpass->resolve_stencil = true;
+         }
+      } else {
+         subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED;
+      }
+   } else {
+      subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED;
+   }
+
+   check_do_depth_stencil_clear_with_draw(device, pass, subpass);
+
+   pass->attachment_count = a;
+}
+
+void
+v3dv_setup_dynamic_render_pass_inheritance(struct v3dv_cmd_buffer *cmd_buffer,
+                                           const VkCommandBufferInheritanceRenderingInfo *info)
+{
+   struct v3dv_device *device = cmd_buffer->device;
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   struct v3dv_render_pass *pass = &state->dynamic_pass;
+   struct v3dv_subpass *subpass = &state->dynamic_subpass;
+   struct v3dv_render_pass_attachment *pass_attachments =
+      &state->dynamic_attachments[0];
+   struct v3dv_subpass_attachment *subpass_attachments =
+      &state->dynamic_subpass_attachments[0];
+
+   memset(pass, 0, sizeof(*pass));
+   memset(subpass, 0, sizeof(*subpass));
+   memset(pass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+   memset(subpass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+
+   vk_object_base_init(&device->vk, (struct vk_object_base *) pass,
+                       VK_OBJECT_TYPE_RENDER_PASS);
+
+   pass->attachments = pass_attachments;
+   pass->subpass_attachments = subpass_attachments;
+
+   subpass->view_mask = info->viewMask;
+   subpass->color_count = info->colorAttachmentCount;
+   subpass->color_attachments = &subpass_attachments[0];
+   subpass->resolve_attachments = NULL;
+
+   pass->multiview_enabled = info->viewMask != 0;
+   pass->subpass_count = 1;
+   pass->subpasses = subpass;
+
+   int a = 0;
+   for (int i = 0; i < info->colorAttachmentCount; i++) {
+      struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+      const VkFormat format = info->pColorAttachmentFormats[i];
+
+      if (format == VK_FORMAT_UNDEFINED) {
+         subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+         continue;
+      }
+
+      /* We don't have info about load/store, so we assume we load and we
+       * store.
+       */
+      att->desc.format = format;
+      att->desc.samples = info->rasterizationSamples;
+      att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+      att->desc.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+      att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+      att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+      subpass->color_attachments[i].attachment = a++;
+   }
+
+   if (info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
+       info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) {
+      struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+      att->desc.format = info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ?
+         info->depthAttachmentFormat : info->stencilAttachmentFormat;
+      att->desc.samples = info->rasterizationSamples;
+      if (vk_format_has_depth(att->desc.format)) {
+         att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+         att->desc.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+      } else {
+         att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+         att->desc.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+      }
+      if (vk_format_has_stencil(att->desc.format)) {
+         att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+         att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
+      } else {
+         att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+         att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+      }
+      subpass->ds_attachment.attachment = a++;
+   } else {
+      subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED;
+   }
+
+   pass->attachment_count = a;
+}
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 44962c50508..9851a24c2cd 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,18 +26,18 @@
 #include "v3dv_debug.h"
 #include "v3dv_private.h"
 
-#include "vk_format_info.h"
-
 #include "common/v3d_debug.h"
+#include "qpu/qpu_disasm.h"
 
 #include "compiler/nir/nir_builder.h"
 #include "nir/nir_serialize.h"
 
 #include "util/u_atomic.h"
-#include "util/u_prim.h"
 #include "util/os_time.h"
 
-#include "vulkan/util/vk_format.h"
+#include "vk_format.h"
+#include "vk_nir_convert_ycbcr.h"
+#include "vk_pipeline.h"
 
 static VkResult
 compute_vpm_config(struct v3dv_pipeline *pipeline);
@@ -61,31 +61,15 @@ v3dv_print_v3d_key(struct v3d_key *key,
 }
 
 static void
-pipeline_compute_sha1_from_nir(nir_shader *nir,
-                               unsigned char sha1[20])
-{
-   assert(nir);
-   struct blob blob;
-   blob_init(&blob);
-
-   nir_serialize(&blob, nir, false);
-   if (!blob.out_of_memory)
-      _mesa_sha1_compute(blob.data, blob.size, sha1);
-
-   blob_finish(&blob);
-}
-
-void
-v3dv_shader_module_internal_init(struct v3dv_device *device,
-                                 struct vk_shader_module *module,
-                                 nir_shader *nir)
+pipeline_compute_sha1_from_nir(struct v3dv_pipeline_stage *p_stage)
 {
-   vk_object_base_init(&device->vk, &module->base,
-                       VK_OBJECT_TYPE_SHADER_MODULE);
-   module->nir = nir;
-   module->size = 0;
+   VkPipelineShaderStageCreateInfo info = {
+      .module = vk_shader_module_handle_from_nir(p_stage->nir),
+      .pName = p_stage->entrypoint,
+      .stage = mesa_to_vk_shader_stage(p_stage->nir->info.stage),
+   };
 
-   pipeline_compute_sha1_from_nir(nir, module->sha1);
+   vk_pipeline_hash_shader_stage(&info, NULL, p_stage->shader_sha1);
 }
 
 void
@@ -95,6 +79,10 @@ v3dv_shader_variant_destroy(struct v3dv_device *device,
    /* The assembly BO is shared by all variants in the pipeline, so it can't
     * be freed here and should be freed with the pipeline
     */
+   if (variant->qpu_insts) {
+      free(variant->qpu_insts);
+      variant->qpu_insts = NULL;
+   }
    ralloc_free(variant->prog_data.base);
    vk_free(&device->vk.alloc, variant);
 }
@@ -118,22 +106,10 @@ pipeline_free_stages(struct v3dv_device *device,
 {
    assert(pipeline);
 
-   /* FIXME: we can't just use a loop over mesa stage due the bin, would be
-    * good to find an alternative.
-    */
-   destroy_pipeline_stage(device, pipeline->vs, pAllocator);
-   destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
-   destroy_pipeline_stage(device, pipeline->gs, pAllocator);
-   destroy_pipeline_stage(device, pipeline->gs_bin, pAllocator);
-   destroy_pipeline_stage(device, pipeline->fs, pAllocator);
-   destroy_pipeline_stage(device, pipeline->cs, pAllocator);
-
-   pipeline->vs = NULL;
-   pipeline->vs_bin = NULL;
-   pipeline->gs = NULL;
-   pipeline->gs_bin = NULL;
-   pipeline->fs = NULL;
-   pipeline->cs = NULL;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      destroy_pipeline_stage(device, pipeline->stages[stage], pAllocator);
+      pipeline->stages[stage] = NULL;
+   }
 }
 
 static void
@@ -161,6 +137,12 @@ v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
       pipeline->default_attribute_values = NULL;
    }
 
+   if (pipeline->executables.mem_ctx)
+      ralloc_free(pipeline->executables.mem_ctx);
+
+   if (pipeline->layout)
+      v3dv_pipeline_layout_unref(device, pipeline->layout, pAllocator);
+
    vk_object_free(&device->vk, pAllocator, pipeline);
 }
 
@@ -181,31 +163,44 @@ v3dv_DestroyPipeline(VkDevice _device,
 static const struct spirv_to_nir_options default_spirv_options =  {
    .caps = {
       .device_group = true,
+      .float_controls = true,
       .multiview = true,
+      .storage_8bit = true,
+      .storage_16bit = true,
+      .subgroup_ballot = true,
       .subgroup_basic = true,
+      .subgroup_quad = true,
+      .subgroup_shuffle = true,
+      .subgroup_vote = true,
       .variable_pointers = true,
+      .vk_memory_model = true,
+      .vk_memory_model_device_scope = true,
+      .physical_storage_buffer_address = true,
+      .workgroup_memory_explicit_layout = true,
+      .image_read_without_format = true,
+      .demote_to_helper_invocation = true,
     },
    .ubo_addr_format = nir_address_format_32bit_index_offset,
    .ssbo_addr_format = nir_address_format_32bit_index_offset,
-   .phys_ssbo_addr_format = nir_address_format_64bit_global,
+   .phys_ssbo_addr_format = nir_address_format_2x32bit_global,
    .push_const_addr_format = nir_address_format_logical,
    .shared_addr_format = nir_address_format_32bit_offset,
-   .frag_coord_is_sysval = false,
 };
 
 const nir_shader_compiler_options v3dv_nir_options = {
    .lower_uadd_sat = true,
+   .lower_usub_sat = true,
    .lower_iadd_sat = true,
    .lower_all_io_to_temps = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
    .lower_insert_byte = true,
    .lower_insert_word = true,
-   .lower_bitfield_insert_to_shifts = true,
-   .lower_bitfield_extract_to_shifts = true,
+   .lower_bitfield_insert = true,
+   .lower_bitfield_extract = true,
    .lower_bitfield_reverse = true,
    .lower_bit_count = true,
-   .lower_cs_local_id_from_index = true,
+   .lower_cs_local_id_to_index = true,
    .lower_ffract = true,
    .lower_fmod = true,
    .lower_pack_unorm_2x16 = true,
@@ -218,14 +213,9 @@ const nir_shader_compiler_options v3dv_nir_options = {
    .lower_unpack_snorm_4x8 = true,
    .lower_pack_half_2x16 = true,
    .lower_unpack_half_2x16 = true,
-   /* FIXME: see if we can avoid the uadd_carry and usub_borrow lowering and
-    * get the tests to pass since it might produce slightly better code.
-    */
-   .lower_uadd_carry = true,
-   .lower_usub_borrow = true,
-   /* FIXME: check if we can use multop + umul24 to implement mul2x32_64
-    * without lowering.
-    */
+   .lower_pack_32_2x16 = true,
+   .lower_pack_32_2x16_split = true,
+   .lower_unpack_32_2x16_split = true,
    .lower_mul_2x32_64 = true,
    .lower_fdiv = true,
    .lower_find_lsb = true,
@@ -240,10 +230,10 @@ const nir_shader_compiler_options v3dv_nir_options = {
    .lower_isign = true,
    .lower_ldexp = true,
    .lower_mul_high = true,
-   .lower_wpos_pntc = true,
-   .lower_rotate = true,
+   .lower_wpos_pntc = false,
    .lower_to_scalar = true,
    .lower_device_index_to_zero = true,
+   .lower_fquantize2f16 = true,
    .has_fsub = true,
    .has_isub = true,
    .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
@@ -252,7 +242,7 @@ const nir_shader_compiler_options v3dv_nir_options = {
    .max_unroll_iterations = 16,
    .force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp),
    .divergence_analysis_options =
-      nir_divergence_multiple_workgroup_per_compute_subgroup
+      nir_divergence_multiple_workgroup_per_compute_subgroup,
 };
 
 const nir_shader_compiler_options *
@@ -261,95 +251,39 @@ v3dv_pipeline_get_nir_options(void)
    return &v3dv_nir_options;
 }
 
-#define OPT(pass, ...) ({                                  \
-   bool this_progress = false;                             \
-   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
-   if (this_progress)                                      \
-      progress = true;                                     \
-   this_progress;                                          \
-})
-
-static void
-nir_optimize(nir_shader *nir, bool allow_copies)
-{
-   bool progress;
-
-   do {
-      progress = false;
-      OPT(nir_split_array_vars, nir_var_function_temp);
-      OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
-      OPT(nir_opt_deref);
-      OPT(nir_lower_vars_to_ssa);
-      if (allow_copies) {
-         /* Only run this pass in the first call to nir_optimize.  Later calls
-          * assume that we've lowered away any copy_deref instructions and we
-          * don't want to introduce any more.
-          */
-         OPT(nir_opt_find_array_copies);
-      }
-      OPT(nir_opt_copy_prop_vars);
-      OPT(nir_opt_dead_write_vars);
-      OPT(nir_opt_combine_stores, nir_var_all);
-
-      OPT(nir_lower_alu_to_scalar, NULL, NULL);
-
-      OPT(nir_copy_prop);
-      OPT(nir_lower_phis_to_scalar, false);
-
-      OPT(nir_copy_prop);
-      OPT(nir_opt_dce);
-      OPT(nir_opt_cse);
-      OPT(nir_opt_combine_stores, nir_var_all);
-
-      /* Passing 0 to the peephole select pass causes it to convert
-       * if-statements that contain only move instructions in the branches
-       * regardless of the count.
-       *
-       * Passing 1 to the peephole select pass causes it to convert
-       * if-statements that contain at most a single ALU instruction (total)
-       * in both branches.
-       */
-      OPT(nir_opt_peephole_select, 0, false, false);
-      OPT(nir_opt_peephole_select, 8, false, true);
-
-      OPT(nir_opt_intrinsics);
-      OPT(nir_opt_idiv_const, 32);
-      OPT(nir_opt_algebraic);
-      OPT(nir_opt_constant_folding);
-
-      OPT(nir_opt_dead_cf);
+static const struct vk_ycbcr_conversion_state *
+lookup_ycbcr_conversion(const void *_pipeline_layout, uint32_t set,
+                        uint32_t binding, uint32_t array_index)
+{
+   struct v3dv_pipeline_layout *pipeline_layout =
+      (struct v3dv_pipeline_layout *) _pipeline_layout;
 
-      OPT(nir_opt_if, false);
-      OPT(nir_opt_conditional_discard);
+   assert(set < pipeline_layout->num_sets);
+   struct v3dv_descriptor_set_layout *set_layout =
+      pipeline_layout->set[set].layout;
 
-      OPT(nir_opt_remove_phis);
-      OPT(nir_opt_undef);
-      OPT(nir_lower_pack);
-   } while (progress);
+   assert(binding < set_layout->binding_count);
+   struct v3dv_descriptor_set_binding_layout *bind_layout =
+      &set_layout->binding[binding];
 
-   OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
+   if (bind_layout->immutable_samplers_offset) {
+      const struct v3dv_sampler *immutable_samplers =
+         v3dv_immutable_samplers(set_layout, bind_layout);
+      const struct v3dv_sampler *sampler = &immutable_samplers[array_index];
+      return sampler->conversion ? &sampler->conversion->state : NULL;
+   } else {
+      return NULL;
+   }
 }
 
 static void
 preprocess_nir(nir_shader *nir)
 {
-   /* We have to lower away local variable initializers right before we
-    * inline functions.  That way they get properly initialized at the top
-    * of the function and not at the top of its caller.
-    */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
-   NIR_PASS_V(nir, nir_lower_returns);
-   NIR_PASS_V(nir, nir_inline_functions);
-   NIR_PASS_V(nir, nir_opt_deref);
-
-   /* Pick off the single entrypoint that we want */
-   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
-      if (func->is_entrypoint)
-         func->name = ralloc_strdup(func, "main");
-      else
-         exec_node_remove(&func->node);
-   }
-   assert(exec_list_length(&nir->functions) == 1);
+   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+      .frag_coord = true,
+      .point_coord = true,
+   };
+   NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
 
    /* Vulkan uses the separate-shader linking model */
    nir->info.separate_shader = true;
@@ -357,76 +291,63 @@ preprocess_nir(nir_shader *nir)
    /* Make sure we lower variable initializers on output variables so that
     * nir_remove_dead_variables below sees the corresponding stores
     */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);
-
-   /* Now that we've deleted all but the main function, we can go ahead and
-    * lower the rest of the variable initializers.
-    */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
-
-   /* Split member structs.  We do this before lower_io_to_temporaries so that
-    * it doesn't lower system values to temporaries by accident.
-    */
-   NIR_PASS_V(nir, nir_split_var_copies);
-   NIR_PASS_V(nir, nir_split_per_member_structs);
+   NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_shader_out);
 
    if (nir->info.stage == MESA_SHADER_FRAGMENT)
-      NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
+      NIR_PASS(_, nir, nir_lower_io_to_vector, nir_var_shader_out);
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      NIR_PASS_V(nir, nir_lower_input_attachments,
+      NIR_PASS(_, nir, nir_lower_input_attachments,
                  &(nir_input_attachment_options) {
                     .use_fragcoord_sysval = false,
                        });
    }
 
-   NIR_PASS_V(nir, nir_lower_explicit_io,
-              nir_var_mem_push_const,
-              nir_address_format_32bit_offset);
+   NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+              nir_shader_get_entrypoint(nir), true, false);
 
-   NIR_PASS_V(nir, nir_lower_explicit_io,
-              nir_var_mem_ubo | nir_var_mem_ssbo,
-              nir_address_format_32bit_index_offset);
+   NIR_PASS(_, nir, nir_lower_system_values);
 
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in |
-              nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
-              NULL);
+   NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
 
-   NIR_PASS_V(nir, nir_propagate_invariant, false);
-   NIR_PASS_V(nir, nir_lower_io_to_temporaries,
-              nir_shader_get_entrypoint(nir), true, false);
+   NIR_PASS(_, nir, nir_normalize_cubemap_coords);
 
-   NIR_PASS_V(nir, nir_lower_system_values);
-   NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
+   NIR_PASS(_, nir, nir_lower_global_vars_to_local);
 
-   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+   NIR_PASS(_, nir, nir_split_var_copies);
+   NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
 
-   NIR_PASS_V(nir, nir_normalize_cubemap_coords);
+   v3d_optimize_nir(NULL, nir);
 
-   NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+   NIR_PASS(_, nir, nir_lower_explicit_io,
+            nir_var_mem_push_const,
+            nir_address_format_32bit_offset);
 
-   NIR_PASS_V(nir, nir_split_var_copies);
-   NIR_PASS_V(nir, nir_split_struct_vars, nir_var_function_temp);
+   NIR_PASS(_, nir, nir_lower_explicit_io,
+            nir_var_mem_ubo | nir_var_mem_ssbo,
+            nir_address_format_32bit_index_offset);
 
-   nir_optimize(nir, true);
+   NIR_PASS(_, nir, nir_lower_explicit_io,
+            nir_var_mem_global,
+            nir_address_format_2x32bit_global);
 
-   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
 
    /* Lower a bunch of stuff */
-   NIR_PASS_V(nir, nir_lower_var_copies);
+   NIR_PASS(_, nir, nir_lower_var_copies);
 
-   NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
+   NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
 
-   NIR_PASS_V(nir, nir_lower_indirect_derefs,
-              nir_var_function_temp, 2);
+   NIR_PASS(_, nir, nir_lower_indirect_derefs,
+            nir_var_function_temp, 2);
 
-   NIR_PASS_V(nir, nir_lower_array_deref_of_vec,
-              nir_var_mem_ubo | nir_var_mem_ssbo,
-              nir_lower_direct_array_deref_of_vec_load);
+   NIR_PASS(_, nir, nir_lower_array_deref_of_vec,
+            nir_var_mem_ubo | nir_var_mem_ssbo,
+            nir_lower_direct_array_deref_of_vec_load);
 
-   NIR_PASS_V(nir, nir_lower_frexp);
+   NIR_PASS(_, nir, nir_lower_frexp);
 
    /* Get rid of split copies */
-   nir_optimize(nir, false);
+   v3d_optimize_nir(NULL, nir);
 }
 
 static nir_shader *
@@ -435,42 +356,35 @@ shader_module_compile_to_nir(struct v3dv_device *device,
 {
    nir_shader *nir;
    const nir_shader_compiler_options *nir_options = &v3dv_nir_options;
+   gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(stage->stage);
 
-   if (!stage->module->nir) {
-      uint32_t *spirv = (uint32_t *) stage->module->data;
-      assert(stage->module->size % 4 == 0);
-
-      if (V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV)
-         v3dv_print_spirv(stage->module->data, stage->module->size, stderr);
-
-      uint32_t num_spec_entries = 0;
-      struct nir_spirv_specialization *spec_entries =
-         vk_spec_info_to_nir_spirv(stage->spec_info, &num_spec_entries);
-      const struct spirv_to_nir_options spirv_options = default_spirv_options;
-      nir = spirv_to_nir(spirv, stage->module->size / 4,
-                         spec_entries, num_spec_entries,
-                         broadcom_shader_stage_to_gl(stage->stage),
-                         stage->entrypoint,
-                         &spirv_options, nir_options);
-      assert(nir);
-      nir_validate_shader(nir, "after spirv_to_nir");
-      free(spec_entries);
-   } else {
-      /* For NIR modules created by the driver we can't consume the NIR
-       * directly, we need to clone it first, since ownership of the NIR code
-       * (as with SPIR-V code for SPIR-V shaders), belongs to the creator
-       * of the module and modules can be destroyed immediately after been used
-       * to create pipelines.
-       */
-      nir = nir_shader_clone(NULL, stage->module->nir);
-      nir_validate_shader(nir, "nir module");
+
+   if (V3D_DBG(DUMP_SPIRV) && stage->module->nir == NULL)
+      v3dv_print_spirv(stage->module->data, stage->module->size, stderr);
+
+   /* vk_shader_module_to_nir also handles internal shaders, when module->nir
+    * != NULL. It also calls nir_validate_shader on both cases, so we don't
+    * call it again here.
+    */
+   VkResult result = vk_shader_module_to_nir(&device->vk, stage->module,
+                                             gl_stage,
+                                             stage->entrypoint,
+                                             stage->spec_info,
+                                             &default_spirv_options,
+                                             nir_options,
+                                             NULL, &nir);
+   if (result != VK_SUCCESS)
+      return NULL;
+   assert(nir->info.stage == gl_stage);
+
+   if (V3D_DBG(SHADERDB) && stage->module->nir == NULL) {
+      char sha1buf[41];
+      _mesa_sha1_format(sha1buf, stage->pipeline->sha1);
+      nir->info.name = ralloc_strdup(nir, sha1buf);
    }
-   assert(nir->info.stage == broadcom_shader_stage_to_gl(stage->stage));
 
-   if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                    v3d_debug_flag_for_shader_stage(
-                       broadcom_shader_stage_to_gl(stage->stage)))) {
-      fprintf(stderr, "Initial form: %s prog %d NIR:\n",
+   if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) {
+      fprintf(stderr, "NIR after vk_shader_module_to_nir: %s prog %d NIR:\n",
               broadcom_shader_stage_name(stage->stage),
               stage->program_id);
       nir_print_shader(nir, stderr);
@@ -497,17 +411,21 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
                    int binding,
                    int array_index,
                    int array_size,
-                   uint8_t return_size)
+                   int start_index,
+                   uint8_t return_size,
+                   uint8_t plane)
 {
    assert(array_index < array_size);
    assert(return_size == 16 || return_size == 32);
 
-   unsigned index = 0;
-   for (unsigned i = 0; i < map->num_desc; i++) {
-      if (set == map->set[i] &&
-          binding == map->binding[i] &&
-          array_index == map->array_index[i]) {
-         assert(array_size == map->array_size[i]);
+   unsigned index = start_index;
+   for (; index < map->num_desc; index++) {
+      if (map->used[index] &&
+          set == map->set[index] &&
+          binding == map->binding[index] &&
+          array_index == map->array_index[index] &&
+          plane == map->plane[index]) {
+         assert(array_size == map->array_size[index]);
          if (return_size != map->return_size[index]) {
             /* It the return_size is different it means that the same sampler
              * was used for operations with different precision
@@ -517,26 +435,36 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
             map->return_size[index] = 32;
          }
          return index;
+      } else if (!map->used[index]) {
+         break;
       }
-      index++;
    }
 
-   assert(index == map->num_desc);
+   assert(index < DESCRIPTOR_MAP_SIZE);
+   assert(!map->used[index]);
 
-   map->set[map->num_desc] = set;
-   map->binding[map->num_desc] = binding;
-   map->array_index[map->num_desc] = array_index;
-   map->array_size[map->num_desc] = array_size;
-   map->return_size[map->num_desc] = return_size;
-   map->num_desc++;
+   map->used[index] = true;
+   map->set[index] = set;
+   map->binding[index] = binding;
+   map->array_index[index] = array_index;
+   map->array_size[index] = array_size;
+   map->return_size[index] = return_size;
+   map->plane[index] = plane;
+   map->num_desc = MAX2(map->num_desc, index + 1);
 
    return index;
 }
 
+struct lower_pipeline_layout_state {
+   struct v3dv_pipeline *pipeline;
+   const struct v3dv_pipeline_layout *layout;
+   bool needs_default_sampler_state;
+};
+
 
 static void
 lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
-                         struct v3dv_pipeline *pipeline)
+                         struct lower_pipeline_layout_state *state)
 {
    assert(instr->intrinsic == nir_intrinsic_load_push_constant);
    instr->intrinsic = nir_intrinsic_load_uniform;
@@ -568,8 +496,11 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
          &pipeline->shared_data->maps[broadcom_stage]->sampler_map :
          &pipeline->shared_data->maps[broadcom_stage]->texture_map;
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
       return &pipeline->shared_data->maps[broadcom_stage]->ubo_map;
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
       return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map;
    default:
       unreachable("Descriptor type unknown or not having a descriptor map");
@@ -581,9 +512,7 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
 static void
 lower_vulkan_resource_index(nir_builder *b,
                             nir_intrinsic_instr *instr,
-                            nir_shader *shader,
-                            struct v3dv_pipeline *pipeline,
-                            const struct v3dv_pipeline_layout *layout)
+                            struct lower_pipeline_layout_state *state)
 {
    assert(instr->intrinsic == nir_intrinsic_vulkan_resource_index);
 
@@ -591,35 +520,50 @@ lower_vulkan_resource_index(nir_builder *b,
 
    unsigned set = nir_intrinsic_desc_set(instr);
    unsigned binding = nir_intrinsic_binding(instr);
-   struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
+   struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
    unsigned index = 0;
-   const VkDescriptorType desc_type = nir_intrinsic_desc_type(instr);
 
-   switch (desc_type) {
+   switch (binding_layout->type) {
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
       struct v3dv_descriptor_map *descriptor_map =
-         pipeline_get_descriptor_map(pipeline, desc_type, shader->info.stage, false);
+         pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
+                                     b->shader->info.stage, false);
 
       if (!const_val)
          unreachable("non-constant vulkan_resource_index array index");
 
+      /* At compile-time we will need to know if we are processing a UBO load
+       * for an inline or a regular UBO so we can handle inline loads like
+       * push constants. At the level of NIR level however, the inline
+       * information is gone, so we rely on the index to make this distinction.
+       * Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for
+       * inline buffers. This means that at the descriptor map level
+       * we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1,
+       * and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS.
+       */
+      uint32_t start_index = 0;
+      if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
+         start_index += MAX_INLINE_UNIFORM_BUFFERS;
+      }
+
       index = descriptor_map_add(descriptor_map, set, binding,
                                  const_val->u32,
                                  binding_layout->array_size,
-                                 32 /* return_size: doesn't really apply for this case */);
-
-      if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
-         /* skip index 0 which is used for push constants */
-         index++;
-      }
+                                 start_index,
+                                 32 /* return_size: doesn't really apply for this case */,
+                                 0);
       break;
    }
 
    default:
-      unreachable("unsupported desc_type for vulkan_resource_index");
+      unreachable("unsupported descriptor type for vulkan_resource_index");
       break;
    }
 
@@ -627,30 +571,43 @@ lower_vulkan_resource_index(nir_builder *b,
     * vulkan_load_descriptor return a vec2 providing an index and
     * offset. Our backend compiler only cares about the index part.
     */
-   nir_ssa_def_rewrite_uses(&instr->dest.ssa,
+   nir_def_rewrite_uses(&instr->def,
                             nir_imm_ivec2(b, index, 0));
    nir_instr_remove(&instr->instr);
 }
 
+static uint8_t
+tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
+{
+   int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane);
+   if (plane_src_idx < 0)
+       return 0;
+
+   uint8_t plane = nir_src_as_uint(tex->src[plane_src_idx].src);
+   nir_tex_instr_remove_src(tex, plane_src_idx);
+   return plane;
+}
+
 /* Returns return_size, so it could be used for the case of not having a
  * sampler object
  */
 static uint8_t
-lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
-                        nir_shader *shader,
-                        struct v3dv_pipeline *pipeline,
-                        const struct v3dv_pipeline_layout *layout)
+lower_tex_src(nir_builder *b,
+              nir_tex_instr *instr,
+              unsigned src_idx,
+              struct lower_pipeline_layout_state *state)
 {
-   nir_ssa_def *index = NULL;
+   nir_def *index = NULL;
    unsigned base_index = 0;
    unsigned array_elements = 1;
    nir_tex_src *src = &instr->src[src_idx];
    bool is_sampler = src->src_type == nir_tex_src_sampler_deref;
 
+   uint8_t plane = tex_instr_get_and_remove_plane_src(instr);
+
    /* We compute first the offsets */
    nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
    while (deref->deref_type != nir_deref_type_var) {
-      assert(deref->parent.is_ssa);
       nir_deref_instr *parent =
          nir_instr_as_deref(deref->parent.ssa->parent_instr);
 
@@ -667,8 +624,8 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
          }
 
          index = nir_iadd(b, index,
-                          nir_imul(b, nir_imm_int(b, array_elements),
-                                   nir_ssa_for_src(b, deref->arr.index, 1)));
+                          nir_imul_imm(b, deref->arr.index.ssa,
+                                       array_elements));
       }
 
       array_elements *= glsl_get_length(parent->type);
@@ -683,8 +640,7 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
     * instr if needed
     */
    if (index) {
-      nir_instr_rewrite_src(&instr->instr, &src->src,
-                            nir_src_for_ssa(index));
+      nir_src_rewrite(&src->src, index);
 
       src->src_type = is_sampler ?
          nir_tex_src_sampler_offset :
@@ -696,13 +652,13 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
    uint32_t set = deref->var->data.descriptor_set;
    uint32_t binding = deref->var->data.binding;
    /* FIXME: this is a really simplified check for the precision to be used
-    * for the sampling. Right now we are ony checking for the variables used
+    * for the sampling. Right now we are only checking for the variables used
     * on the operation itself, but there are other cases that we could use to
     * infer the precision requirement.
     */
    bool relaxed_precision = deref->var->data.precision == GLSL_PRECISION_MEDIUM ||
                             deref->var->data.precision == GLSL_PRECISION_LOW;
-   struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
+   struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
 
@@ -714,23 +670,25 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
       base_index;
 
    uint8_t return_size;
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT))
+   if (V3D_DBG(TMU_16BIT))
       return_size = 16;
-   else  if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT))
+   else  if (V3D_DBG(TMU_32BIT))
       return_size = 32;
    else
-      return_size = relaxed_precision || instr->is_shadow ? 16 : 32;
+      return_size = relaxed_precision ? 16 : 32;
 
    struct v3dv_descriptor_map *map =
-      pipeline_get_descriptor_map(pipeline, binding_layout->type,
-                                  shader->info.stage, is_sampler);
+      pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
+                                  b->shader->info.stage, is_sampler);
    int desc_index =
       descriptor_map_add(map,
                          deref->var->data.descriptor_set,
                          deref->var->data.binding,
                          array_index,
                          binding_layout->array_size,
-                         return_size);
+                         0,
+                         return_size,
+                         plane);
 
    if (is_sampler)
       instr->sampler_index = desc_index;
@@ -741,10 +699,9 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
 }
 
 static bool
-lower_sampler(nir_builder *b, nir_tex_instr *instr,
-              nir_shader *shader,
-              struct v3dv_pipeline *pipeline,
-              const struct v3dv_pipeline_layout *layout)
+lower_sampler(nir_builder *b,
+              nir_tex_instr *instr,
+              struct lower_pipeline_layout_state *state)
 {
    uint8_t return_size = 0;
 
@@ -752,44 +709,43 @@ lower_sampler(nir_builder *b, nir_tex_instr *instr,
       nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
 
    if (texture_idx >= 0)
-      return_size = lower_tex_src_to_offset(b, instr, texture_idx, shader,
-                                            pipeline, layout);
+      return_size = lower_tex_src(b, instr, texture_idx, state);
 
    int sampler_idx =
       nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
 
-   if (sampler_idx >= 0)
-      lower_tex_src_to_offset(b, instr, sampler_idx, shader, pipeline, layout);
+   if (sampler_idx >= 0) {
+      assert(nir_tex_instr_need_sampler(instr));
+      lower_tex_src(b, instr, sampler_idx, state);
+   }
 
    if (texture_idx < 0 && sampler_idx < 0)
       return false;
 
-   /* If we don't have a sampler, we assign it the idx we reserve for this
-    * case, and we ensure that it is using the correct return size.
+   /* If the instruction doesn't have a sampler (i.e. txf) we use backend_flags
+    * to bind a default sampler state to configure precission.
     */
    if (sampler_idx < 0) {
-      instr->sampler_index = return_size == 16 ?
+      state->needs_default_sampler_state = true;
+      instr->backend_flags = return_size == 16 ?
          V3DV_NO_SAMPLER_16BIT_IDX : V3DV_NO_SAMPLER_32BIT_IDX;
    }
 
    return true;
 }
 
-/* FIXME: really similar to lower_tex_src_to_offset, perhaps refactor? */
+/* FIXME: really similar to lower_tex_src, perhaps refactor? */
 static void
 lower_image_deref(nir_builder *b,
                   nir_intrinsic_instr *instr,
-                  nir_shader *shader,
-                  struct v3dv_pipeline *pipeline,
-                  const struct v3dv_pipeline_layout *layout)
+                  struct lower_pipeline_layout_state *state)
 {
    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
-   nir_ssa_def *index = NULL;
+   nir_def *index = NULL;
    unsigned array_elements = 1;
    unsigned base_index = 0;
 
    while (deref->deref_type != nir_deref_type_var) {
-      assert(deref->parent.is_ssa);
       nir_deref_instr *parent =
          nir_instr_as_deref(deref->parent.ssa->parent_instr);
 
@@ -806,8 +762,8 @@ lower_image_deref(nir_builder *b,
          }
 
          index = nir_iadd(b, index,
-                          nir_imul(b, nir_imm_int(b, array_elements),
-                                   nir_ssa_for_src(b, deref->arr.index, 1)));
+                          nir_imul_imm(b, deref->arr.index.ssa,
+                                       array_elements));
       }
 
       array_elements *= glsl_get_length(parent->type);
@@ -820,7 +776,7 @@ lower_image_deref(nir_builder *b,
 
    uint32_t set = deref->var->data.descriptor_set;
    uint32_t binding = deref->var->data.binding;
-   struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
+   struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
 
@@ -830,8 +786,8 @@ lower_image_deref(nir_builder *b,
           binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
 
    struct v3dv_descriptor_map *map =
-      pipeline_get_descriptor_map(pipeline, binding_layout->type,
-                                  shader->info.stage, false);
+      pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
+                                  b->shader->info.stage, false);
 
    int desc_index =
       descriptor_map_add(map,
@@ -839,7 +795,9 @@ lower_image_deref(nir_builder *b,
                          deref->var->data.binding,
                          array_index,
                          binding_layout->array_size,
-                         32 /* return_size: doesn't apply for textures */);
+                         0,
+                         32 /* return_size: doesn't apply for textures */,
+                         0);
 
    /* Note: we don't need to do anything here in relation to the precision and
     * the output size because for images we can infer that info from the image
@@ -853,53 +811,35 @@ lower_image_deref(nir_builder *b,
 }
 
 static bool
-lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
-                nir_shader *shader,
-                struct v3dv_pipeline *pipeline,
-                const struct v3dv_pipeline_layout *layout)
+lower_intrinsic(nir_builder *b,
+                nir_intrinsic_instr *instr,
+                struct lower_pipeline_layout_state *state)
 {
    switch (instr->intrinsic) {
-   case nir_intrinsic_load_layer_id:
-      /* FIXME: if layered rendering gets supported, this would need a real
-       * lowering
-       */
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa,
-                               nir_imm_int(b, 0));
-      nir_instr_remove(&instr->instr);
-      return true;
-
    case nir_intrinsic_load_push_constant:
-      lower_load_push_constant(b, instr, pipeline);
+      lower_load_push_constant(b, instr, state);
       return true;
 
    case nir_intrinsic_vulkan_resource_index:
-      lower_vulkan_resource_index(b, instr, shader, pipeline, layout);
+      lower_vulkan_resource_index(b, instr, state);
       return true;
 
    case nir_intrinsic_load_vulkan_descriptor: {
       /* Loading the descriptor happens as part of load/store instructions,
        * so for us this is a no-op.
        */
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa, instr->src[0].ssa);
+      nir_def_rewrite_uses(&instr->def, instr->src[0].ssa);
       nir_instr_remove(&instr->instr);
       return true;
    }
 
    case nir_intrinsic_image_deref_load:
    case nir_intrinsic_image_deref_store:
-   case nir_intrinsic_image_deref_atomic_add:
-   case nir_intrinsic_image_deref_atomic_imin:
-   case nir_intrinsic_image_deref_atomic_umin:
-   case nir_intrinsic_image_deref_atomic_imax:
-   case nir_intrinsic_image_deref_atomic_umax:
-   case nir_intrinsic_image_deref_atomic_and:
-   case nir_intrinsic_image_deref_atomic_or:
-   case nir_intrinsic_image_deref_atomic_xor:
-   case nir_intrinsic_image_deref_atomic_exchange:
-   case nir_intrinsic_image_deref_atomic_comp_swap:
+   case nir_intrinsic_image_deref_atomic:
+   case nir_intrinsic_image_deref_atomic_swap:
    case nir_intrinsic_image_deref_size:
    case nir_intrinsic_image_deref_samples:
-      lower_image_deref(b, instr, shader, pipeline, layout);
+      lower_image_deref(b, instr, state);
       return true;
 
    default:
@@ -908,32 +848,23 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
 }
 
 static bool
-lower_impl(nir_function_impl *impl,
-           nir_shader *shader,
-           struct v3dv_pipeline *pipeline,
-           const struct v3dv_pipeline_layout *layout)
+lower_pipeline_layout_cb(nir_builder *b,
+                         nir_instr *instr,
+                         void *_state)
 {
-   nir_builder b;
-   nir_builder_init(&b, impl);
    bool progress = false;
+   struct lower_pipeline_layout_state *state = _state;
 
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr_safe(instr, block) {
-         b.cursor = nir_before_instr(instr);
-         switch (instr->type) {
-         case nir_instr_type_tex:
-            progress |=
-               lower_sampler(&b, nir_instr_as_tex(instr), shader, pipeline, layout);
-            break;
-         case nir_instr_type_intrinsic:
-            progress |=
-               lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader,
-                               pipeline, layout);
-            break;
-         default:
-            break;
-         }
-      }
+   b->cursor = nir_before_instr(instr);
+   switch (instr->type) {
+   case nir_instr_type_tex:
+      progress |= lower_sampler(b, nir_instr_as_tex(instr), state);
+      break;
+   case nir_instr_type_intrinsic:
+      progress |= lower_intrinsic(b, nir_instr_as_intrinsic(instr), state);
+      break;
+   default:
+      break;
    }
 
    return progress;
@@ -942,25 +873,62 @@ lower_impl(nir_function_impl *impl,
 static bool
 lower_pipeline_layout_info(nir_shader *shader,
                            struct v3dv_pipeline *pipeline,
-                           const struct v3dv_pipeline_layout *layout)
+                           const struct v3dv_pipeline_layout *layout,
+                           bool *needs_default_sampler_state)
 {
    bool progress = false;
 
-   nir_foreach_function(function, shader) {
-      if (function->impl)
-         progress |= lower_impl(function->impl, shader, pipeline, layout);
-   }
+   struct lower_pipeline_layout_state state = {
+      .pipeline = pipeline,
+      .layout = layout,
+      .needs_default_sampler_state = false,
+   };
+
+   progress = nir_shader_instructions_pass(shader, lower_pipeline_layout_cb,
+                                           nir_metadata_block_index |
+                                           nir_metadata_dominance,
+                                           &state);
+
+   *needs_default_sampler_state = state.needs_default_sampler_state;
 
    return progress;
 }
 
+/* This flips gl_PointCoord.y to match Vulkan requirements */
+static bool
+lower_point_coord_cb(nir_builder *b, nir_intrinsic_instr *intr, void *_state)
+{
+   if (intr->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PNTC)
+      return false;
+
+   b->cursor = nir_after_instr(&intr->instr);
+   nir_def *result = &intr->def;
+   result =
+      nir_vector_insert_imm(b, result,
+                            nir_fsub_imm(b, 1.0, nir_channel(b, result, 1)), 1);
+   nir_def_rewrite_uses_after(&intr->def,
+                                  result, result->parent_instr);
+   return true;
+}
+
+static bool
+v3d_nir_lower_point_coord(nir_shader *s)
+{
+   assert(s->info.stage == MESA_SHADER_FRAGMENT);
+   return nir_shader_intrinsics_pass(s, lower_point_coord_cb,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance, NULL);
+}
 
 static void
 lower_fs_io(nir_shader *nir)
 {
    /* Our backend doesn't handle array fragment shader outputs */
    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
+   NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
 
    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
                                MESA_SHADER_FRAGMENT);
@@ -968,8 +936,8 @@ lower_fs_io(nir_shader *nir)
    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                                MESA_SHADER_FRAGMENT);
 
-   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-              type_size_vec4, 0);
+   NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+            type_size_vec4, 0);
 }
 
 static void
@@ -1014,8 +982,7 @@ shader_debug_output(const char *message, void *data)
 static void
 pipeline_populate_v3d_key(struct v3d_key *key,
                           const struct v3dv_pipeline_stage *p_stage,
-                          uint32_t ucp_enables,
-                          bool robust_buffer_access)
+                          uint32_t ucp_enables)
 {
    assert(p_stage->pipeline->shared_data &&
           p_stage->pipeline->shared_data->maps[p_stage->stage]);
@@ -1051,7 +1018,8 @@ pipeline_populate_v3d_key(struct v3d_key *key,
    switch (p_stage->stage) {
    case BROADCOM_SHADER_VERTEX:
    case BROADCOM_SHADER_VERTEX_BIN:
-      key->is_last_geometry_stage = p_stage->pipeline->gs == NULL;
+      key->is_last_geometry_stage =
+         p_stage->pipeline->stages[BROADCOM_SHADER_GEOMETRY] == NULL;
       break;
    case BROADCOM_SHADER_GEOMETRY:
    case BROADCOM_SHADER_GEOMETRY_BIN:
@@ -1078,27 +1046,42 @@ pipeline_populate_v3d_key(struct v3d_key *key,
     */
    key->ucp_enables = ucp_enables;
 
-   key->robust_buffer_access = robust_buffer_access;
+   const VkPipelineRobustnessBufferBehaviorEXT robust_buffer_enabled =
+      VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT;
 
-   key->environment = V3D_ENVIRONMENT_VULKAN;
+   const VkPipelineRobustnessImageBehaviorEXT robust_image_enabled =
+      VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_EXT;
+
+   key->robust_uniform_access =
+      p_stage->robustness.uniform_buffers == robust_buffer_enabled;
+   key->robust_storage_access =
+      p_stage->robustness.storage_buffers == robust_buffer_enabled;
+   key->robust_image_access =
+      p_stage->robustness.images == robust_image_enabled;
 }
 
 /* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the
  * same. For not using prim_mode that is the one already used on v3d
  */
-static const enum pipe_prim_type vk_to_pipe_prim_type[] = {
-   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY,
+static const enum mesa_prim vk_to_mesa_prim[] = {
+   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = MESA_PRIM_POINTS,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = MESA_PRIM_LINES,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = MESA_PRIM_LINE_STRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = MESA_PRIM_TRIANGLES,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = MESA_PRIM_TRIANGLE_STRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = MESA_PRIM_TRIANGLE_FAN,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = MESA_PRIM_LINES_ADJACENCY,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = MESA_PRIM_LINE_STRIP_ADJACENCY,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = MESA_PRIM_TRIANGLES_ADJACENCY,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = MESA_PRIM_TRIANGLE_STRIP_ADJACENCY,
 };
 
+uint32_t
+v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim)
+{
+   return v3d_hw_prim_type(vk_to_mesa_prim[vk_prim]);
+}
+
 static const enum pipe_logicop vk_to_pipe_logicop[] = {
    [VK_LOGIC_OP_CLEAR] = PIPE_LOGICOP_CLEAR,
    [VK_LOGIC_OP_AND] = PIPE_LOGICOP_AND,
@@ -1118,9 +1101,74 @@ static const enum pipe_logicop vk_to_pipe_logicop[] = {
    [VK_LOGIC_OP_SET] = PIPE_LOGICOP_SET,
 };
 
+static bool
+enable_line_smooth(uint8_t topology,
+                   const VkPipelineRasterizationStateCreateInfo *rs_info)
+{
+   if (!rs_info || rs_info->rasterizerDiscardEnable)
+      return false;
+
+   const VkPipelineRasterizationLineStateCreateInfoKHR *ls_info =
+      vk_find_struct_const(rs_info->pNext,
+                           PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_KHR);
+
+   if (!ls_info)
+      return false;
+
+   switch(topology) {
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_LINE_LOOP:
+   case MESA_PRIM_LINE_STRIP:
+   case MESA_PRIM_LINES_ADJACENCY:
+   case MESA_PRIM_LINE_STRIP_ADJACENCY:
+      return ls_info->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR;
+   default:
+      return false;
+   }
+}
+
+static void
+v3d_fs_key_set_color_attachment(struct v3d_fs_key *key,
+                                const struct v3dv_pipeline_stage *p_stage,
+                                uint32_t index,
+                                VkFormat fb_format)
+{
+   key->cbufs |= 1 << index;
+
+   enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
+
+   /* If logic operations are enabled then we might emit color reads and we
+    * need to know the color buffer format and swizzle for that
+    */
+   if (key->logicop_func != PIPE_LOGICOP_COPY) {
+      /* Framebuffer formats should be single plane */
+      assert(vk_format_get_plane_count(fb_format) == 1);
+      key->color_fmt[index].format = fb_pipe_format;
+      memcpy(key->color_fmt[index].swizzle,
+             v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format, 0),
+             sizeof(key->color_fmt[index].swizzle));
+   }
+
+   const struct util_format_description *desc =
+      vk_format_description(fb_format);
+
+   if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+       desc->channel[0].size == 32) {
+      key->f32_color_rb |= 1 << index;
+   }
+
+   if (p_stage->nir->info.fs.untyped_color_outputs) {
+      if (util_format_is_pure_uint(fb_pipe_format))
+         key->uint_color_rb |= 1 << index;
+      else if (util_format_is_pure_sint(fb_pipe_format))
+         key->int_color_rb |= 1 << index;
+   }
+}
+
 static void
 pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                             const struct vk_render_pass_state *rendering_info,
                              const struct v3dv_pipeline_stage *p_stage,
                              bool has_geometry_shader,
                              uint32_t ucp_enables)
@@ -1129,16 +1177,29 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
 
    memset(key, 0, sizeof(*key));
 
-   const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
-   pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables, rba);
+   struct v3dv_device *device = p_stage->pipeline->device;
+   assert(device);
+
+   pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables);
 
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+   uint8_t topology = vk_to_mesa_prim[ia_info->topology];
+
+   key->is_points = (topology == MESA_PRIM_POINTS);
+   key->is_lines = (topology >= MESA_PRIM_LINES &&
+                    topology <= MESA_PRIM_LINE_STRIP);
+
+   if (key->is_points) {
+      /* This mask represents state for GL_ARB_point_sprite which is not
+       * relevant to Vulkan.
+       */
+      key->point_sprite_mask = 0;
+
+      /* Vulkan mandates upper left. */
+      key->point_coord_upper_left = true;
+   }
 
-   key->is_points = (topology == PIPE_PRIM_POINTS);
-   key->is_lines = (topology >= PIPE_PRIM_LINES &&
-                    topology <= PIPE_PRIM_LINE_STRIP);
    key->has_gs = has_geometry_shader;
 
    const VkPipelineColorBlendStateCreateInfo *cb_info =
@@ -1150,6 +1211,7 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
                        PIPE_LOGICOP_COPY;
 
    const bool raster_enabled =
+      pCreateInfo->pRasterizationState &&
       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
 
    /* Multisample rasterization state must be ignored if rasterization
@@ -1162,68 +1224,24 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
              ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
       key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
 
-      if (key->msaa) {
-         key->sample_coverage =
-            p_stage->pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
+      if (key->msaa)
          key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
-         key->sample_alpha_to_one = ms_info->alphaToOneEnable;
-      }
+
+      key->sample_alpha_to_one = ms_info->alphaToOneEnable;
    }
 
+   key->line_smoothing = enable_line_smooth(topology, pCreateInfo->pRasterizationState);
+
    /* This is intended for V3D versions before 4.1, otherwise we just use the
     * tile buffer load/store swap R/B bit.
     */
    key->swap_color_rb = 0;
 
-   const struct v3dv_render_pass *pass =
-      v3dv_render_pass_from_handle(pCreateInfo->renderPass);
-   const struct v3dv_subpass *subpass = p_stage->pipeline->subpass;
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      const uint32_t att_idx = subpass->color_attachments[i].attachment;
-      if (att_idx == VK_ATTACHMENT_UNUSED)
+   for (uint32_t i = 0; i < rendering_info->color_attachment_count; i++) {
+      if (rendering_info->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
          continue;
-
-      key->cbufs |= 1 << i;
-
-      VkFormat fb_format = pass->attachments[att_idx].desc.format;
-      enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
-
-      /* If logic operations are enabled then we might emit color reads and we
-       * need to know the color buffer format and swizzle for that
-       */
-      if (key->logicop_func != PIPE_LOGICOP_COPY) {
-         key->color_fmt[i].format = fb_pipe_format;
-         key->color_fmt[i].swizzle =
-            v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format);
-      }
-
-      const struct util_format_description *desc =
-         vk_format_description(fb_format);
-
-      if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
-          desc->channel[0].size == 32) {
-         key->f32_color_rb |= 1 << i;
-      }
-
-      if (p_stage->nir->info.fs.untyped_color_outputs) {
-         if (util_format_is_pure_uint(fb_pipe_format))
-            key->uint_color_rb |= 1 << i;
-         else if (util_format_is_pure_sint(fb_pipe_format))
-            key->int_color_rb |= 1 << i;
-      }
-
-      if (key->is_points) {
-         /* FIXME: The mask would need to be computed based on the shader
-          * inputs. On gallium it is done at st_atom_rasterizer
-          * (sprite_coord_enable). anv seems (need to confirm) to do that on
-          * genX_pipeline (PointSpriteTextureCoordinateEnable). Would be also
-          * better to have tests to guide filling the mask.
-          */
-         key->point_sprite_mask = 0;
-
-         /* Vulkan mandates upper left. */
-         key->point_coord_upper_left = true;
-      }
+      v3d_fs_key_set_color_attachment(key, p_stage, i,
+                                      rendering_info->color_attachment_formats[i]);
    }
 }
 
@@ -1247,10 +1265,12 @@ pipeline_populate_v3d_gs_key(struct v3d_gs_key *key,
    assert(p_stage->stage == BROADCOM_SHADER_GEOMETRY ||
           p_stage->stage == BROADCOM_SHADER_GEOMETRY_BIN);
 
+   struct v3dv_device *device = p_stage->pipeline->device;
+   assert(device);
+
    memset(key, 0, sizeof(*key));
 
-   const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
-   pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
+   pipeline_populate_v3d_key(&key->base, p_stage, 0);
 
    struct v3dv_pipeline *pipeline = p_stage->pipeline;
 
@@ -1289,10 +1309,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
    assert(p_stage->stage == BROADCOM_SHADER_VERTEX ||
           p_stage->stage == BROADCOM_SHADER_VERTEX_BIN);
 
-   memset(key, 0, sizeof(*key));
+   struct v3dv_device *device = p_stage->pipeline->device;
+   assert(device);
 
-   const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
-   pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
+   memset(key, 0, sizeof(*key));
+   pipeline_populate_v3d_key(&key->base, p_stage, 0);
 
    struct v3dv_pipeline *pipeline = p_stage->pipeline;
 
@@ -1301,11 +1322,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
     */
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+   uint8_t topology = vk_to_mesa_prim[ia_info->topology];
 
    /* FIXME: PRIM_POINTS is not enough, in gallium the full check is
-    * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
-   key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);
+    * MESA_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
+   key->per_vertex_point_size = (topology == MESA_PRIM_POINTS);
 
    key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
 
@@ -1318,7 +1339,7 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
          key->num_used_outputs = 0;
       } else {
          /* Linking against GS binning program */
-         assert(pipeline->gs);
+         assert(pipeline->stages[BROADCOM_SHADER_GEOMETRY]);
          struct v3dv_shader_variant *gs_bin_variant =
             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
 
@@ -1333,7 +1354,7 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
             sizeof(key->used_outputs));
       }
    } else { /* Render VS */
-      if (pipeline->gs) {
+      if (pipeline->stages[BROADCOM_SHADER_GEOMETRY]) {
          /* Linking against GS render program */
          struct v3dv_shader_variant *gs_variant =
             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
@@ -1370,8 +1391,10 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
       const VkVertexInputAttributeDescription *desc =
          &vi_info->pVertexAttributeDescriptions[i];
       assert(desc->location < MAX_VERTEX_ATTRIBS);
-      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
+          desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+      }
    }
 }
 
@@ -1407,14 +1430,33 @@ pipeline_stage_create_binning(const struct v3dv_pipeline_stage *src,
    p_stage->stage = bin_stage;
    p_stage->entrypoint = src->entrypoint;
    p_stage->module = src->module;
-   p_stage->nir = src->nir ? nir_shader_clone(NULL, src->nir) : NULL;
+   /* For binning shaders we will clone the NIR code from the corresponding
+    * render shader later, when we call pipeline_compile_xxx_shader. This way
+    * we only have to run the relevant NIR lowerings once for render shaders
+    */
+   p_stage->nir = NULL;
+   p_stage->program_id = src->program_id;
    p_stage->spec_info = src->spec_info;
-   p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
+   p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
+   p_stage->robustness = src->robustness;
    memcpy(p_stage->shader_sha1, src->shader_sha1, 20);
 
    return p_stage;
 }
 
+/*
+ * Based on some creation flags we assume that the QPU would be needed later
+ * to gather further info. In that case we just keep the qput_insts around,
+ * instead of map/unmap the bo later.
+ */
+static bool
+pipeline_keep_qpu(struct v3dv_pipeline *pipeline)
+{
+   return pipeline->flags &
+      (VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR |
+       VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR);
+}
+
 /**
  * Returns false if it was not able to allocate or map the assembly bo memory.
  */
@@ -1454,9 +1496,10 @@ upload_assembly(struct v3dv_pipeline *pipeline)
          memcpy(bo->map + offset, variant->qpu_insts, variant->qpu_insts_size);
          offset += variant->qpu_insts_size;
 
-         /* We dont need qpu_insts anymore. */
-         free(variant->qpu_insts);
-         variant->qpu_insts = NULL;
+         if (!pipeline_keep_qpu(pipeline)) {
+            free(variant->qpu_insts);
+            variant->qpu_insts = NULL;
+         }
       }
    }
    assert(total_size == offset);
@@ -1474,20 +1517,27 @@ pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   /* We need to include all shader stages in the sha1 key as linking may modify
-    * the shader code in any stage. An alternative would be to use the
+   if (pipeline->layout) {
+      _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
+                        sizeof(pipeline->layout->sha1));
+   }
+
+   /* We need to include all shader stages in the sha1 key as linking may
+    * modify the shader code in any stage. An alternative would be to use the
     * serialized NIR, but that seems like an overkill.
     */
-   _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
-                     sizeof(pipeline->vs->shader_sha1));
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (broadcom_shader_stage_is_binning(stage))
+         continue;
 
-   if (pipeline->gs) {
-      _mesa_sha1_update(&ctx, pipeline->gs->shader_sha1,
-                        sizeof(pipeline->gs->shader_sha1));
-   }
+      struct v3dv_pipeline_stage *p_stage = pipeline->stages[stage];
+      if (p_stage == NULL)
+         continue;
 
-   _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
-                     sizeof(pipeline->fs->shader_sha1));
+      assert(stage != BROADCOM_SHADER_COMPUTE);
+
+      _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
+   }
 
    _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
 
@@ -1502,8 +1552,15 @@ pipeline_hash_compute(const struct v3dv_pipeline *pipeline,
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   _mesa_sha1_update(&ctx, pipeline->cs->shader_sha1,
-                     sizeof(pipeline->cs->shader_sha1));
+   if (pipeline->layout) {
+      _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
+                        sizeof(pipeline->layout->sha1));
+   }
+
+   struct v3dv_pipeline_stage *p_stage =
+      pipeline->stages[BROADCOM_SHADER_COMPUTE];
+
+   _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
 
    _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
 
@@ -1553,7 +1610,7 @@ pipeline_check_spill_size(struct v3dv_pipeline *pipeline)
  * so it is assumed that the caller will prove a pointer that the
  * shader_variant will own.
  *
- * Creation doesn't include allocate a BD to store the content of qpu_insts,
+ * Creation doesn't include allocate a BO to store the content of qpu_insts,
  * as we will try to share the same bo for several shader variants. Also note
  * that qpu_ints being NULL is valid, for example if we are creating the
  * shader_variants from the cache, so we can just upload the assembly of all
@@ -1615,13 +1672,11 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
    int64_t stage_start = os_time_get_nano();
 
    struct v3dv_pipeline *pipeline = p_stage->pipeline;
-   struct v3dv_physical_device *physical_device =
-      &pipeline->device->instance->physicalDevice;
+   struct v3dv_physical_device *physical_device = pipeline->device->pdevice;
    const struct v3d_compiler *compiler = physical_device->compiler;
+   gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(p_stage->stage);
 
-   if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                    v3d_debug_flag_for_shader_stage
-                    (broadcom_shader_stage_to_gl(p_stage->stage)))) {
+   if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) {
       fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n",
               broadcom_shader_stage_name(p_stage->stage),
               p_stage->program_id);
@@ -1632,8 +1687,7 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
    uint64_t *qpu_insts;
    uint32_t qpu_insts_size;
    struct v3d_prog_data *prog_data;
-   uint32_t prog_data_size =
-      v3d_prog_data_size(broadcom_shader_stage_to_gl(p_stage->stage));
+   uint32_t prog_data_size = v3d_prog_data_size(gl_stage);
 
    qpu_insts = v3d_compile(compiler,
                            key, &prog_data,
@@ -1646,7 +1700,7 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
 
    if (!qpu_insts) {
       fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n",
-              gl_shader_stage_name(p_stage->stage),
+              broadcom_shader_stage_name(p_stage->stage),
               p_stage->program_id);
       *out_vk_result = VK_ERROR_UNKNOWN;
    } else {
@@ -1667,59 +1721,6 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
    return variant;
 }
 
-/* FIXME: C&P from st, common place? */
-static void
-st_nir_opts(nir_shader *nir)
-{
-   bool progress;
-
-   do {
-      progress = false;
-
-      NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
-      /* Linking deals with unused inputs/outputs, but here we can remove
-       * things local to the shader in the hopes that we can cleanup other
-       * things. This pass will also remove variables with only stores, so we
-       * might be able to make progress after it.
-       */
-      NIR_PASS(progress, nir, nir_remove_dead_variables,
-               (nir_variable_mode)(nir_var_function_temp |
-                                   nir_var_shader_temp |
-                                   nir_var_mem_shared),
-               NULL);
-
-      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-      NIR_PASS(progress, nir, nir_opt_dead_write_vars);
-
-      if (nir->options->lower_to_scalar) {
-         NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-         NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
-      }
-
-      NIR_PASS_V(nir, nir_lower_alu);
-      NIR_PASS_V(nir, nir_lower_pack);
-      NIR_PASS(progress, nir, nir_copy_prop);
-      NIR_PASS(progress, nir, nir_opt_remove_phis);
-      NIR_PASS(progress, nir, nir_opt_dce);
-      if (nir_opt_trivial_continues(nir)) {
-         progress = true;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_dce);
-      }
-      NIR_PASS(progress, nir, nir_opt_if, false);
-      NIR_PASS(progress, nir, nir_opt_dead_cf);
-      NIR_PASS(progress, nir, nir_opt_cse);
-      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
-
-      NIR_PASS(progress, nir, nir_opt_algebraic);
-      NIR_PASS(progress, nir, nir_opt_constant_folding);
-
-      NIR_PASS(progress, nir, nir_opt_undef);
-      NIR_PASS(progress, nir, nir_opt_conditional_discard);
-   } while (progress);
-}
-
 static void
 link_shaders(nir_shader *producer, nir_shader *consumer)
 {
@@ -1727,34 +1728,34 @@ link_shaders(nir_shader *producer, nir_shader *consumer)
    assert(consumer);
 
    if (producer->options->lower_to_scalar) {
-      NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
-      NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
+      NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
+      NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
    }
 
    nir_lower_io_arrays_to_elements(producer, consumer);
 
-   st_nir_opts(producer);
-   st_nir_opts(consumer);
+   v3d_optimize_nir(NULL, producer);
+   v3d_optimize_nir(NULL, consumer);
 
    if (nir_link_opt_varyings(producer, consumer))
-      st_nir_opts(consumer);
+      v3d_optimize_nir(NULL, consumer);
 
-   NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
-   NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
+   NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
+   NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
 
    if (nir_remove_unused_varyings(producer, consumer)) {
-      NIR_PASS_V(producer, nir_lower_global_vars_to_local);
-      NIR_PASS_V(consumer, nir_lower_global_vars_to_local);
+      NIR_PASS(_, producer, nir_lower_global_vars_to_local);
+      NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
 
-      st_nir_opts(producer);
-      st_nir_opts(consumer);
+      v3d_optimize_nir(NULL, producer);
+      v3d_optimize_nir(NULL, consumer);
 
       /* Optimizations can cause varyings to become unused.
        * nir_compact_varyings() depends on all dead varyings being removed so
        * we need to call nir_remove_dead_variables() again here.
        */
-      NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
-      NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
+      NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
+      NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
    }
 }
 
@@ -1768,6 +1769,9 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
    assert(pipeline->shared_data &&
           pipeline->shared_data->maps[p_stage->stage]);
 
+   NIR_PASS_V(p_stage->nir, nir_vk_lower_ycbcr_tex,
+              lookup_ycbcr_conversion, layout);
+
    nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir));
 
    /* We add this because we need a valid sampler for nir_lower_tex to do
@@ -1777,18 +1781,27 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
     * We add two of those, one for the case we need a 16bit return_size, and
     * another for the case we need a 32bit return size.
     */
-   UNUSED unsigned index =
-      descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
-                         -1, -1, -1, 0, 16);
+   struct v3dv_descriptor_maps *maps =
+      pipeline->shared_data->maps[p_stage->stage];
+
+   UNUSED unsigned index;
+   index = descriptor_map_add(&maps->sampler_map, -1, -1, -1, 0, 0, 16, 0);
    assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
 
-   index =
-      descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
-                         -2, -2, -2, 0, 32);
+   index = descriptor_map_add(&maps->sampler_map, -2, -2, -2, 0, 0, 32, 0);
    assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
 
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
-   NIR_PASS_V(p_stage->nir, lower_pipeline_layout_info, pipeline, layout);
+   bool needs_default_sampler_state = false;
+   NIR_PASS(_, p_stage->nir, lower_pipeline_layout_info, pipeline, layout,
+            &needs_default_sampler_state);
+
+   /* If in the end we didn't need to use the default sampler states and the
+    * shader doesn't need any other samplers, get rid of them so we can
+    * recognize that this program doesn't use any samplers at all.
+    */
+   if (!needs_default_sampler_state && maps->sampler_map.num_desc == 2)
+      maps->sampler_map.num_desc = 0;
 
    p_stage->feedback.duration += os_time_get_nano() - stage_start;
 }
@@ -1830,7 +1843,7 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
    if (nir) {
       assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage));
 
-      /* A NIR cach hit doesn't avoid the large majority of pipeline stage
+      /* A NIR cache hit doesn't avoid the large majority of pipeline stage
        * creation so the cache hit is not recorded in the pipeline feedback
        * flags
        */
@@ -1866,53 +1879,34 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
    return NULL;
 }
 
-static void
-pipeline_hash_shader(const struct vk_shader_module *module,
-                     const char *entrypoint,
-                     gl_shader_stage stage,
-                     const VkSpecializationInfo *spec_info,
-                     unsigned char *sha1_out)
-{
-   struct mesa_sha1 ctx;
-   _mesa_sha1_init(&ctx);
-
-   _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
-   _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
-   _mesa_sha1_update(&ctx, &stage, sizeof(stage));
-   if (spec_info) {
-      _mesa_sha1_update(&ctx, spec_info->pMapEntries,
-                        spec_info->mapEntryCount *
-                        sizeof(*spec_info->pMapEntries));
-      _mesa_sha1_update(&ctx, spec_info->pData,
-                        spec_info->dataSize);
-   }
-
-   _mesa_sha1_final(&ctx, sha1_out);
-}
-
 static VkResult
 pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
                                const VkAllocationCallbacks *pAllocator,
                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
-   assert(pipeline->vs_bin != NULL);
-   if (pipeline->vs_bin->nir == NULL) {
-      assert(pipeline->vs->nir);
-      pipeline->vs_bin->nir = nir_shader_clone(NULL, pipeline->vs->nir);
+   struct v3dv_pipeline_stage *p_stage_vs =
+      pipeline->stages[BROADCOM_SHADER_VERTEX];
+   struct v3dv_pipeline_stage *p_stage_vs_bin =
+      pipeline->stages[BROADCOM_SHADER_VERTEX_BIN];
+
+   assert(p_stage_vs_bin != NULL);
+   if (p_stage_vs_bin->nir == NULL) {
+      assert(p_stage_vs->nir);
+      p_stage_vs_bin->nir = nir_shader_clone(NULL, p_stage_vs->nir);
    }
 
    VkResult vk_result;
    struct v3d_vs_key key;
-   pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs);
+   pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs);
    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] =
-      pipeline_compile_shader_variant(pipeline->vs, &key.base, sizeof(key),
+      pipeline_compile_shader_variant(p_stage_vs, &key.base, sizeof(key),
                                       pAllocator, &vk_result);
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
-   pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin);
+   pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs_bin);
    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
-      pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key),
+      pipeline_compile_shader_variant(p_stage_vs_bin, &key.base, sizeof(key),
                                       pAllocator, &vk_result);
 
    return vk_result;
@@ -1923,26 +1917,30 @@ pipeline_compile_geometry_shader(struct v3dv_pipeline *pipeline,
                                  const VkAllocationCallbacks *pAllocator,
                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
-   assert(pipeline->gs);
+   struct v3dv_pipeline_stage *p_stage_gs =
+      pipeline->stages[BROADCOM_SHADER_GEOMETRY];
+   struct v3dv_pipeline_stage *p_stage_gs_bin =
+      pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN];
 
-   assert(pipeline->gs_bin != NULL);
-   if (pipeline->gs_bin->nir == NULL) {
-      assert(pipeline->gs->nir);
-      pipeline->gs_bin->nir = nir_shader_clone(NULL, pipeline->gs->nir);
+   assert(p_stage_gs);
+   assert(p_stage_gs_bin != NULL);
+   if (p_stage_gs_bin->nir == NULL) {
+      assert(p_stage_gs->nir);
+      p_stage_gs_bin->nir = nir_shader_clone(NULL, p_stage_gs->nir);
    }
 
    VkResult vk_result;
    struct v3d_gs_key key;
-   pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs);
+   pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs);
    pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] =
-      pipeline_compile_shader_variant(pipeline->gs, &key.base, sizeof(key),
+      pipeline_compile_shader_variant(p_stage_gs, &key.base, sizeof(key),
                                       pAllocator, &vk_result);
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
-   pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs_bin);
+   pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs_bin);
    pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN] =
-      pipeline_compile_shader_variant(pipeline->gs_bin, &key.base, sizeof(key),
+      pipeline_compile_shader_variant(p_stage_gs_bin, &key.base, sizeof(key),
                                       pAllocator, &vk_result);
 
    return vk_result;
@@ -1953,19 +1951,26 @@ pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
                                  const VkAllocationCallbacks *pAllocator,
                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
-   struct v3dv_pipeline_stage *p_stage = pipeline->vs;
-
-   p_stage = pipeline->fs;
+   struct v3dv_pipeline_stage *p_stage_vs =
+      pipeline->stages[BROADCOM_SHADER_VERTEX];
+   struct v3dv_pipeline_stage *p_stage_fs =
+      pipeline->stages[BROADCOM_SHADER_FRAGMENT];
+   struct v3dv_pipeline_stage *p_stage_gs =
+      pipeline->stages[BROADCOM_SHADER_GEOMETRY];
 
    struct v3d_fs_key key;
+   pipeline_populate_v3d_fs_key(&key, pCreateInfo, &pipeline->rendering_info,
+                                p_stage_fs, p_stage_gs != NULL,
+                                get_ucp_enable_mask(p_stage_vs));
 
-   pipeline_populate_v3d_fs_key(&key, pCreateInfo, p_stage,
-                                pipeline->gs != NULL,
-                                get_ucp_enable_mask(pipeline->vs));
+   if (key.is_points) {
+      assert(key.point_coord_upper_left);
+      NIR_PASS(_, p_stage_fs->nir, v3d_nir_lower_point_coord);
+   }
 
    VkResult vk_result;
    pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] =
-      pipeline_compile_shader_variant(p_stage, &key.base, sizeof(key),
+      pipeline_compile_shader_variant(p_stage_fs, &key.base, sizeof(key),
                                       pAllocator, &vk_result);
 
    return vk_result;
@@ -1976,16 +1981,20 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
                                struct v3dv_pipeline_key *key,
                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
+   struct v3dv_device *device = pipeline->device;
+   assert(device);
+
    memset(key, 0, sizeof(*key));
-   key->robust_buffer_access =
-      pipeline->device->features.robustBufferAccess;
+
+   key->line_smooth = pipeline->line_smooth;
 
    const bool raster_enabled =
+      pCreateInfo->pRasterizationState &&
       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
 
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   key->topology = vk_to_pipe_prim_type[ia_info->topology];
+   key->topology = vk_to_mesa_prim[ia_info->topology];
 
    const VkPipelineColorBlendStateCreateInfo *cb_info =
       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
@@ -2004,34 +2013,32 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
              ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
       key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
 
-      if (key->msaa) {
-         key->sample_coverage =
-            pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
+      if (key->msaa)
          key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
-         key->sample_alpha_to_one = ms_info->alphaToOneEnable;
-      }
+
+      key->sample_alpha_to_one = ms_info->alphaToOneEnable;
    }
 
-   const struct v3dv_render_pass *pass =
-      v3dv_render_pass_from_handle(pCreateInfo->renderPass);
-   const struct v3dv_subpass *subpass = pipeline->subpass;
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      const uint32_t att_idx = subpass->color_attachments[i].attachment;
-      if (att_idx == VK_ATTACHMENT_UNUSED)
+   struct vk_render_pass_state *ri = &pipeline->rendering_info;
+   for (uint32_t i = 0; i < ri->color_attachment_count; i++) {
+      if (ri->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
          continue;
 
       key->cbufs |= 1 << i;
 
-      VkFormat fb_format = pass->attachments[att_idx].desc.format;
+      VkFormat fb_format = ri->color_attachment_formats[i];
       enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
 
       /* If logic operations are enabled then we might emit color reads and we
        * need to know the color buffer format and swizzle for that
        */
       if (key->logicop_func != PIPE_LOGICOP_COPY) {
+         /* Framebuffer formats should be single plane */
+         assert(vk_format_get_plane_count(fb_format) == 1);
          key->color_fmt[i].format = fb_pipe_format;
-         key->color_fmt[i].swizzle = v3dv_get_format_swizzle(pipeline->device,
-                                                             fb_format);
+         memcpy(key->color_fmt[i].swizzle,
+                v3dv_get_format_swizzle(pipeline->device, fb_format, 0),
+                sizeof(key->color_fmt[i].swizzle));
       }
 
       const struct util_format_description *desc =
@@ -2049,12 +2056,13 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
       const VkVertexInputAttributeDescription *desc =
          &vi_info->pVertexAttributeDescriptions[i];
       assert(desc->location < MAX_VERTEX_ATTRIBS);
-      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
+          desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+      }
    }
 
-   assert(pipeline->subpass);
-   key->has_multiview = pipeline->subpass->view_mask != 0;
+   key->has_multiview = ri->view_mask != 0;
 }
 
 static void
@@ -2062,14 +2070,15 @@ pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
                               struct v3dv_pipeline_key *key,
                               const VkComputePipelineCreateInfo *pCreateInfo)
 {
+   struct v3dv_device *device = pipeline->device;
+   assert(device);
+
    /* We use the same pipeline key for graphics and compute, but we don't need
     * to add a field to flag compute keys because this key is not used alone
     * to search in the cache, we also use the SPIR-V or the serialized NIR for
     * example, which already flags compute shaders.
     */
    memset(key, 0, sizeof(*key));
-   key->robust_buffer_access =
-      pipeline->device->features.robustBufferAccess;
 }
 
 static struct v3dv_pipeline_shared_data *
@@ -2102,9 +2111,10 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
          continue;
       }
 
-      if (stage == BROADCOM_SHADER_GEOMETRY && !pipeline->gs) {
+      if (stage == BROADCOM_SHADER_GEOMETRY &&
+          !pipeline->stages[BROADCOM_SHADER_GEOMETRY]) {
          /* We always inject a custom GS if we have multiview */
-         if (!pipeline->subpass->view_mask)
+         if (!pipeline->rendering_info.view_mask)
             continue;
       }
 
@@ -2146,69 +2156,52 @@ fail:
 static void
 write_creation_feedback(struct v3dv_pipeline *pipeline,
                         const void *next,
-                        const VkPipelineCreationFeedbackEXT *pipeline_feedback,
+                        const VkPipelineCreationFeedback *pipeline_feedback,
                         uint32_t stage_count,
                         const VkPipelineShaderStageCreateInfo *stages)
 {
-   const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
-      vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+   const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+      vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
 
    if (create_feedback) {
       typed_memcpy(create_feedback->pPipelineCreationFeedback,
              pipeline_feedback,
              1);
 
-      assert(stage_count == create_feedback->pipelineStageCreationFeedbackCount);
+      const uint32_t feedback_stage_count =
+         create_feedback->pipelineStageCreationFeedbackCount;
+      assert(feedback_stage_count <= stage_count);
 
-      for (uint32_t i = 0; i < stage_count; i++) {
+      for (uint32_t i = 0; i < feedback_stage_count; i++) {
          gl_shader_stage s = vk_to_mesa_shader_stage(stages[i].stage);
-         switch (s) {
-         case MESA_SHADER_VERTEX:
-            create_feedback->pPipelineStageCreationFeedbacks[i] =
-               pipeline->vs->feedback;
-
-            create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
-               pipeline->vs_bin->feedback.duration;
-            break;
+         enum broadcom_shader_stage bs = gl_shader_stage_to_broadcom(s);
 
-         case MESA_SHADER_GEOMETRY:
-            create_feedback->pPipelineStageCreationFeedbacks[i] =
-               pipeline->gs->feedback;
+         create_feedback->pPipelineStageCreationFeedbacks[i] =
+            pipeline->stages[bs]->feedback;
 
+         if (broadcom_shader_stage_is_render_with_binning(bs)) {
+            enum broadcom_shader_stage bs_bin =
+               broadcom_binning_shader_stage_for_render_stage(bs);
             create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
-               pipeline->gs_bin->feedback.duration;
-            break;
-
-         case MESA_SHADER_FRAGMENT:
-            create_feedback->pPipelineStageCreationFeedbacks[i] =
-               pipeline->fs->feedback;
-            break;
-
-         case MESA_SHADER_COMPUTE:
-            create_feedback->pPipelineStageCreationFeedbacks[i] =
-               pipeline->cs->feedback;
-            break;
-
-         default:
-            unreachable("not supported shader stage");
+               pipeline->stages[bs_bin]->feedback.duration;
          }
       }
    }
 }
 
-static uint32_t
+static enum mesa_prim
 multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
 {
    switch (pipeline->topology) {
-   case PIPE_PRIM_POINTS:
-      return GL_POINTS;
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_STRIP:
-      return GL_LINES;
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return GL_TRIANGLES;
+   case MESA_PRIM_POINTS:
+      return MESA_PRIM_POINTS;
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_LINE_STRIP:
+      return MESA_PRIM_LINES;
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_TRIANGLE_STRIP:
+   case MESA_PRIM_TRIANGLE_FAN:
+      return MESA_PRIM_TRIANGLES;
    default:
       /* Since we don't allow GS with multiview, we can only see non-adjacency
        * primitives.
@@ -2217,19 +2210,19 @@ multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
    }
 }
 
-static uint32_t
+static enum mesa_prim
 multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
 {
    switch (pipeline->topology) {
-   case PIPE_PRIM_POINTS:
-      return GL_POINTS;
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_STRIP:
-      return GL_LINE_STRIP;
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return GL_TRIANGLE_STRIP;
+   case MESA_PRIM_POINTS:
+      return MESA_PRIM_POINTS;
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_LINE_STRIP:
+      return MESA_PRIM_LINE_STRIP;
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_TRIANGLE_STRIP:
+   case MESA_PRIM_TRIANGLE_FAN:
+      return MESA_PRIM_TRIANGLE_STRIP;
    default:
       /* Since we don't allow GS with multiview, we can only see non-adjacency
        * primitives.
@@ -2244,8 +2237,9 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
                           const VkAllocationCallbacks *pAllocator)
 {
    /* Create the passthrough GS from the VS output interface */
-   pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
-   nir_shader *vs_nir = pipeline->vs->nir;
+   struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX];
+   p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache);
+   nir_shader *vs_nir = p_stage_vs->nir;
 
    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
@@ -2255,7 +2249,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
    nir->info.outputs_written = vs_nir->info.outputs_written |
                                (1ull << VARYING_SLOT_LAYER);
 
-   uint32_t vertex_count = u_vertices_per_prim(pipeline->topology);
+   uint32_t vertex_count = mesa_vertices_per_prim(pipeline->topology);
    nir->info.gs.input_primitive =
       multiview_gs_input_primitive_from_pipeline(pipeline);
    nir->info.gs.output_primitive =
@@ -2297,7 +2291,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
    out_layer->data.location = VARYING_SLOT_LAYER;
 
    /* Get the view index value that we will write to gl_Layer */
-   nir_ssa_def *layer =
+   nir_def *layer =
       nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32);
 
    /* Emit all output vertices */
@@ -2323,8 +2317,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
 
    /* Attach the geometry shader to the  pipeline */
    struct v3dv_device *device = pipeline->device;
-   struct v3dv_physical_device *physical_device =
-      &device->instance->physicalDevice;
+   struct v3dv_physical_device *physical_device = device->pdevice;
 
    struct v3dv_pipeline_stage *p_stage =
       vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
@@ -2340,21 +2333,36 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
    p_stage->entrypoint = "main";
    p_stage->module = 0;
    p_stage->nir = nir;
-   pipeline_compute_sha1_from_nir(p_stage->nir, p_stage->shader_sha1);
+   pipeline_compute_sha1_from_nir(p_stage);
    p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
+   p_stage->robustness = pipeline->stages[BROADCOM_SHADER_VERTEX]->robustness;
 
    pipeline->has_gs = true;
-   pipeline->gs = p_stage;
+   pipeline->stages[BROADCOM_SHADER_GEOMETRY] = p_stage;
    pipeline->active_stages |= MESA_SHADER_GEOMETRY;
 
-   pipeline->gs_bin =
-      pipeline_stage_create_binning(pipeline->gs, pAllocator);
-      if (pipeline->gs_bin == NULL)
-         return false;
+   pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] =
+      pipeline_stage_create_binning(p_stage, pAllocator);
+   if (pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] == NULL)
+      return false;
 
    return true;
 }
 
+static void
+pipeline_check_buffer_device_address(struct v3dv_pipeline *pipeline)
+{
+   for (int i = BROADCOM_SHADER_VERTEX; i < BROADCOM_SHADER_STAGES; i++) {
+      struct v3dv_shader_variant *variant = pipeline->shared_data->variants[i];
+      if (variant && variant->prog_data.base->has_global_address) {
+         pipeline->uses_buffer_device_address = true;
+         return;
+      }
+   }
+
+   pipeline->uses_buffer_device_address = false;
+}
+
 /*
  * It compiles a pipeline. Note that it also allocate internal object, but if
  * some allocations success, but other fails, the method is not freeing the
@@ -2371,14 +2379,13 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *pAllocator)
 {
-   VkPipelineCreationFeedbackEXT pipeline_feedback = {
-      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
    };
    int64_t pipeline_start = os_time_get_nano();
 
    struct v3dv_device *device = pipeline->device;
-   struct v3dv_physical_device *physical_device =
-      &device->instance->physicalDevice;
+   struct v3dv_physical_device *physical_device = device->pdevice;
 
    /* First pass to get some common info from the shader, and create the
     * individual pipeline_stage objects
@@ -2394,26 +2401,24 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
       if (p_stage == NULL)
          return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-      /* Note that we are assigning program_id slightly differently that
-       * v3d. Here we are assigning one per pipeline stage, so vs and vs_bin
-       * would have a different program_id, while v3d would have the same for
-       * both. For the case of v3dv, it is more natural to have an id this way,
-       * as right now we are using it for debugging, not for shader-db.
-       */
       p_stage->program_id =
          p_atomic_inc_return(&physical_device->next_program_id);
 
+      enum broadcom_shader_stage broadcom_stage =
+         gl_shader_stage_to_broadcom(stage);
+
       p_stage->pipeline = pipeline;
-      p_stage->stage = gl_shader_stage_to_broadcom(stage);
+      p_stage->stage = broadcom_stage;
       p_stage->entrypoint = sinfo->pName;
       p_stage->module = vk_shader_module_from_handle(sinfo->module);
       p_stage->spec_info = sinfo->pSpecializationInfo;
 
-      pipeline_hash_shader(p_stage->module,
-                           p_stage->entrypoint,
-                           stage,
-                           p_stage->spec_info,
-                           p_stage->shader_sha1);
+      vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
+                                        pCreateInfo->pNext, sinfo->pNext);
+
+      vk_pipeline_hash_shader_stage(&pCreateInfo->pStages[i],
+                                    &p_stage->robustness,
+                                    p_stage->shader_sha1);
 
       pipeline->active_stages |= sinfo->stage;
 
@@ -2421,36 +2426,24 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
        * worry about getting the nir shader for now.
        */
       p_stage->nir = NULL;
-
-      switch(stage) {
-      case MESA_SHADER_VERTEX:
-         pipeline->vs = p_stage;
-         pipeline->vs_bin =
-            pipeline_stage_create_binning(pipeline->vs, pAllocator);
-         if (pipeline->vs_bin == NULL)
-            return VK_ERROR_OUT_OF_HOST_MEMORY;
-         break;
-
-      case MESA_SHADER_GEOMETRY:
+      pipeline->stages[broadcom_stage] = p_stage;
+      if (broadcom_stage == BROADCOM_SHADER_GEOMETRY)
          pipeline->has_gs = true;
-         pipeline->gs = p_stage;
-         pipeline->gs_bin =
-            pipeline_stage_create_binning(pipeline->gs, pAllocator);
-         if (pipeline->gs_bin == NULL)
-            return VK_ERROR_OUT_OF_HOST_MEMORY;
-         break;
 
-      case MESA_SHADER_FRAGMENT:
-         pipeline->fs = p_stage;
-         break;
+      if (broadcom_shader_stage_is_render_with_binning(broadcom_stage)) {
+         enum broadcom_shader_stage broadcom_stage_bin =
+            broadcom_binning_shader_stage_for_render_stage(broadcom_stage);
 
-      default:
-         unreachable("not supported shader stage");
+         pipeline->stages[broadcom_stage_bin] =
+            pipeline_stage_create_binning(p_stage, pAllocator);
+
+         if (pipeline->stages[broadcom_stage_bin] == NULL)
+            return VK_ERROR_OUT_OF_HOST_MEMORY;
       }
    }
 
    /* Add a no-op fragment shader if needed */
-   if (!pipeline->fs) {
+   if (!pipeline->stages[BROADCOM_SHADER_FRAGMENT]) {
       nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
                                                      &v3dv_nir_options,
                                                      "noop_fs");
@@ -2467,109 +2460,126 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
       p_stage->entrypoint = "main";
       p_stage->module = 0;
       p_stage->nir = b.shader;
-      pipeline_compute_sha1_from_nir(p_stage->nir, p_stage->shader_sha1);
+      vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
+                                        NULL, NULL);
+      pipeline_compute_sha1_from_nir(p_stage);
       p_stage->program_id =
          p_atomic_inc_return(&physical_device->next_program_id);
 
-      pipeline->fs = p_stage;
+      pipeline->stages[BROADCOM_SHADER_FRAGMENT] = p_stage;
       pipeline->active_stages |= MESA_SHADER_FRAGMENT;
    }
 
    /* If multiview is enabled, we inject a custom passthrough geometry shader
     * to broadcast draw calls to the appropriate views.
     */
-   assert(!pipeline->subpass->view_mask || (!pipeline->has_gs && !pipeline->gs));
-   if (pipeline->subpass->view_mask) {
+   const uint32_t view_mask = pipeline->rendering_info.view_mask;
+   assert(!view_mask ||
+          (!pipeline->has_gs && !pipeline->stages[BROADCOM_SHADER_GEOMETRY]));
+   if (view_mask) {
       if (!pipeline_add_multiview_gs(pipeline, cache, pAllocator))
          return VK_ERROR_OUT_OF_HOST_MEMORY;
    }
 
-   /* First we try to get the variants from the pipeline cache */
-   struct v3dv_pipeline_key pipeline_key;
-   pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
-   unsigned char pipeline_sha1[20];
-   pipeline_hash_graphics(pipeline, &pipeline_key, pipeline_sha1);
-
-   bool cache_hit = false;
-
-   pipeline->shared_data =
-      v3dv_pipeline_cache_search_for_pipeline(cache,
-                                              pipeline_sha1,
-                                              &cache_hit);
-
-   if (pipeline->shared_data != NULL) {
-      /* A correct pipeline must have at least a VS and FS */
-      assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
-      assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
-      assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
-      assert(!pipeline->gs ||
-             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
-      assert(!pipeline->gs ||
-             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
-
-      if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
-         pipeline_feedback.flags |=
-            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
-
-      goto success;
+   /* First we try to get the variants from the pipeline cache (unless we are
+    * required to capture internal representations, since in that case we need
+    * compile).
+    */
+   bool needs_executable_info =
+      pCreateInfo->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
+   if (!needs_executable_info) {
+      struct v3dv_pipeline_key pipeline_key;
+      pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
+      pipeline_hash_graphics(pipeline, &pipeline_key, pipeline->sha1);
+
+      bool cache_hit = false;
+
+      pipeline->shared_data =
+         v3dv_pipeline_cache_search_for_pipeline(cache,
+                                                 pipeline->sha1,
+                                                 &cache_hit);
+
+      if (pipeline->shared_data != NULL) {
+         /* A correct pipeline must have at least a VS and FS */
+         assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
+         assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
+         assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
+         assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] ||
+                pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
+         assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] ||
+                pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
+
+         if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+            pipeline_feedback.flags |=
+               VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+
+         goto success;
+      }
    }
 
-   if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
-      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+   if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
+      return VK_PIPELINE_COMPILE_REQUIRED;
 
    /* Otherwise we try to get the NIR shaders (either from the original SPIR-V
     * shader or the pipeline cache) and compile.
     */
    pipeline->shared_data =
-      v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline, true);
-
-   pipeline->vs->feedback.flags |=
-      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
-   if (pipeline->gs)
-      pipeline->gs->feedback.flags |=
-         VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
-   pipeline->fs->feedback.flags |=
-      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
-
-   if (!pipeline->vs->nir)
-      pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
-   if (pipeline->gs && !pipeline->gs->nir)
-      pipeline->gs->nir = pipeline_stage_get_nir(pipeline->gs, pipeline, cache);
-   if (!pipeline->fs->nir)
-      pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache);
+      v3dv_pipeline_shared_data_new_empty(pipeline->sha1, pipeline, true);
+   if (!pipeline->shared_data)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX];
+   struct v3dv_pipeline_stage *p_stage_fs = pipeline->stages[BROADCOM_SHADER_FRAGMENT];
+   struct v3dv_pipeline_stage *p_stage_gs = pipeline->stages[BROADCOM_SHADER_GEOMETRY];
+
+   p_stage_vs->feedback.flags |=
+      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+   if (p_stage_gs)
+      p_stage_gs->feedback.flags |=
+         VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+   p_stage_fs->feedback.flags |=
+      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+
+   if (!p_stage_vs->nir)
+      p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache);
+   if (p_stage_gs && !p_stage_gs->nir)
+      p_stage_gs->nir = pipeline_stage_get_nir(p_stage_gs, pipeline, cache);
+   if (!p_stage_fs->nir)
+      p_stage_fs->nir = pipeline_stage_get_nir(p_stage_fs, pipeline, cache);
 
    /* Linking + pipeline lowerings */
-   if (pipeline->gs) {
-      link_shaders(pipeline->gs->nir, pipeline->fs->nir);
-      link_shaders(pipeline->vs->nir, pipeline->gs->nir);
+   if (p_stage_gs) {
+      link_shaders(p_stage_gs->nir, p_stage_fs->nir);
+      link_shaders(p_stage_vs->nir, p_stage_gs->nir);
    } else {
-      link_shaders(pipeline->vs->nir, pipeline->fs->nir);
+      link_shaders(p_stage_vs->nir, p_stage_fs->nir);
    }
 
-   pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout);
-   lower_fs_io(pipeline->fs->nir);
+   pipeline_lower_nir(pipeline, p_stage_fs, pipeline->layout);
+   lower_fs_io(p_stage_fs->nir);
 
-   if (pipeline->gs) {
-      pipeline_lower_nir(pipeline, pipeline->gs, pipeline->layout);
-      lower_gs_io(pipeline->gs->nir);
+   if (p_stage_gs) {
+      pipeline_lower_nir(pipeline, p_stage_gs, pipeline->layout);
+      lower_gs_io(p_stage_gs->nir);
    }
 
-   pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout);
-   lower_vs_io(pipeline->vs->nir);
+   pipeline_lower_nir(pipeline, p_stage_vs, pipeline->layout);
+   lower_vs_io(p_stage_vs->nir);
 
    /* Compiling to vir */
    VkResult vk_result;
 
    /* We should have got all the variants or no variants from the cache */
    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
-   vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator, pCreateInfo);
+   vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator,
+                                                pCreateInfo);
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] &&
           !pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
 
-   if (pipeline->gs) {
+   if (p_stage_gs) {
       vk_result =
          pipeline_compile_geometry_shader(pipeline, pAllocator, pCreateInfo);
       if (vk_result != VK_SUCCESS)
@@ -2590,6 +2600,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
 
  success:
 
+   pipeline_check_buffer_device_address(pipeline);
+
    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
    write_creation_feedback(pipeline,
                            pCreateInfo->pNext,
@@ -2600,7 +2612,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    /* Since we have the variants in the pipeline shared data we can now free
     * the pipeline stages.
     */
-   pipeline_free_stages(device, pipeline, pAllocator);
+   if (!needs_executable_info)
+      pipeline_free_stages(device, pipeline, pAllocator);
 
    pipeline_check_spill_size(pipeline);
 
@@ -2638,139 +2651,11 @@ compute_vpm_config(struct v3dv_pipeline *pipeline)
    return VK_SUCCESS;
 }
 
-static unsigned
-v3dv_dynamic_state_mask(VkDynamicState state)
-{
-   switch(state) {
-   case VK_DYNAMIC_STATE_VIEWPORT:
-      return V3DV_DYNAMIC_VIEWPORT;
-   case VK_DYNAMIC_STATE_SCISSOR:
-      return V3DV_DYNAMIC_SCISSOR;
-   case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
-      return V3DV_DYNAMIC_STENCIL_COMPARE_MASK;
-   case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
-      return V3DV_DYNAMIC_STENCIL_WRITE_MASK;
-   case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
-      return V3DV_DYNAMIC_STENCIL_REFERENCE;
-   case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
-      return V3DV_DYNAMIC_BLEND_CONSTANTS;
-   case VK_DYNAMIC_STATE_DEPTH_BIAS:
-      return V3DV_DYNAMIC_DEPTH_BIAS;
-   case VK_DYNAMIC_STATE_LINE_WIDTH:
-      return V3DV_DYNAMIC_LINE_WIDTH;
-   case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
-      return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
-
-   /* Depth bounds testing is not available in in V3D 4.2 so here we are just
-    * ignoring this dynamic state. We are already asserting at pipeline creation
-    * time that depth bounds testing is not enabled.
-    */
-   case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
-      return 0;
-
-   default:
-      unreachable("Unhandled dynamic state");
-   }
-}
-
-static void
-pipeline_init_dynamic_state(
-   struct v3dv_pipeline *pipeline,
-   const VkPipelineDynamicStateCreateInfo *pDynamicState,
-   const VkPipelineViewportStateCreateInfo *pViewportState,
-   const VkPipelineDepthStencilStateCreateInfo *pDepthStencilState,
-   const VkPipelineColorBlendStateCreateInfo *pColorBlendState,
-   const VkPipelineRasterizationStateCreateInfo *pRasterizationState,
-   const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
-{
-   pipeline->dynamic_state = default_dynamic_state;
-   struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
-
-   /* Create a mask of enabled dynamic states */
-   uint32_t dynamic_states = 0;
-   if (pDynamicState) {
-      uint32_t count = pDynamicState->dynamicStateCount;
-      for (uint32_t s = 0; s < count; s++) {
-         dynamic_states |=
-            v3dv_dynamic_state_mask(pDynamicState->pDynamicStates[s]);
-      }
-   }
-
-   /* For any pipeline states that are not dynamic, set the dynamic state
-    * from the static pipeline state.
-    */
-   if (pViewportState) {
-      if (!(dynamic_states & V3DV_DYNAMIC_VIEWPORT)) {
-         dynamic->viewport.count = pViewportState->viewportCount;
-         typed_memcpy(dynamic->viewport.viewports, pViewportState->pViewports,
-                      pViewportState->viewportCount);
-
-         for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
-            v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
-                                        dynamic->viewport.scale[i],
-                                        dynamic->viewport.translate[i]);
-         }
-      }
-
-      if (!(dynamic_states & V3DV_DYNAMIC_SCISSOR)) {
-         dynamic->scissor.count = pViewportState->scissorCount;
-         typed_memcpy(dynamic->scissor.scissors, pViewportState->pScissors,
-                      pViewportState->scissorCount);
-      }
-   }
-
-   if (pDepthStencilState) {
-      if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
-         dynamic->stencil_compare_mask.front =
-            pDepthStencilState->front.compareMask;
-         dynamic->stencil_compare_mask.back =
-            pDepthStencilState->back.compareMask;
-      }
-
-      if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
-         dynamic->stencil_write_mask.front = pDepthStencilState->front.writeMask;
-         dynamic->stencil_write_mask.back = pDepthStencilState->back.writeMask;
-      }
-
-      if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
-         dynamic->stencil_reference.front = pDepthStencilState->front.reference;
-         dynamic->stencil_reference.back = pDepthStencilState->back.reference;
-      }
-   }
-
-   if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
-      memcpy(dynamic->blend_constants, pColorBlendState->blendConstants,
-             sizeof(dynamic->blend_constants));
-   }
-
-   if (pRasterizationState) {
-      if (pRasterizationState->depthBiasEnable &&
-          !(dynamic_states & V3DV_DYNAMIC_DEPTH_BIAS)) {
-         dynamic->depth_bias.constant_factor =
-            pRasterizationState->depthBiasConstantFactor;
-         dynamic->depth_bias.depth_bias_clamp =
-            pRasterizationState->depthBiasClamp;
-         dynamic->depth_bias.slope_factor =
-            pRasterizationState->depthBiasSlopeFactor;
-      }
-      if (!(dynamic_states & V3DV_DYNAMIC_LINE_WIDTH))
-         dynamic->line_width = pRasterizationState->lineWidth;
-   }
-
-   if (pColorWriteState && !(dynamic_states & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
-      dynamic->color_write_enable = 0;
-      for (uint32_t i = 0; i < pColorWriteState->attachmentCount; i++)
-         dynamic->color_write_enable |= pColorWriteState->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
-   }
-
-   pipeline->dynamic_state.mask = dynamic_states;
-}
-
 static bool
-stencil_op_is_no_op(const VkStencilOpState *stencil)
+stencil_op_is_no_op(struct vk_stencil_test_face_state *stencil)
 {
-   return stencil->depthFailOp == VK_STENCIL_OP_KEEP &&
-          stencil->compareOp == VK_COMPARE_OP_ALWAYS;
+   return stencil->op.depth_fail == VK_STENCIL_OP_KEEP &&
+          stencil->op.compare == VK_COMPARE_OP_ALWAYS;
 }
 
 static void
@@ -2786,113 +2671,63 @@ enable_depth_bias(struct v3dv_pipeline *pipeline,
    /* Check the depth/stencil attachment description for the subpass used with
     * this pipeline.
     */
-   assert(pipeline->pass && pipeline->subpass);
-   struct v3dv_render_pass *pass = pipeline->pass;
-   struct v3dv_subpass *subpass = pipeline->subpass;
-
-   if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
-      return;
-
-   assert(subpass->ds_attachment.attachment < pass->attachment_count);
-   struct v3dv_render_pass_attachment *att =
-      &pass->attachments[subpass->ds_attachment.attachment];
-
-   if (att->desc.format == VK_FORMAT_D16_UNORM)
+   VkFormat ds_format = pipeline->rendering_info.depth_attachment_format;
+   if (ds_format == VK_FORMAT_D16_UNORM)
       pipeline->depth_bias.is_z16 = true;
 
    pipeline->depth_bias.enabled = true;
 }
 
-static void
-pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
-                      const VkPipelineDepthStencilStateCreateInfo *ds_info)
+/* Computes the ez_state based on a given vk_dynamic_graphics_state.  Note
+ * that the parameter dyn doesn't need to be pipeline->dynamic_graphics_state,
+ * as this method can be used by the cmd_buffer too.
+ */
+void
+v3dv_compute_ez_state(struct vk_dynamic_graphics_state *dyn,
+                      struct v3dv_pipeline *pipeline,
+                      enum v3dv_ez_state *ez_state,
+                      bool *incompatible_ez_test)
 {
-   if (!ds_info || !ds_info->depthTestEnable) {
-      pipeline->ez_state = V3D_EZ_DISABLED;
+   if (!dyn->ds.depth.test_enable)  {
+      *ez_state = V3D_EZ_DISABLED;
       return;
    }
 
-   switch (ds_info->depthCompareOp) {
+   switch (dyn->ds.depth.compare_op) {
    case VK_COMPARE_OP_LESS:
    case VK_COMPARE_OP_LESS_OR_EQUAL:
-      pipeline->ez_state = V3D_EZ_LT_LE;
+      *ez_state = V3D_EZ_LT_LE;
       break;
    case VK_COMPARE_OP_GREATER:
    case VK_COMPARE_OP_GREATER_OR_EQUAL:
-      pipeline->ez_state = V3D_EZ_GT_GE;
+      *ez_state = V3D_EZ_GT_GE;
       break;
    case VK_COMPARE_OP_NEVER:
    case VK_COMPARE_OP_EQUAL:
-      pipeline->ez_state = V3D_EZ_UNDECIDED;
+      *ez_state = V3D_EZ_UNDECIDED;
       break;
    default:
-      pipeline->ez_state = V3D_EZ_DISABLED;
+      *ez_state = V3D_EZ_DISABLED;
+      *incompatible_ez_test = true;
       break;
    }
 
    /* If stencil is enabled and is not a no-op, we need to disable EZ */
-   if (ds_info->stencilTestEnable &&
-       (!stencil_op_is_no_op(&ds_info->front) ||
-        !stencil_op_is_no_op(&ds_info->back))) {
-         pipeline->ez_state = V3D_EZ_DISABLED;
+   if (dyn->ds.stencil.test_enable &&
+       (!stencil_op_is_no_op(&dyn->ds.stencil.front) ||
+        !stencil_op_is_no_op(&dyn->ds.stencil.back))) {
+      *ez_state = V3D_EZ_DISABLED;
    }
-}
 
-static bool
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
-{
-   for (uint8_t i = 0; i < pipeline->va_count; i++) {
-      if (vk_format_is_int(pipeline->va[i].vk_format))
-         return true;
+   /* If the FS writes Z, then it may update against the chosen EZ direction */
+   struct v3dv_shader_variant *fs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   if (fs_variant && fs_variant->prog_data.fs->writes_z &&
+       !fs_variant->prog_data.fs->writes_z_from_fep) {
+      *ez_state = V3D_EZ_DISABLED;
    }
-   return false;
 }
 
-/* @pipeline can be NULL. We assume in that case that all the attributes have
- * a float format (we only create an all-float BO once and we reuse it with
- * all float pipelines), otherwise we look at the actual type of each
- * attribute used with the specific pipeline passed in.
- */
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
-                                              struct v3dv_pipeline *pipeline)
-{
-   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
-   struct v3dv_bo *bo;
-
-   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
-
-   if (!bo) {
-      fprintf(stderr, "failed to allocate memory for the default "
-              "attribute values\n");
-      return NULL;
-   }
-
-   bool ok = v3dv_bo_map(device, bo, size);
-   if (!ok) {
-      fprintf(stderr, "failed to map default attribute values buffer\n");
-      return false;
-   }
-
-   uint32_t *attrs = bo->map;
-   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
-   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
-      attrs[i * 4 + 0] = 0;
-      attrs[i * 4 + 1] = 0;
-      attrs[i * 4 + 2] = 0;
-      VkFormat attr_format =
-         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
-      if (i < va_count && vk_format_is_int(attr_format)) {
-         attrs[i * 4 + 3] = 1;
-      } else {
-         attrs[i * 4 + 3] = fui(1.0);
-      }
-   }
-
-   v3dv_bo_unmap(device, bo);
-
-   return bo;
-}
 
 static void
 pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
@@ -2918,6 +2753,135 @@ pipeline_set_sample_rate_shading(struct v3dv_pipeline *pipeline,
       ms_info->sampleShadingEnable;
 }
 
+static void
+pipeline_setup_rendering_info(struct v3dv_device *device,
+                              struct v3dv_pipeline *pipeline,
+                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                              const VkAllocationCallbacks *alloc)
+{
+   struct vk_render_pass_state *rp = &pipeline->rendering_info;
+
+   if (pipeline->pass) {
+      assert(pipeline->subpass);
+      struct v3dv_render_pass *pass = pipeline->pass;
+      struct v3dv_subpass *subpass = pipeline->subpass;
+      const uint32_t attachment_idx = subpass->ds_attachment.attachment;
+
+      rp->view_mask = subpass->view_mask;
+
+      rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
+      rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
+      rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
+      if (attachment_idx != VK_ATTACHMENT_UNUSED) {
+         VkFormat ds_format = pass->attachments[attachment_idx].desc.format;
+         if (vk_format_has_depth(ds_format)) {
+            rp->depth_attachment_format = ds_format;
+            rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
+         }
+         if (vk_format_has_stencil(ds_format)) {
+            rp->stencil_attachment_format = ds_format;
+            rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
+         }
+      }
+
+      rp->color_attachment_count = subpass->color_count;
+      for (uint32_t i = 0; i < subpass->color_count; i++) {
+         const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+         if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+            rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
+            continue;
+         }
+         rp->color_attachment_formats[i] =
+            pass->attachments[attachment_idx].desc.format;
+         rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
+      }
+      return;
+   }
+
+   const VkPipelineRenderingCreateInfo *ri =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           PIPELINE_RENDERING_CREATE_INFO);
+   if (ri) {
+      rp->view_mask = ri->viewMask;
+
+      rp->color_attachment_count = ri->colorAttachmentCount;
+      for (int i = 0; i < ri->colorAttachmentCount; i++) {
+         rp->color_attachment_formats[i] = ri->pColorAttachmentFormats[i];
+         if (rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
+            rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
+         }
+      }
+
+      rp->depth_attachment_format = ri->depthAttachmentFormat;
+      if (ri->depthAttachmentFormat != VK_FORMAT_UNDEFINED)
+         rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
+
+      rp->stencil_attachment_format = ri->stencilAttachmentFormat;
+      if (ri->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)
+         rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
+
+      return;
+   }
+
+   /* From the Vulkan spec for VkPipelineRenderingCreateInfo:
+    *
+    *    "if this structure is not specified, and the pipeline does not include
+    *     a VkRenderPass, viewMask and colorAttachmentCount are 0, and
+    *     depthAttachmentFormat and stencilAttachmentFormat are
+    *     VK_FORMAT_UNDEFINED.
+    */
+   pipeline->rendering_info = (struct vk_render_pass_state) {
+      .view_mask = 0,
+      .attachments = 0,
+      .color_attachment_count = 0,
+      .depth_attachment_format = VK_FORMAT_UNDEFINED,
+      .stencil_attachment_format = VK_FORMAT_UNDEFINED,
+   };
+}
+
+static VkResult
+pipeline_init_dynamic_state(struct v3dv_device *device,
+                            struct v3dv_pipeline *pipeline,
+                            struct vk_graphics_pipeline_state *pipeline_state,
+                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                            const VkPipelineColorWriteCreateInfoEXT *cw_info)
+{
+   VkResult result = VK_SUCCESS;
+   struct vk_graphics_pipeline_all_state all;
+   result = vk_graphics_pipeline_state_fill(&pipeline->device->vk, pipeline_state,
+                                            pCreateInfo, &pipeline->rendering_info, 0,
+                                            &all, NULL, 0, NULL);
+   if (result != VK_SUCCESS)
+      return result;
+
+   vk_dynamic_graphics_state_fill(&pipeline->dynamic_graphics_state, pipeline_state);
+
+   struct v3dv_dynamic_state *v3dv_dyn = &pipeline->dynamic;
+   struct vk_dynamic_graphics_state *dyn = &pipeline->dynamic_graphics_state;
+
+   if (BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
+       BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_SCISSORS)) {
+      /* FIXME: right now we don't support multiViewport so viewporst[0] would
+       * work now, but would need to change if we allow multiple viewports.
+       */
+      v3dv_X(device, viewport_compute_xform)(&dyn->vp.viewports[0],
+                                             v3dv_dyn->viewport.scale[0],
+                                             v3dv_dyn->viewport.translate[0]);
+
+   }
+
+   v3dv_dyn->color_write_enable =
+      (1ull << (4 * V3D_MAX_RENDER_TARGETS(device->devinfo.ver))) - 1;
+   if (cw_info && BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+      v3dv_dyn->color_write_enable = 0;
+      for (uint32_t i = 0; i < cw_info->attachmentCount; i++)
+         v3dv_dyn->color_write_enable |=
+            cw_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
+   }
+
+   return result;
+}
+
 static VkResult
 pipeline_init(struct v3dv_pipeline *pipeline,
               struct v3dv_device *device,
@@ -2928,25 +2892,34 @@ pipeline_init(struct v3dv_pipeline *pipeline,
    VkResult result = VK_SUCCESS;
 
    pipeline->device = device;
+   pipeline->flags = pCreateInfo->flags;
 
    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, pCreateInfo->layout);
    pipeline->layout = layout;
+   v3dv_pipeline_layout_ref(pipeline->layout);
 
    V3DV_FROM_HANDLE(v3dv_render_pass, render_pass, pCreateInfo->renderPass);
-   assert(pCreateInfo->subpass < render_pass->subpass_count);
-   pipeline->pass = render_pass;
-   pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
+   if (render_pass) {
+      assert(pCreateInfo->subpass < render_pass->subpass_count);
+      pipeline->pass = render_pass;
+      pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
+   }
+
+   pipeline_setup_rendering_info(device, pipeline, pCreateInfo, pAllocator);
 
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   pipeline->topology = vk_to_pipe_prim_type[ia_info->topology];
+   pipeline->topology = vk_to_mesa_prim[ia_info->topology];
 
    /* If rasterization is not enabled, various CreateInfo structs must be
     * ignored.
     */
    const bool raster_enabled =
+      pCreateInfo->pRasterizationState &&
       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
 
+   pipeline->rasterization_enabled = raster_enabled;
+
    const VkPipelineViewportStateCreateInfo *vp_info =
       raster_enabled ? pCreateInfo->pViewportState : NULL;
 
@@ -2957,11 +2930,17 @@ pipeline_init(struct v3dv_pipeline *pipeline,
       raster_enabled ? pCreateInfo->pRasterizationState : NULL;
 
    const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info =
-      rs_info ? vk_find_struct_const(
+      raster_enabled ? vk_find_struct_const(
          rs_info->pNext,
          PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT) :
             NULL;
 
+   const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info =
+      raster_enabled ? vk_find_struct_const(
+         rs_info->pNext,
+         PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT) :
+            NULL;
+
    const VkPipelineColorBlendStateCreateInfo *cb_info =
       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
 
@@ -2973,22 +2952,35 @@ pipeline_init(struct v3dv_pipeline *pipeline,
                                      PIPELINE_COLOR_WRITE_CREATE_INFO_EXT) :
                 NULL;
 
-   pipeline_init_dynamic_state(pipeline,
-                               pCreateInfo->pDynamicState,
-                               vp_info, ds_info, cb_info, rs_info, cw_info);
+   struct vk_graphics_pipeline_state pipeline_state = { };
+   result = pipeline_init_dynamic_state(device, pipeline, &pipeline_state,
+                                        pCreateInfo, cw_info);
 
-   /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
-    * feature and it shouldn't be used by any pipeline.
-    */
-   assert(!ds_info || !ds_info->depthBoundsTestEnable);
+   if (result != VK_SUCCESS) {
+      /* Caller would already destroy the pipeline, and we didn't allocate any
+       * extra info. We don't need to do anything else.
+       */
+      return result;
+   }
 
-   v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info,
-                                       rs_info, pv_info, ms_info);
+   const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_control =
+      vp_info ? vk_find_struct_const(vp_info->pNext,
+                                     PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT) :
+                NULL;
+
+   if (depth_clip_control)
+      pipeline->negative_one_to_one = depth_clip_control->negativeOneToOne;
 
-   pipeline_set_ez_state(pipeline, ds_info);
    enable_depth_bias(pipeline, rs_info);
+
+   v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info,
+                                       rs_info, pv_info, ls_info,
+                                       ms_info,
+                                       &pipeline_state);
+
    pipeline_set_sample_mask(pipeline, ms_info);
    pipeline_set_sample_rate_shading(pipeline, ms_info);
+   pipeline->line_smooth = enable_line_smooth(pipeline->topology, rs_info);
 
    pipeline->primitive_restart =
       pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
@@ -3011,15 +3003,22 @@ pipeline_init(struct v3dv_pipeline *pipeline,
 
    v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
 
-   if (pipeline_has_integer_vertex_attrib(pipeline)) {
+   if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
       pipeline->default_attribute_values =
-         v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
+         v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
+
       if (!pipeline->default_attribute_values)
          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
    } else {
       pipeline->default_attribute_values = NULL;
    }
 
+   /* This must be done after the pipeline has been compiled */
+   v3dv_compute_ez_state(&pipeline->dynamic_graphics_state,
+                         pipeline,
+                         &pipeline->ez_state,
+                         &pipeline->incompatible_ez_test);
+
    return result;
 }
 
@@ -3044,15 +3043,13 @@ graphics_pipeline_create(VkDevice _device,
                                VK_OBJECT_TYPE_PIPELINE);
 
    if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   result = pipeline_init(pipeline, device, cache,
-                          pCreateInfo,
-                          pAllocator);
+   result = pipeline_init(pipeline, device, cache, pCreateInfo, pAllocator);
 
    if (result != VK_SUCCESS) {
       v3dv_destroy_pipeline(pipeline, device, pAllocator);
-      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
+      if (result == VK_PIPELINE_COMPILE_REQUIRED)
          *pPipeline = VK_NULL_HANDLE;
       return result;
    }
@@ -3073,7 +3070,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    VkResult result = VK_SUCCESS;
 
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+   if (V3D_DBG(SHADERS))
       mtx_lock(&device->pdevice->mutex);
 
    uint32_t i = 0;
@@ -3091,7 +3088,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
          pPipelines[i] = VK_NULL_HANDLE;
 
          if (pCreateInfos[i].flags &
-             VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
+             VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
             break;
       }
    }
@@ -3099,7 +3096,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
    for (; i < count; i++)
       pPipelines[i] = VK_NULL_HANDLE;
 
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+   if (V3D_DBG(SHADERS))
       mtx_unlock(&device->pdevice->mutex);
 
    return result;
@@ -3118,12 +3115,20 @@ shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 }
 
 static void
-lower_cs_shared(struct nir_shader *nir)
+lower_compute(struct nir_shader *nir)
 {
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
-              nir_var_mem_shared, shared_type_info);
-   NIR_PASS_V(nir, nir_lower_explicit_io,
-              nir_var_mem_shared, nir_address_format_32bit_offset);
+   if (!nir->info.shared_memory_explicit_layout) {
+      NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+               nir_var_mem_shared, shared_type_info);
+   }
+
+   NIR_PASS(_, nir, nir_lower_explicit_io,
+            nir_var_mem_shared, nir_address_format_32bit_offset);
+
+   struct nir_lower_compute_system_values_options sysval_options = {
+      .has_base_workgroup_id = true,
+   };
+   NIR_PASS_V(nir, nir_lower_compute_system_values, &sysval_options);
 }
 
 static VkResult
@@ -3132,14 +3137,13 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
                          const VkComputePipelineCreateInfo *info,
                          const VkAllocationCallbacks *alloc)
 {
-   VkPipelineCreationFeedbackEXT pipeline_feedback = {
-      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
    };
    int64_t pipeline_start = os_time_get_nano();
 
    struct v3dv_device *device = pipeline->device;
-   struct v3dv_physical_device *physical_device =
-      &device->instance->physicalDevice;
+   struct v3dv_physical_device *physical_device = device->pdevice;
 
    const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
    gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
@@ -3156,61 +3160,69 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
    p_stage->entrypoint = sinfo->pName;
    p_stage->module = vk_shader_module_from_handle(sinfo->module);
    p_stage->spec_info = sinfo->pSpecializationInfo;
-   p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
+   p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
 
-   pipeline_hash_shader(p_stage->module,
-                        p_stage->entrypoint,
-                        stage,
-                        p_stage->spec_info,
-                        p_stage->shader_sha1);
+   vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
+                                     info->pNext, sinfo->pNext);
+
+   vk_pipeline_hash_shader_stage(&info->stage,
+                                 &p_stage->robustness,
+                                 p_stage->shader_sha1);
 
-   /* We try to get directly the variant first from the cache */
    p_stage->nir = NULL;
 
-   pipeline->cs = p_stage;
+   pipeline->stages[BROADCOM_SHADER_COMPUTE] = p_stage;
    pipeline->active_stages |= sinfo->stage;
 
-   struct v3dv_pipeline_key pipeline_key;
-   pipeline_populate_compute_key(pipeline, &pipeline_key, info);
-   unsigned char pipeline_sha1[20];
-   pipeline_hash_compute(pipeline, &pipeline_key, pipeline_sha1);
-
-   bool cache_hit = false;
-   pipeline->shared_data =
-      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1, &cache_hit);
-
-   if (pipeline->shared_data != NULL) {
-      assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
-      if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
-         pipeline_feedback.flags |=
-            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
-
-      goto success;
+   /* First we try to get the variants from the pipeline cache (unless we are
+    * required to capture internal representations, since in that case we need
+    * compile).
+    */
+   bool needs_executable_info =
+      info->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
+   if (!needs_executable_info) {
+      struct v3dv_pipeline_key pipeline_key;
+      pipeline_populate_compute_key(pipeline, &pipeline_key, info);
+      pipeline_hash_compute(pipeline, &pipeline_key, pipeline->sha1);
+
+      bool cache_hit = false;
+      pipeline->shared_data =
+         v3dv_pipeline_cache_search_for_pipeline(cache, pipeline->sha1, &cache_hit);
+
+      if (pipeline->shared_data != NULL) {
+         assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+         if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+            pipeline_feedback.flags |=
+               VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+
+         goto success;
+      }
    }
 
-   if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
-      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+   if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
+      return VK_PIPELINE_COMPILE_REQUIRED;
 
-   pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline_sha1,
+   pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline->sha1,
                                                                pipeline,
                                                                false);
+   if (!pipeline->shared_data)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-   p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+   p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
 
    /* If not found on cache, compile it */
    p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
    assert(p_stage->nir);
 
-   st_nir_opts(p_stage->nir);
+   v3d_optimize_nir(NULL, p_stage->nir);
    pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
-   lower_cs_shared(p_stage->nir);
+   lower_compute(p_stage->nir);
 
    VkResult result = VK_SUCCESS;
 
    struct v3d_key key;
    memset(&key, 0, sizeof(key));
-   pipeline_populate_v3d_key(&key, p_stage, 0,
-                             pipeline->device->features.robustBufferAccess);
+   pipeline_populate_v3d_key(&key, p_stage, 0);
    pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE] =
       pipeline_compile_shader_variant(p_stage, &key, sizeof(key),
                                       alloc, &result);
@@ -3225,6 +3237,8 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
 
 success:
 
+   pipeline_check_buffer_device_address(pipeline);
+
    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
    write_creation_feedback(pipeline,
                            info->pNext,
@@ -3233,9 +3247,10 @@ success:
                            &info->stage);
 
    /* As we got the variants in pipeline->shared_data, after compiling we
-    * don't need the pipeline_stages
+    * don't need the pipeline_stages.
     */
-   pipeline_free_stages(device, pipeline, alloc);
+   if (!needs_executable_info)
+      pipeline_free_stages(device, pipeline, alloc);
 
    pipeline_check_spill_size(pipeline);
 
@@ -3253,8 +3268,11 @@ compute_pipeline_init(struct v3dv_pipeline *pipeline,
 
    pipeline->device = device;
    pipeline->layout = layout;
+   v3dv_pipeline_layout_ref(pipeline->layout);
 
    VkResult result = pipeline_compile_compute(pipeline, cache, info, alloc);
+   if (result != VK_SUCCESS)
+      return result;
 
    return result;
 }
@@ -3279,13 +3297,13 @@ compute_pipeline_create(VkDevice _device,
    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
                                VK_OBJECT_TYPE_PIPELINE);
    if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    result = compute_pipeline_init(pipeline, device, cache,
                                   pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
       v3dv_destroy_pipeline(pipeline, device, pAllocator);
-      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
+      if (result == VK_PIPELINE_COMPILE_REQUIRED)
          *pPipeline = VK_NULL_HANDLE;
       return result;
    }
@@ -3306,7 +3324,7 @@ v3dv_CreateComputePipelines(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    VkResult result = VK_SUCCESS;
 
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+   if (V3D_DBG(SHADERS))
       mtx_lock(&device->pdevice->mutex);
 
    uint32_t i = 0;
@@ -3323,7 +3341,7 @@ v3dv_CreateComputePipelines(VkDevice _device,
          pPipelines[i] = VK_NULL_HANDLE;
 
          if (pCreateInfos[i].flags &
-             VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
+             VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
             break;
       }
    }
@@ -3331,8 +3349,303 @@ v3dv_CreateComputePipelines(VkDevice _device,
    for (; i < createInfoCount; i++)
       pPipelines[i] = VK_NULL_HANDLE;
 
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+   if (V3D_DBG(SHADERS))
       mtx_unlock(&device->pdevice->mutex);
 
    return result;
 }
+
+static nir_shader *
+pipeline_get_nir(struct v3dv_pipeline *pipeline,
+                 enum broadcom_shader_stage stage)
+{
+   assert(stage >= 0 && stage < BROADCOM_SHADER_STAGES);
+   if (pipeline->stages[stage])
+      return pipeline->stages[stage]->nir;
+
+   return NULL;
+}
+
+static struct v3d_prog_data *
+pipeline_get_prog_data(struct v3dv_pipeline *pipeline,
+                       enum broadcom_shader_stage stage)
+{
+   if (pipeline->shared_data->variants[stage])
+      return pipeline->shared_data->variants[stage]->prog_data.base;
+   return NULL;
+}
+
+static uint64_t *
+pipeline_get_qpu(struct v3dv_pipeline *pipeline,
+                 enum broadcom_shader_stage stage,
+                 uint32_t *qpu_size)
+{
+   struct v3dv_shader_variant *variant =
+      pipeline->shared_data->variants[stage];
+   if (!variant) {
+      *qpu_size = 0;
+      return NULL;
+   }
+
+   *qpu_size = variant->qpu_insts_size;
+   return variant->qpu_insts;
+}
+
+/* FIXME: we use the same macro in various drivers, maybe move it to
+ * the common vk_util.h?
+ */
+#define WRITE_STR(field, ...) ({                                \
+   memset(field, 0, sizeof(field));                             \
+   UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
+   assert(_i > 0 && _i < sizeof(field));                        \
+})
+
+static bool
+write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
+              const char *data)
+{
+   ir->isText = VK_TRUE;
+
+   size_t data_len = strlen(data) + 1;
+
+   if (ir->pData == NULL) {
+      ir->dataSize = data_len;
+      return true;
+   }
+
+   strncpy(ir->pData, data, ir->dataSize);
+   if (ir->dataSize < data_len)
+      return false;
+
+   ir->dataSize = data_len;
+   return true;
+}
+
+static void
+append(char **str, size_t *offset, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   ralloc_vasprintf_rewrite_tail(str, offset, fmt, args);
+   va_end(args);
+}
+
+static void
+pipeline_collect_executable_data(struct v3dv_pipeline *pipeline)
+{
+   if (pipeline->executables.mem_ctx)
+      return;
+
+   pipeline->executables.mem_ctx = ralloc_context(NULL);
+   util_dynarray_init(&pipeline->executables.data,
+                      pipeline->executables.mem_ctx);
+
+   /* Don't crash for failed/bogus pipelines */
+   if (!pipeline->shared_data)
+      return;
+
+   for (int s = BROADCOM_SHADER_VERTEX; s <= BROADCOM_SHADER_COMPUTE; s++) {
+      VkShaderStageFlags vk_stage =
+         mesa_to_vk_shader_stage(broadcom_shader_stage_to_gl(s));
+      if (!(vk_stage & pipeline->active_stages))
+         continue;
+
+      char *nir_str = NULL;
+      char *qpu_str = NULL;
+
+      if (pipeline_keep_qpu(pipeline)) {
+         nir_shader *nir = pipeline_get_nir(pipeline, s);
+         nir_str = nir ?
+            nir_shader_as_str(nir, pipeline->executables.mem_ctx) : NULL;
+
+         uint32_t qpu_size;
+         uint64_t *qpu = pipeline_get_qpu(pipeline, s, &qpu_size);
+         if (qpu) {
+            uint32_t qpu_inst_count = qpu_size / sizeof(uint64_t);
+            qpu_str = rzalloc_size(pipeline->executables.mem_ctx,
+                                   qpu_inst_count * 96);
+            size_t offset = 0;
+            for (int i = 0; i < qpu_inst_count; i++) {
+               const char *str = v3d_qpu_disasm(&pipeline->device->devinfo, qpu[i]);
+               append(&qpu_str, &offset, "%s\n", str);
+               ralloc_free((void *)str);
+            }
+         }
+      }
+
+      struct v3dv_pipeline_executable_data data = {
+         .stage = s,
+         .nir_str = nir_str,
+         .qpu_str = qpu_str,
+      };
+      util_dynarray_append(&pipeline->executables.data,
+                           struct v3dv_pipeline_executable_data, data);
+   }
+}
+
+static const struct v3dv_pipeline_executable_data *
+pipeline_get_executable(struct v3dv_pipeline *pipeline, uint32_t index)
+{
+   assert(index < util_dynarray_num_elements(&pipeline->executables.data,
+                                             struct v3dv_pipeline_executable_data));
+   return util_dynarray_element(&pipeline->executables.data,
+                                struct v3dv_pipeline_executable_data,
+                                index);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPipelineExecutableInternalRepresentationsKHR(
+   VkDevice device,
+   const VkPipelineExecutableInfoKHR *pExecutableInfo,
+   uint32_t *pInternalRepresentationCount,
+   VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
+{
+   V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
+
+   pipeline_collect_executable_data(pipeline);
+
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
+                          pInternalRepresentations, pInternalRepresentationCount);
+
+   bool incomplete = false;
+   const struct v3dv_pipeline_executable_data *exe =
+      pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
+
+   if (exe->nir_str) {
+      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
+                               &out, ir) {
+         WRITE_STR(ir->name, "NIR (%s)", broadcom_shader_stage_name(exe->stage));
+         WRITE_STR(ir->description, "Final NIR form");
+         if (!write_ir_text(ir, exe->nir_str))
+            incomplete = true;
+      }
+   }
+
+   if (exe->qpu_str) {
+      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
+                               &out, ir) {
+         WRITE_STR(ir->name, "QPU (%s)", broadcom_shader_stage_name(exe->stage));
+         WRITE_STR(ir->description, "Final QPU assembly");
+         if (!write_ir_text(ir, exe->qpu_str))
+            incomplete = true;
+      }
+   }
+
+   return incomplete ? VK_INCOMPLETE : vk_outarray_status(&out);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPipelineExecutablePropertiesKHR(
+   VkDevice device,
+   const VkPipelineInfoKHR *pPipelineInfo,
+   uint32_t *pExecutableCount,
+   VkPipelineExecutablePropertiesKHR *pProperties)
+{
+   V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pPipelineInfo->pipeline);
+
+   pipeline_collect_executable_data(pipeline);
+
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
+                          pProperties, pExecutableCount);
+
+   util_dynarray_foreach(&pipeline->executables.data,
+                         struct v3dv_pipeline_executable_data, exe) {
+      vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
+         gl_shader_stage mesa_stage = broadcom_shader_stage_to_gl(exe->stage);
+         props->stages = mesa_to_vk_shader_stage(mesa_stage);
+
+         WRITE_STR(props->name, "%s (%s)",
+                   _mesa_shader_stage_to_abbrev(mesa_stage),
+                   broadcom_shader_stage_is_binning(exe->stage) ?
+                     "Binning" : "Render");
+
+         WRITE_STR(props->description, "%s",
+                   _mesa_shader_stage_to_string(mesa_stage));
+
+         props->subgroupSize = V3D_CHANNELS;
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPipelineExecutableStatisticsKHR(
+   VkDevice device,
+   const VkPipelineExecutableInfoKHR *pExecutableInfo,
+   uint32_t *pStatisticCount,
+   VkPipelineExecutableStatisticKHR *pStatistics)
+{
+   V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
+
+   pipeline_collect_executable_data(pipeline);
+
+   const struct v3dv_pipeline_executable_data *exe =
+      pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
+
+   struct v3d_prog_data *prog_data =
+      pipeline_get_prog_data(pipeline, exe->stage);
+
+   struct v3dv_shader_variant *variant =
+      pipeline->shared_data->variants[exe->stage];
+   uint32_t qpu_inst_count = variant->qpu_insts_size / sizeof(uint64_t);
+
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
+                          pStatistics, pStatisticCount);
+
+   if (qpu_inst_count > 0) {
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "Compile Strategy");
+         WRITE_STR(stat->description, "Chosen compile strategy index");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->compile_strategy_idx;
+      }
+
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "Instruction Count");
+         WRITE_STR(stat->description, "Number of QPU instructions");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = qpu_inst_count;
+      }
+
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "Thread Count");
+         WRITE_STR(stat->description, "Number of QPU threads dispatched");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->threads;
+      }
+
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "Spill Size");
+         WRITE_STR(stat->description, "Size of the spill buffer in bytes");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->spill_size;
+      }
+
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "TMU Spills");
+         WRITE_STR(stat->description, "Number of times a register was spilled "
+                                      "to memory");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->spill_size;
+      }
+
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "TMU Fills");
+         WRITE_STR(stat->description, "Number of times a register was filled "
+                                      "from memory");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->spill_size;
+      }
+
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "QPU Read Stalls");
+         WRITE_STR(stat->description, "Number of cycles the QPU stalls for a "
+                                      "register read dependency");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->qpu_read_stalls;
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
diff --git a/src/broadcom/vulkan/v3dv_pipeline_cache.c b/src/broadcom/vulkan/v3dv_pipeline_cache.c
index 02721ec1d79..d2124ee0b08 100644
--- a/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -22,7 +22,7 @@
  */
 
 #include "v3dv_private.h"
-#include "vulkan/util/vk_util.h"
+#include "vk_util.h"
 #include "util/blob.h"
 #include "nir/nir_serialize.h"
 
@@ -61,20 +61,22 @@ cache_dump_stats(struct v3dv_pipeline_cache *cache)
    fprintf(stderr, "  cache entries:      %d\n", cache->stats.count);
    fprintf(stderr, "  cache miss count:   %d\n", cache->stats.miss);
    fprintf(stderr, "  cache hit  count:   %d\n", cache->stats.hit);
+
+   fprintf(stderr, "  on-disk cache hit  count:   %d\n", cache->stats.on_disk_hit);
 }
 
 static void
 pipeline_cache_lock(struct v3dv_pipeline_cache *cache)
 {
    if (!cache->externally_synchronized)
-      pthread_mutex_lock(&cache->mutex);
+      mtx_lock(&cache->mutex);
 }
 
 static void
 pipeline_cache_unlock(struct v3dv_pipeline_cache *cache)
 {
    if (!cache->externally_synchronized)
-      pthread_mutex_unlock(&cache->mutex);
+      mtx_unlock(&cache->mutex);
 }
 
 void
@@ -178,7 +180,7 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
       } else {
          cache->nir_stats.hit++;
          if (debug_cache) {
-            fprintf(stderr, "\tnir cache hit: %p\n", nir);
+            fprintf(stderr, "[v3dv nir cache] hit: %p\n", nir);
             if (dump_stats)
                cache_dump_stats(cache);
          }
@@ -188,7 +190,7 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
 
    cache->nir_stats.miss++;
    if (debug_cache) {
-      fprintf(stderr, "\tnir cache miss\n");
+      fprintf(stderr, "[v3dv nir cache] miss\n");
       if (dump_stats)
          cache_dump_stats(cache);
    }
@@ -203,7 +205,7 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
                          bool cache_enabled)
 {
    cache->device = device;
-   pthread_mutex_init(&cache->mutex, NULL);
+   mtx_init(&cache->mutex, mtx_plain);
 
    if (cache_enabled) {
       cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
@@ -219,7 +221,7 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
       cache->stats.count = 0;
 
       cache->externally_synchronized = flags &
-         VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT;
+         VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT;
    } else {
       cache->nir_cache = NULL;
       cache->cache = NULL;
@@ -241,7 +243,7 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
                                         struct blob *blob);
 
 /**
- * It searchs for pipeline cached data, and returns a v3dv_pipeline_shared_data with
+ * It searches for pipeline cached data, and returns a v3dv_pipeline_shared_data with
  * it, or NULL if doesn't have it cached. On the former, it will increases the
  * ref_count, so caller is responsible to unref it.
  */
@@ -273,7 +275,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
       cache->stats.hit++;
       *cache_hit = true;
       if (debug_cache) {
-         fprintf(stderr, "\tcache hit: %p\n", cache_entry);
+         fprintf(stderr, "[v3dv cache] hit: %p\n", cache_entry);
          if (dump_stats)
             cache_dump_stats(cache);
       }
@@ -288,7 +290,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
 
    cache->stats.miss++;
    if (debug_cache) {
-      fprintf(stderr, "\tcache miss\n");
+      fprintf(stderr, "[v3dv cache] miss\n");
       if (dump_stats)
          cache_dump_stats(cache);
    }
@@ -300,7 +302,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
    struct disk_cache *disk_cache = device->pdevice->disk_cache;
    /* Note that the on-disk-cache can be independently disabled, while keeping
     * the pipeline cache working, by using the environment variable
-    * MESA_GLSL_CACHE_DISABLE.  In that case the calls to disk_cache_put/get
+    * MESA_SHADER_CACHE_DISABLE. In that case the calls to disk_cache_put/get
     * will not do anything.
     */
    if (disk_cache && device->instance->pipeline_cache_enabled) {
@@ -309,25 +311,32 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
 
       size_t buffer_size;
       uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size);
+      if (V3D_DBG(CACHE)) {
+         char sha1buf[41];
+         _mesa_sha1_format(sha1buf, cache_key);
+         fprintf(stderr, "[v3dv on-disk cache] %s %s\n",
+                 buffer ? "hit" : "miss",
+                 sha1buf);
+      }
+
       if (buffer) {
          struct blob_reader blob;
          struct v3dv_pipeline_shared_data *shared_data;
 
-         if (debug_cache)
-            fprintf(stderr, "\ton-disk-cache hit\n");
-
          blob_reader_init(&blob, buffer, buffer_size);
          shared_data = v3dv_pipeline_shared_data_create_from_blob(cache, &blob);
          free(buffer);
 
          if (shared_data) {
+            /* Technically we could increase on_disk_hit as soon as we have a
+             * buffer, but we are more interested on hits that got a valid
+             * shared_data
+             */
+            cache->stats.on_disk_hit++;
             if (cache)
                pipeline_cache_upload_shared_data(cache, shared_data, true);
             return shared_data;
          }
-      } else {
-         if (debug_cache)
-            fprintf(stderr, "\ton-disk-cache miss\n");
       }
    }
 #endif
@@ -393,15 +402,13 @@ v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
                                       "pipeline shader assembly", true);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for shaders assembly\n");
-      v3dv_pipeline_shared_data_unref(cache->device, new_entry);
-      return NULL;
+      goto fail;
    }
 
    bool ok = v3dv_bo_map(cache->device, bo, total_assembly_size);
    if (!ok) {
       fprintf(stderr, "failed to map source shader buffer\n");
-      v3dv_pipeline_shared_data_unref(cache->device, new_entry);
-      return NULL;
+      goto fail;
    }
 
    memcpy(bo->map, total_assembly, total_assembly_size);
@@ -409,6 +416,10 @@ v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
    new_entry->assembly_bo = bo;
 
    return new_entry;
+
+fail:
+   v3dv_pipeline_shared_data_unref(cache->device, new_entry);
+   return NULL;
 }
 
 static void
@@ -425,8 +436,13 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
       return;
 
    pipeline_cache_lock(cache);
-   struct hash_entry *entry =
-      _mesa_hash_table_search(cache->cache, shared_data->sha1_key);
+   struct hash_entry *entry = NULL;
+
+   /* If this is being called from the disk cache, we already know that the
+    * entry is not on the hash table
+    */
+   if (!from_disk_cache)
+      entry = _mesa_hash_table_search(cache->cache, shared_data->sha1_key);
 
    if (entry) {
       pipeline_cache_unlock(cache);
@@ -464,14 +480,12 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
          cache_key cache_key;
          disk_cache_compute_key(disk_cache, shared_data->sha1_key, 20, cache_key);
 
-         disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
-         if (debug_cache) {
+         if (V3D_DBG(CACHE)) {
             char sha1buf[41];
             _mesa_sha1_format(sha1buf, shared_data->sha1_key);
-
-            fprintf(stderr, "on-disk-cache, new cache entry with sha1 key %s:%p\n\n",
-                    sha1buf, shared_data);
+            fprintf(stderr, "[v3dv on-disk cache] storing %s\n", sha1buf);
          }
+         disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
       }
 
       blob_finish(&binary);
@@ -528,7 +542,7 @@ shader_variant_create_from_blob(struct v3dv_device *device,
    if (blob->overrun)
       return NULL;
 
-   uint ulist_data_size = sizeof(uint32_t) * ulist_count;
+   size_t ulist_data_size = sizeof(uint32_t) * ulist_count;
    const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
    if (blob->overrun)
       return NULL;
@@ -564,6 +578,7 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
    const unsigned char *sha1_key = blob_read_bytes(blob, 20);
 
    struct v3dv_descriptor_maps *maps[BROADCOM_SHADER_STAGES] = { 0 };
+   struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 };
 
    uint8_t descriptor_maps_count = blob_read_uint8(blob);
    for (uint8_t count = 0; count < descriptor_maps_count; count++) {
@@ -573,14 +588,14 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
          blob_read_bytes(blob, sizeof(struct v3dv_descriptor_maps));
 
       if (blob->overrun)
-         return NULL;
+         goto fail;
 
       maps[stage] = vk_zalloc2(&cache->device->vk.alloc, NULL,
                                sizeof(struct v3dv_descriptor_maps), 8,
                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
       if (maps[stage] == NULL)
-         return NULL;
+         goto fail;
 
       memcpy(maps[stage], current_maps, sizeof(struct v3dv_descriptor_maps));
       if (broadcom_shader_stage_is_render_with_binning(stage)) {
@@ -592,8 +607,6 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
 
    uint8_t variant_count = blob_read_uint8(blob);
 
-   struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 };
-
    for (uint8_t count = 0; count < variant_count; count++) {
       uint8_t stage = blob_read_uint8(blob);
       struct v3dv_shader_variant *variant =
@@ -606,10 +619,25 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
       blob_read_bytes(blob, total_assembly_size);
 
    if (blob->overrun)
-      return NULL;
+      goto fail;
+
+   struct v3dv_pipeline_shared_data *data =
+      v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants,
+                                    total_assembly, total_assembly_size);
+
+   if (!data)
+      goto fail;
 
-   return v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants,
-                                        total_assembly, total_assembly_size);
+   return data;
+
+fail:
+   for (int i = 0; i < BROADCOM_SHADER_STAGES; i++) {
+      if (maps[i])
+         vk_free2(&cache->device->vk.alloc, NULL, maps[i]);
+      if (variants[i])
+         v3dv_shader_variant_destroy(cache->device, variants[i]);
+   }
+   return NULL;
 }
 
 static void
@@ -618,7 +646,7 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache,
                     const void *data)
 {
    struct v3dv_device *device = cache->device;
-   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+   struct v3dv_physical_device *pdevice = device->pdevice;
    struct vk_pipeline_cache_header header;
 
    if (cache->cache == NULL || cache->nir_cache == NULL)
@@ -695,7 +723,7 @@ v3dv_CreatePipelineCache(VkDevice _device,
                             VK_OBJECT_TYPE_PIPELINE_CACHE);
 
    if (cache == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    v3dv_pipeline_cache_init(cache, device, pCreateInfo->flags,
                             device->instance->pipeline_cache_enabled);
@@ -714,7 +742,7 @@ v3dv_CreatePipelineCache(VkDevice _device,
 void
 v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache)
 {
-   pthread_mutex_destroy(&cache->mutex);
+   mtx_destroy(&cache->mutex);
 
    if (dump_stats_on_destroy)
       cache_dump_stats(cache);
@@ -934,7 +962,7 @@ v3dv_GetPipelineCacheData(VkDevice _device,
       blob_init_fixed(&blob, NULL, SIZE_MAX);
    }
 
-   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+   struct v3dv_physical_device *pdevice = device->pdevice;
    VkResult result = VK_INCOMPLETE;
 
    pipeline_cache_lock(cache);
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index b5ab7ed2c59..892afcf3ab8 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * based in part on anv driver which is:
  * Copyright © 2015 Intel Corporation
@@ -36,12 +36,24 @@
 #include <vulkan/vk_icd.h>
 #include <vk_enum_to_str.h>
 
+#include "vk_descriptor_update_template.h"
 #include "vk_device.h"
+#include "vk_device_memory.h"
+#include "vk_format.h"
 #include "vk_instance.h"
 #include "vk_image.h"
+#include "vk_log.h"
 #include "vk_physical_device.h"
 #include "vk_shader_module.h"
+#include "vk_sync.h"
+#include "vk_sync_timeline.h"
 #include "vk_util.h"
+#include "vk_ycbcr_conversion.h"
+
+#include "vk_command_buffer.h"
+#include "vk_command_pool.h"
+#include "vk_queue.h"
+#include "vk_pipeline.h"
 
 #include <xf86drm.h>
 
@@ -53,6 +65,13 @@
 #define VG(x) ((void)0)
 #endif
 
+#include "util/detect_os.h"
+
+#if DETECT_OS_ANDROID
+#include <vndk/hardware_buffer.h>
+#include "util/u_gralloc/u_gralloc.h"
+#endif
+
 #include "v3dv_limits.h"
 
 #include "common/v3d_device_info.h"
@@ -68,8 +87,9 @@
 #include "vk_debug_report.h"
 #include "util/set.h"
 #include "util/hash_table.h"
+#include "util/sparse_array.h"
 #include "util/xmlconfig.h"
-#include "u_atomic.h"
+#include "util/u_atomic.h"
 
 #include "v3dv_entrypoints.h"
 #include "v3dv_bo.h"
@@ -84,7 +104,7 @@
 #include "wsi_common.h"
 
 /* A non-fatal assert.  Useful for debugging. */
-#ifdef DEBUG
+#if MESA_DEBUG
 #define v3dv_assert(x) ({ \
    if (unlikely(!(x))) \
       fprintf(stderr, "%s:%d ASSERT: %s", __FILE__, __LINE__, #x); \
@@ -94,7 +114,7 @@
 #endif
 
 #define perf_debug(...) do {                       \
-   if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))       \
+   if (V3D_DBG(PERF))                            \
       fprintf(stderr, __VA_ARGS__);                \
 } while (0)
 
@@ -111,13 +131,15 @@ struct v3d_simulator_file;
 /* Minimum required by the Vulkan 1.1 spec */
 #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
 
+/* Maximum performance counters number */
+#define V3D_MAX_PERFCNT 93
+
 struct v3dv_physical_device {
    struct vk_physical_device vk;
 
    char *name;
    int32_t render_fd;
    int32_t display_fd;
-   int32_t master_fd;
 
    /* We need these because it is not clear how to detect
     * valid devids in a portable way
@@ -128,11 +150,19 @@ struct v3dv_physical_device {
    dev_t primary_devid;
    dev_t render_devid;
 
+#if using_v3d_simulator
+   uint32_t device_id;
+#endif
+
    uint8_t driver_build_sha1[20];
    uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
    uint8_t device_uuid[VK_UUID_SIZE];
    uint8_t driver_uuid[VK_UUID_SIZE];
 
+   struct vk_sync_type drm_syncobj_type;
+   struct vk_sync_timeline_type sync_timeline_type;
+   const struct vk_sync_type *sync_types[3];
+
    struct disk_cache *disk_cache;
 
    mtx_t mutex;
@@ -148,14 +178,41 @@ struct v3dv_physical_device {
    const struct v3d_compiler *compiler;
    uint32_t next_program_id;
 
+   alignas(8) uint64_t heap_used;
+
+   /* This array holds all our 'struct v3dv_bo' allocations. We use this
+    * so we can add a refcount to our BOs and check if a particular BO
+    * was already allocated in this device using its GEM handle. This is
+    * necessary to properly manage BO imports, because the kernel doesn't
+    * refcount the underlying BO memory.
+    *
+    * Specifically, when self-importing (i.e. importing a BO into the same
+    * device that created it), the kernel will give us the same BO handle
+    * for both BOs and we must only free it once when  both references are
+    * freed. Otherwise, if we are not self-importing, we get two different BO
+    * handles, and we want to free each one individually.
+    *
+    * The BOs in this map all have a refcnt with the reference counter and
+    * only self-imported BOs will ever have a refcnt > 1.
+    */
+   struct util_sparse_array bo_map;
+
    struct {
       bool merge_jobs;
    } options;
+
+   struct {
+      bool cpu_queue;
+      bool multisync;
+      bool perfmon;
+   } caps;
 };
 
-VkResult v3dv_physical_device_acquire_display(struct v3dv_instance *instance,
-                                              struct v3dv_physical_device *pdevice,
-                                              VkIcdSurfaceBase *surface);
+static inline struct v3dv_bo *
+v3dv_device_lookup_bo(struct v3dv_physical_device *device, uint32_t handle)
+{
+   return (struct v3dv_bo *) util_sparse_array_get(&device->bo_map, handle);
+}
 
 VkResult v3dv_wsi_init(struct v3dv_physical_device *physical_device);
 void v3dv_wsi_finish(struct v3dv_physical_device *physical_device);
@@ -172,64 +229,72 @@ void v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device);
 void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device);
 
 bool v3dv_meta_can_use_tlb(struct v3dv_image *image,
+                           uint8_t plane,
+                           uint8_t miplevel,
                            const VkOffset3D *offset,
+                           const VkExtent3D *extent,
                            VkFormat *compat_format);
 
 struct v3dv_instance {
    struct vk_instance vk;
 
-   int physicalDeviceCount;
-   struct v3dv_physical_device physicalDevice;
-
    bool pipeline_cache_enabled;
    bool default_pipeline_cache_enabled;
 };
 
-/* Tracks wait threads spawned from a single vkQueueSubmit call */
-struct v3dv_queue_submit_wait_info {
-   /*  struct vk_object_base base; ?*/
-   struct list_head list_link;
-
-   struct v3dv_device *device;
-
-   /* List of wait threads spawned for any command buffers in a particular
-    * call to vkQueueSubmit.
-    */
-   uint32_t wait_thread_count;
-   struct {
-      pthread_t thread;
-      bool finished;
-   } wait_threads[16];
-
-   /* The master wait thread for the entire submit. This will wait for all
-    * other threads in this submit to complete  before processing signal
-    * semaphores and fences.
+/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd,
+ * tfu), we still need a syncobj to track the last overall job submitted
+ * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can
+ * start expecting multisync to be present and drop the legacy implementation
+ * together with this V3DV_QUEUE_ANY tracker.
+ */
+enum v3dv_queue_type {
+   V3DV_QUEUE_CL = 0,
+   V3DV_QUEUE_CSD,
+   V3DV_QUEUE_TFU,
+   V3DV_QUEUE_CPU,
+   V3DV_QUEUE_ANY,
+   V3DV_QUEUE_COUNT,
+};
+
+/* For each GPU queue, we use a syncobj to track the last job submitted. We
+ * set the flag `first` to determine when we are starting a new cmd buffer
+ * batch and therefore a job submitted to a given queue will be the first in a
+ * cmd buf batch.
+ */
+struct v3dv_last_job_sync {
+   /* If the job is the first submitted to a GPU queue in a cmd buffer batch.
+    *
+    * We use V3DV_QUEUE_{CL,CSD,TFU} both with and without multisync.
     */
-   pthread_t master_wait_thread;
-
-   /* List of semaphores (and fence) to signal after all wait threads completed
-    * and all command buffer jobs in the submission have been sent to the GPU.
+   bool first[V3DV_QUEUE_COUNT];
+   /* Array of syncobj to track the last job submitted to a GPU queue.
+    *
+    * With multisync we use V3DV_QUEUE_{CL,CSD,TFU} to track syncobjs for each
+    * queue, but without multisync we only track the last job submitted to any
+    * queue in V3DV_QUEUE_ANY.
     */
-   uint32_t signal_semaphore_count;
-   VkSemaphore *signal_semaphores;
-   VkFence fence;
+   uint32_t syncs[V3DV_QUEUE_COUNT];
 };
 
 struct v3dv_queue {
-   struct vk_object_base base;
+   struct vk_queue vk;
 
    struct v3dv_device *device;
-   VkDeviceQueueCreateFlags flags;
 
-   /* A list of active v3dv_queue_submit_wait_info */
-   struct list_head submit_wait_list;
-
-   /* A mutex to prevent concurrent access to the list of wait threads */
-   mtx_t mutex;
+   struct v3dv_last_job_sync last_job_syncs;
 
    struct v3dv_job *noop_job;
+
+   /* The last active perfmon ID to prevent mixing of counter results when a
+    * job is submitted with a different perfmon id.
+    */
+   uint32_t last_perfmon_id;
 };
 
+VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue,
+                                  struct vk_queue_submit *submit);
+
 #define V3DV_META_BLIT_CACHE_KEY_SIZE              (4 * sizeof(uint32_t))
 #define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (3 * sizeof(uint32_t) + \
                                                     sizeof(VkComponentMapping))
@@ -261,27 +326,27 @@ struct v3dv_meta_texel_buffer_copy_pipeline {
 };
 
 struct v3dv_pipeline_key {
-   bool robust_buffer_access;
    uint8_t topology;
    uint8_t logicop_func;
    bool msaa;
-   bool sample_coverage;
    bool sample_alpha_to_coverage;
    bool sample_alpha_to_one;
    uint8_t cbufs;
    struct {
       enum pipe_format format;
-      const uint8_t *swizzle;
+      uint8_t swizzle[4];
    } color_fmt[V3D_MAX_DRAW_BUFFERS];
    uint8_t f32_color_rb;
    uint32_t va_swap_rb_mask;
    bool has_multiview;
+   bool line_smooth;
 };
 
 struct v3dv_pipeline_cache_stats {
    uint32_t miss;
    uint32_t hit;
    uint32_t count;
+   uint32_t on_disk_hit;
 };
 
 /* Equivalent to gl_shader_stage, but including the coordinate shaders
@@ -411,11 +476,11 @@ struct v3dv_device {
    struct v3d_device_info devinfo;
    struct v3dv_queue queue;
 
-   /* A sync object to track the last job submitted to the GPU. */
-   uint32_t last_job_sync;
+   /* Guards query->maybe_available and value for timestamps */
+   mtx_t query_mutex;
 
-   /* A mutex to prevent concurrent access to last_job_sync from the queue */
-   mtx_t mutex;
+   /* Signaled whenever a query is ended */
+   cnd_t query_ended;
 
    /* Resources used for meta operations */
    struct {
@@ -457,37 +522,107 @@ struct v3dv_device {
    uint32_t bo_size;
    uint32_t bo_count;
 
+   /* Event handling resources.
+    *
+    * Our implementation of events uses a BO to store event state (signaled vs
+    * reset) and dispatches compute shaders to handle GPU event functions
+    * (signal, reset, wait). This struct holds all the resources required
+    * by the implementation.
+    */
+   struct {
+      mtx_t lock;
+
+      /* BO for the event states: signaled (1) or reset (0) */
+      struct v3dv_bo *bo;
+
+      /* We pre-allocate all the events we can fit for the size of the BO we
+       * create to track their states, where each event has an index which is
+       * basically the offset of its state in that BO. We keep a free list with
+       * the pre-allocated events that are available.
+       */
+      uint32_t event_count;
+      struct v3dv_event *events;
+      struct list_head free_list;
+
+      /* Vulkan resources to access the event BO from shaders. We have a
+       * pipeline that sets the state of an event and another that waits on
+       * a single event. Both pipelines require access to the event state BO,
+       * for which we need to allocate a single descripot set.
+       */
+      VkBuffer buffer;
+      VkDeviceMemory mem;
+      VkDescriptorSetLayout descriptor_set_layout;
+      VkPipelineLayout pipeline_layout;
+      VkDescriptorPool descriptor_pool;
+      VkDescriptorSet descriptor_set;
+      VkPipeline set_event_pipeline;
+      VkPipeline wait_event_pipeline;
+   } events;
+
+   /* Query handling resources.
+    *
+    * Our implementation of occlusion queries uses a BO per pool to keep track
+    * of the per-query availability state and dispatches compute shaders to
+    * handle GPU query functions that read and write that state. This struct
+    * holds Vulkan resources that can be shared across all query pools to
+    * implement this. This framework may be extended in the future to handle
+    * more query types.
+    */
+   struct {
+      VkDescriptorSetLayout buf_descriptor_set_layout;
+
+      /* Set query availability */
+      VkPipelineLayout avail_pipeline_layout;
+      VkPipeline avail_pipeline;
+
+      /* Reset query availability and clear occlusion counters */
+      VkPipelineLayout reset_occlusion_pipeline_layout;
+      VkPipeline reset_occlusion_pipeline;
+
+      /* Copy query results */
+      VkPipelineLayout copy_pipeline_layout;
+      VkPipeline copy_pipeline[8];
+   } queries;
+
    struct v3dv_pipeline_cache default_pipeline_cache;
 
-   /* GL_SHADER_STATE_RECORD needs to speficy default attribute values. The
+   /* GL_SHADER_STATE_RECORD needs to specify default attribute values. The
     * following covers the most common case, that is all attributes format
     * being float being float, allowing us to reuse the same BO for all
     * pipelines matching this requirement. Pipelines that need integer
     * attributes will create their own BO.
+    *
+    * Note that since v71 the default attribute values are not needed, so this
+    * can be NULL.
     */
    struct v3dv_bo *default_attribute_float;
-   VkPhysicalDeviceFeatures features;
+
+   void *device_address_mem_ctx;
+   struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */
+
+#if DETECT_OS_ANDROID
+   struct u_gralloc *gralloc;
+#endif
 };
 
 struct v3dv_device_memory {
-   struct vk_object_base base;
+   struct vk_device_memory vk;
 
    struct v3dv_bo *bo;
    const VkMemoryType *type;
-   bool has_bo_ownership;
    bool is_for_wsi;
+   bool is_for_device_address;
 };
 
 #define V3D_OUTPUT_IMAGE_FORMAT_NO 255
 #define TEXTURE_DATA_FORMAT_NO     255
 
-struct v3dv_format {
-   bool supported;
-
-   /* One of V3D33_OUTPUT_IMAGE_FORMAT_*, or OUTPUT_IMAGE_FORMAT_NO */
+#define V3DV_MAX_PLANE_COUNT 3
+struct v3dv_format_plane {
+   /* One of V3D42_OUTPUT_IMAGE_FORMAT_*, or OUTPUT_IMAGE_FORMAT_NO */
    uint8_t rt_type;
 
-   /* One of V3D33_TEXTURE_DATA_FORMAT_*. */
+   /* One of V3D42_TEXTURE_DATA_FORMAT_*. */
    uint8_t tex_type;
 
    /* Swizzle to apply to the RGBA shader output for storing to the tile
@@ -499,15 +634,54 @@ struct v3dv_format {
 
    /* Whether the return value is 16F/I/UI or 32F/I/UI. */
    uint8_t return_size;
+};
+
+struct v3dv_format {
+   /* Non 0 plane count implies supported */
+   uint8_t plane_count;
+
+   struct v3dv_format_plane planes[V3DV_MAX_PLANE_COUNT];
 
    /* If the format supports (linear) filtering when texturing. */
    bool supports_filtering;
 };
 
+/* Note that although VkImageAspectFlags would allow to combine more than one
+ * PLANE bit, for all the use cases we implement that use VkImageAspectFlags,
+ * only one plane is allowed, like for example vkCmdCopyImage:
+ *
+ *   "If srcImage has a VkFormat with two planes then for each element of
+ *    pRegions, srcSubresource.aspectMask must be VK_IMAGE_ASPECT_PLANE_0_BIT
+ *    or VK_IMAGE_ASPECT_PLANE_1_BIT"
+ *
+ */
+static uint8_t v3dv_plane_from_aspect(VkImageAspectFlags aspect)
+{
+   switch (aspect) {
+   case VK_IMAGE_ASPECT_COLOR_BIT:
+   case VK_IMAGE_ASPECT_DEPTH_BIT:
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+   case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT:
+   case VK_IMAGE_ASPECT_PLANE_0_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT:
+      return 0;
+   case VK_IMAGE_ASPECT_PLANE_1_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
+      return 1;
+   case VK_IMAGE_ASPECT_PLANE_2_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
+      return 2;
+   default:
+      unreachable("invalid image aspect");
+   }
+}
+
 struct v3d_resource_slice {
    uint32_t offset;
    uint32_t stride;
    uint32_t padded_height;
+   uint32_t width;
+   uint32_t height;
    /* Size of a single pane of the slice.  For 3D textures, there will be
     * a number of panes equal to the minified, power-of-two-aligned
     * depth.
@@ -518,24 +692,85 @@ struct v3d_resource_slice {
    uint32_t padded_height_of_output_image_in_uif_blocks;
 };
 
+bool v3dv_format_swizzle_needs_rb_swap(const uint8_t *swizzle);
+bool v3dv_format_swizzle_needs_reverse(const uint8_t *swizzle);
+
 struct v3dv_image {
    struct vk_image vk;
 
    const struct v3dv_format *format;
-   uint32_t cpp;
    bool tiled;
 
-   struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS];
-   uint64_t size; /* Total size in bytes */
-   uint32_t cube_map_stride;
+   uint8_t plane_count;
 
-   struct v3dv_device_memory *mem;
-   VkDeviceSize mem_offset;
-   uint32_t alignment;
+   /* If 0, this is a multi-plane image with use disjoint memory, where each
+    * plane binds a different device memory. Otherwise, all the planes share
+    * the same device memory and this stores the total size of the image in
+    * bytes.
+    */
+   uint32_t non_disjoint_size;
+
+   struct {
+      uint32_t cpp;
+
+      struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS];
+      /* Total size of the plane in bytes. */
+      uint64_t size;
+      uint32_t cube_map_stride;
+
+      /* If not using disjoint memory, mem and mem_offset is the same for all
+       * planes, in which case mem_offset is the offset of plane 0.
+       */
+      struct v3dv_device_memory *mem;
+      VkDeviceSize mem_offset;
+      uint32_t alignment;
+
+      /* Pre-subsampled per plane width and height
+       */
+      uint32_t width;
+      uint32_t height;
+
+      /* Even if we can get it from the parent image format, we keep the
+       * format here for convenience
+       */
+      VkFormat vk_format;
+   } planes[V3DV_MAX_PLANE_COUNT];
+
+   /* Used only when sampling a linear texture (which V3D doesn't support).
+    * This holds a tiled copy of the image we can use for that purpose.
+    */
+   struct v3dv_image *shadow;
+
+#if DETECT_OS_ANDROID
+   /* Image is backed by VK_ANDROID_native_buffer, */
+   bool is_native_buffer_memory;
+   /* Image is backed by VK_ANDROID_external_memory_android_hardware_buffer */
+   bool is_ahb;
+   VkImageDrmFormatModifierExplicitCreateInfoEXT *android_explicit_layout;
+   VkSubresourceLayout *android_plane_layouts;
+#endif
 };
 
+VkResult
+v3dv_image_init(struct v3dv_device *device,
+                const VkImageCreateInfo *pCreateInfo,
+                const VkAllocationCallbacks *pAllocator,
+                struct v3dv_image *image);
+
 VkImageViewType v3dv_image_type_to_view_type(VkImageType type);
 
+static uint32_t
+v3dv_image_aspect_to_plane(const struct v3dv_image *image,
+                           VkImageAspectFlagBits aspect)
+{
+   assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
+
+   /* Because we always put image and view planes in aspect-bit-order, the
+    * plane index is the number of bits in the image aspect before aspect.
+    */
+   return util_bitcount(image->vk.aspects & (aspect - 1));
+}
+
 /* Pre-generating packets needs to consider changes in packet sizes across hw
  * versions. Keep things simple and allocate enough space for any supported
  * version. We ensure the size is large enough through static asserts.
@@ -553,31 +788,50 @@ struct v3dv_image_view {
    struct vk_image_view vk;
 
    const struct v3dv_format *format;
-   bool swap_rb;
-   uint32_t internal_bpp;
-   uint32_t internal_type;
-   uint32_t offset;
 
-   /* Precomputed (composed from createinfo->components and formar swizzle)
-    * swizzles to pass in to the shader key.
-    *
-    * This could be also included on the descriptor bo, but the shader state
-    * packet doesn't need it on a bo, so we can just avoid a memory copy
-    */
-   uint8_t swizzle[4];
+   uint8_t view_swizzle[4];
 
-   /* Prepacked TEXTURE_SHADER_STATE. It will be copied to the descriptor info
-    * during UpdateDescriptorSets.
-    *
-    * Empirical tests show that cube arrays need a different shader state
-    * depending on whether they are used with a sampler or not, so for these
-    * we generate two states and select the one to use based on the descriptor
-    * type.
+   uint8_t plane_count;
+   struct {
+      uint8_t image_plane;
+
+      bool swap_rb;
+      bool channel_reverse;
+      uint32_t internal_bpp;
+      uint32_t internal_type;
+      uint32_t offset;
+
+      /* Precomputed swizzle (composed from the view swizzle and the format
+       * swizzle).
+       *
+       * This could be also included on the descriptor bo, but the shader state
+       * packet doesn't need it on a bo, so we can just avoid a memory copy
+       */
+      uint8_t swizzle[4];
+
+      /* Prepacked TEXTURE_SHADER_STATE. It will be copied to the descriptor info
+       * during UpdateDescriptorSets.
+       *
+       * Empirical tests show that cube arrays need a different shader state
+       * depending on whether they are used with a sampler or not, so for these
+       * we generate two states and select the one to use based on the descriptor
+       * type.
+       */
+      uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH];
+   } planes[V3DV_MAX_PLANE_COUNT];
+
+   /* Used only when sampling a linear texture (which V3D doesn't support).
+    * This would represent a view over the tiled shadow image.
     */
-   uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH];
+   struct v3dv_image_view *shadow;
 };
 
-uint32_t v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer);
+VkResult v3dv_create_image_view(struct v3dv_device *device,
+                                const VkImageViewCreateInfo *pCreateInfo,
+                                VkImageView *pView);
+
+uint32_t v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer,
+                           uint8_t plane);
 
 struct v3dv_buffer {
    struct vk_object_base base;
@@ -590,6 +844,15 @@ struct v3dv_buffer {
    VkDeviceSize mem_offset;
 };
 
+void
+v3dv_buffer_init(struct v3dv_device *device,
+                 const VkBufferCreateInfo *pCreateInfo,
+                 struct v3dv_buffer *buffer,
+                 uint32_t alignment);
+
+void
+v3dv_buffer_bind_memory(const VkBindBufferMemoryInfo *info);
+
 struct v3dv_buffer_view {
    struct vk_object_base base;
 
@@ -622,6 +885,8 @@ struct v3dv_subpass {
    struct v3dv_subpass_attachment *resolve_attachments;
 
    struct v3dv_subpass_attachment ds_attachment;
+   struct v3dv_subpass_attachment ds_resolve_attachment;
+   bool resolve_depth, resolve_stencil;
 
    /* If we need to emit the clear of the depth/stencil attachment using a
     * a draw call instead of using the TLB (GFXH-1461).
@@ -634,7 +899,7 @@ struct v3dv_subpass {
 };
 
 struct v3dv_render_pass_attachment {
-   VkAttachmentDescription desc;
+   VkAttachmentDescription2 desc;
 
    uint32_t first_subpass;
    uint32_t last_subpass;
@@ -650,10 +915,11 @@ struct v3dv_render_pass_attachment {
       uint32_t last_subpass;
    } views[MAX_MULTIVIEW_VIEW_COUNT];
 
-   /* If this is a multismapled attachment that is going to be resolved,
-    * whether we can use the TLB resolve on store.
+   /* If this is a multisampled attachment that is going to be resolved,
+    * whether we may be able to use the TLB hardware resolve based on the
+    * attachment format.
     */
-   bool use_tlb_resolve;
+   bool try_tlb_resolve;
 };
 
 struct v3dv_render_pass {
@@ -678,7 +944,7 @@ struct v3dv_framebuffer {
    uint32_t layers;
 
    /* Typically, edge tiles in the framebuffer have padding depending on the
-    * underlying tiling layout. One consequnce of this is that when the
+    * underlying tiling layout. One consequence of this is that when the
     * framebuffer dimensions are not aligned to tile boundaries, tile stores
     * would still write full tiles on the edges and write to the padded area.
     * If the framebuffer is aliasing a smaller region of a larger image, then
@@ -690,6 +956,11 @@ struct v3dv_framebuffer {
 
    uint32_t attachment_count;
    uint32_t color_attachment_count;
+
+   /* Notice that elements in 'attachments' will be NULL if the framebuffer
+    * was created imageless. The driver is expected to access attachment info
+    * from the command buffer state instead.
+    */
    struct v3dv_image_view *attachments[0];
 };
 
@@ -699,7 +970,9 @@ struct v3dv_frame_tiling {
    uint32_t layers;
    uint32_t render_target_count;
    uint32_t internal_bpp;
+   uint32_t total_color_bpp;
    bool     msaa;
+   bool     double_buffer;
    uint32_t tile_width;
    uint32_t tile_height;
    uint32_t draw_tiles_x;
@@ -710,22 +983,26 @@ struct v3dv_frame_tiling {
    uint32_t frame_height_in_supertiles;
 };
 
-void v3dv_framebuffer_compute_internal_bpp_msaa(const struct v3dv_framebuffer *framebuffer,
-                                                const struct v3dv_subpass *subpass,
-                                                uint8_t *max_bpp, bool *msaa);
-
 bool v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device,
                                        const VkRect2D *area,
                                        struct v3dv_framebuffer *fb,
                                        struct v3dv_render_pass *pass,
                                        uint32_t subpass_idx);
 
-struct v3dv_cmd_pool {
-   struct vk_object_base base;
-
-   VkAllocationCallbacks alloc;
-   struct list_head cmd_buffers;
-};
+/* Checks if we need to emit 2 initial tile clears for double buffer mode.
+ * This happens when we render at least 2 tiles, because in this mode each
+ * tile uses a different half of the tile buffer memory so we can have 2 tiles
+ * in flight (one being stored to memory and the next being rendered). In this
+ * scenario, if we emit a single initial tile clear we would only clear the
+ * first half of the tile buffer.
+ */
+static inline bool
+v3dv_do_double_initial_tile_clear(const struct v3dv_frame_tiling *tiling)
+{
+   return tiling->double_buffer &&
+          (tiling->draw_tiles_x > 1 || tiling->draw_tiles_y > 1 ||
+           tiling->layers > 1);
+}
 
 enum v3dv_cmd_buffer_status {
    V3DV_CMD_BUFFER_STATUS_NEW           = 0,
@@ -748,100 +1025,67 @@ struct v3dv_cmd_buffer_attachment_state {
 
    /* The hardware clear value */
    union v3dv_clear_value clear_value;
+
+   /* The underlying image view (from the framebuffer or, if imageless
+    * framebuffer is used, from VkRenderPassAttachmentBeginInfo.
+    */
+   struct v3dv_image_view *image_view;
+
+   /* If this is a multisampled attachment with a resolve operation. */
+   bool has_resolve;
+
+   /* If this is a multisampled attachment with a resolve operation,
+    * whether we can use the TLB for the resolve.
+    */
+   bool use_tlb_resolve;
 };
 
+/* Cached values derived from Vulkan viewport/count */
 struct v3dv_viewport_state {
-   uint32_t count;
-   VkViewport viewports[MAX_VIEWPORTS];
    float translate[MAX_VIEWPORTS][3];
    float scale[MAX_VIEWPORTS][3];
 };
 
-struct v3dv_scissor_state {
-   uint32_t count;
-   VkRect2D scissors[MAX_SCISSORS];
-};
-
-/* Mostly a v3dv mapping of VkDynamicState, used to track which data as
- * defined as dynamic
- */
-enum v3dv_dynamic_state_bits {
-   V3DV_DYNAMIC_VIEWPORT                  = 1 << 0,
-   V3DV_DYNAMIC_SCISSOR                   = 1 << 1,
-   V3DV_DYNAMIC_STENCIL_COMPARE_MASK      = 1 << 2,
-   V3DV_DYNAMIC_STENCIL_WRITE_MASK        = 1 << 3,
-   V3DV_DYNAMIC_STENCIL_REFERENCE         = 1 << 4,
-   V3DV_DYNAMIC_BLEND_CONSTANTS           = 1 << 5,
-   V3DV_DYNAMIC_DEPTH_BIAS                = 1 << 6,
-   V3DV_DYNAMIC_LINE_WIDTH                = 1 << 7,
-   V3DV_DYNAMIC_COLOR_WRITE_ENABLE        = 1 << 8,
-   V3DV_DYNAMIC_ALL                       = (1 << 9) - 1,
-};
-
-/* Flags for dirty pipeline state.
+/* Flags for custom dirty state, that could lead to packet emission.
+ *
+ * Note *custom*, for all the dynamic state tracking coming from the Vulkan
+ * API, we use the Mesa runtime framework and their predefined flags
+ * (MESA_VK_DYNAMIC_XXX).
+ *
+ * Here we defined additional flags used to track dirty state.
  */
 enum v3dv_cmd_dirty_bits {
-   V3DV_CMD_DIRTY_VIEWPORT                  = 1 << 0,
-   V3DV_CMD_DIRTY_SCISSOR                   = 1 << 1,
-   V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK      = 1 << 2,
-   V3DV_CMD_DIRTY_STENCIL_WRITE_MASK        = 1 << 3,
-   V3DV_CMD_DIRTY_STENCIL_REFERENCE         = 1 << 4,
-   V3DV_CMD_DIRTY_PIPELINE                  = 1 << 5,
-   V3DV_CMD_DIRTY_COMPUTE_PIPELINE          = 1 << 6,
-   V3DV_CMD_DIRTY_VERTEX_BUFFER             = 1 << 7,
-   V3DV_CMD_DIRTY_INDEX_BUFFER              = 1 << 8,
-   V3DV_CMD_DIRTY_DESCRIPTOR_SETS           = 1 << 9,
-   V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS   = 1 << 10,
-   V3DV_CMD_DIRTY_PUSH_CONSTANTS            = 1 << 11,
-   V3DV_CMD_DIRTY_BLEND_CONSTANTS           = 1 << 12,
-   V3DV_CMD_DIRTY_OCCLUSION_QUERY           = 1 << 13,
-   V3DV_CMD_DIRTY_DEPTH_BIAS                = 1 << 14,
-   V3DV_CMD_DIRTY_LINE_WIDTH                = 1 << 15,
-   V3DV_CMD_DIRTY_VIEW_INDEX                = 1 << 16,
-   V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE        = 1 << 17,
+   V3DV_CMD_DIRTY_PIPELINE                  = 1 << 0,
+   V3DV_CMD_DIRTY_COMPUTE_PIPELINE          = 1 << 1,
+   V3DV_CMD_DIRTY_VERTEX_BUFFER             = 1 << 2,
+   V3DV_CMD_DIRTY_INDEX_BUFFER              = 1 << 3,
+   V3DV_CMD_DIRTY_DESCRIPTOR_SETS           = 1 << 4,
+   V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS   = 1 << 5,
+   V3DV_CMD_DIRTY_PUSH_CONSTANTS            = 1 << 6,
+   V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO        = 1 << 7,
+   V3DV_CMD_DIRTY_OCCLUSION_QUERY           = 1 << 8,
+   V3DV_CMD_DIRTY_VIEW_INDEX                = 1 << 9,
+   V3DV_CMD_DIRTY_DRAW_ID                   = 1 << 10,
+   V3DV_CMD_DIRTY_ALL                       = (1 << 10) - 1,
 };
 
 struct v3dv_dynamic_state {
-   /**
-    * Bitmask of (1 << VK_DYNAMIC_STATE_*).
-    * Defines the set of saved dynamic state.
+   /* FIXME: we keep some viewport info cached (translate, scale) because we
+    * use that on more that one place. But note that translate_z and scale_z
+    * is also used in several places, and we recompute it based on
+    * scissor/viewport info all time. So perhaps we could do the same with the
+    * x and y component.
     */
-   uint32_t mask;
-
    struct v3dv_viewport_state viewport;
 
-   struct v3dv_scissor_state scissor;
-
-   struct {
-      uint32_t front;
-      uint32_t back;
-   } stencil_compare_mask;
-
-   struct {
-      uint32_t front;
-      uint32_t back;
-   } stencil_write_mask;
-
-   struct {
-      uint32_t front;
-      uint32_t back;
-   } stencil_reference;
-
-   float blend_constants[4];
-
-   struct {
-      float constant_factor;
-      float depth_bias_clamp;
-      float slope_factor;
-   } depth_bias;
-
-   float line_width;
-
+   /* We cache the color_write_enable as the vulkan runtime keeps a 8-bit
+    * bitset with a bit per attachment, but in order to combine with the
+    * color_write_masks is easier to cache a 32-bit bitset with 4 bits per
+    * attachment.
+    */
    uint32_t color_write_enable;
 };
 
-extern const struct v3dv_dynamic_state default_dynamic_state;
-
 void v3dv_viewport_compute_xform(const VkViewport *viewport,
                                  float scale[3],
                                  float translate[3]);
@@ -855,15 +1099,12 @@ enum v3dv_ez_state {
 
 enum v3dv_job_type {
    V3DV_JOB_TYPE_GPU_CL = 0,
-   V3DV_JOB_TYPE_GPU_CL_SECONDARY,
+   V3DV_JOB_TYPE_GPU_CL_INCOMPLETE,
    V3DV_JOB_TYPE_GPU_TFU,
    V3DV_JOB_TYPE_GPU_CSD,
    V3DV_JOB_TYPE_CPU_RESET_QUERIES,
    V3DV_JOB_TYPE_CPU_END_QUERY,
    V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
-   V3DV_JOB_TYPE_CPU_SET_EVENT,
-   V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
-   V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
    V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
    V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
 };
@@ -874,7 +1115,7 @@ struct v3dv_reset_query_cpu_job_info {
    uint32_t count;
 };
 
-struct v3dv_end_query_cpu_job_info {
+struct v3dv_end_query_info {
    struct v3dv_query_pool *pool;
    uint32_t query;
 
@@ -892,31 +1133,14 @@ struct v3dv_copy_query_results_cpu_job_info {
    VkQueryResultFlags flags;
 };
 
-struct v3dv_event_set_cpu_job_info {
-   struct v3dv_event *event;
-   int state;
-};
-
-struct v3dv_event_wait_cpu_job_info {
-   /* List of events to wait on */
-   uint32_t event_count;
-   struct v3dv_event **events;
-
-   /* Whether any postponed jobs after the wait should wait on semaphores */
-   bool sem_wait;
-};
+struct v3dv_submit_sync_info {
+   /* List of syncs to wait before running a job */
+   uint32_t wait_count;
+   struct vk_sync_wait *waits;
 
-struct v3dv_copy_buffer_to_image_cpu_job_info {
-   struct v3dv_image *image;
-   struct v3dv_buffer *buffer;
-   uint32_t buffer_offset;
-   uint32_t buffer_stride;
-   uint32_t buffer_layer_stride;
-   VkOffset3D image_offset;
-   VkExtent3D image_extent;
-   uint32_t mip_level;
-   uint32_t base_layer;
-   uint32_t layer_count;
+   /* List of syncs to signal when all jobs complete */
+   uint32_t signal_count;
+   struct vk_sync_signal *signals;
 };
 
 struct v3dv_csd_indirect_cpu_job_info {
@@ -936,6 +1160,19 @@ struct v3dv_timestamp_query_cpu_job_info {
    uint32_t count;
 };
 
+/* Number of perfmons required to handle all supported performance counters */
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
+                                       DRM_V3D_MAX_PERF_COUNTERS)
+
+struct v3dv_perf_query {
+   uint32_t kperfmon_ids[V3DV_MAX_PERFMONS];
+
+   /* A DRM syncobj to wait on the GPU jobs for which we are collecting
+    * performance data.
+    */
+   struct vk_sync *last_job_sync;
+};
+
 struct v3dv_job {
    struct list_head list_link;
 
@@ -945,6 +1182,61 @@ struct v3dv_job {
     */
    bool is_clone;
 
+   /* If this is a cloned job, if it has its own BCL resource. This happens
+    * when we suspend jobs with in command buffers with the
+    * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT flag.
+    */
+   bool clone_owns_bcl;
+
+   /* VK_KHR_dynamic_rendering */
+   bool suspending;
+   bool resuming;
+   struct v3dv_cl_out *suspend_branch_inst_ptr;
+   uint32_t suspended_bcl_end;
+
+   /* If the job executes on the transfer stage of the pipeline */
+   bool is_transfer;
+
+   /* VK_KHR_buffer_device_address allows shaders to use pointers that can
+    * dereference memory in any buffer that has been flagged with
+    * VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT. These buffers may not
+    * be bound via descriptor sets, so we need to make sure that a job that
+    * uses this functionality includes all these buffers in its kernel
+    * submission.
+    */
+   bool uses_buffer_device_address;
+
+   /* True if we have not identified anything that would be incompatible
+    * with double-buffer (like MSAA) or that would make double-buffer mode
+    * not efficient (like tile loads or not having any stores).
+    */
+   bool can_use_double_buffer;
+
+   /* This structure keeps track of various scores to inform a heuristic
+    * for double-buffer mode.
+    */
+   struct {
+      /* Cost of geometry shading */
+      uint32_t geom;
+      /* Cost of shader rendering */
+      uint32_t render;
+   } double_buffer_score;
+
+   /* We only need to allocate tile state for all layers if the binner
+    * writes primitives to layers other than the first. This can only be
+    * done using layered rendering (writing gl_Layer from a geometry shader),
+    * so for other cases of multilayered framebuffers (typically with
+    * meta copy/clear operations) that won't use layered rendering, we only
+    * need one layer worth of of tile state for the binner.
+    */
+   bool allocate_tile_state_for_all_layers;
+
+   /* A pointer to the location of the TILE_BINNING_MODE_CFG packet so we can
+    * rewrite it to enable double-buffer mode by the time we have enough info
+    * about the job to make that decision.
+    */
+   struct v3dv_cl_out *bcl_tile_binning_mode_ptr;
+
    enum v3dv_job_type type;
 
    struct v3dv_device *device;
@@ -988,6 +1280,9 @@ struct v3dv_job {
     */
    bool decided_global_ez_enable;
 
+   /* If the job emitted any draw calls with Early Z/S enabled */
+   bool has_ez_draws;
+
    /* If this job has been configured to use early Z/S clear */
    bool early_zs_clear;
 
@@ -1000,8 +1295,10 @@ struct v3dv_job {
     */
    bool always_flush;
 
-   /* Whether we need to serialize this job in our command stream */
-   bool serialize;
+   /* A mask of V3DV_BARRIER_* indicating the source(s) of the barrier. We
+    * can use this to select the hw queues where we need to serialize the job.
+    */
+   uint8_t serialize;
 
    /* If this is a CL job, whether we should sync before binning */
    bool needs_bcl_sync;
@@ -1009,11 +1306,8 @@ struct v3dv_job {
    /* Job specs for CPU jobs */
    union {
       struct v3dv_reset_query_cpu_job_info          query_reset;
-      struct v3dv_end_query_cpu_job_info            query_end;
+      struct v3dv_end_query_info                    query_end;
       struct v3dv_copy_query_results_cpu_job_info   query_copy_results;
-      struct v3dv_event_set_cpu_job_info            event_set;
-      struct v3dv_event_wait_cpu_job_info           event_wait;
-      struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
       struct v3dv_csd_indirect_cpu_job_info         csd_indirect;
       struct v3dv_timestamp_query_cpu_job_info      query_timestamp;
    } cpu;
@@ -1028,6 +1322,9 @@ struct v3dv_job {
       uint32_t wg_base[3];
       struct drm_v3d_submit_csd submit;
    } csd;
+
+   /* Perfmons with last job sync for CSD and CL jobs */
+   struct v3dv_perf_query *perf;
 };
 
 void v3dv_job_init(struct v3dv_job *job,
@@ -1045,10 +1342,17 @@ void v3dv_job_start_frame(struct v3dv_job *job,
                           uint32_t height,
                           uint32_t layers,
                           bool allocate_tile_state_for_all_layers,
+                          bool allocate_tile_state_now,
                           uint32_t render_target_count,
                           uint8_t max_internal_bpp,
+                          uint8_t total_color_bpp,
                           bool msaa);
 
+bool v3dv_job_type_is_gpu(struct v3dv_job *job);
+
+struct v3dv_job *
+v3dv_job_clone(struct v3dv_job *job, bool skip_bcl);
+
 struct v3dv_job *
 v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
                              struct v3dv_cmd_buffer *cmd_buffer);
@@ -1065,7 +1369,26 @@ v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
                                    uint32_t *alloc_count,
                                    void **ptr);
 
-void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer);
+void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
+                                   bool indexed, bool indirect,
+                                   uint32_t vertex_count);
+
+bool v3dv_job_allocate_tile_state(struct v3dv_job *job);
+
+void
+v3dv_setup_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer,
+                               const VkRenderingInfoKHR *pRenderingInfo);
+
+void
+v3dv_destroy_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dv_setup_dynamic_render_pass(struct v3dv_cmd_buffer *cmd_buffer,
+                               const VkRenderingInfoKHR *pRenderingInfo);
+
+void
+v3dv_setup_dynamic_render_pass_inheritance(struct v3dv_cmd_buffer *cmd_buffer,
+                                           const VkCommandBufferInheritanceRenderingInfo *info);
 
 /* FIXME: only used on v3dv_cmd_buffer and v3dvx_cmd_buffer, perhaps move to a
  * cmd_buffer specific header?
@@ -1094,9 +1417,46 @@ struct v3dv_cmd_pipeline_state {
    struct v3dv_descriptor_state descriptor_state;
 };
 
+enum {
+   V3DV_BARRIER_GRAPHICS_BIT = (1 << 0),
+   V3DV_BARRIER_COMPUTE_BIT  = (1 << 1),
+   V3DV_BARRIER_TRANSFER_BIT = (1 << 2),
+   V3DV_BARRIER_CPU_BIT      = (1 << 3),
+};
+#define V3DV_BARRIER_ALL (V3DV_BARRIER_GRAPHICS_BIT | \
+                          V3DV_BARRIER_TRANSFER_BIT | \
+                          V3DV_BARRIER_COMPUTE_BIT | \
+                          V3DV_BARRIER_CPU_BIT);
+
+struct v3dv_barrier_state {
+   /* Mask of V3DV_BARRIER_* indicating where we consume a barrier. */
+   uint8_t dst_mask;
+
+   /* For each possible consumer of a barrier, a mask of V3DV_BARRIER_*
+    * indicating the sources of the dependency.
+    */
+   uint8_t src_mask_graphics;
+   uint8_t src_mask_transfer;
+   uint8_t src_mask_compute;
+
+   /* For graphics barriers, access masks involved. Used to decide if we need
+    * to execute a binning or render barrier.
+    */
+   VkAccessFlags2 bcl_buffer_access;
+   VkAccessFlags2 bcl_image_access;
+};
+
 struct v3dv_cmd_buffer_state {
    struct v3dv_render_pass *pass;
    struct v3dv_framebuffer *framebuffer;
+
+   /* VK_KHR_dynamic_rendering */
+   struct v3dv_render_pass dynamic_pass;
+   struct v3dv_subpass dynamic_subpass;
+   struct v3dv_render_pass_attachment dynamic_attachments[18 /* (8 color + D/S) x 2 (for resolves) */];
+   struct v3dv_subpass_attachment dynamic_subpass_attachments[18];
+   struct v3dv_framebuffer *dynamic_framebuffer;
+
    VkRect2D render_area;
 
    /* Current job being recorded */
@@ -1107,8 +1467,16 @@ struct v3dv_cmd_buffer_state {
    struct v3dv_cmd_pipeline_state gfx;
    struct v3dv_cmd_pipeline_state compute;
 
+   /* For most state tracking we rely on vk_dynamic_graphics_state, but we
+    * maintain a custom structure for some state-related data that we want to
+    * cache.
+    */
    struct v3dv_dynamic_state dynamic;
 
+   /* This dirty is for v3dv_cmd_dirty_bits (FIXME: perhaps we should be more
+    * explicit about it). For dirty flags coming from Vulkan dynamic state,
+    * use the vk_dynamic_graphics_state handled by the vk_cmd_buffer
+    */
    uint32_t dirty;
    VkShaderStageFlagBits dirty_descriptor_stages;
    VkShaderStageFlagBits dirty_push_constants_stages;
@@ -1128,6 +1496,14 @@ struct v3dv_cmd_buffer_state {
     */
    bool tile_aligned_render_area;
 
+   /* FIXME: we have just one client-side BO for the push constants,
+    * independently of the stageFlags in vkCmdPushConstants, and the
+    * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage
+    * tuning in the future if it makes sense.
+    */
+   uint32_t push_constants_size;
+   uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4];
+
    uint32_t attachment_alloc_count;
    struct v3dv_cmd_buffer_attachment_state *attachments;
 
@@ -1151,14 +1527,21 @@ struct v3dv_cmd_buffer_state {
    /* Current view index for multiview rendering */
    uint32_t view_index;
 
+   /* Current draw ID for multidraw */
+   uint32_t draw_id;
+
    /* Used to flag OOM conditions during command buffer recording */
    bool oom;
 
-   /* Whether we have recorded a pipeline barrier that we still need to
-    * process.
-    */
-   bool has_barrier;
-   bool has_bcl_barrier;
+   /* If we are currently recording job(s) for a transfer operation */
+   bool is_transfer;
+
+   /* VK_KHR_dynamic_rendering */
+   bool suspending;
+   bool resuming;
+
+   /* Barrier state tracking */
+   struct v3dv_barrier_state barrier;
 
    /* Secondary command buffer state */
    struct {
@@ -1178,12 +1561,14 @@ struct v3dv_cmd_buffer_state {
       bool tile_aligned_render_area;
       VkRect2D render_area;
 
+      struct vk_dynamic_graphics_state dynamic_graphics_state;
       struct v3dv_dynamic_state dynamic;
 
       struct v3dv_cmd_pipeline_state gfx;
       bool has_descriptor_state;
 
       uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
+      uint32_t push_constants_size;
    } meta;
 
    /* Command buffer state for queries */
@@ -1196,19 +1581,49 @@ struct v3dv_cmd_buffer_state {
       struct {
          uint32_t used_count;
          uint32_t alloc_count;
-         struct v3dv_end_query_cpu_job_info *states;
+         struct v3dv_end_query_info *states;
       } end;
 
-      /* This BO is not NULL if we have an active query, that is, we have
-       * called vkCmdBeginQuery but not vkCmdEndQuery.
-       */
       struct {
+         /* This BO is not NULL if we have an active occlusion query, that is,
+          * we have called vkCmdBeginQuery but not vkCmdEndQuery.
+          */
          struct v3dv_bo *bo;
          uint32_t offset;
+         /* When the driver emits draw calls to implement other operations in
+          * the middle of a render pass (such as an attachment clear), we need
+          * to pause occlusion query recording and resume it later so that
+          * these draw calls don't register in occlussion counters. We use
+          * this to store the BO reference in which we should resume occlusion
+          * query counters after the driver is done emitting its draw calls.
+           */
+         struct v3dv_bo *paused_bo;
+
+         /* This pointer is not NULL if we have an active performance query */
+         struct v3dv_perf_query *perf;
       } active_query;
    } query;
+
+   /* This is dynamic state since VK_EXT_extended_dynamic_state. */
+   bool z_updates_enable;
+
+   /* ez_state can be dynamic since VK_EXT_extended_dynamic_state so we need
+    * to keep track of it in the cmd_buffer state
+    */
+   enum v3dv_ez_state ez_state;
+
+   /* incompatible_ez_test can be dynamic since VK_EXT_extended_dynamic_state
+    * so we need to keep track of it in the cmd_buffer state
+    */
+   bool incompatible_ez_test;
+
 };
 
+void
+v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer,
+                                           uint32_t vp_idx,
+                                           float *translate_z, float *scale_z);
+
 /* The following struct represents the info from a descriptor that we store on
  * the host memory. They are mostly links to other existing vulkan objects,
  * like the image_view in order to access to swizzle info, or the buffer used
@@ -1228,8 +1643,8 @@ struct v3dv_descriptor {
 
       struct {
          struct v3dv_buffer *buffer;
-         uint32_t offset;
-         uint32_t range;
+         size_t offset;
+         size_t range;
       };
 
       struct v3dv_buffer_view *buffer_view;
@@ -1237,28 +1652,90 @@ struct v3dv_descriptor {
 };
 
 struct v3dv_query {
+   /* Used by queries where we implement result copying in the CPU so we can
+    * tell if the relevant jobs have been submitted for execution. Currently
+    * these are all but occlusion queries.
+    */
    bool maybe_available;
+
    union {
-      /* Used by GPU queries (occlusion) */
+      /* Used by occlusion queries */
       struct {
-         struct v3dv_bo *bo;
+         /* Offset of this query in the occlusion query counter BO */
          uint32_t offset;
-      };
-      /* Used by CPU queries (timestamp) */
-      uint64_t value;
+      } occlusion;
+
+      /* Used by timestamp queries */
+      struct {
+         /* Offset of this query in the timestamp BO for its value */
+         uint32_t offset;
+
+         /* Syncobj to signal timestamp query availability */
+         struct vk_sync *sync;
+      } timestamp;
+
+      /* Used by performance queries */
+      struct v3dv_perf_query perf;
    };
 };
 
 struct v3dv_query_pool {
    struct vk_object_base base;
 
-   struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
+   /* Per-pool Vulkan resources required to implement GPU-side query
+    * functions (only occlusion queries for now).
+    */
+   struct {
+      /* Buffer to access the BO with the occlusion query results and
+       * availability info.
+       */
+      VkBuffer buf;
+      VkDeviceMemory mem;
+
+      /* Descriptor set for accessing the buffer from a pipeline. */
+      VkDescriptorPool descriptor_pool;
+      VkDescriptorSet descriptor_set;
+   } meta;
+
+   /* Only used with occlusion queries */
+   struct {
+      /* BO with the occlusion counters and query availability */
+      struct v3dv_bo *bo;
+      /* Offset of the availability info in the BO */
+      uint32_t avail_offset;
+   } occlusion;
+
+   /* Only used with timestamp queries */
+   struct {
+      /* BO with the query timestamp values */
+      struct v3dv_bo *bo;
+   } timestamp;
+
+   /* Only used with performance queries */
+   struct {
+      uint32_t ncounters;
+      uint8_t counters[V3D_MAX_PERFCNT];
+
+      /* V3D has a limit on the number of counters we can track in a
+       * single performance monitor, so if too many counters are requested
+       * we need to create multiple monitors to record all of them. This
+       * field represents the number of monitors required for the number
+       * of counters requested.
+       */
+      uint8_t nperfmons;
+   } perfmon;
 
    VkQueryType query_type;
    uint32_t query_count;
    struct v3dv_query *queries;
 };
 
+VkResult
+v3dv_query_allocate_resources(struct v3dv_device *decice);
+
+void
+v3dv_query_free_resources(struct v3dv_device *decice);
+
 VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
                                          struct v3dv_query_pool *pool,
                                          uint32_t first,
@@ -1267,6 +1744,16 @@ VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
                                          VkDeviceSize stride,
                                          VkQueryResultFlags flags);
 
+void v3dv_reset_query_pool_cpu(struct v3dv_device *device,
+                               struct v3dv_query_pool *query_pool,
+                               uint32_t first,
+                               uint32_t last);
+
+void v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
+                                                 struct v3dv_query_pool *pool,
+                                                 uint32_t query, uint32_t count,
+                                                 uint8_t availability);
+
 typedef void (*v3dv_cmd_buffer_private_obj_destroy_cb)(VkDevice device,
                                                        uint64_t pobj,
                                                        VkAllocationCallbacks *alloc);
@@ -1276,33 +1763,20 @@ struct v3dv_cmd_buffer_private_obj {
    v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb;
 };
 
+extern const struct vk_command_buffer_ops v3dv_cmd_buffer_ops;
+
 struct v3dv_cmd_buffer {
-   struct vk_object_base base;
+   struct vk_command_buffer vk;
 
    struct v3dv_device *device;
 
-   struct v3dv_cmd_pool *pool;
-   struct list_head pool_link;
-
-   /* Used at submit time to link command buffers in the submission that have
-    * spawned wait threads, so we can then wait on all of them to complete
-    * before we process any signal sempahores or fences.
-    */
-   struct list_head list_link;
-
    VkCommandBufferUsageFlags usage_flags;
-   VkCommandBufferLevel level;
 
    enum v3dv_cmd_buffer_status status;
 
    struct v3dv_cmd_buffer_state state;
 
-   /* FIXME: we have just one client-side and bo for the push constants,
-    * independently of the stageFlags in vkCmdPushConstants, and the
-    * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage
-    * tunning in the future if it makes sense.
-    */
-   uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4];
+   /* Buffer where we upload push constant data to resolve indirect indexing */
    struct v3dv_cl_reloc push_constants_resource;
 
    /* Collection of Vulkan objects created internally by the driver (typically
@@ -1321,6 +1795,10 @@ struct v3dv_cmd_buffer {
          /* The current descriptor pool for texel buffer copy sources */
          VkDescriptorPool dspool;
       } texel_buffer_copy;
+      struct {
+         /* The current descriptor pool for the copy query results output buffer */
+         VkDescriptorPool dspool;
+      } query;
    } meta;
 
    /* List of jobs in the command buffer. For primary command buffers it
@@ -1346,19 +1824,16 @@ void v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer);
 void v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
                                      bool push_descriptor_state);
 void v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
-                                    uint32_t dirty_dynamic_state,
                                     bool needs_subpass_resume);
 
-void v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
-                                   struct v3dv_query_pool *pool,
-                                   uint32_t first,
-                                   uint32_t count);
-
 void v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
                                  struct v3dv_query_pool *pool,
                                  uint32_t query,
                                  VkQueryControlFlags flags);
 
+void v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
+void v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
+
 void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
                                struct v3dv_query_pool *pool,
                                uint32_t query);
@@ -1375,38 +1850,58 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
 void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
                                  struct drm_v3d_submit_tfu *tfu);
 
-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
+void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
+                                              struct v3dv_csd_indirect_cpu_job_info *info,
                                               const uint32_t *wg_counts);
 
 void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
                                      uint64_t obj,
                                      v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb);
 
-struct v3dv_semaphore {
-   struct vk_object_base base;
+void v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
+                                         struct v3dv_barrier_state *src);
 
-   /* A syncobject handle associated with this semaphore */
-   uint32_t sync;
+void v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer,
+                                      struct v3dv_job *job);
 
-   /* A temporary syncobject handle produced from a vkImportSemaphoreFd. */
-   uint32_t temp_sync;
-};
+bool v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
+                                      VkImageAspectFlags aspect,
+                                      uint32_t first_subpass_idx,
+                                      VkAttachmentLoadOp load_op,
+                                      uint32_t last_subpass_idx,
+                                      VkAttachmentStoreOp store_op);
 
-struct v3dv_fence {
-   struct vk_object_base base;
+bool v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
+                                       VkImageAspectFlags aspect,
+                                       uint32_t last_subpass_idx,
+                                       VkAttachmentStoreOp store_op);
 
-   /* A syncobject handle associated with this fence */
-   uint32_t sync;
+void v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
+                                           const VkDependencyInfo *info);
 
-   /* A temporary syncobject handle produced from a vkImportFenceFd. */
-   uint32_t temp_sync;
-};
+bool v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
+                                    struct v3dv_image *dst,
+                                    struct v3dv_image *src,
+                                    const VkImageCopy2 *region);
 
 struct v3dv_event {
    struct vk_object_base base;
-   int state;
+
+   /* Link in the device list of pre-allocated free events */
+   struct list_head link;
+
+   /* Each event gets a different index, which we use to compute the offset
+    * in the BO we use to track their state (signaled vs reset).
+    */
+   uint32_t index;
 };
 
+VkResult
+v3dv_event_allocate_resources(struct v3dv_device *device);
+
+void
+v3dv_event_free_resources(struct v3dv_device *device);
+
 struct v3dv_shader_variant {
    enum broadcom_shader_stage stage;
 
@@ -1428,9 +1923,11 @@ struct v3dv_shader_variant {
     */
    uint32_t assembly_offset;
 
-   /* Note: it is really likely that qpu_insts would be NULL, as it will be
-    * used only temporarily, to upload it to the shared bo, as we compile the
-    * different stages individually.
+   /* Note: don't assume qpu_insts to be always NULL or not-NULL. In general
+    * we will try to free it as soon as we upload it to the shared bo while we
+    * compile the different stages. But we can decide to keep it around based
+    * on some pipeline creation flags, like
+    * VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT.
     */
    uint64_t *qpu_insts;
    uint32_t qpu_insts_size;
@@ -1462,7 +1959,9 @@ struct v3dv_pipeline_stage {
    /** A name for this program, so you can track it in shader-db output. */
    uint32_t program_id;
 
-   VkPipelineCreationFeedbackEXT feedback;
+   VkPipelineCreationFeedback feedback;
+
+   struct vk_pipeline_robustness_state robustness;
 };
 
 /* We are using the descriptor pool entry for two things:
@@ -1486,6 +1985,9 @@ struct v3dv_descriptor_pool_entry
 struct v3dv_descriptor_pool {
    struct vk_object_base base;
 
+   /* A list with all descriptor sets allocated from the pool. */
+   struct list_head set_list;
+
    /* If this descriptor pool has been allocated for the driver for internal
     * use, typically to implement meta operations.
     */
@@ -1515,9 +2017,12 @@ struct v3dv_descriptor_pool {
 struct v3dv_descriptor_set {
    struct vk_object_base base;
 
+   /* List link into the list of all sets allocated from the pool */
+   struct list_head pool_link;
+
    struct v3dv_descriptor_pool *pool;
 
-   const struct v3dv_descriptor_set_layout *layout;
+   struct v3dv_descriptor_set_layout *layout;
 
    /* Offset relative to the descriptor pool bo for this set */
    uint32_t base_offset;
@@ -1533,7 +2038,7 @@ struct v3dv_descriptor_set_binding_layout {
    /* Number of array elements in this binding */
    uint32_t array_size;
 
-   /* Index into the flattend descriptor set */
+   /* Index into the flattened descriptor set */
    uint32_t descriptor_index;
 
    uint32_t dynamic_offset_count;
@@ -1548,6 +2053,11 @@ struct v3dv_descriptor_set_binding_layout {
     * if there are no immutable samplers.
     */
    uint32_t immutable_samplers_offset;
+
+   /* Descriptors for multiplanar combined image samplers are larger.
+    * For mutable descriptors, this is always 1.
+    */
+   uint8_t plane_stride;
 };
 
 struct v3dv_descriptor_set_layout {
@@ -1571,10 +2081,35 @@ struct v3dv_descriptor_set_layout {
    /* Number of dynamic offsets used by this descriptor set */
    uint16_t dynamic_offset_count;
 
+   /* Descriptor set layouts can be destroyed even if they are still being
+    * used.
+    */
+   uint32_t ref_cnt;
+
    /* Bindings in this descriptor set */
    struct v3dv_descriptor_set_binding_layout binding[0];
 };
 
+void
+v3dv_descriptor_set_layout_destroy(struct v3dv_device *device,
+                                   struct v3dv_descriptor_set_layout *set_layout);
+
+static inline void
+v3dv_descriptor_set_layout_ref(struct v3dv_descriptor_set_layout *set_layout)
+{
+   assert(set_layout && set_layout->ref_cnt >= 1);
+   p_atomic_inc(&set_layout->ref_cnt);
+}
+
+static inline void
+v3dv_descriptor_set_layout_unref(struct v3dv_device *device,
+                                 struct v3dv_descriptor_set_layout *set_layout)
+{
+   assert(set_layout && set_layout->ref_cnt >= 1);
+   if (p_atomic_dec_zero(&set_layout->ref_cnt))
+      v3dv_descriptor_set_layout_destroy(device, set_layout);
+}
+
 struct v3dv_pipeline_layout {
    struct vk_object_base base;
 
@@ -1590,8 +2125,37 @@ struct v3dv_pipeline_layout {
 
    uint32_t dynamic_offset_count;
    uint32_t push_constant_size;
+
+   /* Pipeline layouts can be destroyed after creating pipelines since
+    * maintenance4.
+    */
+   uint32_t ref_cnt;
+
+   unsigned char sha1[20];
 };
 
+void
+v3dv_pipeline_layout_destroy(struct v3dv_device *device,
+                             struct v3dv_pipeline_layout *layout,
+                             const VkAllocationCallbacks *alloc);
+
+static inline void
+v3dv_pipeline_layout_ref(struct v3dv_pipeline_layout *layout)
+{
+   assert(layout && layout->ref_cnt >= 1);
+   p_atomic_inc(&layout->ref_cnt);
+}
+
+static inline void
+v3dv_pipeline_layout_unref(struct v3dv_device *device,
+                           struct v3dv_pipeline_layout *layout,
+                           const VkAllocationCallbacks *alloc)
+{
+   assert(layout && layout->ref_cnt >= 1);
+   if (p_atomic_dec_zero(&layout->ref_cnt))
+      v3dv_pipeline_layout_destroy(device, layout, alloc);
+}
+
 /*
  * We are using descriptor maps for ubo/ssbo and texture/samplers, so we need
  * it to be big enough to include the max value for all of them.
@@ -1599,18 +2163,20 @@ struct v3dv_pipeline_layout {
  * FIXME: one alternative would be to allocate the map as big as you need for
  * each descriptor type. That would means more individual allocations.
  */
-#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \
-                                 MAX_UNIFORM_BUFFERS,      \
+#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS,                         \
+                                 MAX_UNIFORM_BUFFERS + MAX_INLINE_UNIFORM_BUFFERS, \
                                  MAX_STORAGE_BUFFERS)
 
 
 struct v3dv_descriptor_map {
-   /* TODO: avoid fixed size array/justify the size */
+   /* FIXME: avoid fixed size array/justify the size */
    unsigned num_desc; /* Number of descriptors  */
    int set[DESCRIPTOR_MAP_SIZE];
    int binding[DESCRIPTOR_MAP_SIZE];
    int array_index[DESCRIPTOR_MAP_SIZE];
    int array_size[DESCRIPTOR_MAP_SIZE];
+   uint8_t plane[DESCRIPTOR_MAP_SIZE];
+   bool used[DESCRIPTOR_MAP_SIZE];
 
    /* NOTE: the following is only for sampler, but this is the easier place to
     * put it.
@@ -1620,57 +2186,19 @@ struct v3dv_descriptor_map {
 
 struct v3dv_sampler {
    struct vk_object_base base;
+   struct vk_ycbcr_conversion *conversion;
 
    bool compare_enable;
    bool unnormalized_coordinates;
-   bool clamp_to_transparent_black_border;
 
-   /* Prepacked SAMPLER_STATE, that is referenced as part of the tmu
+   /* Prepacked per plane SAMPLER_STATE, that is referenced as part of the tmu
     * configuration. If needed it will be copied to the descriptor info during
     * UpdateDescriptorSets
     */
+   uint8_t plane_count;
    uint8_t sampler_state[V3DV_SAMPLER_STATE_LENGTH];
 };
 
-struct v3dv_descriptor_template_entry {
-   /* The type of descriptor in this entry */
-   VkDescriptorType type;
-
-   /* Binding in the descriptor set */
-   uint32_t binding;
-
-   /* Offset at which to write into the descriptor set binding */
-   uint32_t array_element;
-
-   /* Number of elements to write into the descriptor set binding */
-   uint32_t array_count;
-
-   /* Offset into the user provided data */
-   size_t offset;
-
-   /* Stride between elements into the user provided data */
-   size_t stride;
-};
-
-struct v3dv_descriptor_update_template {
-   struct vk_object_base base;
-
-   VkPipelineBindPoint bind_point;
-
-   /* The descriptor set this template corresponds to. This value is only
-    * valid if the template was created with the templateType
-    * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET.
-    */
-   uint8_t set;
-
-   /* Number of entries in this template */
-   uint32_t entry_count;
-
-   /* Entries of the template */
-   struct v3dv_descriptor_template_entry entries[0];
-};
-
-
 /* We keep two special values for the sampler idx that represents exactly when a
  * sampler is not needed/provided. The main use is that even if we don't have
  * sampler, we still need to do the output unpacking (through
@@ -1685,32 +2213,6 @@ struct v3dv_descriptor_update_template {
 #define V3DV_NO_SAMPLER_16BIT_IDX 0
 #define V3DV_NO_SAMPLER_32BIT_IDX 1
 
-/*
- * Following two methods are using on the combined to/from texture/sampler
- * indices maps at v3dv_pipeline.
- */
-static inline uint32_t
-v3dv_pipeline_combined_index_key_create(uint32_t texture_index,
-                                        uint32_t sampler_index)
-{
-   return texture_index << 24 | sampler_index;
-}
-
-static inline void
-v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key,
-                                        uint32_t *texture_index,
-                                        uint32_t *sampler_index)
-{
-   uint32_t texture = combined_index_key >> 24;
-   uint32_t sampler = combined_index_key & 0xffffff;
-
-   if (texture_index)
-      *texture_index = texture;
-
-   if (sampler_index)
-      *sampler_index = sampler;
-}
-
 struct v3dv_descriptor_maps {
    struct v3dv_descriptor_map ubo_map;
    struct v3dv_descriptor_map ssbo_map;
@@ -1733,50 +2235,59 @@ struct v3dv_pipeline_shared_data {
    struct v3dv_bo *assembly_bo;
 };
 
+struct v3dv_pipeline_executable_data {
+   enum broadcom_shader_stage stage;
+   char *nir_str;
+   char *qpu_str;
+};
+
 struct v3dv_pipeline {
    struct vk_object_base base;
 
    struct v3dv_device *device;
 
    VkShaderStageFlags active_stages;
+   VkPipelineCreateFlags flags;
 
    struct v3dv_render_pass *pass;
    struct v3dv_subpass *subpass;
 
-   /* Note: We can't use just a MESA_SHADER_STAGES array because we also need
-    * to track binning shaders. Note these will be freed once the pipeline
-    * has been compiled.
-    */
-   struct v3dv_pipeline_stage *vs;
-   struct v3dv_pipeline_stage *vs_bin;
-   struct v3dv_pipeline_stage *gs;
-   struct v3dv_pipeline_stage *gs_bin;
-   struct v3dv_pipeline_stage *fs;
-   struct v3dv_pipeline_stage *cs;
+   struct v3dv_pipeline_stage *stages[BROADCOM_SHADER_STAGES];
+
+   /* For VK_KHR_dynamic_rendering */
+   struct vk_render_pass_state rendering_info;
 
    /* Flags for whether optional pipeline stages are present, for convenience */
    bool has_gs;
 
+   /* Whether any stage in this pipeline uses VK_KHR_buffer_device_address */
+   bool uses_buffer_device_address;
+
    /* Spilling memory requirements */
    struct {
       struct v3dv_bo *bo;
       uint32_t size_per_thread;
    } spill;
 
-   struct v3dv_dynamic_state dynamic_state;
+   struct vk_dynamic_graphics_state dynamic_graphics_state;
+   struct v3dv_dynamic_state dynamic;
 
    struct v3dv_pipeline_layout *layout;
 
-   /* Whether this pipeline enables depth writes */
-   bool z_updates_enable;
-
    enum v3dv_ez_state ez_state;
 
+   /* If ez_state is V3D_EZ_DISABLED, if the reason for disabling is that the
+    * pipeline selects an incompatible depth test function.
+    */
+   bool incompatible_ez_test;
+
+   bool rasterization_enabled;
    bool msaa;
    bool sample_rate_shading;
    uint32_t sample_mask;
 
    bool primitive_restart;
+   bool negative_one_to_one;
 
    /* Accessed by binding. So vb[binding]->stride is the stride of the vertex
     * array with such binding
@@ -1799,12 +2310,18 @@ struct v3dv_pipeline {
    } va[MAX_VERTEX_ATTRIBS];
    uint32_t va_count;
 
-   enum pipe_prim_type topology;
+   enum mesa_prim topology;
+
+   bool line_smooth;
 
    struct v3dv_pipeline_shared_data *shared_data;
 
+   /* It is the combined stages sha1, layout sha1, plus the pipeline key sha1. */
+   unsigned char sha1[20];
+
    /* In general we can reuse v3dv_device->default_attribute_float, so note
-    * that the following can be NULL.
+    * that the following can be NULL. In 7.x this is not used, so it will be
+    * always NULL.
     *
     * FIXME: the content of this BO will be small, so it could be improved to
     * be uploaded to a common BO. But as in most cases it will be NULL, it is
@@ -1838,6 +2355,11 @@ struct v3dv_pipeline {
       bool is_z16;
    } depth_bias;
 
+   struct {
+      void *mem_ctx;
+      struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
+   } executables;
+
    /* Packets prepacked during pipeline creation
     */
    uint8_t cfg_bits[V3DV_CFG_BITS_LENGTH];
@@ -1848,6 +2370,13 @@ struct v3dv_pipeline {
    uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
 };
 
+static inline bool
+v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
+{
+   return device->devinfo.ver > 71 ||
+          (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
+}
+
 static inline VkPipelineBindPoint
 v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
 {
@@ -1872,28 +2401,17 @@ const nir_shader_compiler_options *v3dv_pipeline_get_nir_options(void);
 uint32_t v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev);
 uint32_t v3dv_physical_device_device_id(struct v3dv_physical_device *dev);
 
-VkResult __vk_errorf(struct v3dv_instance *instance, VkResult error,
-                     const char *file, int line,
-                     const char *format, ...);
-
-#define vk_error(instance, error) __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
-#define vk_errorf(instance, error, format, ...) __vk_errorf(instance, error, __FILE__, __LINE__, format, ## __VA_ARGS__);
-
-#ifdef DEBUG
 #define v3dv_debug_ignored_stype(sType) \
-   fprintf(stderr, "%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType))
-#else
-#define v3dv_debug_ignored_stype(sType)
-#endif
+   mesa_logd("%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType))
 
-const uint8_t *v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f);
-uint8_t v3dv_get_tex_return_size(const struct v3dv_format *vf, bool compare_enable);
+const uint8_t *v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f,
+                                       uint8_t plane);
 const struct v3dv_format *
 v3dv_get_compatible_tfu_format(struct v3dv_device *device,
                                uint32_t bpp, VkFormat *out_vk_format);
 bool v3dv_buffer_format_supports_features(struct v3dv_device *device,
                                           VkFormat vk_format,
-                                          VkFormatFeatureFlags features);
+                                          VkFormatFeatureFlags2 features);
 
 struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
                                          struct v3dv_pipeline *pipeline,
@@ -1953,6 +2471,14 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
                                    uint32_t index,
                                    uint32_t *dynamic_offset);
 
+struct v3dv_cl_reloc
+v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
+                                      struct v3dv_descriptor_state *descriptor_state,
+                                      struct v3dv_descriptor_map *map,
+                                      struct v3dv_pipeline_layout *pipeline_layout,
+                                      uint32_t index,
+                                      VkDescriptorType *out_type);
+
 const struct v3dv_sampler *
 v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state,
                                 struct v3dv_descriptor_map *map,
@@ -1973,13 +2499,6 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device,
                                              struct v3dv_pipeline_layout *pipeline_layout,
                                              uint32_t index);
 
-const struct v3dv_format*
-v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_state,
-                                       struct v3dv_descriptor_map *map,
-                                       struct v3dv_pipeline_layout *pipeline_layout,
-                                       uint32_t index,
-                                       VkFormat *out_vk_format);
-
 struct v3dv_bo*
 v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_state,
                                    struct v3dv_descriptor_map *map,
@@ -2020,71 +2539,56 @@ void
 v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
                                     struct v3dv_pipeline_cache *cache);
 
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
-                                              struct v3dv_pipeline *pipeline);
-
-void v3dv_shader_module_internal_init(struct v3dv_device *device,
-                                      struct vk_shader_module *module,
-                                      nir_shader *nir);
-
-#define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType)   \
-                                                        \
-   static inline struct __v3dv_type *                    \
-   __v3dv_type ## _from_handle(__VkType _handle)         \
-   {                                                    \
-      return (struct __v3dv_type *) _handle;             \
-   }                                                    \
-                                                        \
-   static inline __VkType                               \
-   __v3dv_type ## _to_handle(struct __v3dv_type *_obj)    \
-   {                                                    \
-      return (__VkType) _obj;                           \
-   }
-
-#define V3DV_DEFINE_NONDISP_HANDLE_CASTS(__v3dv_type, __VkType)              \
-                                                                           \
-   static inline struct __v3dv_type *                                       \
-   __v3dv_type ## _from_handle(__VkType _handle)                            \
-   {                                                                       \
-      return (struct __v3dv_type *)(uintptr_t) _handle;                     \
-   }                                                                       \
-                                                                           \
-   static inline __VkType                                                  \
-   __v3dv_type ## _to_handle(struct __v3dv_type *_obj)                       \
-   {                                                                       \
-      return (__VkType)(uintptr_t) _obj;                                   \
-   }
+VkResult
+v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
+                                      nir_shader *nir,
+                                      VkPipelineLayout pipeline_layout,
+                                      VkPipeline *pipeline);
 
 #define V3DV_FROM_HANDLE(__v3dv_type, __name, __handle)			\
-   struct __v3dv_type *__name = __v3dv_type ## _from_handle(__handle)
-
-V3DV_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, VkCommandBuffer)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_device, VkDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_instance, VkInstance)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_physical_device, VkPhysicalDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_queue, VkQueue)
-
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_cmd_pool, VkCommandPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, VkBuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, VkBufferView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, VkDeviceMemory)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, VkDescriptorPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, VkDescriptorSet)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, VkDescriptorSetLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, VkDescriptorUpdateTemplate)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, VkEvent)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, VkFence)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, VkFramebuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, VkImage)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, VkImageView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, VkPipeline)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, VkPipelineCache)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, VkPipelineLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, VkQueryPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, VkRenderPass)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, VkSampler)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, VkSemaphore)
+   VK_FROM_HANDLE(__v3dv_type, __name, __handle)
+
+VK_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+VK_DEFINE_HANDLE_CASTS(v3dv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_instance, vk.base, VkInstance,
+                       VK_OBJECT_TYPE_INSTANCE)
+VK_DEFINE_HANDLE_CASTS(v3dv_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, base, VkBuffer,
+                               VK_OBJECT_TYPE_BUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, vk.base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, base, VkFramebuffer,
+                               VK_OBJECT_TYPE_FRAMEBUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, vk.base, VkImage,
+                               VK_OBJECT_TYPE_IMAGE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, vk.base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, base, VkPipeline,
+                               VK_OBJECT_TYPE_PIPELINE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, base, VkPipelineCache,
+                               VK_OBJECT_TYPE_PIPELINE_CACHE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, base, VkPipelineLayout,
+                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, base, VkRenderPass,
+                               VK_OBJECT_TYPE_RENDER_PASS)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
 
 static inline int
 v3dv_ioctl(int fd, unsigned long request, void *arg)
@@ -2132,19 +2636,39 @@ u64_compare(const void *key1, const void *key2)
    return memcmp(key1, key2, sizeof(uint64_t)) == 0;
 }
 
-/* Helper to call hw ver speficic functions */
+/* Helper to call hw ver specific functions */
 #define v3dv_X(device, thing) ({                      \
    __typeof(&v3d42_##thing) v3d_X_thing;              \
    switch (device->devinfo.ver) {                     \
    case 42:                                           \
       v3d_X_thing = &v3d42_##thing;                   \
       break;                                          \
+   case 71:                                           \
+      v3d_X_thing = &v3d71_##thing;                   \
+      break;                                          \
    default:                                           \
       unreachable("Unsupported hardware generation"); \
    }                                                  \
    v3d_X_thing;                                       \
 })
 
+/* Helper to get hw-specific macro values */
+#define V3DV_X(device, thing) ({                                \
+   __typeof(V3D42_##thing) V3D_X_THING;                         \
+   switch (device->devinfo.ver) {                               \
+   case 42:                                                     \
+      V3D_X_THING = V3D42_##thing;                              \
+      break;                                                    \
+   case 71:                                                     \
+      V3D_X_THING = V3D71_##thing;                              \
+      break;                                                    \
+   default:                                                     \
+      unreachable("Unsupported hardware generation");           \
+   }                                                            \
+   V3D_X_THING;                                                 \
+})
+
+
 
 /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
  * define v3dX for each version supported, because when we compile code that
@@ -2157,6 +2681,45 @@ u64_compare(const void *key1, const void *key2)
 #  define v3dX(x) v3d42_##x
 #  include "v3dvx_private.h"
 #  undef v3dX
+
+#  define v3dX(x) v3d71_##x
+#  include "v3dvx_private.h"
+#  undef v3dX
 #endif
 
+VkResult
+v3dv_update_image_layout(struct v3dv_device *device,
+                         struct v3dv_image *image,
+                         uint64_t modifier,
+                         bool disjoint,
+                         const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info);
+
+float
+v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline,
+                       struct v3dv_cmd_buffer *buffer);
+
+
+void
+v3dv_compute_ez_state(struct vk_dynamic_graphics_state *dyn,
+                      struct v3dv_pipeline *pipeline,
+                      enum v3dv_ez_state *ez_state,
+                      bool *incompatible_ez_test);
+
+uint32_t v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim);
+
+#if DETECT_OS_ANDROID
+VkResult
+v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc,
+                                    struct u_gralloc_buffer_handle *in_hnd,
+                                    VkImageDrmFormatModifierExplicitCreateInfoEXT *out,
+                                    VkSubresourceLayout *out_layouts,
+                                    int max_planes);
+
+VkResult
+v3dv_import_native_buffer_fd(VkDevice device_h,
+                             int dma_buf,
+                             const VkAllocationCallbacks *alloc,
+                             VkImage image_h);
+#endif /* DETECT_OS_ANDROID */
+
 #endif /* V3DV_PRIVATE_H */
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 0deb430fc16..7231c694fff 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,6 +23,224 @@
 
 #include "v3dv_private.h"
 
+#include "util/timespec.h"
+#include "compiler/nir/nir_builder.h"
+
+static void
+kperfmon_create(struct v3dv_device *device,
+                struct v3dv_query_pool *pool,
+                uint32_t query)
+{
+   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+      assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
+
+      struct drm_v3d_perfmon_create req = {
+         .ncounters = MIN2(pool->perfmon.ncounters -
+                           i * DRM_V3D_MAX_PERF_COUNTERS,
+                           DRM_V3D_MAX_PERF_COUNTERS),
+      };
+      memcpy(req.counters,
+             &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
+             req.ncounters);
+
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_PERFMON_CREATE,
+                           &req);
+      if (ret)
+         fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
+
+      pool->queries[query].perf.kperfmon_ids[i] = req.id;
+   }
+}
+
+static void
+kperfmon_destroy(struct v3dv_device *device,
+                 struct v3dv_query_pool *pool,
+                 uint32_t query)
+{
+   /* Skip destroying if never created */
+   if (!pool->queries[query].perf.kperfmon_ids[0])
+      return;
+
+   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+      struct drm_v3d_perfmon_destroy req = {
+         .id = pool->queries[query].perf.kperfmon_ids[i]
+      };
+
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_PERFMON_DESTROY,
+                           &req);
+
+      if (ret) {
+         fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
+                 req.id, strerror(ret));
+      }
+   }
+}
+
+/**
+ * Creates a VkBuffer (and VkDeviceMemory) to access a BO.
+ */
+static VkResult
+create_vk_storage_buffer(struct v3dv_device *device,
+                         struct v3dv_bo *bo,
+                         VkBuffer *vk_buf,
+                         VkDeviceMemory *vk_mem)
+{
+   VkDevice vk_device = v3dv_device_to_handle(device);
+
+   VkBufferCreateInfo buf_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+      .size = bo->size,
+      .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+   };
+   VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct v3dv_device_memory *mem =
+      vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
+                       VK_OBJECT_TYPE_DEVICE_MEMORY);
+   if (!mem)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   mem->bo = bo;
+   mem->type = &device->pdevice->memory.memoryTypes[0];
+
+   *vk_mem = v3dv_device_memory_to_handle(mem);
+   VkBindBufferMemoryInfo bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+      .buffer = *vk_buf,
+      .memory = *vk_mem,
+      .memoryOffset = 0,
+   };
+   v3dv_BindBufferMemory2(vk_device, 1, &bind_info);
+
+   return VK_SUCCESS;
+}
+
+static void
+destroy_vk_storage_buffer(struct v3dv_device *device,
+                          VkBuffer *vk_buf,
+                          VkDeviceMemory *vk_mem)
+{
+   if (*vk_mem) {
+      vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem));
+      *vk_mem = VK_NULL_HANDLE;
+   }
+
+   v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL);
+   *vk_buf = VK_NULL_HANDLE;
+}
+
+/**
+ * Allocates descriptor sets to access query pool BO (availability and
+ * occlusion query results) from Vulkan pipelines.
+ */
+static VkResult
+create_pool_descriptors(struct v3dv_device *device,
+                        struct v3dv_query_pool *pool)
+{
+   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+   VkDevice vk_device = v3dv_device_to_handle(device);
+
+   VkDescriptorPoolSize pool_size = {
+      .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+      .descriptorCount = 1,
+   };
+   VkDescriptorPoolCreateInfo pool_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+      .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+      .maxSets = 1,
+      .poolSizeCount = 1,
+      .pPoolSizes = &pool_size,
+   };
+   VkResult result =
+      v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL,
+                                &pool->meta.descriptor_pool);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   VkDescriptorSetAllocateInfo alloc_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+      .descriptorPool = pool->meta.descriptor_pool,
+      .descriptorSetCount = 1,
+      .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+   };
+   result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info,
+                                        &pool->meta.descriptor_set);
+   if (result != VK_SUCCESS)
+      return result;
+
+   VkDescriptorBufferInfo desc_buf_info = {
+      .buffer = pool->meta.buf,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE,
+   };
+
+   VkWriteDescriptorSet write = {
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = pool->meta.descriptor_set,
+      .dstBinding = 0,
+      .dstArrayElement = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+      .pBufferInfo = &desc_buf_info,
+   };
+   v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
+
+   return VK_SUCCESS;
+}
+
+static void
+destroy_pool_descriptors(struct v3dv_device *device,
+                         struct v3dv_query_pool *pool)
+{
+   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+
+   v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
+                           pool->meta.descriptor_pool,
+                           1, &pool->meta.descriptor_set);
+   pool->meta.descriptor_set = VK_NULL_HANDLE;
+
+   v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
+                              pool->meta.descriptor_pool, NULL);
+   pool->meta.descriptor_pool = VK_NULL_HANDLE;
+}
+
+static VkResult
+pool_create_meta_resources(struct v3dv_device *device,
+                           struct v3dv_query_pool *pool)
+{
+   VkResult result;
+
+   if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
+      return VK_SUCCESS;
+
+   result = create_vk_storage_buffer(device, pool->occlusion.bo,
+                                     &pool->meta.buf, &pool->meta.mem);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = create_pool_descriptors(device, pool);
+   if (result != VK_SUCCESS)
+       return result;
+
+   return VK_SUCCESS;
+}
+
+static void
+pool_destroy_meta_resources(struct v3dv_device *device,
+                            struct v3dv_query_pool *pool)
+{
+   if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
+      return;
+
+   destroy_pool_descriptors(device, pool);
+   destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateQueryPool(VkDevice _device,
                      const VkQueryPoolCreateInfo *pCreateInfo,
@@ -32,74 +250,149 @@ v3dv_CreateQueryPool(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
    assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
-          pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP);
+          pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
+          pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
    assert(pCreateInfo->queryCount > 0);
 
    struct v3dv_query_pool *pool =
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
                        VK_OBJECT_TYPE_QUERY_POOL);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    pool->query_type = pCreateInfo->queryType;
    pool->query_count = pCreateInfo->queryCount;
 
+   uint32_t query_idx = 0;
    VkResult result;
 
    const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
    pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pool->queries == NULL) {
-      result = vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail;
    }
 
-   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION: {
       /* The hardware allows us to setup groups of 16 queries in consecutive
        * 4-byte addresses, requiring only that each group of 16 queries is
        * aligned to a 1024 byte boundary.
        */
       const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
-      const uint32_t bo_size = query_groups * 1024;
-      pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
-      if (!pool->bo) {
-         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      uint32_t bo_size = query_groups * 1024;
+      /* After the counters we store avalability data, 1 byte/query */
+      pool->occlusion.avail_offset = bo_size;
+      bo_size += pool->query_count;
+      pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true);
+      if (!pool->occlusion.bo) {
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
          goto fail;
       }
-      if (!v3dv_bo_map(device, pool->bo, bo_size)) {
-         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) {
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
          goto fail;
       }
+      break;
    }
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
+         vk_find_struct_const(pCreateInfo->pNext,
+                              QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+
+      assert(pq_info);
+
+      pool->perfmon.ncounters = pq_info->counterIndexCount;
+      for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
+         pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
 
-   uint32_t i;
-   for (i = 0; i < pool->query_count; i++) {
-      pool->queries[i].maybe_available = false;
+      pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
+                                             DRM_V3D_MAX_PERF_COUNTERS);
+
+      assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
+      break;
+   }
+   case VK_QUERY_TYPE_TIMESTAMP: {
+      /* 8 bytes per query used for the timestamp value. We have all
+       * timestamps tightly packed first in the buffer.
+       */
+      const uint32_t bo_size = pool->query_count * 8;
+      pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true);
+      if (!pool->timestamp.bo) {
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail;
+      }
+      if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) {
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail;
+      }
+      break;
+   }
+   default:
+      unreachable("Unsupported query type");
+   }
+
+   /* Initialize queries in the pool */
+   for (; query_idx < pool->query_count; query_idx++) {
+      pool->queries[query_idx].maybe_available = false;
       switch (pool->query_type) {
       case VK_QUERY_TYPE_OCCLUSION: {
-         const uint32_t query_group = i / 16;
-         const uint32_t query_offset = query_group * 1024 + (i % 16) * 4;
-         pool->queries[i].bo = pool->bo;
-         pool->queries[i].offset = query_offset;
+         const uint32_t query_group = query_idx / 16;
+         const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
+         pool->queries[query_idx].occlusion.offset = query_offset;
          break;
          }
       case VK_QUERY_TYPE_TIMESTAMP:
-         pool->queries[i].value = 0;
+         pool->queries[query_idx].timestamp.offset = query_idx * 8;
+         result = vk_sync_create(&device->vk,
+                                 &device->pdevice->drm_syncobj_type, 0, 0,
+                                 &pool->queries[query_idx].timestamp.sync);
+         if (result != VK_SUCCESS)
+            goto fail;
+         break;
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+         result = vk_sync_create(&device->vk,
+                                 &device->pdevice->drm_syncobj_type, 0, 0,
+                                 &pool->queries[query_idx].perf.last_job_sync);
+         if (result != VK_SUCCESS)
+            goto fail;
+
+         kperfmon_create(device, pool, query_idx);
          break;
+         }
       default:
          unreachable("Unsupported query type");
       }
    }
 
+   /* Create meta resources */
+   result = pool_create_meta_resources(device, pool);
+   if (result != VK_SUCCESS)
+      goto fail;
+
    *pQueryPool = v3dv_query_pool_to_handle(pool);
 
    return VK_SUCCESS;
 
 fail:
-   if (pool->bo)
-      v3dv_bo_free(device, pool->bo);
+   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      for (uint32_t j = 0; j < query_idx; j++)
+         vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync);
+   }
+
+   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t j = 0; j < query_idx; j++)
+         vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
+   }
+
+   if (pool->occlusion.bo)
+      v3dv_bo_free(device, pool->occlusion.bo);
+   if (pool->timestamp.bo)
+      v3dv_bo_free(device, pool->timestamp.bo);
    if (pool->queries)
       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
+   pool_destroy_meta_resources(device, pool);
    vk_object_free(&device->vk, pAllocator, pool);
 
    return result;
@@ -116,17 +409,34 @@ v3dv_DestroyQueryPool(VkDevice _device,
    if (!pool)
       return;
 
-   if (pool->bo)
-      v3dv_bo_free(device, pool->bo);
+   if (pool->occlusion.bo)
+      v3dv_bo_free(device, pool->occlusion.bo);
+
+   if (pool->timestamp.bo)
+      v3dv_bo_free(device, pool->timestamp.bo);
+
+   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      for (uint32_t i = 0; i < pool->query_count; i++)
+         vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync);
+   }
+
+   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t i = 0; i < pool->query_count; i++) {
+         kperfmon_destroy(device, pool, i);
+         vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
+      }
+   }
 
    if (pool->queries)
       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
 
+   pool_destroy_meta_resources(device, pool);
+
    vk_object_free(&device->vk, pAllocator, pool);
 }
 
 static void
-write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
+write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
 {
    if (do_64bit) {
       uint64_t *dst64 = (uint64_t *) dst;
@@ -138,89 +448,255 @@ write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
 }
 
 static VkResult
-get_occlusion_query_result(struct v3dv_device *device,
-                           struct v3dv_query_pool *pool,
-                           uint32_t query,
-                           bool do_wait,
-                           bool *available,
-                           uint64_t *value)
+query_wait_available(struct v3dv_device *device,
+                     struct v3dv_query_pool *pool,
+                     struct v3dv_query *q,
+                     uint32_t query_idx)
 {
-   assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+   /* For occlusion queries we prefer to poll the availability BO in a loop
+    * to waiting on the query results BO, because the latter would
+    * make us wait for any job running queries from the pool, even if those
+    * queries do not involve the one we want to wait on.
+    */
+   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
+                        pool->occlusion.avail_offset + query_idx;
+      while (*q_addr == 0)
+         usleep(250);
+      return VK_SUCCESS;
+   }
 
-   struct v3dv_query *q = &pool->queries[query];
-   assert(q->bo && q->bo->map);
+   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      if (vk_sync_wait(&device->vk, q->timestamp.sync,
+                       0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
+         return vk_device_set_lost(&device->vk, "Query job wait failed");
+      }
+      return VK_SUCCESS;
+   }
 
-   if (do_wait) {
-      /* From the Vulkan 1.0 spec:
-       *
-       *    "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not
-       *     become available in a finite amount of time (e.g. due to not
-       *     issuing a query since the last reset), a VK_ERROR_DEVICE_LOST
-       *     error may occur."
+   assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+   /* For performance queries we need to wait for the queue to signal that
+    * the query has been submitted for execution before anything else.
+    */
+   VkResult result = VK_SUCCESS;
+   if (!q->maybe_available) {
+      struct timespec timeout;
+      timespec_get(&timeout, TIME_UTC);
+      timespec_add_msec(&timeout, &timeout, 2000);
+
+      mtx_lock(&device->query_mutex);
+      while (!q->maybe_available) {
+         if (vk_device_is_lost(&device->vk)) {
+            result = VK_ERROR_DEVICE_LOST;
+            break;
+         }
+
+         int ret = cnd_timedwait(&device->query_ended,
+                                 &device->query_mutex,
+                                 &timeout);
+         if (ret != thrd_success) {
+            mtx_unlock(&device->query_mutex);
+            result = vk_device_set_lost(&device->vk, "Query wait failed");
+            break;
+         }
+      }
+      mtx_unlock(&device->query_mutex);
+
+      if (result != VK_SUCCESS)
+         return result;
+
+      /* For performance queries, we also need to wait for the relevant syncobj
+       * to be signaled to ensure completion of the GPU work.
        */
-      if (!q->maybe_available)
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+          vk_sync_wait(&device->vk, q->perf.last_job_sync,
+                       0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
+        return vk_device_set_lost(&device->vk, "Query job wait failed");
+      }
+   }
+
+   return result;
+}
+
+static VkResult
+query_check_available(struct v3dv_device *device,
+                      struct v3dv_query_pool *pool,
+                      struct v3dv_query *q,
+                      uint32_t query_idx)
+{
+   /* For occlusion we check the availability BO */
+   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
+                              pool->occlusion.avail_offset + query_idx;
+      return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY;
+   }
 
-      if (!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+   /* For timestamp queries, we need to check if the relevant job
+    * has completed.
+    */
+   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      if (vk_sync_wait(&device->vk, q->timestamp.sync,
+                       0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
+         return VK_NOT_READY;
+      }
+      return VK_SUCCESS;
+   }
+
+   /* For other queries we need to check if the queue has submitted the query
+    * for execution at all.
+    */
+   assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+   if (!q->maybe_available)
+      return VK_NOT_READY;
+
+   /* For performance queries, we also need to check if the relevant GPU job
+    * has completed.
+    */
+   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+       vk_sync_wait(&device->vk, q->perf.last_job_sync,
+                    0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
+         return VK_NOT_READY;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+query_is_available(struct v3dv_device *device,
+                   struct v3dv_query_pool *pool,
+                   uint32_t query,
+                   bool do_wait,
+                   bool *available)
+{
+   struct v3dv_query *q = &pool->queries[query];
+
+   if (do_wait) {
+      VkResult result = query_wait_available(device, pool, q, query);
+      if (result != VK_SUCCESS) {
+         *available = false;
+         return result;
+      }
 
       *available = true;
    } else {
-      *available = q->maybe_available && v3dv_bo_wait(device, q->bo, 0);
+      VkResult result = query_check_available(device, pool, q, query);
+      assert(result == VK_SUCCESS || result == VK_NOT_READY);
+      *available = (result == VK_SUCCESS);
    }
 
-   const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
-   *value = (uint64_t) *((uint32_t *)query_addr);
    return VK_SUCCESS;
 }
 
 static VkResult
-get_timestamp_query_result(struct v3dv_device *device,
-                           struct v3dv_query_pool *pool,
-                           uint32_t query,
-                           bool do_wait,
-                           bool *available,
-                           uint64_t *value)
+write_occlusion_query_result(struct v3dv_device *device,
+                             struct v3dv_query_pool *pool,
+                             uint32_t query,
+                             bool do_64bit,
+                             void *data,
+                             uint32_t slot)
+{
+   assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+
+   if (vk_device_is_lost(&device->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   struct v3dv_query *q = &pool->queries[query];
+   assert(pool->occlusion.bo && pool->occlusion.bo->map);
+
+   const uint8_t *query_addr =
+      ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
+   write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
+   return VK_SUCCESS;
+}
+
+static VkResult
+write_timestamp_query_result(struct v3dv_device *device,
+                             struct v3dv_query_pool *pool,
+                             uint32_t query,
+                             bool do_64bit,
+                             void *data,
+                             uint32_t slot)
 {
    assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
 
    struct v3dv_query *q = &pool->queries[query];
 
-   if (do_wait) {
-      /* From the Vulkan 1.0 spec:
-       *
-       *    "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not
-       *     become available in a finite amount of time (e.g. due to not
-       *     issuing a query since the last reset), a VK_ERROR_DEVICE_LOST
-       *     error may occur."
-       */
-      if (!q->maybe_available)
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+   const uint8_t *query_addr =
+      ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset;
 
-      *available = true;
-   } else {
-      *available = q->maybe_available;
+   write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr));
+   return VK_SUCCESS;
+}
+
+static VkResult
+write_performance_query_result(struct v3dv_device *device,
+                               struct v3dv_query_pool *pool,
+                               uint32_t query,
+                               bool do_64bit,
+                               void *data,
+                               uint32_t slot)
+{
+   assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+   struct v3dv_query *q = &pool->queries[query];
+   uint64_t counter_values[V3D_MAX_PERFCNT];
+
+   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+      struct drm_v3d_perfmon_get_values req = {
+         .id = q->perf.kperfmon_ids[i],
+         .values_ptr = (uintptr_t)(&counter_values[i *
+                                   DRM_V3D_MAX_PERF_COUNTERS])
+      };
+
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_PERFMON_GET_VALUES,
+                           &req);
+
+      if (ret) {
+         fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
+      }
    }
 
-   *value = q->value;
+   for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
+      write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
+
    return VK_SUCCESS;
 }
 
 static VkResult
-get_query_result(struct v3dv_device *device,
-                 struct v3dv_query_pool *pool,
-                 uint32_t query,
-                 bool do_wait,
-                 bool *available,
-                 uint64_t *value)
+write_query_result(struct v3dv_device *device,
+                   struct v3dv_query_pool *pool,
+                   uint32_t query,
+                   bool do_64bit,
+                   void *data,
+                   uint32_t slot)
+{
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      return write_occlusion_query_result(device, pool, query, do_64bit,
+                                          data, slot);
+   case VK_QUERY_TYPE_TIMESTAMP:
+      return write_timestamp_query_result(device, pool, query, do_64bit,
+                                          data, slot);
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+      return write_performance_query_result(device, pool, query, do_64bit,
+                                            data, slot);
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
+static uint32_t
+get_query_result_count(struct v3dv_query_pool *pool)
 {
    switch (pool->query_type) {
    case VK_QUERY_TYPE_OCCLUSION:
-      return get_occlusion_query_result(device, pool, query, do_wait,
-                                        available, value);
    case VK_QUERY_TYPE_TIMESTAMP:
-      return get_timestamp_query_result(device, pool, query, do_wait,
-                                        available, value);
+      return 1;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+      return pool->perfmon.ncounters;
    default:
       unreachable("Unsupported query type");
    }
@@ -239,16 +715,18 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
    assert(first + count <= pool->query_count);
    assert(data);
 
-   const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT;
+   const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
+      pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
    const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
    const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
 
+   uint32_t result_count = get_query_result_count(pool);
+
    VkResult result = VK_SUCCESS;
    for (uint32_t i = first; i < first + count; i++) {
       bool available = false;
-      uint64_t value = 0;
       VkResult query_result =
-         get_query_result(device, pool, i, do_wait, &available, &value);
+         query_is_available(device, pool, i, do_wait, &available);
       if (query_result == VK_ERROR_DEVICE_LOST)
          result = VK_ERROR_DEVICE_LOST;
 
@@ -266,11 +744,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
 
       const bool write_result = available || do_partial;
       if (write_result)
-         write_query_result(data, slot, do_64bit, value);
-      slot++;
+         write_query_result(device, pool, i, do_64bit, data, slot);
+      slot += result_count;
 
       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
-         write_query_result(data, slot++, do_64bit, available ? 1u : 0u);
+         write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
 
       if (!write_result && result != VK_ERROR_DEVICE_LOST)
          result = VK_NOT_READY;
@@ -298,6 +776,170 @@ v3dv_GetQueryPoolResults(VkDevice _device,
                                           pData, stride, flags);
 }
 
+/* Emits a series of vkCmdDispatchBase calls to execute all the workgroups
+ * required to handle a number of queries considering per-dispatch limits.
+ */
+static void
+cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer,
+                                 uint32_t query_count)
+{
+   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+   uint32_t dispatched = 0;
+   const uint32_t max_batch_size = 65535;
+   while (dispatched < query_count) {
+      uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size);
+      v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1);
+      dispatched += batch_size;
+   }
+}
+
+void
+v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
+                                            struct v3dv_query_pool *pool,
+                                            uint32_t query, uint32_t count,
+                                            uint8_t availability)
+{
+   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
+          pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+   struct v3dv_device *device = cmd_buffer->device;
+   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+   /* We are about to emit a compute job to set query availability and we need
+    * to ensure this executes after the graphics work using the queries has
+    * completed.
+    */
+   VkMemoryBarrier2 barrier = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+      .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+      .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+   };
+   VkDependencyInfo barrier_info = {
+      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+      .memoryBarrierCount = 1,
+      .pMemoryBarriers = &barrier,
+   };
+   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+
+   /* Dispatch queries */
+   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+   v3dv_CmdBindPipeline(vk_cmd_buffer,
+                        VK_PIPELINE_BIND_POINT_COMPUTE,
+                        device->queries.avail_pipeline);
+
+   v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
+                              VK_PIPELINE_BIND_POINT_COMPUTE,
+                              device->queries.avail_pipeline_layout,
+                              0, 1, &pool->meta.descriptor_set,
+                              0, NULL);
+
+   struct {
+      uint32_t offset;
+      uint32_t query;
+      uint8_t availability;
+   } push_data = { pool->occlusion.avail_offset, query, availability };
+   v3dv_CmdPushConstants(vk_cmd_buffer,
+                         device->queries.avail_pipeline_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT,
+                         0, sizeof(push_data), &push_data);
+   cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
+
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+static void
+cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
+                                           struct v3dv_query_pool *pool,
+                                           uint32_t query, uint32_t count)
+{
+   struct v3dv_device *device = cmd_buffer->device;
+   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+   /* Ensure the GPU is done with the queries in the graphics queue before
+    * we reset in the compute queue.
+    */
+   VkMemoryBarrier2 barrier = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+      .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+      .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+   };
+   VkDependencyInfo barrier_info = {
+      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+      .memoryBarrierCount = 1,
+      .pMemoryBarriers = &barrier,
+   };
+   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+
+   /* Emit compute reset */
+   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+   v3dv_CmdBindPipeline(vk_cmd_buffer,
+                        VK_PIPELINE_BIND_POINT_COMPUTE,
+                        device->queries.reset_occlusion_pipeline);
+
+   v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
+                              VK_PIPELINE_BIND_POINT_COMPUTE,
+                              device->queries.reset_occlusion_pipeline_layout,
+                              0, 1, &pool->meta.descriptor_set,
+                              0, NULL);
+   struct {
+      uint32_t offset;
+      uint32_t query;
+   } push_data = { pool->occlusion.avail_offset, query };
+   v3dv_CmdPushConstants(vk_cmd_buffer,
+                         device->queries.reset_occlusion_pipeline_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT,
+                         0, sizeof(push_data), &push_data);
+
+   cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
+
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+
+   /* Ensure future work in the graphics queue using the queries doesn't start
+    * before the reset completed.
+    */
+   barrier = (VkMemoryBarrier2) {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+      .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+      .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT,
+   };
+   barrier_info = (VkDependencyInfo) {
+      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+      .memoryBarrierCount = 1,
+      .pMemoryBarriers = &barrier,
+   };
+   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+}
+
+static void
+cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
+                                 struct v3dv_query_pool *pool,
+                                 uint32_t first, uint32_t count)
+{
+   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+   cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count);
+}
+
+static void
+cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+                                     struct v3dv_query_pool *pool,
+                                     uint32_t first, uint32_t count)
+{
+   assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION);
+
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                     V3DV_JOB_TYPE_CPU_RESET_QUERIES,
+                                     cmd_buffer, -1);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+   job->cpu.query_reset.pool = pool;
+   job->cpu.query_reset.first = first;
+   job->cpu.query_reset.count = count;
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
                        VkQueryPool queryPool,
@@ -307,7 +949,261 @@ v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
 
-   v3dv_cmd_buffer_reset_queries(cmd_buffer, pool, firstQuery, queryCount);
+   /* Resets can only happen outside a render pass instance so we should not
+    * be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   assert(firstQuery < pool->query_count);
+   assert(firstQuery + queryCount <= pool->query_count);
+
+   /* We can reset occlusion queries in the GPU, but for other query types
+    * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed
+    * in the queue.
+    */
+   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount);
+   } else {
+      cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool,
+                                           firstQuery, queryCount);
+   }
+}
+
+/**
+ * Creates a descriptor pool so we can create a descriptors for the destination
+ * buffers of vkCmdCopyQueryResults for queries where this is implemented in
+ * the GPU.
+ */
+static VkResult
+create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* If this is not the first pool we create one for this command buffer
+    * size it based on the size of the currently exhausted pool.
+    */
+   uint32_t descriptor_count = 32;
+   if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) {
+      struct v3dv_descriptor_pool *exhausted_pool =
+         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool);
+      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
+   }
+
+   /* Create the descriptor pool */
+   cmd_buffer->meta.query.dspool = VK_NULL_HANDLE;
+   VkDescriptorPoolSize pool_size = {
+      .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+      .descriptorCount = descriptor_count,
+   };
+   VkDescriptorPoolCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+      .maxSets = descriptor_count,
+      .poolSizeCount = 1,
+      .pPoolSizes = &pool_size,
+      .flags = 0,
+   };
+   VkResult result =
+      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
+                                &info,
+                                &cmd_buffer->device->vk.alloc,
+                                &cmd_buffer->meta.query.dspool);
+
+   if (result == VK_SUCCESS) {
+      assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
+      const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool;
+
+      v3dv_cmd_buffer_add_private_obj(
+         cmd_buffer, (uintptr_t) vk_pool,
+         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
+
+      struct v3dv_descriptor_pool *pool =
+         v3dv_descriptor_pool_from_handle(vk_pool);
+      pool->is_driver_internal = true;
+   }
+
+   return result;
+}
+
+static VkResult
+allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
+                                       VkDescriptorSet *set)
+{
+   /* Make sure we have a descriptor pool */
+   VkResult result;
+   if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) {
+      result = create_storage_buffer_descriptor_pool(cmd_buffer);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+   assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
+
+   /* Allocate descriptor set */
+   struct v3dv_device *device = cmd_buffer->device;
+   VkDevice vk_device = v3dv_device_to_handle(device);
+   VkDescriptorSetAllocateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+      .descriptorPool = cmd_buffer->meta.query.dspool,
+      .descriptorSetCount = 1,
+      .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+   };
+   result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
+
+   /* If we ran out of pool space, grow the pool and try again */
+   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
+      result = create_storage_buffer_descriptor_pool(cmd_buffer);
+      if (result == VK_SUCCESS) {
+         info.descriptorPool = cmd_buffer->meta.query.dspool;
+         result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
+      }
+   }
+
+   return result;
+}
+
+static uint32_t
+copy_pipeline_index_from_flags(VkQueryResultFlags flags)
+{
+   uint32_t index = 0;
+   if (flags & VK_QUERY_RESULT_64_BIT)
+      index |= 1;
+   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+      index |= 2;
+   if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
+      index |= 4;
+   assert(index < 8);
+   return index;
+}
+
+static nir_shader *
+get_copy_query_results_cs(VkQueryResultFlags flags);
+
+static void
+cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer,
+                                        struct v3dv_query_pool *pool,
+                                        uint32_t first, uint32_t count,
+                                        struct v3dv_buffer *buf,
+                                        uint32_t offset, uint32_t stride,
+                                        VkQueryResultFlags flags)
+{
+   struct v3dv_device *device = cmd_buffer->device;
+   VkDevice vk_device = v3dv_device_to_handle(device);
+   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+   /* Create the required copy pipeline if not yet created */
+   uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags);
+   if (!device->queries.copy_pipeline[pipeline_idx]) {
+      nir_shader *copy_query_results_cs_nir = get_copy_query_results_cs(flags);
+      VkResult result =
+         v3dv_create_compute_pipeline_from_nir(
+               device, copy_query_results_cs_nir,
+               device->queries.copy_pipeline_layout,
+               &device->queries.copy_pipeline[pipeline_idx]);
+      ralloc_free(copy_query_results_cs_nir);
+      if (result != VK_SUCCESS) {
+         fprintf(stderr, "Failed to create copy query results pipeline\n");
+         return;
+      }
+   }
+
+   /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called
+    * and that already waits maybe we don't (since this is serialized
+    * in the compute queue with EndQuery anyway).
+    */
+   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+      VkMemoryBarrier2 barrier = {
+         .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+         .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+         .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+      };
+      VkDependencyInfo barrier_info = {
+         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+         .memoryBarrierCount = 1,
+         .pMemoryBarriers = &barrier,
+      };
+      v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+   }
+
+   /* Allocate and setup descriptor set for output buffer */
+   VkDescriptorSet out_buf_descriptor_set;
+   VkResult result =
+      allocate_storage_buffer_descriptor_set(cmd_buffer,
+                                             &out_buf_descriptor_set);
+   if (result != VK_SUCCESS) {
+      fprintf(stderr, "vkCmdCopyQueryPoolResults failed: "
+              "could not allocate descriptor.\n");
+      return;
+   }
+
+   VkDescriptorBufferInfo desc_buf_info = {
+      .buffer = v3dv_buffer_to_handle(buf),
+      .offset = 0,
+      .range = VK_WHOLE_SIZE,
+   };
+   VkWriteDescriptorSet write = {
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = out_buf_descriptor_set,
+      .dstBinding = 0,
+      .dstArrayElement = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+      .pBufferInfo = &desc_buf_info,
+   };
+   v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
+
+   /* Dispatch copy */
+   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+   assert(device->queries.copy_pipeline[pipeline_idx]);
+   v3dv_CmdBindPipeline(vk_cmd_buffer,
+                        VK_PIPELINE_BIND_POINT_COMPUTE,
+                        device->queries.copy_pipeline[pipeline_idx]);
+
+   VkDescriptorSet sets[2] = {
+      pool->meta.descriptor_set,
+      out_buf_descriptor_set,
+   };
+   v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
+                              VK_PIPELINE_BIND_POINT_COMPUTE,
+                              device->queries.copy_pipeline_layout,
+                              0, 2, sets, 0, NULL);
+
+   struct {
+      uint32_t avail_offset, first, offset, stride, flags;
+   } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags };
+   v3dv_CmdPushConstants(vk_cmd_buffer,
+                         device->queries.copy_pipeline_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT,
+                         0, sizeof(push_data), &push_data);
+
+   cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
+
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+static void
+cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+                                            struct v3dv_query_pool *pool,
+                                            uint32_t first,
+                                            uint32_t count,
+                                            struct v3dv_buffer *dst,
+                                            uint32_t offset,
+                                            uint32_t stride,
+                                            VkQueryResultFlags flags)
+{
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                     V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
+                                     cmd_buffer, -1);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   job->cpu.query_copy_results.pool = pool;
+   job->cpu.query_copy_results.first = first;
+   job->cpu.query_copy_results.count = count;
+   job->cpu.query_copy_results.dst = dst;
+   job->cpu.query_copy_results.offset = offset;
+   job->cpu.query_copy_results.stride = stride;
+   job->cpu.query_copy_results.flags = flags;
+
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -324,9 +1220,30 @@ v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
    V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
 
-   v3dv_cmd_buffer_copy_query_results(cmd_buffer, pool,
-                                      firstQuery, queryCount,
-                                      dst, dstOffset, stride, flags);
+   /* Copies can only happen outside a render pass instance so we should not
+    * be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   assert(firstQuery < pool->query_count);
+   assert(firstQuery + queryCount <= pool->query_count);
+
+   /* For occlusion queries we implement the copy in the GPU but for other
+    * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu
+    * when executed in the queue.
+    */
+   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool,
+                                              firstQuery, queryCount,
+                                              dst, (uint32_t) dstOffset,
+                                              (uint32_t) stride, flags);
+   } else {
+      cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool,
+                                                  firstQuery, queryCount,
+                                                  dst, (uint32_t)dstOffset,
+                                                  (uint32_t) stride, flags);
+   }
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -351,3 +1268,537 @@ v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
 
    v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
 }
+
+void
+v3dv_reset_query_pool_cpu(struct v3dv_device *device,
+                          struct v3dv_query_pool *pool,
+                          uint32_t first,
+                          uint32_t count)
+{
+   mtx_lock(&device->query_mutex);
+
+   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      assert(first + count <= pool->query_count);
+
+      /* Reset timestamp */
+      uint8_t *base_addr;
+      base_addr  = ((uint8_t *) pool->timestamp.bo->map) +
+                    pool->queries[first].timestamp.offset;
+      memset(base_addr, 0, 8 * count);
+
+      for (uint32_t i = first; i < first + count; i++) {
+         if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS)
+            fprintf(stderr, "Failed to reset sync");
+      }
+
+      mtx_unlock(&device->query_mutex);
+      return;
+   }
+
+   for (uint32_t i = first; i < first + count; i++) {
+      assert(i < pool->query_count);
+      struct v3dv_query *q = &pool->queries[i];
+      q->maybe_available = false;
+      switch (pool->query_type) {
+      case VK_QUERY_TYPE_OCCLUSION: {
+         /* Reset availability */
+         uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) +
+                              pool->occlusion.avail_offset + first;
+         memset(base_addr, 0, count);
+
+         /* Reset occlusion counter */
+         const uint8_t *q_addr =
+            ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
+         uint32_t *counter = (uint32_t *) q_addr;
+         *counter = 0;
+         break;
+      }
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+         kperfmon_destroy(device, pool, i);
+         kperfmon_create(device, pool, i);
+         if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
+            fprintf(stderr, "Failed to reset sync");
+         break;
+      default:
+         unreachable("Unsupported query type");
+      }
+   }
+
+   mtx_unlock(&device->query_mutex);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_ResetQueryPool(VkDevice _device,
+                    VkQueryPool queryPool,
+                    uint32_t firstQuery,
+                    uint32_t queryCount)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
+
+   v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+   VkPhysicalDevice physicalDevice,
+   uint32_t queueFamilyIndex,
+   uint32_t *pCounterCount,
+   VkPerformanceCounterKHR *pCounters,
+   VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+   V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
+
+   return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
+                                                                pCounters,
+                                                                pCounterDescriptions);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
+   VkPhysicalDevice physicalDevice,
+   const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
+   uint32_t *pNumPasses)
+{
+   *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
+                              DRM_V3D_MAX_PERF_COUNTERS);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_AcquireProfilingLockKHR(
+   VkDevice _device,
+   const VkAcquireProfilingLockInfoKHR *pInfo)
+{
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_ReleaseProfilingLockKHR(VkDevice device)
+{
+}
+
+static inline void
+nir_set_query_availability(nir_builder *b,
+                           nir_def *buf,
+                           nir_def *offset,
+                           nir_def *query_idx,
+                           nir_def *avail)
+{
+   offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
+   nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
+}
+
+static inline nir_def *
+nir_get_query_availability(nir_builder *b,
+                           nir_def *buf,
+                           nir_def *offset,
+                           nir_def *query_idx)
+{
+   offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
+   nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
+   return nir_i2i32(b, avail);
+}
+
+static nir_shader *
+get_set_query_availability_cs()
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+                                                  "set query availability cs");
+
+   nir_def *buf =
+      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+                                .desc_set = 0,
+                                .binding = 0,
+                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+   /* This assumes a local size of 1 and a horizontal-only dispatch. If we
+    * ever change any of these parameters we need to update how we compute the
+    * query index here.
+    */
+   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+
+   nir_def *offset =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+   nir_def *query_idx =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+   nir_def *avail =
+      nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);
+
+   query_idx = nir_iadd(&b, query_idx, wg_id);
+   nir_set_query_availability(&b, buf, offset, query_idx, avail);
+
+   return b.shader;
+}
+
+static inline nir_def *
+nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
+{
+   nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
+   nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
+   nir_def *offset =
+      nir_iadd(b, nir_imul_imm(b, query_group, 1024),
+                  nir_imul_imm(b, query_group_offset, 4));
+   return offset;
+}
+
+static inline void
+nir_reset_occlusion_counter(nir_builder *b,
+                            nir_def *buf,
+                            nir_def *query_idx)
+{
+   nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
+   nir_def *zero = nir_imm_int(b, 0);
+   nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
+}
+
+static inline nir_def *
+nir_read_occlusion_counter(nir_builder *b,
+                           nir_def *buf,
+                           nir_def *query_idx)
+{
+   nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
+   return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
+}
+
+static nir_shader *
+get_reset_occlusion_query_cs()
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+                                                  "reset occlusion query cs");
+
+   nir_def *buf =
+      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+                                .desc_set = 0,
+                                .binding = 0,
+                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+   /* This assumes a local size of 1 and a horizontal-only dispatch. If we
+    * ever change any of these parameters we need to update how we compute the
+    * query index here.
+    */
+   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+
+   nir_def *avail_offset =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+   nir_def *base_query_idx =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+   nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
+
+   nir_set_query_availability(&b, buf, avail_offset, query_idx,
+                              nir_imm_intN_t(&b, 0, 8));
+   nir_reset_occlusion_counter(&b, buf, query_idx);
+
+   return b.shader;
+}
+
+static void
+write_query_buffer(nir_builder *b,
+                   nir_def *buf,
+                   nir_def **offset,
+                   nir_def *value,
+                   bool flag_64bit)
+{
+   if (flag_64bit) {
+      /* Create a 64-bit value using a vec2 with the .Y component set to 0
+       * so we can write a 64-bit value in a single store.
+       */
+      nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
+      nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
+      *offset = nir_iadd_imm(b, *offset, 8);
+   } else {
+      nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
+      *offset = nir_iadd_imm(b, *offset, 4);
+   }
+}
+
+static nir_shader *
+get_copy_query_results_cs(VkQueryResultFlags flags)
+{
+   bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT;
+   bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
+   bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
+
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+                                                  "copy query results cs");
+
+   nir_def *buf =
+      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+                                .desc_set = 0,
+                                .binding = 0,
+                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+   nir_def *buf_out =
+      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+                                .desc_set = 1,
+                                .binding = 0,
+                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+   /* Read push constants */
+   nir_def *avail_offset =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+   nir_def *base_query_idx =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+   nir_def *base_offset_out =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);
+
+   nir_def *stride =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);
+
+   /* This assumes a local size of 1 and a horizontal-only dispatch. If we
+    * ever change any of these parameters we need to update how we compute the
+    * query index here.
+    */
+   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+   nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
+
+   /* Read query availability if needed */
+   nir_def *avail = NULL;
+   if (flag_avail || !flag_partial)
+      avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);
+
+   /* Write occusion query result... */
+   nir_def *offset =
+      nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));
+
+   /* ...if partial is requested, we always write */
+   if(flag_partial) {
+      nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
+      write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
+   } else {
+      /*...otherwise, we only write if the query is available */
+      nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
+         nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
+         write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
+      nir_pop_if(&b, if_stmt);
+   }
+
+   /* Write query availability */
+   if (flag_avail)
+      write_query_buffer(&b, buf_out, &offset, avail, flag_64bit);
+
+   return b.shader;
+}
+
+static bool
+create_query_pipelines(struct v3dv_device *device)
+{
+   VkResult result;
+   VkPipeline pipeline;
+
+   /* Set layout: single storage buffer */
+   if (!device->queries.buf_descriptor_set_layout) {
+      VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
+         .binding = 0,
+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      };
+      VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+         .bindingCount = 1,
+         .pBindings = &descriptor_set_layout_binding,
+      };
+      result =
+         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
+                                        &descriptor_set_layout_info,
+                                        &device->vk.alloc,
+                                        &device->queries.buf_descriptor_set_layout);
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   /* Set availability pipeline.
+    *
+    * Pipeline layout:
+    *  - 1 storage buffer for the BO with the query availability.
+    *  - 2 push constants:
+    *    0B: offset of the availability info in the buffer (4 bytes)
+    *    4B: base query index (4 bytes).
+    *    8B: availability (1 byte).
+    */
+   if (!device->queries.avail_pipeline_layout) {
+      VkPipelineLayoutCreateInfo pipeline_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+         .setLayoutCount = 1,
+         .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+         .pushConstantRangeCount = 1,
+         .pPushConstantRanges =
+             &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 },
+      };
+
+      result =
+         v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+                                   &pipeline_layout_info,
+                                   &device->vk.alloc,
+                                   &device->queries.avail_pipeline_layout);
+
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   if (!device->queries.avail_pipeline) {
+      nir_shader *set_query_availability_cs_nir = get_set_query_availability_cs();
+      result = v3dv_create_compute_pipeline_from_nir(device,
+                                                     set_query_availability_cs_nir,
+                                                     device->queries.avail_pipeline_layout,
+                                                     &pipeline);
+      ralloc_free(set_query_availability_cs_nir);
+      if (result != VK_SUCCESS)
+         return false;
+
+      device->queries.avail_pipeline = pipeline;
+   }
+
+   /* Reset occlusion query pipeline.
+    *
+    * Pipeline layout:
+    *  - 1 storage buffer for the BO with the occlusion and availability data.
+    *  - Push constants:
+    *    0B: offset of the availability info in the buffer (4B)
+    *    4B: base query index (4B)
+    */
+   if (!device->queries.reset_occlusion_pipeline_layout) {
+      VkPipelineLayoutCreateInfo pipeline_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+         .setLayoutCount = 1,
+         .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+         .pushConstantRangeCount = 1,
+         .pPushConstantRanges =
+             &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 },
+      };
+
+      result =
+         v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+                                   &pipeline_layout_info,
+                                   &device->vk.alloc,
+                                   &device->queries.reset_occlusion_pipeline_layout);
+
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   if (!device->queries.reset_occlusion_pipeline) {
+      nir_shader *reset_occlusion_query_cs_nir = get_reset_occlusion_query_cs();
+      result = v3dv_create_compute_pipeline_from_nir(
+                  device,
+                  reset_occlusion_query_cs_nir,
+                  device->queries.reset_occlusion_pipeline_layout,
+                  &pipeline);
+      ralloc_free(reset_occlusion_query_cs_nir);
+      if (result != VK_SUCCESS)
+         return false;
+
+      device->queries.reset_occlusion_pipeline = pipeline;
+   }
+
+   /* Copy query results pipelines.
+    *
+    * Pipeline layout:
+    *  - 1 storage buffer for the BO with the query availability and occlusion.
+    *  - 1 storage buffer for the output.
+    *  - Push constants:
+    *    0B: offset of the availability info in the buffer (4B)
+    *    4B: base query index (4B)
+    *    8B: offset into output buffer (4B)
+    *    12B: stride (4B)
+    *
+    * We create multiple specialized pipelines depending on the copy flags
+    * to remove conditionals from the copy shader and get more optimized
+    * pipelines.
+    */
+   if (!device->queries.copy_pipeline_layout) {
+      VkDescriptorSetLayout set_layouts[2] = {
+         device->queries.buf_descriptor_set_layout,
+         device->queries.buf_descriptor_set_layout
+      };
+      VkPipelineLayoutCreateInfo pipeline_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+         .setLayoutCount = 2,
+         .pSetLayouts = set_layouts,
+         .pushConstantRangeCount = 1,
+         .pPushConstantRanges =
+             &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 },
+      };
+
+      result =
+         v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+                                   &pipeline_layout_info,
+                                   &device->vk.alloc,
+                                   &device->queries.copy_pipeline_layout);
+
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   /* Actual copy pipelines are created lazily on demand since there can be up
+    * to 8 depending on the flags used, however it is likely that applications
+    * will use the same flags every time and only one pipeline is required.
+    */
+
+   return true;
+}
+
+static void
+destroy_query_pipelines(struct v3dv_device *device)
+{
+   VkDevice _device = v3dv_device_to_handle(device);
+
+   /* Availability pipeline */
+   v3dv_DestroyPipeline(_device, device->queries.avail_pipeline,
+                         &device->vk.alloc);
+   device->queries.avail_pipeline = VK_NULL_HANDLE;
+   v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout,
+                              &device->vk.alloc);
+   device->queries.avail_pipeline_layout = VK_NULL_HANDLE;
+
+   /* Reset occlusion pipeline */
+   v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline,
+                         &device->vk.alloc);
+   device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE;
+   v3dv_DestroyPipelineLayout(_device,
+                              device->queries.reset_occlusion_pipeline_layout,
+                              &device->vk.alloc);
+   device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE;
+
+   /* Copy pipelines */
+   for (int i = 0; i < 8; i++) {
+      v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i],
+                            &device->vk.alloc);
+      device->queries.copy_pipeline[i] = VK_NULL_HANDLE;
+   }
+   v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout,
+                              &device->vk.alloc);
+   device->queries.copy_pipeline_layout = VK_NULL_HANDLE;
+
+   v3dv_DestroyDescriptorSetLayout(_device,
+                                   device->queries.buf_descriptor_set_layout,
+                                   &device->vk.alloc);
+   device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE;
+}
+
+/**
+ * Allocates device resources for implementing certain types of queries.
+ */
+VkResult
+v3dv_query_allocate_resources(struct v3dv_device *device)
+{
+   if (!create_query_pipelines(device))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   return VK_SUCCESS;
+}
+
+void
+v3dv_query_free_resources(struct v3dv_device *device)
+{
+   destroy_query_pipelines(device);
+}
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 05343b0a24c..ac981984c4f 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,6 +25,9 @@
 #include "drm-uapi/v3d_drm.h"
 
 #include "broadcom/clif/clif_dump.h"
+#include "util/libsync.h"
+#include "util/os_time.h"
+#include "vk_drm_syncobj.h"
 
 #include <errno.h>
 #include <time.h>
@@ -34,16 +37,16 @@ v3dv_clif_dump(struct v3dv_device *device,
                struct v3dv_job *job,
                struct drm_v3d_submit_cl *submit)
 {
-   if (!(V3D_DEBUG & (V3D_DEBUG_CL |
-                      V3D_DEBUG_CL_NO_BIN |
-                      V3D_DEBUG_CLIF)))
+   if (!(V3D_DBG(CL) ||
+         V3D_DBG(CL_NO_BIN) ||
+         V3D_DBG(CLIF)))
       return;
 
    struct clif_dump *clif = clif_dump_init(&device->devinfo,
                                            stderr,
-                                           V3D_DEBUG & (V3D_DEBUG_CL |
-                                                        V3D_DEBUG_CL_NO_BIN),
-                                           V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
+                                           V3D_DBG(CL) ||
+                                           V3D_DBG(CL_NO_BIN),
+                                           V3D_DBG(CL_NO_BIN));
 
    set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (void *)entry->key;
@@ -67,131 +70,415 @@ v3dv_clif_dump(struct v3dv_device *device,
    clif_dump_destroy(clif);
 }
 
-static uint64_t
-gettime_ns()
+static VkResult
+queue_wait_idle(struct v3dv_queue *queue,
+                struct v3dv_submit_sync_info *sync_info)
 {
-   struct timespec current;
-   clock_gettime(CLOCK_MONOTONIC, &current);
-   return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
-}
+   int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
+                            queue->last_job_syncs.syncs, 4,
+                            INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+                            NULL);
+   if (ret)
+      return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m");
 
-static uint64_t
-get_absolute_timeout(uint64_t timeout)
-{
-   uint64_t current_time = gettime_ns();
-   uint64_t max_timeout = (uint64_t) INT64_MAX - current_time;
+   bool first = true;
+   for (int i = 0; i < 4; i++) {
+      if (!queue->last_job_syncs.first[i])
+         first = false;
+   }
 
-   timeout = MIN2(max_timeout, timeout);
+   /* If we're not the first job, that means we're waiting on some
+    * per-queue-type syncobj which transitively waited on the semaphores
+    * so we can skip the semaphore wait.
+    */
+   if (first) {
+      VkResult result = vk_sync_wait_many(&queue->device->vk,
+                                          sync_info->wait_count,
+                                          sync_info->waits,
+                                          VK_SYNC_WAIT_COMPLETE,
+                                          UINT64_MAX);
+      if (result != VK_SUCCESS)
+         return result;
+   }
 
-   return (current_time + timeout);
-}
+   for (int i = 0; i < 4; i++)
+      queue->last_job_syncs.first[i] = false;
 
-static VkResult
-queue_submit_job(struct v3dv_queue *queue,
-                 struct v3dv_job *job,
-                 bool do_sem_wait,
-                 pthread_t *wait_thread);
+   return VK_SUCCESS;
+}
 
-/* Waits for active CPU wait threads spawned before the current thread to
- * complete and submit all their GPU jobs.
- */
 static void
-cpu_queue_wait_idle(struct v3dv_queue *queue)
+multisync_free(struct v3dv_device *device,
+               struct drm_v3d_multi_sync *ms)
 {
-   const pthread_t this_thread = pthread_self();
-
-retry:
-   mtx_lock(&queue->mutex);
-   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
-                       &queue->submit_wait_list, list_link) {
-      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
-         if (info->wait_threads[i].finished)
-            continue;
-
-         /* Because we are testing this against the list of spawned threads
-          * it will never match for the main thread, so when we call this from
-          * the main thread we are effectively waiting for all active threads
-          * to complete, and otherwise we are only waiting for work submitted
-          * before the wait thread that called this (a wait thread should never
-          * be waiting for work submitted after it).
-          */
-         if (info->wait_threads[i].thread == this_thread)
-            goto done;
+   vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
+   vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
+}
 
-         /* Wait and try again */
-         mtx_unlock(&queue->mutex);
-         usleep(500); /* 0.5 ms */
-         goto retry;
-      }
+static struct drm_v3d_sem *
+set_in_syncs(struct v3dv_queue *queue,
+             struct v3dv_job *job,
+             enum v3dv_queue_type queue_sync,
+             uint32_t *count,
+             struct vk_sync_wait *waits,
+             unsigned wait_count,
+             struct v3dv_submit_sync_info *sync_info)
+{
+   struct v3dv_device *device = queue->device;
+   uint32_t n_syncs = 0;
+
+   /* If this is the first job submitted to a given GPU queue in this cmd buf
+    * batch, it has to wait on wait semaphores (if any) before running.
+    */
+   if (queue->last_job_syncs.first[queue_sync])
+      n_syncs = sync_info->wait_count;
+
+   /* If the serialize flag is set the job needs to be serialized in the
+    * corresponding queues. Notice that we may implement transfer operations
+    * as both CL or TFU jobs.
+    *
+    * FIXME: maybe we could track more precisely if the source of a transfer
+    * barrier is a CL and/or a TFU job.
+    */
+   bool sync_csd  = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
+   bool sync_tfu  = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
+   bool sync_cl   = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
+                                      V3DV_BARRIER_TRANSFER_BIT);
+   bool sync_cpu  = job->serialize & V3DV_BARRIER_CPU_BIT;
+
+   *count = n_syncs;
+   if (sync_cl)
+      (*count)++;
+   if (sync_tfu)
+      (*count)++;
+   if (sync_csd)
+      (*count)++;
+   if (sync_cpu)
+      (*count)++;
+
+   *count += wait_count;
+
+   if (!*count)
+      return NULL;
+
+   struct drm_v3d_sem *syncs =
+      vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+   if (!syncs)
+      return NULL;
+
+   for (int i = 0; i < n_syncs; i++) {
+      syncs[i].handle =
+         vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
    }
 
-done:
-   mtx_unlock(&queue->mutex);
+   for (int i = 0; i < wait_count; i++) {
+      syncs[n_syncs++].handle =
+         vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
+   }
+
+   if (sync_cl)
+      syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
+
+   if (sync_csd)
+      syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
+
+   if (sync_tfu)
+      syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
+
+   if (sync_cpu)
+      syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
+
+   assert(n_syncs == *count);
+   return syncs;
 }
 
-static VkResult
-gpu_queue_wait_idle(struct v3dv_queue *queue)
+static struct drm_v3d_sem *
+set_out_syncs(struct v3dv_queue *queue,
+              struct v3dv_job *job,
+              enum v3dv_queue_type queue_sync,
+              uint32_t *count,
+              struct v3dv_submit_sync_info *sync_info,
+              bool signal_syncs)
 {
    struct v3dv_device *device = queue->device;
 
-   mtx_lock(&device->mutex);
-   uint32_t last_job_sync = device->last_job_sync;
-   mtx_unlock(&device->mutex);
+   uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
 
-   int ret = drmSyncobjWait(device->pdevice->render_fd,
-                            &last_job_sync, 1, INT64_MAX, 0, NULL);
-   if (ret)
-      return VK_ERROR_DEVICE_LOST;
+   /* We always signal the syncobj from `device->last_job_syncs` related to
+    * this v3dv_queue_type to track the last job submitted to this queue.
+    */
+   (*count) = n_vk_syncs + 1;
 
-   return VK_SUCCESS;
+   struct drm_v3d_sem *syncs =
+      vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+   if (!syncs)
+      return NULL;
+
+   if (n_vk_syncs) {
+      for (unsigned i = 0; i < n_vk_syncs; i++) {
+         syncs[i].handle =
+            vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
+      }
+   }
+
+   syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
+
+   return syncs;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueueWaitIdle(VkQueue _queue)
+static void
+set_ext(struct drm_v3d_extension *ext,
+	struct drm_v3d_extension *next,
+	uint32_t id,
+	uintptr_t flags)
 {
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
+   ext->next = (uintptr_t)(void *)next;
+   ext->id = id;
+   ext->flags = flags;
+}
 
-   /* Check that we don't have any wait threads running in the CPU first,
-    * as these can spawn new GPU jobs.
-    */
-   cpu_queue_wait_idle(queue);
+/* This function sets the extension for multiple in/out syncobjs. When it is
+ * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
+ * Otherwise, the extension id is 0, which means an out-of-memory error.
+ */
+static void
+set_multisync(struct drm_v3d_multi_sync *ms,
+              struct v3dv_submit_sync_info *sync_info,
+              struct vk_sync_wait *waits,
+              unsigned wait_count,
+              struct drm_v3d_extension *next,
+              struct v3dv_device *device,
+              struct v3dv_job *job,
+              enum v3dv_queue_type in_queue_sync,
+              enum v3dv_queue_type out_queue_sync,
+              enum v3d_queue wait_stage,
+              bool signal_syncs)
+{
+   struct v3dv_queue *queue = &device->queue;
+   uint32_t out_sync_count = 0, in_sync_count = 0;
+   struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
 
-   /* Check we don't have any GPU jobs running */
-   return gpu_queue_wait_idle(queue);
+   in_syncs = set_in_syncs(queue, job, in_queue_sync,
+                           &in_sync_count, waits, wait_count, sync_info);
+   if (!in_syncs && in_sync_count)
+      goto fail;
+
+   out_syncs = set_out_syncs(queue, job, out_queue_sync,
+                             &out_sync_count, sync_info, signal_syncs);
+
+   assert(out_sync_count > 0);
+
+   if (!out_syncs)
+      goto fail;
+
+   set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
+   ms->wait_stage = wait_stage;
+   ms->out_sync_count = out_sync_count;
+   ms->out_syncs = (uintptr_t)(void *)out_syncs;
+   ms->in_sync_count = in_sync_count;
+   ms->in_syncs = (uintptr_t)(void *)in_syncs;
+
+   return;
+
+fail:
+   if (in_syncs)
+      vk_free(&device->vk.alloc, in_syncs);
+   assert(!out_syncs);
+
+   return;
 }
 
 static VkResult
-handle_reset_query_cpu_job(struct v3dv_job *job)
+handle_reset_query_cpu_job(struct v3dv_queue *queue,
+                           struct v3dv_job *job,
+                           struct v3dv_submit_sync_info *sync_info,
+                           bool signal_syncs)
 {
+   struct v3dv_device *device = queue->device;
    struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
    assert(info->pool);
 
-   /* We are about to reset query counters so we need to make sure that
-    * The GPU is not using them. The exception is timestamp queries, since
-    * we handle those in the CPU.
-    *
-    * FIXME: we could avoid blocking the main thread for this if we use
-    *        submission thread.
+   assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
+
+   if (device->pdevice->caps.cpu_queue) {
+      assert(info->first + info->count <= info->pool->query_count);
+
+      struct drm_v3d_submit_cpu submit = {0};
+      struct drm_v3d_multi_sync ms = {0};
+
+      uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+      uintptr_t *kperfmon_ids = NULL;
+
+      if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+         submit.bo_handle_count = 1;
+         submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
+
+         struct drm_v3d_reset_timestamp_query reset = {0};
+
+         set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
+
+         reset.count = info->count;
+         reset.offset = info->pool->queries[info->first].timestamp.offset;
+
+         for (uint32_t i = 0; i < info->count; i++) {
+            struct v3dv_query *query = &info->pool->queries[info->first + i];
+            syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
+         }
+
+         reset.syncs = (uintptr_t)(void *)syncs;
+
+         set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
+                       V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+         if (!ms.base.id)
+            return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      } else {
+         assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+         struct drm_v3d_reset_performance_query reset = {0};
+
+         set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
+
+         struct vk_sync_wait waits[info->count];
+         unsigned wait_count = 0;
+         for (int i = 0; i < info->count; i++) {
+            struct v3dv_query *query = &info->pool->queries[info->first + i];
+            /* Only wait for a query if we've used it otherwise we will be
+             * waiting forever for the fence to become signaled.
+             */
+            if (query->maybe_available) {
+               waits[wait_count] = (struct vk_sync_wait){
+                  .sync = query->perf.last_job_sync
+               };
+               wait_count++;
+            };
+         }
+
+         reset.count = info->count;
+         reset.nperfmons = info->pool->perfmon.nperfmons;
+
+         kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
+
+         for (uint32_t i = 0; i < info->count; i++) {
+            struct v3dv_query *query = &info->pool->queries[info->first + i];
+
+            syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+            kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
+         }
+
+         reset.syncs = (uintptr_t)(void *)syncs;
+         reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
+
+         set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
+                       V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+         if (!ms.base.id)
+            return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+      submit.extensions = (uintptr_t)(void *)&ms;
+
+      /* From the Vulkan spec for vkCmdResetQueryPool:
+       *
+       *    "This command defines an execution dependency between other query commands
+       *     that reference the same query.
+       *     ...
+       *     The second synchronization scope includes all commands which reference the
+       *     queries in queryPool indicated by firstQuery and queryCount that occur later
+       *     in submission order."
+       *
+       * This means we should ensure that any timestamps after a reset don't execute before
+       * the reset, however, for timestamps queries in particular we don't have to do
+       * anything special because timestamp queries have to wait for all previously
+       * submitted work to complete before executing (which we accomplish by using
+       * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
+       */
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
+
+      free(syncs);
+      free(kperfmon_ids);
+      multisync_free(device, &ms);
+
+      queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
+
+      if (ret)
+         return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
+
+      return VK_SUCCESS;
+   }
+
+   /* We are about to reset query counters in user-space so we need to make
+    * sure that the GPU is not using them.
     */
-   if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
-         v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
-
-   for (uint32_t i = info->first; i < info->first + info->count; i++) {
-      assert(i < info->pool->query_count);
-      struct v3dv_query *q = &info->pool->queries[i];
-      q->maybe_available = false;
-      switch (info->pool->query_type) {
-      case VK_QUERY_TYPE_OCCLUSION: {
-         const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
-         uint32_t *counter = (uint32_t *) q_addr;
-         *counter = 0;
-         break;
+   if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      VkResult result = queue_wait_idle(queue, sync_info);
+      if (result != VK_SUCCESS)
+         return result;
+
+      v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
+   }
+
+   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      struct vk_sync_wait waits[info->count];
+      unsigned wait_count = 0;
+      for (int i = 0; i < info->count; i++) {
+         struct v3dv_query *query = &info->pool->queries[info->first + i];
+         /* Only wait for a query if we've used it otherwise we will be
+          * waiting forever for the fence to become signaled.
+          */
+         if (query->maybe_available) {
+            waits[wait_count] = (struct vk_sync_wait){
+               .sync = query->perf.last_job_sync
+            };
+            wait_count++;
+         };
       }
-      case VK_QUERY_TYPE_TIMESTAMP:
-         q->value = 0;
-         break;
-      default:
-         unreachable("Unsupported query type");
+
+      VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
+                                          VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
+{
+   int err;
+   static const enum v3dv_queue_type queues_to_sync[] = {
+      V3DV_QUEUE_CL,
+      V3DV_QUEUE_CSD,
+   };
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
+      enum v3dv_queue_type queue_type = queues_to_sync[i];
+      int tmp_fd = -1;
+
+      err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
+                                     queue->last_job_syncs.syncs[queue_type],
+                                     &tmp_fd);
+
+      if (err) {
+         close(*fd);
+         return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+                          "sync file export failed: %m");
+      }
+
+      err = sync_accumulate("v3dv", fd, tmp_fd);
+
+      if (err) {
+         close(tmp_fd);
+         close(*fd);
+         return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+                          "failed to accumulate sync files: %m");
       }
    }
 
@@ -199,36 +486,200 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
 }
 
 static VkResult
-handle_end_query_cpu_job(struct v3dv_job *job)
+handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
 {
-   struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
+   VkResult result = VK_SUCCESS;
+
+   mtx_lock(&job->device->query_mutex);
+
+   struct v3dv_end_query_info *info = &job->cpu.query_end;
+   struct v3dv_queue *queue = &job->device->queue;
+
+   int err = 0;
+   int fd = -1;
+
+   assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      result = export_perfmon_last_job_sync(queue, job, &fd);
+
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      assert(fd >= 0);
+   }
+
    for (uint32_t i = 0; i < info->count; i++) {
       assert(info->query + i < info->pool->query_count);
       struct v3dv_query *query = &info->pool->queries[info->query + i];
+
+      if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+         uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+         err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
+                                        syncobj, fd);
+
+         if (err) {
+            result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+                               "sync file import failed: %m");
+            goto fail;
+         }
+      }
+
       query->maybe_available = true;
    }
 
-   return VK_SUCCESS;
+fail:
+   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
+      close(fd);
+
+   cnd_broadcast(&job->device->query_ended);
+   mtx_unlock(&job->device->query_mutex);
+
+   return result;
 }
 
 static VkResult
-handle_copy_query_results_cpu_job(struct v3dv_job *job)
+handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
+                                  struct v3dv_job *job,
+                                  struct v3dv_submit_sync_info *sync_info,
+                                  bool signal_syncs)
 {
+   struct v3dv_device *device = queue->device;
    struct v3dv_copy_query_results_cpu_job_info *info =
       &job->cpu.query_copy_results;
 
+   assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+          info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
+
    assert(info->dst && info->dst->mem && info->dst->mem->bo);
    struct v3dv_bo *bo = info->dst->mem->bo;
 
+   if (device->pdevice->caps.cpu_queue) {
+      struct drm_v3d_submit_cpu submit = {0};
+      struct drm_v3d_multi_sync ms = {0};
+
+      uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+      uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+      uint32_t *bo_handles = NULL;
+      uintptr_t *kperfmon_ids = NULL;
+
+      if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+         submit.bo_handle_count = 2;
+
+         bo_handles = (uint32_t *)
+            malloc(sizeof(uint32_t) * submit.bo_handle_count);
+
+         bo_handles[0] = bo->handle;
+         bo_handles[1] = info->pool->timestamp.bo->handle;
+         submit.bo_handles = (uintptr_t)(void *)bo_handles;
+
+         struct drm_v3d_copy_timestamp_query copy = {0};
+
+         set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
+
+         copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
+         copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
+         copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
+         copy.offset = info->offset + info->dst->mem_offset;
+         copy.stride = info->stride;
+         copy.count = info->count;
+
+         for (uint32_t i = 0; i < info->count; i++) {
+            assert(info->first < info->pool->query_count);
+            assert(info->first + info->count <= info->pool->query_count);
+            struct v3dv_query *query = &info->pool->queries[info->first + i];
+
+            offsets[i] = query->timestamp.offset;
+            syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
+         }
+
+         copy.offsets = (uintptr_t)(void *)offsets;
+         copy.syncs = (uintptr_t)(void *)syncs;
+
+         set_multisync(&ms, sync_info, NULL, 0, (void *)&copy, device, job,
+                       V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+         if (!ms.base.id)
+            return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      } else {
+         assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+         submit.bo_handle_count = 1;
+         submit.bo_handles = (uintptr_t)(void *)&bo->handle;
+
+         struct drm_v3d_copy_performance_query copy = {0};
+
+         set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
+
+	 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
+	  * results for each query are written as an array of the type indicated
+	  * by VkPerformanceCounterKHR::storage for the counter being queried.
+	  * For v3dv, VkPerformanceCounterKHR::storage is
+	  * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
+	  */
+         copy.do_64bit = true;
+         copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
+         copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
+         copy.offset = info->offset + info->dst->mem_offset;
+         copy.stride = info->stride;
+         copy.count = info->count;
+         copy.nperfmons = info->pool->perfmon.nperfmons;
+         copy.ncounters = info->pool->perfmon.ncounters;
+
+         kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
+
+         struct vk_sync_wait waits[info->count];
+         unsigned wait_count = 0;
+
+         for (uint32_t i = 0; i < info->count; i++) {
+            assert(info->first < info->pool->query_count);
+            assert(info->first + info->count <= info->pool->query_count);
+            struct v3dv_query *query = &info->pool->queries[info->first + i];
+
+            syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+            kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
+
+            if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
+                waits[wait_count] = (struct vk_sync_wait){
+                   .sync = query->perf.last_job_sync
+                };
+                wait_count++;
+            }
+         }
+
+         copy.syncs = (uintptr_t)(void *)syncs;
+         copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
+
+         set_multisync(&ms, sync_info, waits, wait_count, (void *)&copy, device, job,
+                       V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+         if (!ms.base.id)
+            return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+      submit.extensions = (uintptr_t)(void *)&ms;
+
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
+
+      free(kperfmon_ids);
+      free(bo_handles);
+      free(offsets);
+      free(syncs);
+      multisync_free(device, &ms);
+
+      queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
+
+      if (ret)
+         return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
+
+      return VK_SUCCESS;
+   }
+
    /* Map the entire dst buffer for the CPU copy if needed */
    assert(!bo->map || bo->map_size == bo->size);
    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a
-    * sync wait on the CPU for the corresponding GPU jobs to finish. We might
-    * want to use a submission thread to avoid blocking on the main thread.
-    */
    uint8_t *offset = ((uint8_t *) bo->map) +
                      info->offset + info->dst->mem_offset;
    v3dv_get_query_pool_results_cpu(job->device,
@@ -243,344 +694,213 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
 }
 
 static VkResult
-handle_set_event_cpu_job(struct v3dv_job *job, bool is_wait_thread)
+handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
+                               struct v3dv_job *job,
+                               struct v3dv_submit_sync_info *sync_info,
+                               bool signal_syncs)
 {
-   /* From the Vulkan 1.0 spec:
-    *
-    *    "When vkCmdSetEvent is submitted to a queue, it defines an execution
-    *     dependency on commands that were submitted before it, and defines an
-    *     event signal operation which sets the event to the signaled state.
-    *     The first synchronization scope includes every command previously
-    *     submitted to the same queue, including those in the same command
-    *     buffer and batch".
-    *
-    * So we should wait for all prior work to be completed before signaling
-    * the event, this includes all active CPU wait threads spawned for any
-    * command buffer submitted *before* this.
-    *
-    * FIXME: we could avoid blocking the main thread for this if we use a
-    *        submission thread.
-    */
+   struct v3dv_device *device = queue->device;
 
-   /* If we are calling this from a wait thread it will only wait
-    * wait threads sspawned before it, otherwise it will wait for
-    * all active threads to complete.
-    */
-   cpu_queue_wait_idle(&job->device->queue);
+   assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
+   struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
 
-   VkResult result = gpu_queue_wait_idle(&job->device->queue);
-   if (result != VK_SUCCESS)
-      return result;
+   if (!device->pdevice->caps.cpu_queue) {
+      /* Wait for completion of all work queued before the timestamp query */
+      VkResult result = queue_wait_idle(queue, sync_info);
+      if (result != VK_SUCCESS)
+         return result;
 
-   struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set;
-   p_atomic_set(&info->event->state, info->state);
+      mtx_lock(&job->device->query_mutex);
 
-   return VK_SUCCESS;
-}
+      /* Compute timestamp */
+      struct timespec t;
+      clock_gettime(CLOCK_MONOTONIC, &t);
 
-static bool
-check_wait_events_complete(struct v3dv_job *job)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+      for (uint32_t i = 0; i < info->count; i++) {
+         assert(info->query + i < info->pool->query_count);
+	 struct v3dv_query *query = &info->pool->queries[info->query + i];
+         query->maybe_available = true;
 
-   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
-   for (uint32_t i = 0; i < info->event_count; i++) {
-      if (!p_atomic_read(&info->events[i]->state))
-         return false;
-   }
-   return true;
-}
+         /* Value */
+         uint8_t *value_addr =
+            ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
+         *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
 
-static void
-wait_thread_finish(struct v3dv_queue *queue, pthread_t thread)
-{
-   mtx_lock(&queue->mutex);
-   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
-                       &queue->submit_wait_list, list_link) {
-      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
-         if (info->wait_threads[i].thread == thread) {
-            info->wait_threads[i].finished = true;
-            goto done;
-         }
+         /* Availability */
+         result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
       }
-   }
 
-   unreachable(!"Failed to finish wait thread: not found");
+      cnd_broadcast(&job->device->query_ended);
+      mtx_unlock(&job->device->query_mutex);
 
-done:
-   mtx_unlock(&queue->mutex);
-}
-
-static void *
-event_wait_thread_func(void *_job)
-{
-   struct v3dv_job *job = (struct v3dv_job *) _job;
-   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
-   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
-
-   /* Wait for events to be signaled */
-   const useconds_t wait_interval_ms = 1;
-   while (!check_wait_events_complete(job))
-      usleep(wait_interval_ms * 1000);
-
-   /* Now continue submitting pending jobs for the same command buffer after
-    * the wait job.
-    */
-   struct v3dv_queue *queue = &job->device->queue;
-   list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
-                            &job->cmd_buffer->jobs, list_link) {
-      /* We don't want to spawn more than one wait thread per command buffer.
-       * If this job also requires a wait for events, we will do the wait here.
-       */
-      VkResult result = queue_submit_job(queue, pjob, info->sem_wait, NULL);
-      if (result == VK_NOT_READY) {
-         while (!check_wait_events_complete(pjob)) {
-            usleep(wait_interval_ms * 1000);
-         }
-         result = VK_SUCCESS;
-      }
-
-      if (result != VK_SUCCESS) {
-         fprintf(stderr, "Wait thread job execution failed.\n");
-         goto done;
-      }
+      return result;
    }
 
-done:
-   wait_thread_finish(queue, pthread_self());
-   return NULL;
-}
+   struct drm_v3d_submit_cpu submit = {0};
 
-static VkResult
-spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread)
+   submit.bo_handle_count = 1;
+   submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
 
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
-   assert(job->cmd_buffer);
-   assert(wait_thread != NULL);
+   struct drm_v3d_timestamp_query timestamp = {0};
 
-   if (pthread_create(wait_thread, NULL, event_wait_thread_func, job))
-      return vk_error(job->device->instance, VK_ERROR_DEVICE_LOST);
+   set_ext(&timestamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
 
-   return VK_NOT_READY;
-}
+   timestamp.count = info->count;
 
-static VkResult
-handle_wait_events_cpu_job(struct v3dv_job *job,
-                           bool sem_wait,
-                           pthread_t *wait_thread)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
-   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+   uint32_t *offsets =
+      (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+   uint32_t *syncs =
+      (uint32_t *) malloc(sizeof(uint32_t) * info->count);
 
-   /* If all events are signaled then we are done and can continue submitting
-    * the rest of the command buffer normally.
-    */
-   if (check_wait_events_complete(job))
-      return VK_SUCCESS;
+   for (uint32_t i = 0; i < info->count; i++) {
+      assert(info->query + i < info->pool->query_count);
+      struct v3dv_query *query = &info->pool->queries[info->query + i];
+      query->maybe_available = true;
 
-   /* Otherwise, we put the rest of the command buffer on a wait thread until
-    * all events are signaled. We only spawn a new thread on the first
-    * wait job we see for a command buffer, any additional wait jobs in the
-    * same command buffer will run in that same wait thread and will get here
-    * with a NULL wait_thread pointer.
-    *
-    * Also, whether we spawn a wait thread or not, we always return
-    * VK_NOT_READY (unless an error happened), so we stop trying to submit
-    * any jobs in the same command buffer after the wait job. The wait thread
-    * will attempt to submit them after the wait completes.
-    */
-   info->sem_wait = sem_wait;
-   if (wait_thread)
-      return spawn_event_wait_thread(job, wait_thread);
-   else
-      return VK_NOT_READY;
-}
+      offsets[i] = query->timestamp.offset;
+      syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
+   }
 
-static VkResult
-handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
-   struct v3dv_copy_buffer_to_image_cpu_job_info *info =
-      &job->cpu.copy_buffer_to_image;
+   timestamp.offsets = (uintptr_t)(void *)offsets;
+   timestamp.syncs = (uintptr_t)(void *)syncs;
 
-   /* Wait for all GPU work to finish first, since we may be accessing
-    * the BOs involved in the operation.
+   struct drm_v3d_multi_sync ms = {0};
+
+   /* The CPU job should be serialized so it only executes after all previously
+    * submitted work has completed
     */
-   v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
-
-   /* Map BOs */
-   struct v3dv_bo *dst_bo = info->image->mem->bo;
-   assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
-   if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-   void *dst_ptr = dst_bo->map;
-
-   struct v3dv_bo *src_bo = info->buffer->mem->bo;
-   assert(!src_bo->map || src_bo->map_size == src_bo->size);
-   if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-   void *src_ptr = src_bo->map;
-
-   const struct v3d_resource_slice *slice =
-      &info->image->slices[info->mip_level];
-
-   const struct pipe_box box = {
-      info->image_offset.x, info->image_offset.y, info->base_layer,
-      info->image_extent.width, info->image_extent.height, info->layer_count,
-   };
+   job->serialize = V3DV_BARRIER_ALL;
 
-   /* Copy each layer */
-   for (uint32_t i = 0; i < info->layer_count; i++) {
-      const uint32_t dst_offset =
-         v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i);
-      const uint32_t src_offset =
-         info->buffer->mem_offset + info->buffer_offset +
-         info->buffer_layer_stride * i;
-      v3d_store_tiled_image(
-         dst_ptr + dst_offset, slice->stride,
-         src_ptr + src_offset, info->buffer_stride,
-         slice->tiling, info->image->cpp, slice->padded_height, &box);
-   }
+   set_multisync(&ms, sync_info, NULL, 0, (void *)&timestamp, device, job,
+	         V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+   if (!ms.base.id)
+      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   return VK_SUCCESS;
-}
+   submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+   submit.extensions = (uintptr_t)(void *)&ms;
 
-static VkResult
-handle_timestamp_query_cpu_job(struct v3dv_job *job)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
-   struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
+   int ret = v3dv_ioctl(device->pdevice->render_fd,
+			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
 
-   /* Wait for completion of all work queued before the timestamp query */
-   v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
+   free(offsets);
+   free(syncs);
+   multisync_free(device, &ms);
 
-   /* Compute timestamp */
-   struct timespec t;
-   clock_gettime(CLOCK_MONOTONIC, &t);
+   queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
 
-   for (uint32_t i = 0; i < info->count; i++) {
-      assert(info->query + i < info->pool->query_count);
-      struct v3dv_query *query = &info->pool->queries[info->query + i];
-      query->maybe_available = true;
-      if (i == 0)
-         query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
-   }
+   if (ret)
+      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
 
    return VK_SUCCESS;
 }
 
 static VkResult
-handle_csd_job(struct v3dv_queue *queue,
-               struct v3dv_job *job,
-               bool do_sem_wait);
-
-static VkResult
 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
                             struct v3dv_job *job,
-                            bool do_sem_wait)
+                            struct v3dv_submit_sync_info *sync_info,
+                            bool signal_syncs)
 {
+   struct v3dv_device *device = queue->device;
+
    assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
    struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
    assert(info->csd_job);
 
-   /* Make sure the GPU is no longer using the indirect buffer*/
-   assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
-   v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE);
-
-   /* Map the indirect buffer and read the dispatch parameters */
    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
    struct v3dv_bo *bo = info->buffer->mem->bo;
-   if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-   assert(bo->map);
 
-   const uint32_t offset = info->buffer->mem_offset + info->offset;
-   const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
-   if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
-      return VK_SUCCESS;
+   if (!device->pdevice->caps.cpu_queue) {
+      /* Make sure the GPU is no longer using the indirect buffer*/
+      v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
 
-   if (memcmp(group_counts, info->csd_job->csd.wg_count,
-              sizeof(info->csd_job->csd.wg_count)) != 0) {
-      v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
-   }
+      /* Map the indirect buffer and read the dispatch parameters */
+      if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
+         return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      assert(bo->map);
 
-   handle_csd_job(queue, info->csd_job, do_sem_wait);
+      const uint32_t offset = info->buffer->mem_offset + info->offset;
+      const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
+      if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
+         return VK_SUCCESS;
 
-   return VK_SUCCESS;
-}
+      if (memcmp(group_counts, info->csd_job->csd.wg_count,
+		 sizeof(info->csd_job->csd.wg_count)) != 0) {
+         v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
+      }
 
-static VkResult
-process_semaphores_to_signal(struct v3dv_device *device,
-                             uint32_t count, const VkSemaphore *sems)
-{
-   if (count == 0)
       return VK_SUCCESS;
+   }
 
-   int render_fd = device->pdevice->render_fd;
+   struct v3dv_job *csd_job = info->csd_job;
 
-   int fd;
-   mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
-   mtx_unlock(&device->mutex);
-   if (fd == -1)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   struct drm_v3d_submit_cpu submit = {0};
 
-   VkResult result = VK_SUCCESS;
-   for (uint32_t i = 0; i < count; i++) {
-      struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
-
-      int ret;
-      if (!sem->temp_sync)
-         ret = drmSyncobjImportSyncFile(render_fd, sem->sync, fd);
-      else
-         ret = drmSyncobjImportSyncFile(render_fd, sem->temp_sync, fd);
-
-      if (ret) {
-         result = VK_ERROR_OUT_OF_HOST_MEMORY;
-         break;
-      }
+   submit.bo_handle_count = 1;
+   submit.bo_handles = (uintptr_t)(void *)&bo->handle;
+
+   csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
+   uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
+   uint32_t bo_idx = 0;
+   set_foreach (csd_job->bos, entry) {
+      struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
+      bo_handles[bo_idx++] = bo->handle;
    }
+   csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
 
-   assert(fd >= 0);
-   close(fd);
+   struct drm_v3d_indirect_csd indirect = {0};
 
-   return result;
-}
+   set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
 
-static VkResult
-process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
-{
-   if (_fence == VK_NULL_HANDLE)
-      return VK_SUCCESS;
+   indirect.submit = csd_job->csd.submit;
+   indirect.offset = info->buffer->mem_offset + info->offset;
+   indirect.wg_size = info->wg_size;
+
+   for (int i = 0; i < 3; i++) {
+      if (info->wg_uniform_offsets[i]) {
+         assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
+         indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
+      } else {
+         indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
+      }
+   }
 
-   struct v3dv_fence *fence = v3dv_fence_from_handle(_fence);
+   indirect.indirect = csd_job->indirect.bo->handle;
 
-   int render_fd = device->pdevice->render_fd;
+   struct drm_v3d_multi_sync ms = {0};
 
-   int fd;
-   mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
-   mtx_unlock(&device->mutex);
-   if (fd == -1)
+   /* We need to configure the semaphores of this job with the indirect
+    * CSD job, as the CPU job must obey to the CSD job synchronization
+    * demands, such as barriers.
+    */
+   set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
+	         V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
+   if (!ms.base.id)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   int ret;
-   if (!fence->temp_sync)
-      ret = drmSyncobjImportSyncFile(render_fd, fence->sync, fd);
-   else
-      ret = drmSyncobjImportSyncFile(render_fd, fence->temp_sync, fd);
+   submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+   submit.extensions = (uintptr_t)(void *)&ms;
+
+   int ret = v3dv_ioctl(device->pdevice->render_fd,
+			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
+
+   free(bo_handles);
+   multisync_free(device, &ms);
 
-   assert(fd >= 0);
-   close(fd);
+   queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
+   queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
 
-   return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS;
+   if (ret)
+      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
+
+   return VK_SUCCESS;
 }
 
 static VkResult
 handle_cl_job(struct v3dv_queue *queue,
               struct v3dv_job *job,
-              bool do_sem_wait)
+              uint32_t counter_pass_idx,
+              struct v3dv_submit_sync_info *sync_info,
+              bool signal_syncs)
 {
    struct v3dv_device *device = queue->device;
 
@@ -599,7 +919,8 @@ handle_cl_job(struct v3dv_queue *queue,
    struct v3dv_bo *bcl_fist_bo =
       list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
    submit.bcl_start = bcl_fist_bo->offset;
-   submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
+   submit.bcl_end = job->suspending ? job->suspended_bcl_end :
+                                      job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
    submit.rcl_start = job->rcl.bo->offset;
    submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
 
@@ -611,6 +932,17 @@ handle_cl_job(struct v3dv_queue *queue,
    if (job->tmu_dirty_rcl)
       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
 
+   /* If the job uses VK_KHR_buffer_device_address we need to ensure all
+    * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
+    * are included.
+    */
+   if (job->uses_buffer_device_address) {
+      util_dynarray_foreach(&queue->device->device_address_bo_list,
+                            struct v3dv_bo *, bo) {
+         v3dv_job_add_bo(job, *bo);
+      }
+   }
+
    submit.bo_handle_count = job->bo_count;
    uint32_t *bo_handles =
       (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
@@ -622,34 +954,64 @@ handle_cl_job(struct v3dv_queue *queue,
    assert(bo_idx == submit.bo_handle_count);
    submit.bo_handles = (uintptr_t)(void *)bo_handles;
 
-   /* We need a binning sync if we are waiting on a sempahore (do_sem_wait) or
-    * if the job comes after a pipeline barrier than involves geometry stages
-    * (needs_bcl_sync).
+   submit.perfmon_id = job->perf ?
+      job->perf->kperfmon_ids[counter_pass_idx] : 0;
+   const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
+   queue->last_perfmon_id = submit.perfmon_id;
+
+   /* We need a binning sync if we are the first CL job waiting on a semaphore
+    * with a wait stage that involves the geometry pipeline, or if the job
+    * comes after a pipeline barrier that involves geometry stages
+    * (needs_bcl_sync) or when performance queries are in use.
     *
     * We need a render sync if the job doesn't need a binning sync but has
     * still been flagged for serialization. It should be noted that RCL jobs
     * don't start until the previous RCL job has finished so we don't really
     * need to add a fence for those, however, we might need to wait on a CSD or
     * TFU job, which are not automatically serialized with CL jobs.
-    *
-    * FIXME: for now, if we are asked to wait on any semaphores, we just wait
-    * on the last job we submitted. In the future we might want to pass the
-    * actual syncobj of the wait semaphores so we don't block on the last RCL
-    * if we only need to wait for a previous CSD or TFU, for example, but
-    * we would have to extend our kernel interface to support the case where
-    * we have more than one semaphore to wait on.
     */
-   const bool needs_bcl_sync = do_sem_wait || job->needs_bcl_sync;
-   const bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
+   bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
+   if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
+      for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
+         needs_bcl_sync = sync_info->waits[i].stage_mask &
+             (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
+              VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
+              VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+              VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
+              VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+              VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
+              VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
+              VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
+              VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
+              VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
+              VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
+              VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
+      }
+   }
+
+   bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
+
+   /* Replace single semaphore settings whenever our kernel-driver supports
+    * multiple semaphores extension.
+    */
+   struct drm_v3d_multi_sync ms = { 0 };
+   enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
+   set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
+                 V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
+   if (!ms.base.id)
+      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+   submit.extensions = (uintptr_t)(void *)&ms;
+
+   /* We are using multisync so disable legacy single-sync interface */
+   submit.in_sync_rcl = 0;
+   submit.in_sync_bcl = 0;
+   submit.out_sync = 0;
 
-   mtx_lock(&queue->device->mutex);
-   submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0;
-   submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0;
-   submit.out_sync = device->last_job_sync;
    v3dv_clif_dump(device, job, &submit);
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_CL, &submit);
-   mtx_unlock(&queue->device->mutex);
 
    static bool warned = false;
    if (ret && !warned) {
@@ -659,9 +1021,12 @@ handle_cl_job(struct v3dv_queue *queue,
    }
 
    free(bo_handles);
+   multisync_free(device, &ms);
+
+   queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
 
    if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
 
    return VK_SUCCESS;
 }
@@ -669,23 +1034,37 @@ handle_cl_job(struct v3dv_queue *queue,
 static VkResult
 handle_tfu_job(struct v3dv_queue *queue,
                struct v3dv_job *job,
-               bool do_sem_wait)
+               struct v3dv_submit_sync_info *sync_info,
+               bool signal_syncs)
 {
+   assert(!V3D_DBG(DISABLE_TFU));
+
    struct v3dv_device *device = queue->device;
 
-   const bool needs_sync = do_sem_wait || job->serialize;
+   /* Replace single semaphore settings whenever our kernel-driver supports
+    * multiple semaphore extension.
+    */
+   struct drm_v3d_multi_sync ms = { 0 };
+   set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
+                 V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
+   if (!ms.base.id)
+      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
+   job->tfu.extensions = (uintptr_t)(void *)&ms;
+
+   /* We are using multisync so disable legacy single-sync interface */
+   job->tfu.in_sync = 0;
+   job->tfu.out_sync = 0;
 
-   mtx_lock(&device->mutex);
-   job->tfu.in_sync = needs_sync ? device->last_job_sync : 0;
-   job->tfu.out_sync = device->last_job_sync;
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
-   mtx_unlock(&device->mutex);
 
-   if (ret != 0) {
-      fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
-   }
+   multisync_free(device, &ms);
+   queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
+
+   if (ret != 0)
+      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
 
    return VK_SUCCESS;
 }
@@ -693,12 +1072,25 @@ handle_tfu_job(struct v3dv_queue *queue,
 static VkResult
 handle_csd_job(struct v3dv_queue *queue,
                struct v3dv_job *job,
-               bool do_sem_wait)
+               uint32_t counter_pass_idx,
+               struct v3dv_submit_sync_info *sync_info,
+               bool signal_syncs)
 {
    struct v3dv_device *device = queue->device;
 
    struct drm_v3d_submit_csd *submit = &job->csd.submit;
 
+   /* If the job uses VK_KHR_buffer_device_address we need to ensure all
+    * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
+    * are included.
+    */
+   if (job->uses_buffer_device_address) {
+      util_dynarray_foreach(&queue->device->device_address_bo_list,
+                            struct v3dv_bo *, bo) {
+         v3dv_job_add_bo(job, *bo);
+      }
+   }
+
    submit->bo_handle_count = job->bo_count;
    uint32_t *bo_handles =
       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
@@ -710,14 +1102,28 @@ handle_csd_job(struct v3dv_queue *queue,
    assert(bo_idx == submit->bo_handle_count);
    submit->bo_handles = (uintptr_t)(void *)bo_handles;
 
-   const bool needs_sync = do_sem_wait || job->serialize;
+   /* Replace single semaphore settings whenever our kernel-driver supports
+    * multiple semaphore extension.
+    */
+   struct drm_v3d_multi_sync ms = { 0 };
+   set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
+                 V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
+   if (!ms.base.id)
+      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
+   submit->extensions = (uintptr_t)(void *)&ms;
+
+   /* We are using multisync so disable legacy single-sync interface */
+   submit->in_sync = 0;
+   submit->out_sync = 0;
+
+   submit->perfmon_id = job->perf ?
+      job->perf->kperfmon_ids[counter_pass_idx] : 0;
+   queue->last_perfmon_id = submit->perfmon_id;
 
-   mtx_lock(&queue->device->mutex);
-   submit->in_sync = needs_sync ? device->last_job_sync : 0;
-   submit->out_sync = device->last_job_sync;
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_CSD, submit);
-   mtx_unlock(&queue->device->mutex);
 
    static bool warned = false;
    if (ret && !warned) {
@@ -728,43 +1134,39 @@ handle_csd_job(struct v3dv_queue *queue,
 
    free(bo_handles);
 
+   multisync_free(device, &ms);
+   queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
+
    if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
 
    return VK_SUCCESS;
 }
 
 static VkResult
-queue_submit_job(struct v3dv_queue *queue,
+queue_handle_job(struct v3dv_queue *queue,
                  struct v3dv_job *job,
-                 bool do_sem_wait,
-                 pthread_t *wait_thread)
+                 uint32_t counter_pass_idx,
+                 struct v3dv_submit_sync_info *sync_info,
+                 bool signal_syncs)
 {
-   assert(job);
-
    switch (job->type) {
    case V3DV_JOB_TYPE_GPU_CL:
-      return handle_cl_job(queue, job, do_sem_wait);
+      return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_GPU_TFU:
-      return handle_tfu_job(queue, job, do_sem_wait);
+      return handle_tfu_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_GPU_CSD:
-      return handle_csd_job(queue, job, do_sem_wait);
+      return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
-      return handle_reset_query_cpu_job(job);
+      return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_CPU_END_QUERY:
-      return handle_end_query_cpu_job(job);
+      return handle_end_query_cpu_job(job, counter_pass_idx);
    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
-      return handle_copy_query_results_cpu_job(job);
-   case V3DV_JOB_TYPE_CPU_SET_EVENT:
-      return handle_set_event_cpu_job(job, wait_thread != NULL);
-   case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
-      return handle_wait_events_cpu_job(job, do_sem_wait, wait_thread);
-   case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
-      return handle_copy_buffer_to_image_cpu_job(job);
+      return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
-      return handle_csd_indirect_cpu_job(queue, job, do_sem_wait);
+      return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
-      return handle_timestamp_query_cpu_job(job);
+      return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
    default:
       unreachable("Unhandled job type");
    }
@@ -777,772 +1179,128 @@ queue_create_noop_job(struct v3dv_queue *queue)
    queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!queue->noop_job)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
 
    v3dv_X(device, job_emit_noop)(queue->noop_job);
 
+   /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
+    * serialized across all hw queues to comply with Vulkan's signal operation
+    * order requirements, which basically require that signal operations occur
+    * in submission order.
+    */
+   queue->noop_job->serialize = V3DV_BARRIER_ALL;
+
    return VK_SUCCESS;
 }
 
 static VkResult
-queue_submit_noop_job(struct v3dv_queue *queue, const VkSubmitInfo *pSubmit)
+queue_submit_noop_job(struct v3dv_queue *queue,
+                      uint32_t counter_pass_idx,
+                      struct v3dv_submit_sync_info *sync_info,
+                      bool signal_syncs)
 {
-   /* VkQueue host access is externally synchronized so we don't need to lock
-    * here for the static variable.
-    */
    if (!queue->noop_job) {
       VkResult result = queue_create_noop_job(queue);
       if (result != VK_SUCCESS)
          return result;
    }
 
-   return queue_submit_job(queue, queue->noop_job,
-                           pSubmit->waitSemaphoreCount > 0, NULL);
-}
-
-static VkResult
-queue_submit_cmd_buffer(struct v3dv_queue *queue,
-                        struct v3dv_cmd_buffer *cmd_buffer,
-                        const VkSubmitInfo *pSubmit,
-                        pthread_t *wait_thread)
-{
-   assert(cmd_buffer);
-   assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE);
-
-   if (list_is_empty(&cmd_buffer->jobs))
-      return queue_submit_noop_job(queue, pSubmit);
-
-   list_for_each_entry_safe(struct v3dv_job, job,
-                            &cmd_buffer->jobs, list_link) {
-      VkResult result = queue_submit_job(queue, job,
-                                         pSubmit->waitSemaphoreCount > 0,
-                                         wait_thread);
-      if (result != VK_SUCCESS)
-         return result;
-   }
-
-   return VK_SUCCESS;
-}
-
-static void
-add_wait_thread_to_list(struct v3dv_device *device,
-                        pthread_t thread,
-                        struct v3dv_queue_submit_wait_info **wait_info)
-{
-   /* If this is the first time we spawn a wait thread for this queue
-    * submission create a v3dv_queue_submit_wait_info to track this and
-    * any other threads in the same submission and add it to the global list
-    * in the queue.
-    */
-   if (*wait_info == NULL) {
-      *wait_info =
-         vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8,
-                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-      (*wait_info)->device = device;
-   }
-
-   /* And add the thread to the list of wait threads for this submission */
-   const uint32_t thread_idx = (*wait_info)->wait_thread_count;
-   assert(thread_idx < 16);
-   (*wait_info)->wait_threads[thread_idx].thread = thread;
-   (*wait_info)->wait_threads[thread_idx].finished = false;
-   (*wait_info)->wait_thread_count++;
-}
-
-static void
-add_signal_semaphores_to_wait_list(struct v3dv_device *device,
-                                   const VkSubmitInfo *pSubmit,
-                                   struct v3dv_queue_submit_wait_info *wait_info)
-{
-   assert(wait_info);
-
-   if (pSubmit->signalSemaphoreCount == 0)
-      return;
-
-   /* FIXME: We put all the semaphores in a list and we signal all of them
-    * together from the submit master thread when the last wait thread in the
-    * submit completes. We could do better though: group the semaphores per
-    * submit and signal them as soon as all wait threads for a particular
-    * submit completes. Not sure if the extra work would be worth it though,
-    * since we only spawn waith threads for event waits and only when the
-    * event if set from the host after the queue submission.
-    */
-
-   /* Check the size of the current semaphore list */
-   const uint32_t prev_count = wait_info->signal_semaphore_count;
-   const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore);
-   VkSemaphore *prev_list = wait_info->signal_semaphores;
-
-   /* Resize the list to hold the additional semaphores */
-   const uint32_t extra_alloc_size =
-      pSubmit->signalSemaphoreCount * sizeof(VkSemaphore);
-   wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount;
-   wait_info->signal_semaphores =
-      vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8,
-               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-
-   /* Copy the old list to the new allocation and free the old list */
-   if (prev_count > 0) {
-      memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size);
-      vk_free(&device->vk.alloc, prev_list);
-   }
-
-   /* Add the new semaphores to the list */
-   memcpy(wait_info->signal_semaphores + prev_count,
-          pSubmit->pSignalSemaphores, extra_alloc_size);
-}
-
-static VkResult
-queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
-                              const VkSubmitInfo *pSubmit,
-                              struct v3dv_queue_submit_wait_info **wait_info)
-{
-   VkResult result = VK_SUCCESS;
-   bool has_wait_threads = false;
-
-   /* Even if we don't have any actual work to submit we still need to wait
-    * on the wait semaphores and signal the signal semaphores and fence, so
-    * in this scenario we just submit a trivial no-op job so we don't have
-    * to do anything special, it should not be a common case anyway.
-    */
-   if (pSubmit->commandBufferCount == 0) {
-      result = queue_submit_noop_job(queue, pSubmit);
-   } else {
-      for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
-         pthread_t wait_thread;
-         struct v3dv_cmd_buffer *cmd_buffer =
-            v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
-         result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit,
-                                          &wait_thread);
-
-         /* We get VK_NOT_READY if we had to spawn a wait thread for the
-          * command buffer. In that scenario, we want to continue submitting
-          * any pending command buffers in the batch, but we don't want to
-          * process any signal semaphores for the batch until we know we have
-          * submitted every job for every command buffer in the batch.
-          */
-         if (result == VK_NOT_READY) {
-            result = VK_SUCCESS;
-            add_wait_thread_to_list(queue->device, wait_thread, wait_info);
-            has_wait_threads = true;
-         }
-
-         if (result != VK_SUCCESS)
-            break;
-      }
-   }
-
-   if (result != VK_SUCCESS)
-      return result;
-
-   /* If had to emit any wait threads in this submit we need to wait for all
-    * of them to complete before we can signal any semaphores.
-    */
-   if (!has_wait_threads) {
-      return process_semaphores_to_signal(queue->device,
-                                          pSubmit->signalSemaphoreCount,
-                                          pSubmit->pSignalSemaphores);
-   } else {
-      assert(*wait_info);
-      add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info);
-      return VK_NOT_READY;
-   }
+   assert(queue->noop_job);
+   return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
+                           sync_info, signal_syncs);
 }
 
-static void *
-master_wait_thread_func(void *_wait_info)
+VkResult
+v3dv_queue_driver_submit(struct vk_queue *vk_queue,
+                         struct vk_queue_submit *submit)
 {
-   struct v3dv_queue_submit_wait_info *wait_info =
-      (struct v3dv_queue_submit_wait_info *) _wait_info;
-
-   struct v3dv_queue *queue = &wait_info->device->queue;
-
-   /* Wait for all command buffer wait threads to complete */
-   for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) {
-      int res = pthread_join(wait_info->wait_threads[i].thread, NULL);
-      if (res != 0)
-         fprintf(stderr, "Wait thread failed to join.\n");
-   }
-
-   /* Signal semaphores and fences */
+   struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
    VkResult result;
-   result = process_semaphores_to_signal(wait_info->device,
-                                         wait_info->signal_semaphore_count,
-                                         wait_info->signal_semaphores);
-   if (result != VK_SUCCESS)
-      fprintf(stderr, "Wait thread semaphore signaling failed.");
-
-   result = process_fence_to_signal(wait_info->device, wait_info->fence);
-   if (result != VK_SUCCESS)
-      fprintf(stderr, "Wait thread fence signaling failed.");
-
-   /* Release wait_info */
-   mtx_lock(&queue->mutex);
-   list_del(&wait_info->list_link);
-   mtx_unlock(&queue->mutex);
-
-   vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores);
-   vk_free(&wait_info->device->vk.alloc, wait_info);
-
-   return NULL;
-}
-
-
-static VkResult
-spawn_master_wait_thread(struct v3dv_queue *queue,
-                         struct v3dv_queue_submit_wait_info *wait_info)
-
-{
-   VkResult result = VK_SUCCESS;
-
-   mtx_lock(&queue->mutex);
-   if (pthread_create(&wait_info->master_wait_thread, NULL,
-                      master_wait_thread_func, wait_info)) {
-      result = vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
-      goto done;
-   }
-
-   list_addtail(&wait_info->list_link, &queue->submit_wait_list);
-
-done:
-   mtx_unlock(&queue->mutex);
-   return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueueSubmit(VkQueue _queue,
-                 uint32_t submitCount,
-                 const VkSubmitInfo* pSubmits,
-                 VkFence fence)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-
-   struct v3dv_queue_submit_wait_info *wait_info = NULL;
-
-   VkResult result = VK_SUCCESS;
-   for (uint32_t i = 0; i < submitCount; i++) {
-      result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info);
-      if (result != VK_SUCCESS && result != VK_NOT_READY)
-         goto done;
-   }
-
-   if (!wait_info) {
-      assert(result != VK_NOT_READY);
-      result = process_fence_to_signal(queue->device, fence);
-      goto done;
-   }
-
-   /* We emitted wait threads, so we have to spwan a master thread for this
-    * queue submission that waits for all other threads to complete and then
-    * will signal any semaphores and fences.
-    */
-   assert(wait_info);
-   wait_info->fence = fence;
-   result = spawn_master_wait_thread(queue, wait_info);
-
-done:
-   return result;
-}
-
-static void
-destroy_syncobj(uint32_t device_fd, uint32_t *sync)
-{
-   assert(sync);
-   drmSyncobjDestroy(device_fd, *sync);
-   *sync = 0;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSemaphore(VkDevice _device,
-                     const VkSemaphoreCreateInfo *pCreateInfo,
-                     const VkAllocationCallbacks *pAllocator,
-                     VkSemaphore *pSemaphore)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
-
-   struct v3dv_semaphore *sem =
-      vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore),
-                       VK_OBJECT_TYPE_SEMAPHORE);
-   if (sem == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync);
-   if (ret) {
-      vk_object_free(&device->vk, pAllocator, sem);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-   }
-
-   *pSemaphore = v3dv_semaphore_to_handle(sem);
 
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceExternalSemaphoreProperties(
-    VkPhysicalDevice physicalDevice,
-    const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
-    VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
-{
-   switch (pExternalSemaphoreInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
-      pExternalSemaphoreProperties->exportFromImportedHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalSemaphoreProperties->compatibleHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
-
-      /* FIXME: we can't import external semaphores until we improve the kernel
-       * submit interface to handle multiple in syncobjs, because once we have
-       * an imported semaphore in our list of semaphores to wait on, we can no
-       * longer use the workaround of waiting on the last syncobj fence produced
-       * from the device, since the imported semaphore may not (and in fact, it
-       * would typically not) have been produced from same device.
-       *
-       * This behavior is exercised via dEQP-VK.synchronization.cross_instance.*.
-       * Particularly, this test:
-       * dEQP-VK.synchronization.cross_instance.dedicated.
-       * write_ssbo_compute_read_vertex_input.buffer_16384_binary_semaphore_fd
-       * fails consistently because of this, so it'll be a good reference to
-       * verify the implementation when the kernel bits are in place.
-       */
-      pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
-
-      /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties
-       * for details on why we can't export to SYNC_FD.
-       */
-      if (pExternalSemaphoreInfo->handleType !=
-          VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
-         pExternalSemaphoreProperties->externalSemaphoreFeatures |=
-            VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT;
-      }
-      break;
-   default:
-      pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
-      pExternalSemaphoreProperties->compatibleHandleTypes = 0;
-      pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
-      break;
-   }
-}
+   struct v3dv_submit_sync_info sync_info = {
+      .wait_count = submit->wait_count,
+      .waits = submit->waits,
+      .signal_count = submit->signal_count,
+      .signals = submit->signals,
+   };
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ImportSemaphoreFdKHR(
-   VkDevice _device,
-   const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
-
-   assert(pImportSemaphoreFdInfo->sType ==
-          VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR);
-
-   int fd = pImportSemaphoreFdInfo->fd;
-   int render_fd = device->pdevice->render_fd;
-
-   bool is_temporary =
-      pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT ||
-      (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT);
-
-   uint32_t new_sync;
-   switch (pImportSemaphoreFdInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
-      /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the
-       *  special value -1 for fd is treated like a valid sync file descriptor
-       *  referring to an object that has already signaled. The import
-       *  operation will succeed and the VkSemaphore will have a temporarily
-       *  imported payload as if a valid file descriptor had been provided."
-       */
-      unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
-      if (drmSyncobjCreate(render_fd, flags, &new_sync))
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      if (fd != -1) {
-         if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
-            drmSyncobjDestroy(render_fd, new_sync);
-            return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
+      queue->last_job_syncs.first[i] = true;
+
+   struct v3dv_job *first_suspend_job = NULL;
+   struct v3dv_job *current_suspend_job = NULL;
+   for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
+      struct v3dv_cmd_buffer *cmd_buffer =
+         container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
+      list_for_each_entry_safe(struct v3dv_job, job,
+                               &cmd_buffer->jobs, list_link) {
+         if (job->suspending) {
+            job = v3dv_X(job->device,
+                         cmd_buffer_prepare_suspend_job_for_submit)(job);
+            if (!job)
+               return VK_ERROR_OUT_OF_DEVICE_MEMORY;
          }
-      }
-      break;
-   }
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: {
-      if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
-         return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-      break;
-   }
-   default:
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-   }
 
-   destroy_syncobj(render_fd, &sem->temp_sync);
-   if (is_temporary) {
-      sem->temp_sync = new_sync;
-   } else {
-      destroy_syncobj(render_fd, &sem->sync);
-      sem->sync = new_sync;
-   }
-
-   /* From the Vulkan 1.0.53 spec:
-    *
-    *    "Importing a semaphore payload from a file descriptor transfers
-    *     ownership of the file descriptor from the application to the
-    *     Vulkan implementation. The application must not perform any
-    *     operations on the file descriptor after a successful import."
-    *
-    * If the import fails, we leave the file descriptor open.
-    */
-   if (fd != -1)
-      close(fd);
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetSemaphoreFdKHR(VkDevice _device,
-                       const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
-                       int *pFd)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore);
-
-   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
-
-   *pFd = -1;
-   int render_fd = device->pdevice->render_fd;
-   switch (pGetFdInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
-      drmSyncobjExportSyncFile(render_fd, sem->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
-      drmSyncobjHandleToFD(render_fd, sem->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   }
-   default:
-      unreachable("Unsupported external semaphore handle type");
-   }
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySemaphore(VkDevice _device,
-                      VkSemaphore semaphore,
-                      const VkAllocationCallbacks *pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore);
-
-   if (sem == NULL)
-      return;
-
-   destroy_syncobj(device->pdevice->render_fd, &sem->sync);
-   destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync);
-
-   vk_object_free(&device->vk, pAllocator, sem);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateFence(VkDevice _device,
-                 const VkFenceCreateInfo *pCreateInfo,
-                 const VkAllocationCallbacks *pAllocator,
-                 VkFence *pFence)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
-
-   struct v3dv_fence *fence =
-      vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence),
-                       VK_OBJECT_TYPE_FENCE);
-   if (fence == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   unsigned flags = 0;
-   if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
-      flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-   int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync);
-   if (ret) {
-      vk_object_free(&device->vk, pAllocator, fence);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-   }
-
-   *pFence = v3dv_fence_to_handle(fence);
+         if (job->suspending && !job->resuming) {
+            assert(!first_suspend_job);
+            assert(!current_suspend_job);
+            first_suspend_job = job;
+         }
 
-   return VK_SUCCESS;
-}
+         if (job->resuming) {
+            assert(first_suspend_job);
+            assert(current_suspend_job);
+            v3dv_X(job->device, job_patch_resume_address)(first_suspend_job,
+                                                          current_suspend_job,
+                                                          job);
+            current_suspend_job = NULL;
+         }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceExternalFenceProperties(
-    VkPhysicalDevice physicalDevice,
-    const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo,
-    VkExternalFenceProperties *pExternalFenceProperties)
+         if (job->suspending) {
+            current_suspend_job = job;
+         } else {
+            assert(!current_suspend_job);
+            struct v3dv_job *submit_job = first_suspend_job ?
+                                          first_suspend_job : job;
+            result =
+               queue_handle_job(queue, submit_job, submit->perf_pass_index,
+                                &sync_info, false);
 
-{
-   switch (pExternalFenceInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
-      pExternalFenceProperties->exportFromImportedHandleTypes =
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalFenceProperties->compatibleHandleTypes =
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalFenceProperties->externalFenceFeatures =
-         VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
-
-      /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not
-       * the syncobj itself, and that fence is only created after we have
-       * submitted to the kernel and updated the syncobj for the fence to import
-       * the actual DRM fence created with the submission. Unfortunately, if the
-       * queue submission has a 'wait for events' we may hold any jobs after the
-       * wait in a user-space thread until the events are signaled, and in that
-       * case we don't update the out fence of the submit until the events are
-       * signaled and we can submit all the jobs involved with the vkQueueSubmit
-       * call. This means that if the applications submits with an out fence and
-       * a wait for events, trying to export the out fence to a SYNC_FD rigth
-       * after the submission and before the events are signaled will fail,
-       * because the actual DRM fence won't exist yet. This is not a problem
-       * with OPAQUE_FD because in this case we export the entire syncobj, not
-       * the underlying DRM fence. To fix this we need to rework our kernel
-       * interface to be more flexible and accept multiple in/out syncobjs so
-       * we can implement event waits as regular fence waits on the kernel side,
-       * until then, we can only reliably export OPAQUE_FD.
-       */
-      if (pExternalFenceInfo->handleType !=
-          VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) {
-         pExternalFenceProperties->externalFenceFeatures |=
-            VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT;
-      }
-      break;
-   default:
-      pExternalFenceProperties->exportFromImportedHandleTypes = 0;
-      pExternalFenceProperties->compatibleHandleTypes = 0;
-      pExternalFenceProperties->externalFenceFeatures = 0;
-      break;
-   }
-}
+            if (result != VK_SUCCESS)
+               return result;
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ImportFenceFdKHR(VkDevice _device,
-                      const VkImportFenceFdInfoKHR *pImportFenceFdInfo)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence);
-
-   assert(pImportFenceFdInfo->sType ==
-          VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
-
-   int fd = pImportFenceFdInfo->fd;
-   int render_fd = device->pdevice->render_fd;
-
-   bool is_temporary =
-      pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT ||
-      (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT);
-
-   uint32_t new_sync;
-   switch (pImportFenceFdInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
-      /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
-       *  special value -1 for fd is treated like a valid sync file descriptor
-       *  referring to an object that has already signaled. The import
-       *  operation will succeed and the VkFence will have a temporarily
-       *  imported payload as if a valid file descriptor had been provided."
-       */
-      unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
-      if (drmSyncobjCreate(render_fd, flags, &new_sync))
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      if (fd != -1) {
-         if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
-            drmSyncobjDestroy(render_fd, new_sync);
-            return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+            first_suspend_job = NULL;
          }
       }
-      break;
-   }
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
-      if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
-         return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-      break;
-   }
-   default:
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-   }
-
-   destroy_syncobj(render_fd, &fence->temp_sync);
-   if (is_temporary) {
-      fence->temp_sync = new_sync;
-   } else {
-      destroy_syncobj(render_fd, &fence->sync);
-      fence->sync = new_sync;
-   }
-
-   /* From the Vulkan 1.0.53 spec:
-    *
-    *    "Importing a fence payload from a file descriptor transfers
-    *     ownership of the file descriptor from the application to the
-    *     Vulkan implementation. The application must not perform any
-    *     operations on the file descriptor after a successful import."
-    *
-    * If the import fails, we leave the file descriptor open.
-    */
-   if (fd != -1)
-      close(fd);
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroyFence(VkDevice _device,
-                  VkFence _fence,
-                  const VkAllocationCallbacks *pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
-
-   if (fence == NULL)
-      return;
-
-   destroy_syncobj(device->pdevice->render_fd, &fence->sync);
-   destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync);
 
-   vk_object_free(&device->vk, pAllocator, fence);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
-
-   int ret = drmSyncobjWait(device->pdevice->render_fd, &fence->sync, 1,
-                            0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL);
-   if (ret == -ETIME)
-      return VK_NOT_READY;
-   else if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetFenceFdKHR(VkDevice _device,
-                   const VkFenceGetFdInfoKHR *pGetFdInfo,
-                   int *pFd)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence);
-
-   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
-
-   *pFd = -1;
-   int render_fd = device->pdevice->render_fd;
-   switch (pGetFdInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
-      drmSyncobjExportSyncFile(render_fd, fence->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
-      drmSyncobjHandleToFD(render_fd, fence->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   }
-   default:
-      unreachable("Unsupported external fence handle type");
-   }
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
-                                 sizeof(*syncobjs) * fenceCount, 8,
-                                 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!syncobjs)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   int render_fd = device->pdevice->render_fd;
-   uint32_t reset_count = 0;
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
-      /* From the Vulkan spec, section 'Importing Fence Payloads':
+      /* If the command buffer ends with a barrier we need to consume it now.
        *
-       *    "If the import is temporary, the fence will be restored to its
-       *     permanent state the next time that fence is passed to
-       *     vkResetFences.
-       *
-       *     Note: Restoring a fence to its prior permanent payload is a
-       *     distinct operation from resetting a fence payload."
-       *
-       * To restore the previous state, we just need to destroy the temporary.
+       * FIXME: this will drain all hw queues. Instead, we could use the pending
+       * barrier state to limit the queues we serialize against.
        */
-      if (fence->temp_sync)
-         destroy_syncobj(render_fd, &fence->temp_sync);
-      else
-         syncobjs[reset_count++] = fence->sync;
+      if (cmd_buffer->state.barrier.dst_mask) {
+         result = queue_submit_noop_job(queue, submit->perf_pass_index,
+                                        &sync_info, false);
+         if (result != VK_SUCCESS)
+            return result;
+      }
    }
 
-   int ret = 0;
-   if (reset_count > 0)
-      ret = drmSyncobjReset(render_fd, syncobjs, reset_count);
+   assert(!first_suspend_job);
+   assert(!current_suspend_job);
 
-   vk_free(&device->vk.alloc, syncobjs);
-
-   if (ret)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_WaitForFences(VkDevice _device,
-                   uint32_t fenceCount,
-                   const VkFence *pFences,
-                   VkBool32 waitAll,
-                   uint64_t timeout)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   const uint64_t abs_timeout = get_absolute_timeout(timeout);
-
-   uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
-                                 sizeof(*syncobjs) * fenceCount, 8,
-                                 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!syncobjs)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
-      syncobjs[i] = fence->temp_sync ? fence->temp_sync : fence->sync;
+   /* Handle signaling now */
+   if (submit->signal_count > 0) {
+      /* Finish by submitting a no-op job that synchronizes across all queues.
+       * This will ensure that the signal semaphores don't get triggered until
+       * all work on any queue completes. See Vulkan's signal operation order
+       * requirements.
+       */
+      return queue_submit_noop_job(queue, submit->perf_pass_index,
+                                   &sync_info, true);
    }
 
-   unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
-   if (waitAll)
-      flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
-
-   int ret;
-   do {
-      ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount,
-                           timeout, flags, NULL);
-   } while (ret == -ETIME && gettime_ns() < abs_timeout);
-
-   vk_free(&device->vk.alloc, syncobjs);
-
-   if (ret == -ETIME)
-      return VK_TIMEOUT;
-   else if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
    return VK_SUCCESS;
 }
 
@@ -1553,5 +1311,5 @@ v3dv_QueueBindSparse(VkQueue _queue,
                      VkFence fence)
 {
    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   return vk_error(queue->device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
+   return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
 }
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 47bc3a0b17c..eab8c0f0840 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
  *
  * Based in part on v3d driver which is:
  *
@@ -26,16 +26,6 @@
  */
 
 #include "v3dv_private.h"
-#include "vk_format_info.h"
-
-/* The only version specific structure that we need is
- * TMU_CONFIG_PARAMETER_1. This didn't seem to change significantly from
- * previous V3D versions and we don't expect that to change, so for now let's
- * just hardcode the V3D version here.
- */
-#define V3D_VERSION 41
-#include "broadcom/common/v3d_macros.h"
-#include "broadcom/cle/v3dx_pack.h"
 
 /* Our Vulkan resource indices represent indices in descriptor maps which
  * include all shader stages, so we need to size the arrays below
@@ -57,7 +47,8 @@ struct state_bo_list {
    struct v3dv_bo *states[MAX_TOTAL_STATES];
 };
 
-#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES)
+#define MAX_TOTAL_UNIFORM_BUFFERS ((MAX_UNIFORM_BUFFERS + \
+                                    MAX_INLINE_UNIFORM_BUFFERS) * MAX_STAGES)
 #define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES)
 struct buffer_bo_list {
    struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS];
@@ -74,29 +65,36 @@ state_bo_in_list(struct state_bo_list *list, struct v3dv_bo *bo)
    return false;
 }
 
+static void
+push_constants_bo_free(VkDevice _device,
+                       uint64_t bo_ptr,
+                       VkAllocationCallbacks *alloc)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   v3dv_bo_free(device, (struct v3dv_bo *)(uintptr_t) bo_ptr);
+}
+
 /*
  * This method checks if the ubo used for push constants is needed to be
  * updated or not.
  *
- * push contants ubo is only used for push constants accessed by a non-const
+ * push constants ubo is only used for push constants accessed by a non-const
  * index.
- *
- * FIXME: right now for this cases we are uploading the full
- * push_constants_data. An improvement would be to upload only the data that
- * we need to rely on a UBO.
  */
 static void
 check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_pipeline *pipeline)
 {
-   if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS) ||
+   if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO) ||
        pipeline->layout->push_constant_size == 0)
       return;
 
    if (cmd_buffer->push_constants_resource.bo == NULL) {
       cmd_buffer->push_constants_resource.bo =
-         v3dv_bo_alloc(cmd_buffer->device, MAX_PUSH_CONSTANTS_SIZE,
-                       "push constants", true);
+         v3dv_bo_alloc(cmd_buffer->device, 4096, "push constants", true);
+
+      v3dv_job_add_bo(cmd_buffer->state.job,
+                      cmd_buffer->push_constants_resource.bo);
 
       if (!cmd_buffer->push_constants_resource.bo) {
          fprintf(stderr, "Failed to allocate memory for push constants\n");
@@ -105,28 +103,41 @@ check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer,
 
       bool ok = v3dv_bo_map(cmd_buffer->device,
                             cmd_buffer->push_constants_resource.bo,
-                            MAX_PUSH_CONSTANTS_SIZE);
+                            cmd_buffer->push_constants_resource.bo->size);
       if (!ok) {
          fprintf(stderr, "failed to map push constants buffer\n");
          abort();
       }
    } else {
-      if (cmd_buffer->push_constants_resource.offset + MAX_PUSH_CONSTANTS_SIZE <=
+      if (cmd_buffer->push_constants_resource.offset +
+          cmd_buffer->state.push_constants_size <=
           cmd_buffer->push_constants_resource.bo->size) {
-         cmd_buffer->push_constants_resource.offset += MAX_PUSH_CONSTANTS_SIZE;
+         cmd_buffer->push_constants_resource.offset +=
+            cmd_buffer->state.push_constants_size;
       } else {
-         /* FIXME: we got out of space for push descriptors. Should we create
-          * a new bo? This could be easier with a uploader
+         /* We ran out of space so we'll have to allocate a new buffer but we
+          * need to ensure the old one is preserved until the end of the command
+          * buffer life and make sure it is eventually freed. We use the
+          * private object machinery in the command buffer for this.
           */
+         v3dv_cmd_buffer_add_private_obj(
+            cmd_buffer, (uintptr_t) cmd_buffer->push_constants_resource.bo,
+            (v3dv_cmd_buffer_private_obj_destroy_cb) push_constants_bo_free);
+
+         /* Now call back so we create a new BO */
+         cmd_buffer->push_constants_resource.bo = NULL;
+         check_push_constants_ubo(cmd_buffer, pipeline);
+         return;
       }
    }
 
+   assert(cmd_buffer->state.push_constants_size <= MAX_PUSH_CONSTANTS_SIZE);
    memcpy(cmd_buffer->push_constants_resource.bo->map +
           cmd_buffer->push_constants_resource.offset,
-          cmd_buffer->push_constants_data,
-          MAX_PUSH_CONSTANTS_SIZE);
+          cmd_buffer->state.push_constants_data,
+          cmd_buffer->state.push_constants_size);
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO;
 }
 
 /** V3D 4.x TMU configuration parameter 0 (texture) */
@@ -203,11 +214,8 @@ write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,
    /* Set unnormalized coordinates flag from sampler object */
    uint32_t p1_packed = v3d_unit_data_get_offset(data);
    if (sampler->unnormalized_coordinates) {
-      struct V3DX(TMU_CONFIG_PARAMETER_1) p1_unpacked;
-      V3DX(TMU_CONFIG_PARAMETER_1_unpack)((uint8_t *)&p1_packed, &p1_unpacked);
-      p1_unpacked.unnormalized_coordinates = true;
-      V3DX(TMU_CONFIG_PARAMETER_1_pack)(NULL, (uint8_t *)&p1_packed,
-                                        &p1_unpacked);
+      v3d_pack_unnormalized_coordinates(&cmd_buffer->device->devinfo, &p1_packed,
+                                        sampler->unnormalized_coordinates);
    }
 
    cl_aligned_u32(uniforms, sampler_state_reloc.bo->offset +
@@ -248,13 +256,14 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
 
    uint32_t dynamic_offset = 0;
 
-   /* For ubos, index is shifted, as 0 is reserved for push constants.
+   /* For ubos, index is shifted, as 0 is reserved for push constants
+    * and 1..MAX_INLINE_UNIFORM_BUFFERS are reserved for inline uniform
+    * buffers.
     */
-   if (content == QUNIFORM_UBO_ADDR &&
-       v3d_unit_data_get_unit(data) == 0) {
-      /* This calls is to ensure that the push_constant_ubo is
-       * updated. It already take into account it is should do the
-       * update or not
+   uint32_t index = v3d_unit_data_get_unit(data);
+   if (content == QUNIFORM_UBO_ADDR && index == 0) {
+      /* Ensure the push constants UBO is created and updated. This also
+       * adds the BO to the job so we don't need to track it in buffer_bos.
        */
       check_push_constants_ubo(cmd_buffer, pipeline);
 
@@ -265,42 +274,99 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
       cl_aligned_u32(uniforms, resource->bo->offset +
                                resource->offset +
                                offset + dynamic_offset);
-      buffer_bos->ubo[0] = resource->bo;
    } else {
-      uint32_t index =
-         content == QUNIFORM_UBO_ADDR ?
-         v3d_unit_data_get_unit(data) - 1 :
-         data;
+      if (content == QUNIFORM_UBO_ADDR) {
+         /* We reserve UBO index 0 for push constants in Vulkan (and for the
+          * constant buffer in GL) so the compiler always adds one to all UBO
+          * indices, fix it up before we access the descriptor map, since
+          * indices start from 0 there.
+          */
+         assert(index > 0);
+         index--;
+      } else {
+         index = data;
+      }
 
       struct v3dv_descriptor *descriptor =
          v3dv_descriptor_map_get_descriptor(descriptor_state, map,
                                             pipeline->layout,
                                             index, &dynamic_offset);
+
+      /* Inline UBO descriptors store UBO data in descriptor pool memory,
+       * instead of an external buffer.
+       */
       assert(descriptor);
-      assert(descriptor->buffer);
-      assert(descriptor->buffer->mem);
-      assert(descriptor->buffer->mem->bo);
 
       if (content == QUNIFORM_GET_SSBO_SIZE ||
           content == QUNIFORM_GET_UBO_SIZE) {
          cl_aligned_u32(uniforms, descriptor->range);
       } else {
-         cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset +
-                                  descriptor->buffer->mem_offset +
-                                  descriptor->offset +
-                                  offset + dynamic_offset);
+         /* Inline uniform buffers store their contents in pool memory instead
+          * of an external buffer.
+          */
+         struct v3dv_bo *bo;
+         uint32_t addr;
+         if (descriptor->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+            assert(dynamic_offset == 0);
+            struct v3dv_cl_reloc reloc =
+               v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
+                                                     descriptor_state, map,
+                                                     pipeline->layout, index,
+                                                     NULL);
+            bo = reloc.bo;
+            addr = reloc.bo->offset + reloc.offset + offset;
+         } else {
+            assert(descriptor->buffer);
+            assert(descriptor->buffer->mem);
+            assert(descriptor->buffer->mem->bo);
+
+            bo = descriptor->buffer->mem->bo;
+            addr = bo->offset +
+                   descriptor->buffer->mem_offset +
+                   descriptor->offset +
+                   offset + dynamic_offset;
+         }
+
+         cl_aligned_u32(uniforms, addr);
 
          if (content == QUNIFORM_UBO_ADDR) {
-            assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS);
-            buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo;
+            assert(index < MAX_TOTAL_UNIFORM_BUFFERS);
+            buffer_bos->ubo[index] = bo;
          } else {
             assert(index < MAX_TOTAL_STORAGE_BUFFERS);
-            buffer_bos->ssbo[index] = descriptor->buffer->mem->bo;
+            buffer_bos->ssbo[index] = bo;
          }
       }
    }
 }
 
+static void
+write_inline_uniform(struct v3dv_cl_out **uniforms,
+                     uint32_t index,
+                     uint32_t offset,
+                     struct v3dv_cmd_buffer *cmd_buffer,
+                     struct v3dv_pipeline *pipeline,
+                     enum broadcom_shader_stage stage)
+{
+   assert(index < MAX_INLINE_UNIFORM_BUFFERS);
+
+   struct v3dv_descriptor_state *descriptor_state =
+      v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
+
+   struct v3dv_descriptor_map *map =
+      &pipeline->shared_data->maps[stage]->ubo_map;
+
+   struct v3dv_cl_reloc reloc =
+      v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
+                                            descriptor_state, map,
+                                            pipeline->layout, index,
+                                            NULL);
+
+   /* Offset comes in 32-bit units */
+   uint32_t *addr = reloc.bo->map + reloc.offset + 4 * offset;
+   cl_aligned_u32(uniforms, *addr);
+}
+
 static uint32_t
 get_texture_size_from_image_view(struct v3dv_image_view *image_view,
                                  enum quniform_contents contents,
@@ -420,7 +486,6 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
 
    struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
-
    for (int i = 0; i < uinfo->count; i++) {
       uint32_t data = uinfo->data[i];
 
@@ -430,24 +495,45 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
          break;
 
       case QUNIFORM_UNIFORM:
-         cl_aligned_u32(&uniforms, cmd_buffer->push_constants_data[data]);
+         cl_aligned_u32(&uniforms, cmd_buffer->state.push_constants_data[data]);
          break;
 
-      case QUNIFORM_VIEWPORT_X_SCALE:
-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
+      case QUNIFORM_INLINE_UBO_0:
+      case QUNIFORM_INLINE_UBO_1:
+      case QUNIFORM_INLINE_UBO_2:
+      case QUNIFORM_INLINE_UBO_3:
+         write_inline_uniform(&uniforms,
+                              uinfo->contents[i] - QUNIFORM_INLINE_UBO_0, data,
+                              cmd_buffer, pipeline, variant->stage);
          break;
 
-      case QUNIFORM_VIEWPORT_Y_SCALE:
-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
+      case QUNIFORM_VIEWPORT_X_SCALE: {
+         float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
+         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
          break;
+      }
 
-      case QUNIFORM_VIEWPORT_Z_OFFSET:
-         cl_aligned_f(&uniforms, dynamic->viewport.translate[0][2]);
+      case QUNIFORM_VIEWPORT_Y_SCALE: {
+         float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
+         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
          break;
+      }
 
-      case QUNIFORM_VIEWPORT_Z_SCALE:
-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][2]);
+      case QUNIFORM_VIEWPORT_Z_OFFSET: {
+         float translate_z;
+         v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
+                                                    &translate_z, NULL);
+         cl_aligned_f(&uniforms, translate_z);
          break;
+      }
+
+      case QUNIFORM_VIEWPORT_Z_SCALE: {
+         float scale_z;
+         v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
+                                                    NULL, &scale_z);
+         cl_aligned_f(&uniforms, scale_z);
+         break;
+      }
 
       case QUNIFORM_SSBO_OFFSET:
       case QUNIFORM_UBO_ADDR:
@@ -527,9 +613,9 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
          } else if (cmd_buffer->state.framebuffer) {
             num_layers = cmd_buffer->state.framebuffer->layers;
          } else {
-            assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+            assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
             num_layers = 2048;
-#if DEBUG
+#if MESA_DEBUG
             fprintf(stderr, "Skipping gl_LayerID shader sanity check for "
                             "secondary command buffer\n");
 #endif
@@ -571,6 +657,20 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
          cl_aligned_u32(&uniforms, pipeline->spill.size_per_thread);
          break;
 
+      case QUNIFORM_DRAW_ID:
+         cl_aligned_u32(&uniforms, job->cmd_buffer->state.draw_id);
+         break;
+
+      case QUNIFORM_LINE_WIDTH:
+         cl_aligned_u32(&uniforms,
+                        job->cmd_buffer->vk.dynamic_graphics_state.rs.line.width);
+         break;
+
+      case QUNIFORM_AA_LINE_WIDTH:
+         cl_aligned_u32(&uniforms,
+                        v3dv_get_aa_line_width(pipeline, job->cmd_buffer));
+         break;
+
       default:
          unreachable("unsupported quniform_contents uniform type\n");
       }
diff --git a/src/broadcom/vulkan/v3dv_wsi.c b/src/broadcom/vulkan/v3dv_wsi.c
index 23c542cbc05..78af39448ce 100644
--- a/src/broadcom/vulkan/v3dv_wsi.c
+++ b/src/broadcom/vulkan/v3dv_wsi.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
  * based on intel anv code:
  * Copyright © 2015 Intel Corporation
 
@@ -24,123 +24,40 @@
  */
 
 #include "v3dv_private.h"
-#include "drm-uapi/drm_fourcc.h"
-#include "vk_format_info.h"
 #include "vk_util.h"
 #include "wsi_common.h"
+#include "wsi_common_drm.h"
+#include "wsi_common_entrypoints.h"
 
 static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 v3dv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
 {
    V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
-   PFN_vkVoidFunction func;
-
-   func = vk_instance_dispatch_table_get(&pdevice->vk.instance->dispatch_table, pName);
-   if (func != NULL)
-      return func;
-
-   func = vk_physical_device_dispatch_table_get(&pdevice->vk.dispatch_table, pName);
-   if (func != NULL)
-      return func;
-
-   return vk_device_dispatch_table_get(&vk_device_trampolines, pName);
+   return vk_instance_get_proc_addr_unchecked(pdevice->vk.instance, pName);
 }
 
 static bool
 v3dv_wsi_can_present_on_device(VkPhysicalDevice _pdevice, int fd)
 {
    V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, _pdevice);
-
-   drmDevicePtr fd_devinfo, display_devinfo;
-   int ret;
-
-   ret = drmGetDevice2(fd, 0, &fd_devinfo);
-   if (ret)
-      return false;
-
-   ret = drmGetDevice2(pdevice->display_fd, 0, &display_devinfo);
-   if (ret) {
-      drmFreeDevice(&fd_devinfo);
-      return false;
-   }
-
-   bool result = drmDevicesEqual(fd_devinfo, display_devinfo);
-
-   drmFreeDevice(&fd_devinfo);
-   drmFreeDevice(&display_devinfo);
-   return result;
+   assert(pdevice->display_fd != -1);
+   return wsi_common_drm_devices_equal(fd, pdevice->display_fd);
 }
 
-VkResult
-v3dv_wsi_init(struct v3dv_physical_device *physical_device)
-{
-   VkResult result;
-
-   result = wsi_device_init(&physical_device->wsi_device,
-                            v3dv_physical_device_to_handle(physical_device),
-                            v3dv_wsi_proc_addr,
-                            &physical_device->vk.instance->alloc,
-                            physical_device->master_fd, NULL, false);
-
-   if (result != VK_SUCCESS)
-      return result;
 
-   physical_device->wsi_device.supports_modifiers = true;
-   physical_device->wsi_device.can_present_on_device =
-      v3dv_wsi_can_present_on_device;
-
-   return VK_SUCCESS;
-}
-
-void
-v3dv_wsi_finish(struct v3dv_physical_device *physical_device)
-{
-   wsi_device_finish(&physical_device->wsi_device,
-                     &physical_device->vk.instance->alloc);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySurfaceKHR(
-    VkInstance                                   _instance,
-    VkSurfaceKHR                                 _surface,
-    const VkAllocationCallbacks*                 pAllocator)
+static void
+filter_surface_capabilities(VkSurfaceKHR _surface,
+                            VkSurfaceCapabilitiesKHR *caps)
 {
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
    ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
 
-   if (!surface)
-      return;
-
-   vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    VkSurfaceKHR                                surface,
-    VkBool32*                                   pSupported)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_support(&device->wsi_device,
-                                         queueFamilyIndex,
-                                         surface,
-                                         pSupported);
-}
-
-static void
-constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps)
-{
-   /* Our display pipeline requires that images are linear, so we cannot
-    * ensure that our swapchain images can be sampled. If we are running under
-    * a compositor in windowed mode, the DRM modifier negotiation should
-    * probably end up selecting an UIF layout for the swapchain images but it
-    * may still choose linear and send images directly for scanout if the
-    * surface is in fullscreen mode for example. If we are not running under
-    * a compositor, then we would always need them to be linear anyway.
+   /* Display images must be linear so they are restricted. This would
+    * affect sampling usages too, but we don't restrict those since we
+    * support on-the-fly conversion to UIF when sampling for simple 2D
+    * images at a performance penalty.
     */
-   caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_SAMPLED_BIT;
+   if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY)
+      caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_STORAGE_BIT;
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -149,13 +66,11 @@ v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
     VkSurfaceKHR                                surface,
     VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
 {
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
    VkResult result;
-   result = wsi_common_get_surface_capabilities(&device->wsi_device,
-                                                surface,
-                                                pSurfaceCapabilities);
-   constraint_surface_capabilities(pSurfaceCapabilities);
+   result = wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice,
+                                                        surface,
+                                                        pSurfaceCapabilities);
+   filter_surface_capabilities(surface, pSurfaceCapabilities);
    return result;
 }
 
@@ -165,227 +80,50 @@ v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR(
     const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
     VkSurfaceCapabilities2KHR*                  pSurfaceCapabilities)
 {
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
    VkResult result;
-   result = wsi_common_get_surface_capabilities2(&device->wsi_device,
-                                                 pSurfaceInfo,
-                                                 pSurfaceCapabilities);
-   constraint_surface_capabilities(&pSurfaceCapabilities->surfaceCapabilities);
+   result = wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(physicalDevice,
+                                                         pSurfaceInfo,
+                                                         pSurfaceCapabilities);
+   filter_surface_capabilities(pSurfaceInfo->surface,
+                               &pSurfaceCapabilities->surfaceCapabilities);
    return result;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceFormatsKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormatKHR*                         pSurfaceFormats)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats(&device->wsi_device, surface,
-                                         pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceFormats2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormat2KHR*                        pSurfaceFormats)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
-                                          pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfacePresentModesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pPresentModeCount,
-    VkPresentModeKHR*                           pPresentModes)
+VkResult
+v3dv_wsi_init(struct v3dv_physical_device *physical_device)
 {
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_present_modes(&device->wsi_device, surface,
-                                               pPresentModeCount,
-                                               pPresentModes);
-}
+   VkResult result;
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSwapchainKHR(
-    VkDevice                                     _device,
-    const VkSwapchainCreateInfoKHR*              pCreateInfo,
-    const VkAllocationCallbacks*                 pAllocator,
-    VkSwapchainKHR*                              pSwapchain)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   struct v3dv_instance *instance = device->instance;
-   struct v3dv_physical_device *pdevice = &instance->physicalDevice;
-   struct wsi_device *wsi_device = &pdevice->wsi_device;
+   result = wsi_device_init(&physical_device->wsi_device,
+                            v3dv_physical_device_to_handle(physical_device),
+                            v3dv_wsi_proc_addr,
+                            &physical_device->vk.instance->alloc,
+                            physical_device->display_fd, NULL,
+                            &(struct wsi_device_options){.sw_device = false});
 
-   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface);
-   VkResult result =
-      v3dv_physical_device_acquire_display(instance, pdevice, surface);
    if (result != VK_SUCCESS)
       return result;
 
-   const VkAllocationCallbacks *alloc;
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
-
-   return wsi_common_create_swapchain(wsi_device, _device,
-                                      pCreateInfo, alloc, pSwapchain);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySwapchainKHR(
-    VkDevice                                     _device,
-    VkSwapchainKHR                               swapchain,
-    const VkAllocationCallbacks*                 pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   const VkAllocationCallbacks *alloc;
+   physical_device->wsi_device.supports_modifiers = true;
+   physical_device->wsi_device.can_present_on_device =
+      v3dv_wsi_can_present_on_device;
 
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
+   physical_device->vk.wsi_device = &physical_device->wsi_device;
 
-   wsi_common_destroy_swapchain(_device, swapchain, alloc);
+   return VK_SUCCESS;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetSwapchainImagesKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint32_t*                                    pSwapchainImageCount,
-    VkImage*                                     pSwapchainImages)
+void
+v3dv_wsi_finish(struct v3dv_physical_device *physical_device)
 {
-   return wsi_common_get_images(swapchain,
-                                pSwapchainImageCount,
-                                pSwapchainImages);
+   physical_device->vk.wsi_device = NULL;
+   wsi_device_finish(&physical_device->wsi_device,
+                     &physical_device->vk.instance->alloc);
 }
 
 struct v3dv_image *
 v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index)
 {
-   uint32_t n_images = index + 1;
-   VkImage *images = malloc(sizeof(*images) * n_images);
-   VkResult result = wsi_common_get_images(swapchain, &n_images, images);
-
-   if (result != VK_SUCCESS && result != VK_INCOMPLETE) {
-      free(images);
-      return NULL;
-   }
-
-   V3DV_FROM_HANDLE(v3dv_image, image, images[index]);
-   free(images);
-
-   return image;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AcquireNextImageKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint64_t                                     timeout,
-    VkSemaphore                                  semaphore,
-    VkFence                                      fence,
-    uint32_t*                                    pImageIndex)
-{
-   VkAcquireNextImageInfoKHR acquire_info = {
-      .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
-      .swapchain = swapchain,
-      .timeout = timeout,
-      .semaphore = semaphore,
-      .fence = fence,
-      .deviceMask = 0,
-   };
-
-   return v3dv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AcquireNextImage2KHR(
-    VkDevice                                     _device,
-    const VkAcquireNextImageInfoKHR*             pAcquireInfo,
-    uint32_t*                                    pImageIndex)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, pAcquireInfo->fence);
-   V3DV_FROM_HANDLE(v3dv_semaphore, semaphore, pAcquireInfo->semaphore);
-
-   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
-
-   VkResult result;
-   result = wsi_common_acquire_next_image2(&pdevice->wsi_device, _device,
-                                           pAcquireInfo, pImageIndex);
-
-   if (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR) {
-      if (fence)
-         drmSyncobjSignal(pdevice->render_fd, &fence->sync, 1);
-      if (semaphore)
-         drmSyncobjSignal(pdevice->render_fd, &semaphore->sync, 1);
-   }
-
-   return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueuePresentKHR(
-    VkQueue                                  _queue,
-    const VkPresentInfoKHR*                  pPresentInfo)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   struct v3dv_physical_device *pdevice =
-      &queue->device->instance->physicalDevice;
-
-   return wsi_common_queue_present(&pdevice->wsi_device,
-                                   v3dv_device_to_handle(queue->device),
-                                   _queue, 0,
-                                   pPresentInfo);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDeviceGroupPresentCapabilitiesKHR(
-    VkDevice                                    device,
-    VkDeviceGroupPresentCapabilitiesKHR*        pCapabilities)
-{
-   memset(pCapabilities->presentMask, 0,
-          sizeof(pCapabilities->presentMask));
-   pCapabilities->presentMask[0] = 0x1;
-   pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDeviceGroupSurfacePresentModesKHR(
-    VkDevice                                    device,
-    VkSurfaceKHR                                surface,
-    VkDeviceGroupPresentModeFlagsKHR*           pModes)
-{
-   *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDevicePresentRectanglesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pRectCount,
-    VkRect2D*                                   pRects)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_present_rectangles(&device->wsi_device,
-                                            surface,
-                                            pRectCount, pRects);
+   VkImage image = wsi_common_get_image(swapchain, index);
+   return v3dv_image_from_handle(image);
 }
diff --git a/src/broadcom/vulkan/v3dv_wsi_display.c b/src/broadcom/vulkan/v3dv_wsi_display.c
deleted file mode 100644
index 3d1cf91ecbe..00000000000
--- a/src/broadcom/vulkan/v3dv_wsi_display.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- * based on KHR_display extension code:
- * Copyright © 2017 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that copyright
- * notice and this permission notice appear in supporting documentation, and
- * that the name of the copyright holders not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  The copyright holders make no representations
- * about the suitability of this software for any purpose.  It is provided "as
- * is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THIS SOFTWARE.
- */
-#include "v3dv_private.h"
-#include "wsi_common_display.h"
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
-                                           uint32_t *property_count,
-                                           VkDisplayPropertiesKHR *properties)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties(
-      physical_device,
-      &pdevice->wsi_device,
-      property_count,
-      properties);
-}
-
-VkResult
-v3dv_GetPhysicalDeviceDisplayProperties2KHR(
-    VkPhysicalDevice physical_device,
-    uint32_t *pPropertyCount,
-    VkDisplayProperties2KHR *pProperties)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties2(
-      physical_device,
-      &pdevice->wsi_device,
-      pPropertyCount,
-      pProperties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceDisplayPlanePropertiesKHR(
-   VkPhysicalDevice physical_device,
-   uint32_t *property_count,
-   VkDisplayPlanePropertiesKHR *properties)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties(
-      physical_device,
-      &pdevice->wsi_device,
-      property_count,
-      properties);
-}
-
-VkResult
-v3dv_GetPhysicalDeviceDisplayPlaneProperties2KHR(
-    VkPhysicalDevice physical_device,
-    uint32_t *pPropertyCount,
-    VkDisplayPlaneProperties2KHR *pProperties)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties2(
-      physical_device,
-      &pdevice->wsi_device,
-      pPropertyCount,
-      pProperties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
-                                         uint32_t plane_index,
-                                         uint32_t *display_count,
-                                         VkDisplayKHR *displays)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_plane_supported_displays(
-      physical_device,
-      &pdevice->wsi_device,
-      plane_index,
-      display_count,
-      displays);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
-                                 VkDisplayKHR display,
-                                 uint32_t *property_count,
-                                 VkDisplayModePropertiesKHR *properties)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties(physical_device,
-                                                  &pdevice->wsi_device,
-                                                  display,
-                                                  property_count,
-                                                  properties);
-}
-
-VkResult
-v3dv_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device,
-                                  VkDisplayKHR display,
-                                  uint32_t *pPropertyCount,
-                                  VkDisplayModeProperties2KHR *pProperties)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties2(physical_device,
-                                                   &pdevice->wsi_device,
-                                                   display,
-                                                   pPropertyCount,
-                                                   pProperties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
-                          VkDisplayKHR display,
-                          const VkDisplayModeCreateInfoKHR *create_info,
-                          const VkAllocationCallbacks *allocator,
-                          VkDisplayModeKHR *mode)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_display_create_display_mode(physical_device,
-                                          &pdevice->wsi_device,
-                                          display,
-                                          create_info,
-                                          allocator,
-                                          mode);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
-                                    VkDisplayModeKHR mode_khr,
-                                    uint32_t plane_index,
-                                    VkDisplayPlaneCapabilitiesKHR *capabilities)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities(physical_device,
-                                             &pdevice->wsi_device,
-                                             mode_khr,
-                                             plane_index,
-                                             capabilities);
-}
-
-VkResult
-v3dv_GetDisplayPlaneCapabilities2KHR(
-   VkPhysicalDevice physical_device,
-   const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
-   VkDisplayPlaneCapabilities2KHR *pCapabilities)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities2(physical_device,
-                                              &pdevice->wsi_device,
-                                              pDisplayPlaneInfo,
-                                              pCapabilities);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateDisplayPlaneSurfaceKHR(
-    VkInstance _instance,
-    const VkDisplaySurfaceCreateInfoKHR *create_info,
-    const VkAllocationCallbacks *allocator,
-    VkSurfaceKHR *surface)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-
-   if (allocator)
-      alloc = allocator;
-   else
-      alloc = &instance->vk.alloc;
-
-   return wsi_create_display_surface(_instance, alloc,
-                                     create_info, surface);
-}
diff --git a/src/broadcom/vulkan/v3dv_wsi_wayland.c b/src/broadcom/vulkan/v3dv_wsi_wayland.c
deleted file mode 100644
index e61abf3c724..00000000000
--- a/src/broadcom/vulkan/v3dv_wsi_wayland.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright © 2020 Ella Stanforth
- * based on intel anv code:
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "wsi_common_wayland.h"
-#include "v3dv_private.h"
-
-VKAPI_ATTR VkBool32 VKAPI_CALL
-v3dv_GetPhysicalDeviceWaylandPresentationSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    struct wl_display*                          display)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice);
-
-   return wsi_wl_get_presentation_support(&physical_device->wsi_device, display);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateWaylandSurfaceKHR(
-    VkInstance                                  _instance,
-    const VkWaylandSurfaceCreateInfoKHR*        pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkSurfaceKHR*                               pSurface)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR);
-
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &instance->vk.alloc;
-
-   return wsi_create_wl_surface(alloc, pCreateInfo, pSurface);
-}
diff --git a/src/broadcom/vulkan/v3dv_wsi_x11.c b/src/broadcom/vulkan/v3dv_wsi_x11.c
deleted file mode 100644
index 4fa99ccd5ab..00000000000
--- a/src/broadcom/vulkan/v3dv_wsi_x11.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- *
- * based mostly on anv driver which is:
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <X11/Xlib-xcb.h>
-#include <X11/xshmfence.h>
-#include <xcb/xcb.h>
-#include <xcb/dri3.h>
-#include <xcb/present.h>
-
-#include "wsi_common_x11.h"
-#include "v3dv_private.h"
-
-VKAPI_ATTR VkBool32 VKAPI_CALL
-v3dv_GetPhysicalDeviceXcbPresentationSupportKHR(
-   VkPhysicalDevice                            physicalDevice,
-   uint32_t                                    queueFamilyIndex,
-   xcb_connection_t*                           connection,
-   xcb_visualid_t                              visual_id)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_get_physical_device_xcb_presentation_support(
-             &device->wsi_device,
-             queueFamilyIndex,
-             connection, visual_id);
-}
-
-VKAPI_ATTR VkBool32 VKAPI_CALL
-v3dv_GetPhysicalDeviceXlibPresentationSupportKHR(
-   VkPhysicalDevice                            physicalDevice,
-   uint32_t                                    queueFamilyIndex,
-   Display*                                    dpy,
-   VisualID                                    visualID)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_get_physical_device_xcb_presentation_support(
-             &device->wsi_device,
-             queueFamilyIndex,
-             XGetXCBConnection(dpy), visualID);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateXcbSurfaceKHR(
-   VkInstance                                  _instance,
-   const VkXcbSurfaceCreateInfoKHR*            pCreateInfo,
-   const VkAllocationCallbacks*                pAllocator,
-   VkSurfaceKHR*                               pSurface)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR);
-
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &instance->vk.alloc;
-
-   return wsi_create_xcb_surface(alloc, pCreateInfo, pSurface);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateXlibSurfaceKHR(
-   VkInstance                                  _instance,
-   const VkXlibSurfaceCreateInfoKHR*           pCreateInfo,
-   const VkAllocationCallbacks*                pAllocator,
-   VkSurfaceKHR*                               pSurface)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR);
-
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &instance->vk.alloc;
-
-   return wsi_create_xlib_surface(alloc, pCreateInfo, pSurface);
-}
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index c2f2c77864b..d7fb087d9a8 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,14 +23,13 @@
 
 #include "v3dv_private.h"
 #include "broadcom/common/v3d_macros.h"
+#include "broadcom/common/v3d_util.h"
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/compiler/v3d_compiler.h"
 
 #include "util/half_float.h"
-#include "vulkan/util/vk_format.h"
 #include "util/u_pack_color.h"
-
-#include "vk_format_info.h"
+#include "vk_format.h"
 
 void
 v3dX(job_emit_binning_flush)(struct v3dv_job *job)
@@ -44,6 +43,34 @@ v3dX(job_emit_binning_flush)(struct v3dv_job *job)
 }
 
 void
+v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
+{
+   assert(job->can_use_double_buffer);
+   assert(job->frame_tiling.double_buffer);
+   assert(!job->frame_tiling.msaa);
+   assert(job->bcl_tile_binning_mode_ptr);
+
+   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+   struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = {
+      cl_packet_header(TILE_BINNING_MODE_CFG),
+   };
+   config.width_in_pixels = tiling->width;
+   config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
+   config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
+   config.multisample_mode_4x = tiling->msaa;
+   config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+   config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      unreachable("HW generation 71 not supported yet.");
+#endif
+
+   uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
+   cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
+}
+
+void
 v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
                               const struct v3dv_frame_tiling *tiling,
                               uint32_t layers)
@@ -55,12 +82,27 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
       config.number_of_layers = layers;
    }
 
+   assert(!tiling->double_buffer || !tiling->msaa);
+   job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl);
    cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
       config.width_in_pixels = tiling->width;
       config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
       config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
       config.multisample_mode_4x = tiling->msaa;
+      config.double_buffer_in_non_ms_mode = tiling->double_buffer;
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideally we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
    }
 
    /* There's definitely nothing in the VCD cache we want. */
@@ -106,18 +148,45 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
                                  uint32_t buffer)
 {
    const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+
+   /* We don't support rendering to ycbcr images, so the image view should be
+    * single-plane, and using a single-plane format. But note that the underlying
+    * image can be a ycbcr format, as we support rendering to a specific plane
+    * of an image. This is used for example on some meta_copy code paths, in
+    * order to copy from/to a plane of a ycbcr image.
+    */
+   assert(iview->plane_count == 1);
+   assert(iview->format->plane_count == 1);
+
+   uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
    const struct v3d_resource_slice *slice =
-      &image->slices[iview->vk.base_mip_level];
+      &image->planes[image_plane].slices[iview->vk.base_mip_level];
+
    uint32_t layer_offset =
       v3dv_layer_offset(image, iview->vk.base_mip_level,
-                        iview->vk.base_array_layer + layer);
+                        iview->vk.base_array_layer + layer, image_plane);
 
    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
       load.buffer_to_load = buffer;
-      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
+      load.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
+
+      load.input_image_format = iview->format->planes[0].rt_type;
+
+      /* If we create an image view with only the stencil format, we
+       * re-interpret the format as RGBA8_UINT, as it is want we want in
+       * general (see CreateImageView).
+       *
+       * However, when we are loading/storing tiles from the ZSTENCIL tile
+       * buffer, we need to use the underlying DS format.
+       */
+      if (buffer == ZSTENCIL &&
+          iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
+         assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
+         load.input_image_format = image->format->planes[image_plane].rt_type;
+      }
 
-      load.input_image_format = iview->format->rt_type;
-      load.r_b_swap = iview->swap_rb;
+      load.r_b_swap = iview->planes[0].swap_rb;
+      load.channel_reverse = iview->planes[0].channel_reverse;
       load.memory_format = slice->tiling;
 
       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
@@ -135,38 +204,6 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
    }
 }
 
-static bool
-check_needs_load(const struct v3dv_cmd_buffer_state *state,
-                 VkImageAspectFlags aspect,
-                 uint32_t first_subpass_idx,
-                 VkAttachmentLoadOp load_op)
-{
-   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
-    * testing does not exist in the image.
-    */
-   if (!aspect)
-      return false;
-
-   /* Attachment (or view) load operations apply on the first subpass that
-    * uses the attachment (or view), otherwise we always need to load.
-    */
-   if (state->job->first_subpass > first_subpass_idx)
-      return true;
-
-   /* If the job is continuing a subpass started in another job, we always
-    * need to load.
-    */
-   if (state->job->is_subpass_continue)
-      return true;
-
-   /* If the area is not aligned to tile boundaries, we always need to load */
-   if (!state->tile_aligned_render_area)
-      return true;
-
-   /* The attachment load operations must be LOAD */
-   return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
-}
-
 static inline uint32_t
 v3dv_zs_buffer(bool depth, bool stencil)
 {
@@ -185,7 +222,6 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
                                   uint32_t layer)
 {
    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
    const struct v3dv_render_pass *pass = state->pass;
    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
 
@@ -222,12 +258,20 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
          attachment->first_subpass :
          attachment->views[layer].first_subpass;
 
-      bool needs_load = check_needs_load(state,
-                                         VK_IMAGE_ASPECT_COLOR_BIT,
-                                         first_subpass,
-                                         attachment->desc.loadOp);
+      uint32_t last_subpass = !pass->multiview_enabled ?
+         attachment->last_subpass :
+         attachment->views[layer].last_subpass;
+
+      bool needs_load =
+         v3dv_cmd_buffer_check_needs_load(state,
+                                          VK_IMAGE_ASPECT_COLOR_BIT,
+                                          first_subpass,
+                                          attachment->desc.loadOp,
+                                          last_subpass,
+                                          attachment->desc.storeOp);
       if (needs_load) {
-         struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
+         struct v3dv_image_view *iview =
+            state->attachments[attachment_idx].image_view;
          cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
                                           layer, RENDER_TARGET_0 + i);
       }
@@ -245,21 +289,29 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
          ds_attachment->first_subpass :
          ds_attachment->views[layer].first_subpass;
 
+      uint32_t ds_last_subpass = !pass->multiview_enabled ?
+         ds_attachment->last_subpass :
+         ds_attachment->views[layer].last_subpass;
+
       const bool needs_depth_load =
-         check_needs_load(state,
-                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                          ds_first_subpass,
-                          ds_attachment->desc.loadOp);
+         v3dv_cmd_buffer_check_needs_load(state,
+                                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                          ds_first_subpass,
+                                          ds_attachment->desc.loadOp,
+                                          ds_last_subpass,
+                                          ds_attachment->desc.storeOp);
 
       const bool needs_stencil_load =
-         check_needs_load(state,
-                          ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                          ds_first_subpass,
-                          ds_attachment->desc.stencilLoadOp);
+         v3dv_cmd_buffer_check_needs_load(state,
+                                          ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                          ds_first_subpass,
+                                          ds_attachment->desc.stencilLoadOp,
+                                          ds_last_subpass,
+                                          ds_attachment->desc.stencilStoreOp);
 
       if (needs_depth_load || needs_stencil_load) {
          struct v3dv_image_view *iview =
-            framebuffer->attachments[ds_attachment_idx];
+            state->attachments[ds_attachment_idx].image_view;
          /* From the Vulkan spec:
           *
           *   "When an image view of a depth/stencil image is used as a
@@ -290,21 +342,53 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
                                   bool is_multisample_resolve)
 {
    const struct v3dv_image_view *iview =
-      cmd_buffer->state.framebuffer->attachments[attachment_idx];
+      cmd_buffer->state.attachments[attachment_idx].image_view;
    const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+
+   /* We don't support rendering to ycbcr images, so the image view should be
+    * one-plane, and using a single-plane format. But note that the underlying
+    * image can be a ycbcr format, as we support rendering to a specific plane
+    * of an image. This is used for example on some meta_copy code paths, in
+    * order to copy from/to a plane of a ycbcr image.
+    */
+   assert(iview->plane_count == 1);
+   assert(iview->format->plane_count == 1);
+
+   uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
    const struct v3d_resource_slice *slice =
-      &image->slices[iview->vk.base_mip_level];
+      &image->planes[image_plane].slices[iview->vk.base_mip_level];
    uint32_t layer_offset = v3dv_layer_offset(image,
                                              iview->vk.base_mip_level,
-                                             iview->vk.base_array_layer + layer);
+                                             iview->vk.base_array_layer + layer,
+                                             image_plane);
+
+   /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
+    * is broken in earlier V3D versions.
+    */
+   assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
 
    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
       store.buffer_to_store = buffer;
-      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
+      store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
       store.clear_buffer_being_stored = clear;
 
-      store.output_image_format = iview->format->rt_type;
-      store.r_b_swap = iview->swap_rb;
+      store.output_image_format = iview->format->planes[0].rt_type;
+
+      /* If we create an image view with only the stencil format, we
+       * re-interpret the format as RGBA8_UINT, as it is want we want in
+       * general (see CreateImageView).
+       *
+       * However, when we are loading/storing tiles from the ZSTENCIL tile
+       * buffer, we need to use the underlying DS format.
+       */
+      if (buffer == ZSTENCIL &&
+          iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
+         assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
+         store.output_image_format = image->format->planes[image_plane].rt_type;
+      }
+
+      store.r_b_swap = iview->planes[0].swap_rb;
+      store.channel_reverse = iview->planes[0].channel_reverse;
       store.memory_format = slice->tiling;
 
       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
@@ -349,7 +433,7 @@ check_needs_clear(const struct v3dv_cmd_buffer_state *state,
    if (state->job->is_subpass_continue)
       return false;
 
-   /* If the render area is not aligned to tile boudaries we can't use the
+   /* If the render area is not aligned to tile boundaries we can't use the
     * TLB for a clear.
     */
    if (!state->tile_aligned_render_area)
@@ -366,36 +450,6 @@ check_needs_clear(const struct v3dv_cmd_buffer_state *state,
    return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
 }
 
-static bool
-check_needs_store(const struct v3dv_cmd_buffer_state *state,
-                  VkImageAspectFlags aspect,
-                  uint32_t last_subpass_idx,
-                  VkAttachmentStoreOp store_op)
-{
-   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
-    * testing does not exist in the image.
-    */
-   if (!aspect)
-      return false;
-
-   /* Attachment (or view) store operations only apply on the last subpass
-    * where the attachment (or view)  is used, in other subpasses we always
-    * need to store.
-    */
-   if (state->subpass_idx < last_subpass_idx)
-      return true;
-
-   /* Attachment store operations only apply on the last job we emit on the the
-    * last subpass where the attachment is used, otherwise we always need to
-    * store.
-    */
-   if (!state->job->is_subpass_finish)
-      return true;
-
-   /* The attachment store operation must be STORE */
-   return store_op == VK_ATTACHMENT_STORE_OP_STORE;
-}
-
 static void
 cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
                                    struct v3dv_cl *cl,
@@ -435,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
       const VkImageAspectFlags aspects =
          vk_format_aspects(ds_attachment->desc.format);
 
+#if V3D_VERSION <= 42
+      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+       * for depth/stencil.
+       *
+       * There used to be some confusion regarding the Clear Tile Buffers
+       * Z/S bit also being broken, but we confirmed with Broadcom that this
+       * is not the case, it was just that some other hardware bugs (that we
+       * need to work around, such as GFXH-1461) could cause this bit to behave
+       * incorrectly.
+       *
+       * There used to be another issue where the RTs bit in the Clear Tile
+       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+       * fixed since V3D 4.1.
+       *
+       * So if we have to emit a clear of depth or stencil we don't use
+       * the per-buffer store clear bit, even if we need to store the buffers,
+       * instead we always have to use the Clear Tile Buffers Z/S bit.
+       * If we have configured the job to do early Z/S clearing, then we
+       * don't want to emit any Clear Tile Buffers command at all here.
+       *
+       * Note that GFXH-1689 is not reproduced in the simulator, where
+       * using the clear buffer bit in depth/stencil stores works fine.
+       */
+
       /* Only clear once on the first subpass that uses the attachment */
       uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
          ds_attachment->first_subpass :
@@ -454,47 +532,59 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
                            ds_attachment->desc.stencilLoadOp,
                            subpass->do_stencil_clear_with_draw);
 
+      use_global_zs_clear = !state->job->early_zs_clear &&
+         (needs_depth_clear || needs_stencil_clear);
+#endif
+#if V3D_VERSION >= 71
+      /* The store command's clear buffer bit cannot be used for Z/S stencil:
+       * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
+       * so we don't want to emit redundant clears here.
+       */
+      use_global_zs_clear = false;
+#endif
+
       /* Skip the last store if it is not required */
       uint32_t ds_last_subpass = !pass->multiview_enabled ?
          ds_attachment->last_subpass :
          ds_attachment->views[layer].last_subpass;
 
       bool needs_depth_store =
-         check_needs_store(state,
-                           aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                           ds_last_subpass,
-                           ds_attachment->desc.storeOp);
+         v3dv_cmd_buffer_check_needs_store(state,
+                                           aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                           ds_last_subpass,
+                                           ds_attachment->desc.storeOp);
 
       bool needs_stencil_store =
-         check_needs_store(state,
-                           aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                           ds_last_subpass,
-                           ds_attachment->desc.stencilStoreOp);
+         v3dv_cmd_buffer_check_needs_store(state,
+                                           aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                           ds_last_subpass,
+                                           ds_attachment->desc.stencilStoreOp);
+
+      /* If we have a resolve, handle it before storing the tile */
+      const struct v3dv_cmd_buffer_attachment_state *ds_att_state =
+         &state->attachments[ds_attachment_idx];
+      if (ds_att_state->use_tlb_resolve) {
+         assert(ds_att_state->has_resolve);
+         assert(subpass->resolve_depth || subpass->resolve_stencil);
+         const uint32_t resolve_attachment_idx =
+            subpass->ds_resolve_attachment.attachment;
+         assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED);
+
+         const uint32_t zs_buffer =
+            v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil);
+         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
+                                           resolve_attachment_idx, layer,
+                                           zs_buffer,
+                                           false, false);
+         has_stores = true;
+      } else if (ds_att_state->has_resolve) {
+         /* If we can't use the TLB to implement the resolve we will need to
+          * store the attachment so we can implement it later using a blit.
+          */
+         needs_depth_store = subpass->resolve_depth;
+         needs_stencil_store = subpass->resolve_stencil;
+      }
 
-      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
-       * for depth/stencil.
-       *
-       * There used to be some confusion regarding the Clear Tile Buffers
-       * Z/S bit also being broken, but we confirmed with Broadcom that this
-       * is not the case, it was just that some other hardware bugs (that we
-       * need to work around, such as GFXH-1461) could cause this bit to behave
-       * incorrectly.
-       *
-       * There used to be another issue where the RTs bit in the Clear Tile
-       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
-       * fixed since V3D 4.1.
-       *
-       * So if we have to emit a clear of depth or stencil we don't use
-       * the per-buffer store clear bit, even if we need to store the buffers,
-       * instead we always have to use the Clear Tile Buffers Z/S bit.
-       * If we have configured the job to do early Z/S clearing, then we
-       * don't want to emit any Clear Tile Buffers command at all here.
-       *
-       * Note that GFXH-1689 is not reproduced in the simulator, where
-       * using the clear buffer bit in depth/stencil stores works fine.
-       */
-      use_global_zs_clear = !state->job->early_zs_clear &&
-         (needs_depth_clear || needs_stencil_clear);
       if (needs_depth_store || needs_stencil_store) {
          const uint32_t zs_buffer =
             v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
@@ -536,10 +626,10 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
          attachment->views[layer].last_subpass;
 
       bool needs_store =
-         check_needs_store(state,
-                           VK_IMAGE_ASPECT_COLOR_BIT,
-                           last_subpass,
-                           attachment->desc.storeOp);
+         v3dv_cmd_buffer_check_needs_store(state,
+                                           VK_IMAGE_ASPECT_COLOR_BIT,
+                                           last_subpass,
+                                           attachment->desc.storeOp);
 
       /* If we need to resolve this attachment emit that store first. Notice
        * that we must not request a tile buffer clear here in that case, since
@@ -547,15 +637,16 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
        * color attachment store below, since the clear happens after the
        * store is completed.
        *
-       * If the attachment doesn't support TLB resolves then we will have to
-       * fallback to doing the resolve in a shader separately after this
-       * job, so we will need to store the multisampled sttachment even if that
-       * wansn't requested by the client.
+       * If the attachment doesn't support TLB resolves (or the render area
+       * is not aligned to tile boundaries) then we will have to fallback to
+       * doing the resolve in a shader separately after this job, so we will
+       * need to store the multisampled attachment even if that wasn't
+       * requested by the client.
        */
-      const bool needs_resolve =
-         subpass->resolve_attachments &&
-         subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
-      if (needs_resolve && attachment->use_tlb_resolve) {
+      const struct v3dv_cmd_buffer_attachment_state *att_state =
+         &state->attachments[attachment_idx];
+      if (att_state->use_tlb_resolve) {
+         assert(att_state->has_resolve);
          const uint32_t resolve_attachment_idx =
             subpass->resolve_attachments[i].attachment;
          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
@@ -563,7 +654,7 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
                                            RENDER_TARGET_0 + i,
                                            false, true);
          has_stores = true;
-      } else if (needs_resolve) {
+      } else if (att_state->has_resolve) {
          needs_store = true;
       }
 
@@ -591,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
     * bit and instead we have to emit a single clear of all tile buffers.
     */
    if (use_global_zs_clear || use_global_rt_clear) {
+#if V3D_VERSION == 42
       cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
          clear.clear_z_stencil_buffer = use_global_zs_clear;
          clear.clear_all_render_targets = use_global_rt_clear;
       }
+#endif
+#if V3D_VERSION >= 71
+      cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
    }
 }
 
@@ -698,11 +794,8 @@ set_rcl_early_z_config(struct v3dv_job *job,
                        bool *early_z_disable,
                        uint32_t *early_z_test_and_update_direction)
 {
-   /* If this is true then we have not emitted any draw calls in this job
-    * and we don't get any benefits form early Z.
-    */
-   if (!job->decided_global_ez_enable) {
-      assert(job->draw_count == 0);
+   /* Disable if none of the draw calls in this job enabled EZ */
+   if (!job->has_ez_draws) {
       *early_z_disable = true;
       return;
    }
@@ -723,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
    }
 }
 
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ * FIXME: for v71 we are not returning all the possible combinations for
+ * render target internal type and clamp. For example for int types we are
+ * always using clamp int, and for 16f we are using clamp none or pos (that
+ * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
+ * summary right now we are just porting what we were doing on 4.2
+ */
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format)
+{
+#if V3D_VERSION == 42
+   if (vk_format_is_int(vk_format))
+      return V3D_RENDER_TARGET_CLAMP_INT;
+   else if (vk_format_is_srgb(vk_format))
+      return V3D_RENDER_TARGET_CLAMP_NORM;
+   else
+      return V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+   switch (rt_type) {
+   case V3D_INTERNAL_TYPE_8I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+   case V3D_INTERNAL_TYPE_8UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_8:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+   case V3D_INTERNAL_TYPE_16I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+   case V3D_INTERNAL_TYPE_16UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_16F:
+      return vk_format_is_srgb(vk_format) ?
+         V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+         V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+   case V3D_INTERNAL_TYPE_32I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+   case V3D_INTERNAL_TYPE_32UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_32F:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+   default:
+      unreachable("Unknown internal render target type");
+   }
+
+   return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+}
+
+static void
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
+                                           int rt,
+                                           uint32_t *rt_bpp,
+#if V3D_VERSION == 42
+                                           uint32_t *rt_type,
+                                           uint32_t *rt_clamp)
+#else
+                                           uint32_t *rt_type_clamp)
+#endif
+{
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   assert(state->subpass_idx < state->pass->subpass_count);
+   const struct v3dv_subpass *subpass =
+      &state->pass->subpasses[state->subpass_idx];
+
+   if (rt >= subpass->color_count)
+      return;
+
+   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+   const uint32_t attachment_idx = attachment->attachment;
+   if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      return;
+
+   assert(attachment_idx < state->framebuffer->attachment_count &&
+          attachment_idx < state->attachment_alloc_count);
+   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+   assert(vk_format_is_color(iview->vk.format));
+
+   assert(iview->plane_count == 1);
+   *rt_bpp = iview->planes[0].internal_bpp;
+#if V3D_VERSION == 42
+   *rt_type = iview->planes[0].internal_type;
+   *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+                                               iview->vk.format);
+#endif
+#if V3D_VERSION >= 71
+   *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+                                                    iview->vk.format);
+#endif
+}
+
 void
 v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -738,7 +928,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
     * buffer.
     */
    if (!framebuffer) {
-      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
       return;
    }
 
@@ -756,23 +946,44 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
    struct v3dv_cl *rcl = &job->rcl;
 
-   /* Comon config must be the first TILE_RENDERING_MODE_CFG and
+   /* Common config must be the first TILE_RENDERING_MODE_CFG and
     * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
     * updates to the previous HW state.
     */
    bool do_early_zs_clear = false;
    const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+   assert(!tiling->msaa || !tiling->double_buffer);
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
       config.image_width_pixels = framebuffer->width;
       config.image_height_pixels = framebuffer->height;
       config.number_of_render_targets = MAX2(subpass->color_count, 1);
       config.multisample_mode_4x = tiling->msaa;
+      config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideallly we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
 
       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
          const struct v3dv_image_view *iview =
-            framebuffer->attachments[ds_attachment_idx];
-         config.internal_depth_type = iview->internal_type;
+            state->attachments[ds_attachment_idx].image_view;
+
+         /* At this point the image view should be single-plane. But note that
+          * the underlying image can be multi-plane, and the image view refer
+          * to one specific plane.
+          */
+         assert(iview->plane_count == 1);
+         assert(iview->format->plane_count == 1);
+         config.internal_depth_type = iview->planes[0].internal_type;
 
          set_rcl_early_z_config(job,
                                 &config.early_z_disable,
@@ -787,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
           * Early-Z/S clearing is independent of Early Z/S testing, so it is
           * possible to enable one but not the other so long as their
           * respective requirements are met.
+          *
+          * From V3D 4.5.6, Z/S buffers are always cleared automatically
+          * between tiles, but we still want to enable early ZS clears
+          * when Z/S are not loaded or stored.
           */
          struct v3dv_render_pass_attachment *ds_attachment =
             &pass->attachments[ds_attachment_idx];
@@ -794,6 +1009,13 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          const VkImageAspectFlags ds_aspects =
             vk_format_aspects(ds_attachment->desc.format);
 
+         bool needs_depth_store =
+            v3dv_cmd_buffer_check_needs_store(state,
+                                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                              ds_attachment->last_subpass,
+                                              ds_attachment->desc.storeOp) ||
+                                              subpass->resolve_depth;
+#if V3D_VERSION <= 42
          bool needs_depth_clear =
             check_needs_clear(state,
                               ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
@@ -801,26 +1023,35 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
                               ds_attachment->desc.loadOp,
                               subpass->do_depth_clear_with_draw);
 
-         bool needs_depth_store =
-            check_needs_store(state,
-                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                              ds_attachment->last_subpass,
-                              ds_attachment->desc.storeOp);
-
          do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+#endif
+#if V3D_VERSION >= 71
+         bool needs_depth_load =
+            v3dv_cmd_buffer_check_needs_load(state,
+                                             ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                             ds_attachment->first_subpass,
+                                             ds_attachment->desc.loadOp,
+                                             ds_attachment->last_subpass,
+                                             ds_attachment->desc.storeOp);
+         do_early_zs_clear = !needs_depth_load && !needs_depth_store;
+#endif
+
          if (do_early_zs_clear &&
              vk_format_has_stencil(ds_attachment->desc.format)) {
             bool needs_stencil_load =
-               check_needs_load(state,
-                                ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                                ds_attachment->first_subpass,
-                                ds_attachment->desc.stencilLoadOp);
+               v3dv_cmd_buffer_check_needs_load(state,
+                                                ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                                ds_attachment->first_subpass,
+                                                ds_attachment->desc.stencilLoadOp,
+                                                ds_attachment->last_subpass,
+                                                ds_attachment->desc.stencilStoreOp);
 
             bool needs_stencil_store =
-               check_needs_store(state,
-                                 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                                 ds_attachment->last_subpass,
-                                 ds_attachment->desc.stencilStoreOp);
+               v3dv_cmd_buffer_check_needs_store(state,
+                                                 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                                 ds_attachment->last_subpass,
+                                                 ds_attachment->desc.stencilStoreOp) ||
+               subpass->resolve_stencil;
 
             do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
          }
@@ -837,25 +1068,38 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
     */
    job->early_zs_clear = do_early_zs_clear;
 
+#if V3D_VERSION >= 71
+   uint32_t base_addr = 0;
+#endif
    for (uint32_t i = 0; i < subpass->color_count; i++) {
       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+#if V3D_VERSION >= 71
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+            rt.render_target_number = i;
+            rt.stride = 1; /* Unused */
+         }
+#endif
          continue;
+      }
 
       struct v3dv_image_view *iview =
-         state->framebuffer->attachments[attachment_idx];
+         state->attachments[attachment_idx].image_view;
+      assert(iview->plane_count == 1);
 
       const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+
+      uint8_t plane = v3dv_plane_from_aspect(iview->vk.aspects);
       const struct v3d_resource_slice *slice =
-         &image->slices[iview->vk.base_mip_level];
+         &image->planes[plane].slices[iview->vk.base_mip_level];
 
-      const uint32_t *clear_color =
+      UNUSED const uint32_t *clear_color =
          &state->attachments[attachment_idx].clear_value.color[0];
 
-      uint32_t clear_pad = 0;
+      UNUSED uint32_t clear_pad = 0;
       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
           slice->tiling == V3D_TILING_UIF_XOR) {
-         int uif_block_height = v3d_utile_height(image->cpp) * 2;
+         int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
 
          uint32_t implicit_padded_height =
             align(framebuffer->height, uif_block_height) / uif_block_height;
@@ -866,13 +1110,14 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          }
       }
 
+#if V3D_VERSION == 42
       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
          clear.clear_color_low_32_bits = clear_color[0];
          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
          clear.render_target_number = i;
       };
 
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) {
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
             clear.clear_color_mid_low_32_bits =
                ((clear_color[1] >> 24) | (clear_color[2] << 8));
@@ -882,29 +1127,81 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          };
       }
 
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
             clear.uif_padded_height_in_uif_blocks = clear_pad;
             clear.clear_color_high_16_bits = clear_color[3] >> 16;
             clear.render_target_number = i;
          };
       }
+#endif
+
+#if V3D_VERSION >= 71
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+         rt.clear_color_low_bits = clear_color[0];
+         cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
+                                                    &rt.internal_type_and_clamping);
+         rt.stride =
+            v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+                                                   v3d_internal_bpp_words(rt.internal_bpp));
+         rt.base_address = base_addr;
+         rt.render_target_number = i;
+
+         /* base_addr in multiples of 512 bits. We divide by 8 because stride
+          * is in 128-bit units, but it is packing 2 rows worth of data, so we
+          * need to divide it by 2 so it is only 1 row, and then again by 4 so
+          * it is in 512-bit units.
+          */
+         base_addr += (tiling->tile_height * rt.stride) / 8;
+      }
+
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+            rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
+               ((uint64_t) clear_color[1]) |
+               (((uint64_t) (clear_color[2] & 0xff)) << 32);
+            rt.render_target_number = i;
+         }
+      }
+
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+            rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+               (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
+               (((uint64_t) (clear_color[3])) << 24);
+            rt.render_target_number = i;
+         }
+      }
+#endif
+   }
+
+#if V3D_VERSION >= 71
+   /* If we don't have any color RTs, we still need to emit one and flag
+    * it as not used using stride = 1.
+    */
+   if (subpass->color_count == 0) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+         rt.stride = 1;
+      }
    }
+#endif
 
+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
           &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
           &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
           &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
           &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
    }
+#endif
 
    /* Ends rendering mode config. */
    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -944,12 +1241,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          tiling->frame_height_in_supertiles;
    }
 
-   /* Start by clearing the tile buffer. */
-   cl_emit(rcl, TILE_COORDINATES, coords) {
-      coords.tile_column_number = 0;
-      coords.tile_row_number = 0;
-   }
-
    /* Emit an initial clear of the tile buffers. This is necessary
     * for any buffers that should be cleared (since clearing
     * normally happens at the *end* of the generic tile list), but
@@ -964,17 +1255,22 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
     * changes on V3D 3.x, and 2 dummy stores on 4.x.
     */
    for (int i = 0; i < 2; i++) {
-      if (i > 0)
-         cl_emit(rcl, TILE_COORDINATES, coords);
+      cl_emit(rcl, TILE_COORDINATES, coords);
       cl_emit(rcl, END_OF_LOADS, end);
       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
          store.buffer_to_store = NONE;
       }
-      if (i == 0 && cmd_buffer->state.tile_aligned_render_area) {
+      if (cmd_buffer->state.tile_aligned_render_area &&
+          (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
             clear.clear_z_stencil_buffer = !job->early_zs_clear;
             clear.clear_all_render_targets = true;
          }
+#endif
+#if V3D_VERSION >= 71
+         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
+#endif
       }
       cl_emit(rcl, END_OF_TILE_MARKER, end);
    }
@@ -990,11 +1286,51 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
 }
 
 void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+                            float scale[3],
+                            float translate[3])
+{
+   float x = viewport->x;
+   float y = viewport->y;
+   float half_width = 0.5f * viewport->width;
+   float half_height = 0.5f * viewport->height;
+   double n = viewport->minDepth;
+   double f = viewport->maxDepth;
+
+   scale[0] = half_width;
+   translate[0] = half_width + x;
+   scale[1] = half_height;
+   translate[1] = half_height + y;
+
+   scale[2] = (f - n);
+   translate[2] = n;
+
+   /* It seems that if the scale is small enough the hardware won't clip
+    * correctly so we work around this my choosing the smallest scale that
+    * seems to work.
+    *
+    * This case is exercised by CTS:
+    * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+    *
+    * V3D 7.x fixes this by using the new
+    * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
+    */
+#if V3D_VERSION <= 42
+   const float min_abs_scale = 0.0005f;
+   if (fabs(scale[2]) < min_abs_scale)
+      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+#endif
+}
+
+void
 v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
 {
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-   /* FIXME: right now we only support one viewport. viewporst[0] would work
-    * now, would need to change if we allow multiple viewports
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   assert(pipeline);
+
+   /* FIXME: right now we don't support multiViewport so viewports[0] would
+    * work now, but would need to change if we allow multiple viewports.
     */
    float *vptranslate = dynamic->viewport.translate[0];
    float *vpscale = dynamic->viewport.scale[0];
@@ -1010,29 +1346,83 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
    v3dv_return_if_oom(cmd_buffer, NULL);
 
+#if V3D_VERSION == 42
    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
    }
+#endif
+#if V3D_VERSION >= 71
+   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+      clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
+      clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
+   }
+#endif
 
+   float translate_z, scale_z;
+   v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
+                                              &translate_z, &scale_z);
+
+#if V3D_VERSION == 42
    cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
-      clip.viewport_z_offset_zc_to_zs = vptranslate[2];
-      clip.viewport_z_scale_zc_to_zs = vpscale[2];
+      clip.viewport_z_offset_zc_to_zs = translate_z;
+      clip.viewport_z_scale_zc_to_zs = scale_z;
+   }
+#endif
+
+#if V3D_VERSION >= 71
+   /* If the Z scale is too small guardband clipping may not clip correctly */
+   if (fabsf(scale_z) < 0.01f) {
+      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
+         clip.viewport_z_offset_zc_to_zs = translate_z;
+         clip.viewport_z_scale_zc_to_zs = scale_z;
+      }
+   } else {
+      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+         clip.viewport_z_offset_zc_to_zs = translate_z;
+         clip.viewport_z_scale_zc_to_zs = scale_z;
+      }
    }
+#endif
+
    cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
-      /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */
-      float z1 = vptranslate[2];
-      float z2 = vptranslate[2] + vpscale[2];
+      /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
+       * we are using OpenGL's [-1, 1] instead.
+       */
+      float z1 = pipeline->negative_one_to_one ? translate_z - scale_z :
+                                                 translate_z;
+      float z2 = translate_z + scale_z;
       clip.minimum_zw = MIN2(z1, z2);
       clip.maximum_zw = MAX2(z1, z2);
    }
 
    cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
-      vp.viewport_centre_x_coordinate = vptranslate[0];
-      vp.viewport_centre_y_coordinate = vptranslate[1];
+      float vp_fine_x = vptranslate[0];
+      float vp_fine_y = vptranslate[1];
+      int32_t vp_coarse_x = 0;
+      int32_t vp_coarse_y = 0;
+
+      /* The fine coordinates must be unsigned, but coarse can be signed */
+      if (unlikely(vp_fine_x < 0)) {
+         int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64);
+         vp_fine_x += 64.0f * blocks_64;
+         vp_coarse_x -= blocks_64;
+      }
+
+      if (unlikely(vp_fine_y < 0)) {
+         int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64);
+         vp_fine_y += 64.0f * blocks_64;
+         vp_coarse_y -= blocks_64;
+      }
+
+      vp.fine_x = vp_fine_x;
+      vp.fine_y = vp_fine_y;
+      vp.coarse_x = vp_coarse_x;
+      vp.coarse_y = vp_coarse_y;
    }
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
+   BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
+                MESA_VK_DYNAMIC_VP_VIEWPORTS);
 }
 
 void
@@ -1042,52 +1432,62 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer)
    assert(job);
 
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic;
-
-   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
-      V3DV_DYNAMIC_STENCIL_WRITE_MASK |
-      V3DV_DYNAMIC_STENCIL_REFERENCE;
+   struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
 
    v3dv_cl_ensure_space_with_branch(&job->bcl,
                                     2 * cl_packet_length(STENCIL_CFG));
    v3dv_return_if_oom(cmd_buffer, NULL);
 
+   bool any_dynamic_stencil_state =
+      BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+      BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+      BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+      BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP);
+
    bool emitted_stencil = false;
-   for (uint32_t i = 0; i < 2; i++) {
+   const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
+   const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
+
+   const bool needs_front_and_back = any_dynamic_stencil_state ?
+      memcmp(front, back, sizeof(*front)) != 0 :
+      pipeline->emit_stencil_cfg[1] == true;
+   const unsigned stencil_packets = needs_front_and_back ? 2 : 1;
+
+   for (uint32_t i = 0; i < stencil_packets; i++) {
       if (pipeline->emit_stencil_cfg[i]) {
-         if (dynamic_state->mask & dynamic_stencil_states) {
-            cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
-                                   pipeline->stencil_cfg[i], config) {
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) {
-                  config.stencil_test_mask =
-                     i == 0 ? dynamic_state->stencil_compare_mask.front :
-                     dynamic_state->stencil_compare_mask.back;
-               }
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) {
-                  config.stencil_write_mask =
-                     i == 0 ? dynamic_state->stencil_write_mask.front :
-                     dynamic_state->stencil_write_mask.back;
-               }
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) {
-                  config.stencil_ref_value =
-                     i == 0 ? dynamic_state->stencil_reference.front :
-                     dynamic_state->stencil_reference.back;
-               }
+         if (any_dynamic_stencil_state) {
+            const struct vk_stencil_test_face_state *stencil_state =
+               i == 0 ? front : back;
+
+            /* If we have any dynamic stencil state we just emit the entire
+             * packet since for simplicity
+             */
+            cl_emit(&job->bcl, STENCIL_CFG, config) {
+               config.front_config = !needs_front_and_back || i == 0;
+               config.back_config = !needs_front_and_back || i == 1;
+               config.stencil_test_mask = stencil_state->compare_mask & 0xff;
+               config.stencil_write_mask = stencil_state->write_mask & 0xff;
+               config.stencil_ref_value = stencil_state->reference & 0xff;
+               config.stencil_test_function = stencil_state->op.compare;
+               config.stencil_pass_op =
+                  v3dX(translate_stencil_op)(stencil_state->op.pass);
+               config.depth_test_fail_op =
+                  v3dX(translate_stencil_op)(stencil_state->op.depth_fail);
+               config.stencil_test_fail_op =
+                  v3dX(translate_stencil_op)(stencil_state->op.fail);
             }
          } else {
             cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
          }
-
          emitted_stencil = true;
       }
    }
-
    if (emitted_stencil) {
-      const uint32_t dynamic_stencil_dirty_flags =
-         V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
-         V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
-         V3DV_CMD_DIRTY_STENCIL_REFERENCE;
-      cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags;
+      BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK);
+      BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE);
+      BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
+      BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
    }
 }
 
@@ -1103,19 +1503,51 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_job *job = cmd_buffer->state.job;
    assert(job);
 
+   struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+
    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
    v3dv_return_if_oom(cmd_buffer, NULL);
 
-   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
    cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
-      bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
-      bias.depth_offset_units = dynamic->depth_bias.constant_factor;
+      bias.depth_offset_factor = dyn->rs.depth_bias.slope;
+      bias.depth_offset_units = dyn->rs.depth_bias.constant;
+#if V3D_VERSION <= 42
       if (pipeline->depth_bias.is_z16)
          bias.depth_offset_units *= 256.0f;
-      bias.limit = dynamic->depth_bias.depth_bias_clamp;
+#endif
+      bias.limit = dyn->rs.depth_bias.clamp;
    }
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
+   BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
+}
+
+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* No depthBounds support for v42, so this method is empty in that case.
+    *
+    * Note that this method is being called as v3dv_job_init flags all state
+    * as dirty. See FIXME note in v3dv_job_init.
+    */
+#if V3D_VERSION >= 71
+   struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+
+   if (!dyn->ds.depth.bounds_test.enable)
+      return;
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
+      bounds.lower_test_limit = dyn->ds.depth.bounds_test.min;
+      bounds.upper_test_limit = dyn->ds.depth.bounds_test.max;
+   }
+   BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS);
+#endif
 }
 
 void
@@ -1124,14 +1556,17 @@ v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_job *job = cmd_buffer->state.job;
    assert(job);
 
+   struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+
    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
    v3dv_return_if_oom(cmd_buffer, NULL);
 
    cl_emit(&job->bcl, LINE_WIDTH, line) {
-      line.line_width = cmd_buffer->state.dynamic.line_width;
+      line.line_width = v3dv_get_aa_line_width(cmd_buffer->state.gfx.pipeline,
+                                               cmd_buffer);
    }
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH;
+   BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH);
 }
 
 void
@@ -1161,10 +1596,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);
 
+   const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
+   const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
+
    const uint32_t blend_packets_size =
       cl_packet_length(BLEND_ENABLES) +
       cl_packet_length(BLEND_CONSTANT_COLOR) +
-      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
+      cl_packet_length(BLEND_CFG) * max_color_rts;
 
    v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
    v3dv_return_if_oom(cmd_buffer, NULL);
@@ -1176,23 +1614,26 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
          }
       }
 
-      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+      for (uint32_t i = 0; i < max_color_rts; i++) {
          if (pipeline->blend.enables & (1 << i))
             cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
       }
    }
 
-   if (pipeline->blend.needs_color_constants &&
-       cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) {
-      struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   if (pipeline->blend.needs_color_constants) {
+      const struct vk_dynamic_graphics_state *dyn =
+         &cmd_buffer->vk.dynamic_graphics_state;
+
       cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
-         color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]);
-         color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]);
-         color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]);
-         color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]);
+         color.red_f16 = _mesa_float_to_half(dyn->cb.blend_constants[0]);
+         color.green_f16 = _mesa_float_to_half(dyn->cb.blend_constants[1]);
+         color.blue_f16 = _mesa_float_to_half(dyn->cb.blend_constants[2]);
+         color.alpha_f16 = _mesa_float_to_half(dyn->cb.blend_constants[3]);
       }
-      cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS;
    }
+
+   BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
+                MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
 }
 
 void
@@ -1202,13 +1643,21 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS));
 
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
+   uint32_t color_write_mask = ~v3dv_dyn->color_write_enable |
+                               pipeline->blend.color_write_masks;
+
+#if V3D_VERSION <= 42
+   /* Only 4 RTs */
+   color_write_mask &= 0xffff;
+#endif
+
    cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
-      mask.mask = (~dynamic->color_write_enable |
-                   pipeline->blend.color_write_masks) & 0xffff;
+      mask.mask = color_write_mask;
    }
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
+   BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
+                MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
 }
 
 static void
@@ -1346,11 +1795,33 @@ v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer)
    }
 }
 
-static void
-job_update_ez_state(struct v3dv_job *job,
-                    struct v3dv_pipeline *pipeline,
-                    struct v3dv_cmd_buffer *cmd_buffer)
+#if V3D_VERSION == 42
+/* Updates cmd_buffer, and their job, early z state tracking. Returns false if
+ * EZ must be disabled for the current draw call.
+ */
+static bool
+cmd_buffer_update_ez_state(struct v3dv_cmd_buffer *cmd_buffer,
+                           struct v3dv_pipeline *pipeline)
 {
+   struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+   /* Update first cmd_buffer ez_state tracking. If possible we reuse the
+    * values from the pipeline
+    */
+   if (!BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP) &&
+       !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) &&
+       !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) &&
+       !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
+      cmd_buffer->state.ez_state = pipeline->ez_state;
+      cmd_buffer->state.incompatible_ez_test =
+         pipeline->incompatible_ez_test;
+   } else {
+      v3dv_compute_ez_state(dyn, pipeline,
+                            &cmd_buffer->state.ez_state,
+                            &cmd_buffer->state.incompatible_ez_test);
+   }
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
    /* If first_ez_state is V3D_EZ_DISABLED it means that we have already
     * determined that we should disable EZ completely for all draw calls in
     * this job. This will cause us to disable EZ for the entire job in the
@@ -1360,9 +1831,15 @@ job_update_ez_state(struct v3dv_job *job,
     */
    if (job->first_ez_state == V3D_EZ_DISABLED) {
       assert(job->ez_state == V3D_EZ_DISABLED);
-      return;
+      return false;
    }
 
+   /* If ez_state is V3D_EZ_DISABLED it means that we have already decided
+    * that EZ must be disabled for the remaining of the frame.
+    */
+   if (job->ez_state == V3D_EZ_DISABLED)
+      return false;
+
    /* This is part of the pre draw call handling, so we should be inside a
     * render pass.
     */
@@ -1371,7 +1848,7 @@ job_update_ez_state(struct v3dv_job *job,
    /* If this is the first time we update EZ state for this job we first check
     * if there is anything that requires disabling it completely for the entire
     * job (based on state that is not related to the current draw call and
-    * pipeline state).
+    * pipeline/cmd_buffer state).
     */
    if (!job->decided_global_ez_enable) {
       job->decided_global_ez_enable = true;
@@ -1382,13 +1859,14 @@ job_update_ez_state(struct v3dv_job *job,
       if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
          job->first_ez_state = V3D_EZ_DISABLED;
          job->ez_state = V3D_EZ_DISABLED;
-         return;
+         return false;
       }
 
-      /* GFXH-1918: the early-z buffer may load incorrect depth values
-       * if the frame has odd width or height.
+      /* GFXH-1918: the early-z buffer may load incorrect depth values if the
+       * frame has odd width or height, or if the buffer is 16-bit and
+       * multisampled.
        *
-       * So we need to disable EZ in this case.
+       * So we need to disable EZ in these cases.
        */
       const struct v3dv_render_pass_attachment *ds_attachment =
          &state->pass->attachments[subpass->ds_attachment.attachment];
@@ -1397,21 +1875,32 @@ job_update_ez_state(struct v3dv_job *job,
          vk_format_aspects(ds_attachment->desc.format);
 
       bool needs_depth_load =
-         check_needs_load(state,
-                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                          ds_attachment->first_subpass,
-                          ds_attachment->desc.loadOp);
+         v3dv_cmd_buffer_check_needs_load(state,
+                                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                          ds_attachment->first_subpass,
+                                          ds_attachment->desc.loadOp,
+                                          ds_attachment->last_subpass,
+                                          ds_attachment->desc.storeOp);
 
       if (needs_depth_load) {
+         if (ds_attachment->desc.format == VK_FORMAT_D16_UNORM &&
+             ds_attachment->desc.samples != VK_SAMPLE_COUNT_1_BIT) {
+            perf_debug("Loading depth aspect from a multisampled 16-bit "
+                       "depth buffer disables early-Z tests.\n");
+            job->first_ez_state = V3D_EZ_DISABLED;
+            job->ez_state = V3D_EZ_DISABLED;
+            return false;
+         }
+
          struct v3dv_framebuffer *fb = state->framebuffer;
 
          if (!fb) {
-            assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+            assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
             perf_debug("Loading depth aspect in a secondary command buffer "
                        "without framebuffer info disables early-z tests.\n");
             job->first_ez_state = V3D_EZ_DISABLED;
             job->ez_state = V3D_EZ_DISABLED;
-            return;
+            return false;
          }
 
          if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
@@ -1419,24 +1908,18 @@ job_update_ez_state(struct v3dv_job *job,
                        "or height disables early-Z tests.\n");
             job->first_ez_state = V3D_EZ_DISABLED;
             job->ez_state = V3D_EZ_DISABLED;
-            return;
+            return false;
          }
       }
    }
 
    /* Otherwise, we can decide to selectively enable or disable EZ for draw
-    * calls using the CFG_BITS packet based on the bound pipeline state.
+    * calls using the CFG_BITS packet based on the bound pipeline state, or
+    * cmd_buffer state if some stencil/depth flags were dynamic.
     */
-
-   /* If the FS writes Z, then it may update against the chosen EZ direction */
-   struct v3dv_shader_variant *fs_variant =
-      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
-   if (fs_variant->prog_data.fs->writes_z) {
-      job->ez_state = V3D_EZ_DISABLED;
-      return;
-   }
-
-   switch (pipeline->ez_state) {
+   bool disable_ez = false;
+   bool incompatible_test = false;
+   switch (cmd_buffer->state.ez_state) {
    case V3D_EZ_UNDECIDED:
       /* If the pipeline didn't pick a direction but didn't disable, then go
        * along with the current EZ state. This allows EZ optimization for Z
@@ -1449,25 +1932,40 @@ job_update_ez_state(struct v3dv_job *job,
       /* If the pipeline picked a direction, then it needs to match the current
        * direction if we've decided on one.
        */
-      if (job->ez_state == V3D_EZ_UNDECIDED)
-         job->ez_state = pipeline->ez_state;
-      else if (job->ez_state != pipeline->ez_state)
-         job->ez_state = V3D_EZ_DISABLED;
+      if (job->ez_state == V3D_EZ_UNDECIDED) {
+         job->ez_state = cmd_buffer->state.ez_state;
+      } else if (job->ez_state != pipeline->ez_state) {
+         disable_ez = true;
+         incompatible_test = true;
+      }
       break;
 
    case V3D_EZ_DISABLED:
-      /* If the pipeline disables EZ because of a bad Z func or stencil
-       * operation, then we can't do any more EZ in this frame.
-       */
-      job->ez_state = V3D_EZ_DISABLED;
+         disable_ez = true;
+         incompatible_test = cmd_buffer->state.incompatible_ez_test;
       break;
    }
 
-   if (job->first_ez_state == V3D_EZ_UNDECIDED &&
-       job->ez_state != V3D_EZ_DISABLED) {
+   if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) {
+      assert(job->ez_state != V3D_EZ_DISABLED);
       job->first_ez_state = job->ez_state;
    }
+
+   /* If we had to disable EZ because of an incompatible test direction and
+    * and the cmd buffer writes depth then we need to disable EZ for the rest
+    * of the frame.
+    */
+   if (incompatible_test && cmd_buffer->state.z_updates_enable) {
+      assert(disable_ez);
+      job->ez_state = V3D_EZ_DISABLED;
+   }
+
+   if (!disable_ez)
+      job->has_ez_draws = true;
+
+   return !disable_ez;
 }
+#endif
 
 void
 v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
@@ -1478,16 +1976,60 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);
 
-   job_update_ez_state(job, pipeline, cmd_buffer);
-
    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
    v3dv_return_if_oom(cmd_buffer, NULL);
 
+   struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+
+   /* Disable depth/stencil if we don't have a D/S attachment */
+   bool has_depth =
+      pipeline->rendering_info.depth_attachment_format != VK_FORMAT_UNDEFINED;
+   bool has_stencil =
+      pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
+
    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
-      config.early_z_enable = job->ez_state != V3D_EZ_DISABLED;
+      if (dyn->ds.depth.test_enable && has_depth) {
+         config.z_updates_enable = dyn->ds.depth.write_enable;
+         config.depth_test_function = dyn->ds.depth.compare_op;
+      } else {
+         config.depth_test_function = VK_COMPARE_OP_ALWAYS;
+      }
+
+      config.stencil_enable = dyn->ds.stencil.test_enable && has_stencil;
+
+      cmd_buffer->state.z_updates_enable = config.z_updates_enable;
+#if V3D_VERSION == 42
+      bool enable_ez = cmd_buffer_update_ez_state(cmd_buffer, pipeline);
+      config.early_z_enable = enable_ez;
       config.early_z_updates_enable = config.early_z_enable &&
-         pipeline->z_updates_enable;
-   }
+         cmd_buffer->state.z_updates_enable;
+#endif
+
+      if (pipeline->rasterization_enabled) {
+         assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_CULL_MODE));
+         assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_FRONT_FACE));
+         config.enable_forward_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT);
+         config.enable_reverse_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT);
+         /* Seems like the hardware is backwards regarding this setting... */
+         config.clockwise_primitives = dyn->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
+      }
+
+      /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
+       * feature and it shouldn't be used by any pipeline.
+       */
+      assert(cmd_buffer->device->devinfo.ver >= 71 ||
+             !dyn->ds.depth.bounds_test.enable);
+#if V3D_VERSION >= 71
+      config.depth_bounds_test_enable =
+         dyn->ds.depth.bounds_test.enable && has_depth;
+#endif
+   }
+
+   BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE);
+   BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE);
+   BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
+   BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
 }
 
 void
@@ -1523,7 +2065,8 @@ cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
    if (!job)
       return NULL;
 
-   job->serialize = true;
+   /* FIXME: we can do better than all barriers */
+   job->serialize = V3DV_BARRIER_ALL;
    job->needs_bcl_sync = is_bcl_barrier;
    return job;
 }
@@ -1538,21 +2081,20 @@ cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
    const uint32_t total_state_count =
       p_state->query.end.used_count + s_state->query.end.used_count;
    v3dv_cmd_buffer_ensure_array_state(primary,
-                                      sizeof(struct v3dv_end_query_cpu_job_info),
+                                      sizeof(struct v3dv_end_query_info),
                                       total_state_count,
                                       &p_state->query.end.alloc_count,
                                       (void **) &p_state->query.end.states);
    v3dv_return_if_oom(primary, NULL);
 
    for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
-      const struct v3dv_end_query_cpu_job_info *s_qstate =
+      const struct v3dv_end_query_info *s_qstate =
          &secondary->state.query.end.states[i];
 
-      struct v3dv_end_query_cpu_job_info *p_qstate =
+      struct v3dv_end_query_info *p_qstate =
          &p_state->query.end.states[p_state->query.end.used_count++];
 
-      p_qstate->pool = s_qstate->pool;
-      p_qstate->query = s_qstate->query;
+      memcpy(p_qstate, s_qstate, sizeof(struct v3dv_end_query_info));
    }
 }
 
@@ -1563,6 +2105,20 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
 {
    assert(primary->state.job);
 
+   /* Typically we postpone applying binning syncs until we see a draw call
+    * that may actually access proteted resources in the binning stage. However,
+    * if the draw calls are recorded in a secondary command buffer and the
+    * barriers were recorded in a primary command buffer, that won't work
+    * and we will have to check if we need a binning sync when executing the
+    * secondary.
+    */
+   struct v3dv_job *primary_job = primary->state.job;
+   if (primary_job->serialize &&
+       (primary->state.barrier.bcl_buffer_access ||
+        primary->state.barrier.bcl_image_access)) {
+      v3dv_cmd_buffer_consume_bcl_sync(primary, primary_job);
+   }
+
    /* Emit occlusion query state if needed so the draw calls inside our
     * secondaries update the counters.
     */
@@ -1575,8 +2131,7 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
     * pipelines used by the secondaries do, we need to re-start the primary
     * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
     */
-   bool pending_barrier = false;
-   bool pending_bcl_barrier = false;
+   struct v3dv_barrier_state pending_barrier = { 0 };
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
       V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
 
@@ -1585,7 +2140,7 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
 
       list_for_each_entry(struct v3dv_job, secondary_job,
                           &secondary->jobs, list_link) {
-         if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
+         if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
             /* If the job is a CL, then we branch to it from the primary BCL.
              * In this case the secondary's BCL is finished with a
              * RETURN_FROM_SUB_LIST command to return back to the primary BCL
@@ -1609,10 +2164,14 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
              * the RETURN_FROM_SUB_LIST into the primary job to skip the
              * branch?
              */
-            struct v3dv_job *primary_job = primary->state.job;
-            if (!primary_job || secondary_job->serialize || pending_barrier) {
+            primary_job = primary->state.job;
+            if (!primary_job || secondary_job->serialize ||
+                pending_barrier.dst_mask) {
                const bool needs_bcl_barrier =
-                  secondary_job->needs_bcl_sync || pending_bcl_barrier;
+                  secondary_job->needs_bcl_sync ||
+                  pending_barrier.bcl_buffer_access ||
+                  pending_barrier.bcl_image_access;
+
                primary_job =
                   cmd_buffer_subpass_split_for_barrier(primary,
                                                        needs_bcl_barrier);
@@ -1644,6 +2203,14 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
                }
             }
 
+            if (!secondary_job->can_use_double_buffer) {
+               primary_job->can_use_double_buffer = false;
+            } else {
+               primary_job->double_buffer_score.geom +=
+                  secondary_job->double_buffer_score.geom;
+               primary_job->double_buffer_score.render +=
+                  secondary_job->double_buffer_score.render;
+            }
             primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
          } else {
             /* This is a regular job (CPU or GPU), so just finish the current
@@ -1652,15 +2219,21 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
              */
             v3dv_cmd_buffer_finish_job(primary);
             v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
-            if (pending_barrier) {
-               secondary_job->serialize = true;
-               if (pending_bcl_barrier)
+            if (pending_barrier.dst_mask) {
+               /* FIXME: do the same we do for primaries and only choose the
+                * relevant src masks.
+                */
+               secondary_job->serialize = pending_barrier.src_mask_graphics |
+                                          pending_barrier.src_mask_transfer |
+                                          pending_barrier.src_mask_compute;
+               if (pending_barrier.bcl_buffer_access ||
+                   pending_barrier.bcl_image_access) {
                   secondary_job->needs_bcl_sync = true;
+               }
             }
          }
 
-         pending_barrier = false;
-         pending_bcl_barrier = false;
+         memset(&pending_barrier, 0, sizeof(pending_barrier));
       }
 
       /* If the secondary has recorded any vkCmdEndQuery commands, we need to
@@ -1672,14 +2245,16 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
       /* If this secondary had any pending barrier state we will need that
        * barrier state consumed with whatever comes next in the primary.
        */
-      assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
-      pending_barrier = secondary->state.has_barrier;
-      pending_bcl_barrier = secondary->state.has_bcl_barrier;
+      assert(secondary->state.barrier.dst_mask ||
+             (!secondary->state.barrier.bcl_buffer_access &&
+              !secondary->state.barrier.bcl_image_access));
+
+      pending_barrier = secondary->state.barrier;
    }
 
-   if (pending_barrier) {
-      primary->state.has_barrier = true;
-      primary->state.has_bcl_barrier |= pending_bcl_barrier;
+   if (pending_barrier.dst_mask) {
+      v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
+                                          &pending_barrier);
    }
 }
 
@@ -1698,7 +2273,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
          gs_bin->prog_data.gs->base.threads == 4;
       shader.geometry_bin_mode_shader_start_in_final_thread_section =
          gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
       shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
       shader.geometry_bin_mode_shader_uniforms_address =
          gs_bin_uniforms;
 
@@ -1708,21 +2285,23 @@ emit_gs_shader_state_record(struct v3dv_job *job,
          gs->prog_data.gs->base.threads == 4;
       shader.geometry_render_mode_shader_start_in_final_thread_section =
          gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
       shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
       shader.geometry_render_mode_shader_uniforms_address =
          gs_render_uniforms;
    }
 }
 
 static uint8_t
-v3d_gs_output_primitive(uint32_t prim_type)
+v3d_gs_output_primitive(enum mesa_prim prim_type)
 {
     switch (prim_type) {
-    case GL_POINTS:
+    case MESA_PRIM_POINTS:
         return GEOMETRY_SHADER_POINTS;
-    case GL_LINE_STRIP:
+    case MESA_PRIM_LINE_STRIP:
         return GEOMETRY_SHADER_LINE_STRIP;
-    case GL_TRIANGLE_STRIP:
+    case MESA_PRIM_TRIANGLE_STRIP:
         return GEOMETRY_SHADER_TRI_STRIP;
     default:
         unreachable("Unsupported primitive type");
@@ -1884,10 +2463,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
                                 pipeline->vpm_cfg.Gv);
    }
 
+#if V3D_VERSION == 42
    struct v3dv_bo *default_attribute_values =
       pipeline->default_attribute_values != NULL ?
       pipeline->default_attribute_values :
       pipeline->device->default_attribute_float;
+#endif
 
    cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
                           pipeline->shader_state_record, shader) {
@@ -1913,8 +2494,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
       shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
       shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
 
+#if V3D_VERSION == 42
       shader.address_of_default_attribute_values =
          v3dv_cl_address(default_attribute_values, 0);
+#endif
 
       shader.any_shader_reads_hardware_written_primitive_id =
          (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
@@ -1979,6 +2562,8 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
             cs_loaded_any = true;
          }
 
+         attr.stride =
+            cmd_buffer->vk.dynamic_graphics_state.vi_binding_strides[binding];
          attr.maximum_index = 0xffffff;
       }
 
@@ -2027,6 +2612,11 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
       }
    }
 
+   /* Clearing push constants and descriptor sets for all stages is not quite
+    * correct (some shader stages may not be used at all or they may not be
+    * consuming push constants), however this is not relevant because if we
+    * bind a different pipeline we always have to rebuild the uniform streams.
+    */
    cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
                                 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
                                 V3DV_CMD_DIRTY_PUSH_CONSTANTS);
@@ -2034,44 +2624,15 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
 }
 
-/* FIXME: C&P from v3dx_draw. Refactor to common place? */
-static uint32_t
-v3d_hw_prim_type(enum pipe_prim_type prim_type)
-{
-   switch (prim_type) {
-   case PIPE_PRIM_POINTS:
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_LOOP:
-   case PIPE_PRIM_LINE_STRIP:
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return prim_type;
-
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY);
-
-   default:
-      unreachable("Unsupported primitive type");
-   }
-}
-
 void
 v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
                            struct v3dv_draw_info *info)
 {
    struct v3dv_job *job = cmd_buffer->state.job;
    assert(job);
-
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   struct v3dv_pipeline *pipeline = state->gfx.pipeline;
-
-   assert(pipeline);
-
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
 
    if (info->first_instance > 0) {
       v3dv_cl_ensure_space_with_branch(
@@ -2226,7 +2787,9 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
    assert(job);
 
    const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
    uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
 
    v3dv_cl_ensure_space_with_branch(
@@ -2245,37 +2808,159 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
-                                                 int rt,
-                                                 uint32_t *rt_bpp,
-                                                 uint32_t *rt_type,
-                                                 uint32_t *rt_clamp)
+v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
 
-   assert(state->subpass_idx < state->pass->subpass_count);
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
+   job->suspending = true;
 
-   if (rt >= subpass->color_count)
-      return;
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(BRANCH));
 
-   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
-   const uint32_t attachment_idx = attachment->attachment;
-   if (attachment_idx == VK_ATTACHMENT_UNUSED)
-      return;
+   job->suspend_branch_inst_ptr = cl_start(&job->bcl);
+   cl_emit(&job->bcl, BRANCH, branch) {
+      branch.address = v3dv_cl_address(NULL, 0);
+   }
 
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-   assert(attachment_idx < framebuffer->attachment_count);
-   struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
-   assert(iview->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT);
-
-   *rt_bpp = iview->internal_bpp;
-   *rt_type = iview->internal_type;
-   if (vk_format_is_int(iview->vk.format))
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
-   else if (vk_format_is_srgb(iview->vk.format))
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
-   else
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+   /* The sim complains if the command list ends with a branch */
+   cl_emit(&job->bcl, NOP, nop);
+}
+
+void
+v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend,
+                               struct v3dv_job *suspend,
+                               struct v3dv_job *resume)
+{
+   assert(resume && resume->resuming);
+   assert(first_suspend && first_suspend->suspending);
+   assert(suspend && suspend->suspending);
+   assert(suspend->suspend_branch_inst_ptr != NULL);
+
+   struct v3dv_bo *resume_bo =
+      list_first_entry(&resume->bcl.bo_list, struct v3dv_bo, list_link);
+   struct cl_packet_struct(BRANCH) branch = {
+      cl_packet_header(BRANCH),
+   };
+   branch.address = v3dv_cl_address(NULL, resume_bo->offset);
+
+   uint8_t *rewrite_addr = (uint8_t *) suspend->suspend_branch_inst_ptr;
+   cl_packet_pack(BRANCH)(NULL, rewrite_addr, &branch);
+
+   if (resume != first_suspend) {
+      set_foreach(resume->bos, entry) {
+         struct v3dv_bo *bo = (void *)entry->key;
+         v3dv_job_add_bo(first_suspend, bo);
+      }
+   }
+
+   first_suspend->suspended_bcl_end = resume->bcl.bo->offset +
+                                      v3dv_cl_offset(&resume->bcl);
+}
+
+static void
+job_destroy_cb(VkDevice device, uint64_t pobj, VkAllocationCallbacks *allocb)
+{
+   struct v3dv_job *clone = (struct v3dv_job *) (uintptr_t) pobj;
+   v3dv_job_destroy(clone);
+}
+
+/**
+ * This checks if the command buffer has been created with
+ * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, in which case we won't be
+ * able to safely patch the resume address into the job (since we could have
+ * another instance of this job running in the GPU, potentially resuming in a
+ * different address). In that case, we clone the job and make the clone have
+ * its own BCL copied from the original job so we can later patch the resume
+ * address into it safely.
+ */
+struct v3dv_job *
+v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job)
+{
+   assert(job->suspending);
+   assert(job->cmd_buffer);
+   assert(job->type == V3DV_JOB_TYPE_GPU_CL);
+
+   if (!(job->cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
+      return job;
+
+   /* Create the clone job, but skip the BCL since we are going to create
+    * our own below.
+    */
+   struct v3dv_job *clone = v3dv_job_clone(job, true);
+   if (!clone)
+      return NULL;
+
+   /* Compute total size of BCL we need to copy */
+   uint32_t bcl_size = 0;
+   list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link)
+      bcl_size += bo->size;
+
+   /* Prepare the BCL for the cloned job. For this we go over the BOs in the
+    * BCL of the original job and we copy their contents into the single BO
+    * in the BCL of the cloned job.
+    */
+   clone->clone_owns_bcl = true;
+   v3dv_cl_init(clone, &clone->bcl);
+   v3dv_cl_ensure_space(&clone->bcl, bcl_size, 4);
+   if (!clone->bcl.bo)
+      return NULL;
+
+   assert(clone->bcl.base);
+   assert(clone->bcl.base == clone->bcl.next);
+
+   /* Unlink this job from the command buffer's execution list */
+   list_inithead(&clone->list_link);
+
+   /* Copy the contents of each BO in the original job's BCL into the single
+    * BO we have in the clone's BCL.
+    *
+    * If the BO is the last in the BCL (which we can tell because it wouldn't
+    * have emitted a BRANCH instruction to link to another BO) we need to copy
+    * up to the current BCL offset, otherwise we need to copy up to the BRANCH
+    * instruction (excluded, since we are putting everything together into a
+    * single BO here).
+    */
+   list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
+      assert(bo->map);
+      uint32_t copy_size;
+      if (bo->cl_branch_offset == 0xffffffff) { /* Last BO in BCL */
+         assert(bo == list_last_entry(&job->bcl.bo_list, struct v3dv_bo, list_link));
+         copy_size = v3dv_cl_offset(&job->bcl);
+      } else {
+         assert(bo->cl_branch_offset >= cl_packet_length(BRANCH));
+         copy_size = bo->cl_branch_offset - cl_packet_length(BRANCH);
+      }
+
+      assert(v3dv_cl_offset(&job->bcl) + copy_size < bcl_size);
+      memcpy(cl_start(&clone->bcl), bo->map, copy_size);
+      cl_advance_and_end(&clone->bcl, copy_size);
+   }
+
+   /* Now we need to fixup the pointer to the suspend BRANCH instruction at the
+    * end of the BCL so it points to the address in the new BCL. We know that
+    * to suspend a command buffer we always emit a BRANCH+NOP combo, so we just
+    * need to go back that many bytes in to the BCL to find the instruction.
+    */
+   uint32_t suspend_terminator_size =
+      cl_packet_length(BRANCH) + cl_packet_length(NOP);
+   clone->suspend_branch_inst_ptr = (struct v3dv_cl_out *)
+      (((uint8_t *)cl_start(&clone->bcl)) - suspend_terminator_size);
+   assert(*(((uint8_t *)clone->suspend_branch_inst_ptr)) == V3DX(BRANCH_opcode));
+
+   /* This job is not in the execution list of the command buffer so it
+    * won't be destroyed with it; add it as a private object to get it freed.
+    *
+    * FIXME: every time this job is submitted we clone the job and we only
+    * destroy it when the command buffer is destroyed. If the user keeps the
+    * command buffer for the entire lifetime of the application, this command
+    * buffer could grow significantly, so maybe we want to do something smarter
+    * like having a syncobj bound to these jobs and every time we submit the
+    * command buffer again we first check these sncobjs to see if we can free
+    * some of these clones so we avoid blowing up memory.
+    */
+   v3dv_cmd_buffer_add_private_obj(
+      job->cmd_buffer, (uintptr_t)clone,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)job_destroy_cb);
+
+   return clone;
 }
diff --git a/src/broadcom/vulkan/v3dvx_descriptor_set.c b/src/broadcom/vulkan/v3dvx_descriptor_set.c
index 2c28ce46aa5..ced7b7e8c85 100644
--- a/src/broadcom/vulkan/v3dvx_descriptor_set.c
+++ b/src/broadcom/vulkan/v3dvx_descriptor_set.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -51,7 +51,7 @@ v3dX(descriptor_bo_size)(VkDescriptorType type)
 }
 
 /* To compute the max_bo_size we want to iterate through the descriptor
- * types. Unfourtunately we can't just use the descriptor type enum values, as
+ * types. Unfortunately we can't just use the descriptor type enum values, as
  * the values are not defined consecutively (so extensions could add new
  * descriptor types), and VK_DESCRIPTOR_TYPE_MAX_ENUM is also a really big
  * number.
@@ -86,13 +86,15 @@ v3dX(max_descriptor_bo_size)(void)
 
 
 uint32_t
-v3dX(combined_image_sampler_texture_state_offset)(void)
+v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane)
 {
-   return 0;
+   return v3dX(descriptor_bo_size)(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) *
+      plane;
 }
 
 uint32_t
-v3dX(combined_image_sampler_sampler_state_offset)(void)
+v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane)
 {
-   return cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32);
+   return v3dX(combined_image_sampler_texture_state_offset)(plane) +
+      cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32);
 }
diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
index a48738aec42..a27d65cfd23 100644
--- a/src/broadcom/vulkan/v3dvx_device.c
+++ b/src/broadcom/vulkan/v3dvx_device.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,7 +26,6 @@
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/compiler/v3d_compiler.h"
-#include "vk_format_info.h"
 #include "util/u_pack_color.h"
 #include "util/half_float.h"
 
@@ -50,8 +49,8 @@ vk_to_v3d_compare_func[] = {
    [VK_COMPARE_OP_ALWAYS]                       = V3D_COMPARE_FUNC_ALWAYS,
 };
 
-
 static union pipe_color_union encode_border_color(
+   const struct v3dv_device *device,
    const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
 {
    const struct util_format_description *desc =
@@ -59,10 +58,55 @@ static union pipe_color_union encode_border_color(
 
    const struct v3dv_format *format = v3dX(get_format)(bc_info->format);
 
+   /* YCbCr doesn't interact with border color at all. From spec:
+    *
+    *   "If sampler YCBCR conversion is enabled, addressModeU, addressModeV,
+    *    and addressModeW must be VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+    *    anisotropyEnable must be VK_FALSE, and unnormalizedCoordinates must
+    *    be VK_FALSE"
+    */
+   assert(format->plane_count == 1);
+
+   /* We use the swizzle in our format table to determine swizzle configuration
+    * for sampling as well as to decide if we need to use the Swap R/B and
+    * Reverse Channels bits for Tile Load/Store operations. The order of the
+    * R/B swap and Reverse operations matters and gives different swizzles.
+    * Our format table assumes that Reverse happens first and R/B Swap second.
+    * This seems to match semantics for texture sampling and Tile load/store,
+    * however, it seems that the semantics are reversed for custom border
+    * colors so we need to fix up the swizzle manually for this case.
+    */
+   uint8_t swizzle[4];
+   const bool v3d_has_reverse_swap_rb_bits =
+      v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
+   if (!v3d_has_reverse_swap_rb_bits &&
+       v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+       v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
+      swizzle[0] = PIPE_SWIZZLE_W;
+      swizzle[1] = PIPE_SWIZZLE_X;
+      swizzle[2] = PIPE_SWIZZLE_Y;
+      swizzle[3] = PIPE_SWIZZLE_Z;
+   }
+   /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
+    * we have to use the new reverse and swap_r/b flags in the texture shader
+    * state which will apply the format swizzle automatically when sampling
+    * the border color too and we should not apply it manually here.
+    */
+   else if (v3d_has_reverse_swap_rb_bits &&
+            (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
+             v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
+      swizzle[0] = PIPE_SWIZZLE_X;
+      swizzle[1] = PIPE_SWIZZLE_Y;
+      swizzle[2] = PIPE_SWIZZLE_Z;
+      swizzle[3] = PIPE_SWIZZLE_W;
+   } else {
+      memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
+   }
+
    union pipe_color_union border;
    for (int i = 0; i < 4; i++) {
-      if (format->swizzle[i] <= 3)
-         border.ui[i] = bc_info->customBorderColor.uint32[format->swizzle[i]];
+      if (format->planes[0].swizzle[i] <= 3)
+         border.ui[i] = bc_info->customBorderColor.uint32[swizzle[i]];
       else
          border.ui[i] = 0;
    }
@@ -90,7 +134,11 @@ static union pipe_color_union encode_border_color(
                              (1 << (desc->channel[i].size - 1)) - 1);
    }
 
-   /* convert from float to expected format */
+#if V3D_VERSION <= 42
+   /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+    * for us. In V3D 4.x we need to manually convert floating point color
+    * values to the expected format.
+    */
    if (vk_format_is_srgb(bc_info->format) ||
        vk_format_is_compressed(bc_info->format)) {
       for (int i = 0; i < 4; i++)
@@ -142,12 +190,14 @@ static union pipe_color_union encode_border_color(
          }
       }
    }
+#endif
 
    return border;
 }
 
 void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+                         struct v3dv_sampler *sampler,
                          const VkSamplerCreateInfo *pCreateInfo,
                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
 {
@@ -175,21 +225,6 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
       break;
    }
 
-   /* For some texture formats, when clamping to transparent black border the
-    * CTS expects alpha to be set to 1 instead of 0, but the border color mode
-    * will take priority over the texture state swizzle, so the only way to
-    * fix that is to apply a swizzle in the shader. Here we keep track of
-    * whether we are activating that mode and we will decide if we need to
-    * activate the texture swizzle lowering in the shader key at compile time
-    * depending on the actual texture format.
-    */
-   if ((pCreateInfo->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
-        pCreateInfo->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
-        pCreateInfo->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER) &&
-       border_color_mode == V3D_BORDER_COLOR_0000) {
-      sampler->clamp_to_transparent_black_border = true;
-   }
-
    v3dvx_pack(sampler->sampler_state, SAMPLER_STATE, s) {
       if (pCreateInfo->anisotropyEnable) {
          s.anisotropy_enable = true;
@@ -204,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
       s.border_color_mode = border_color_mode;
 
       if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
-         union pipe_color_union border = encode_border_color(bc_info);
+         union pipe_color_union border = encode_border_color(device, bc_info);
 
          s.border_color_word_0 = border.ui[0];
          s.border_color_word_1 = border.ui[1];
@@ -238,12 +273,15 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
 void
 v3dX(framebuffer_compute_internal_bpp_msaa)(
    const struct v3dv_framebuffer *framebuffer,
+   const struct v3dv_cmd_buffer_attachment_state *attachments,
    const struct v3dv_subpass *subpass,
-   uint8_t *max_bpp,
+   uint8_t *max_internal_bpp,
+   uint8_t *total_color_bpp,
    bool *msaa)
 {
    STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
-   *max_bpp = V3D_INTERNAL_BPP_32;
+   *max_internal_bpp = V3D_INTERNAL_BPP_32;
+   *total_color_bpp = 0;
    *msaa = false;
 
    if (subpass) {
@@ -252,11 +290,15 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
          if (att_idx == VK_ATTACHMENT_UNUSED)
             continue;
 
-         const struct v3dv_image_view *att = framebuffer->attachments[att_idx];
+         const struct v3dv_image_view *att = attachments[att_idx].image_view;
          assert(att);
+         assert(att->plane_count == 1);
 
-         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-            *max_bpp = MAX2(*max_bpp, att->internal_bpp);
+         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+            const uint32_t internal_bpp = att->planes[0].internal_bpp;
+            *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+            *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+         }
 
          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
@@ -264,23 +306,26 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
 
       if (!*msaa && subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
          const struct v3dv_image_view *att =
-            framebuffer->attachments[subpass->ds_attachment.attachment];
+            attachments[subpass->ds_attachment.attachment].image_view;
          assert(att);
 
          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
       }
-
       return;
    }
 
    assert(framebuffer->attachment_count <= 4);
    for (uint32_t i = 0; i < framebuffer->attachment_count; i++) {
-      const struct v3dv_image_view *att = framebuffer->attachments[i];
+      const struct v3dv_image_view *att = attachments[i].image_view;
       assert(att);
+      assert(att->plane_count == 1);
 
-      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-         *max_bpp = MAX2(*max_bpp, att->internal_bpp);
+      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+         const uint32_t internal_bpp = att->planes[0].internal_bpp;
+         *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+         *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+      }
 
       if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
          *msaa = true;
@@ -342,7 +387,7 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
    }
 }
 
-#ifdef DEBUG
+#if MESA_DEBUG
 void
 v3dX(device_check_prepacked_sizes)(void)
 {
diff --git a/src/broadcom/vulkan/v3dvx_formats.c b/src/broadcom/vulkan/v3dvx_formats.c
index 4f77dd0086a..4fe548faee0 100644
--- a/src/broadcom/vulkan/v3dvx_formats.c
+++ b/src/broadcom/vulkan/v3dvx_formats.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,6 +26,9 @@
 #include "broadcom/cle/v3dx_pack.h"
 
 #include "util/format/u_format.h"
+#include "vk_enum_to_str.h"
+#include "vk_enum_defines.h"
+#include "vk_util.h"
 
 #define SWIZ(x,y,z,w) {   \
    PIPE_SWIZZLE_##x,      \
@@ -35,15 +38,34 @@
 }
 
 #define FORMAT(vk, rt, tex, swiz, return_size, supports_filtering)  \
-   [VK_FORMAT_##vk] = {                                             \
-      true,                                                         \
-      V3D_OUTPUT_IMAGE_FORMAT_##rt,                                 \
-      TEXTURE_DATA_FORMAT_##tex,                                    \
-      swiz,                                                         \
-      return_size,                                                  \
+   [VK_ENUM_OFFSET(VK_FORMAT_##vk)] = {                             \
+      1,                                                            \
+      {{                                                            \
+         V3D_OUTPUT_IMAGE_FORMAT_##rt,                              \
+         TEXTURE_DATA_FORMAT_##tex,                                 \
+         swiz,                                                      \
+         return_size,                                               \
+      }},                                                           \
       supports_filtering,                                           \
    }
 
+#define PLANE(rt, tex, swiz, return_size)  \
+   {                                       \
+      V3D_OUTPUT_IMAGE_FORMAT_##rt,        \
+      TEXTURE_DATA_FORMAT_##tex,           \
+      swiz,                                \
+      return_size                          \
+   }
+
+#define YCBCR_FORMAT(vk, supports_filtering, plane_count, ...)  \
+   [VK_ENUM_OFFSET(VK_FORMAT_##vk)] = {                         \
+      plane_count,                                              \
+      {                                                         \
+         __VA_ARGS__,                                           \
+      },                                                        \
+      supports_filtering,                                       \
+   }
+
 #define SWIZ_X001 SWIZ(X, 0, 0, 1)
 #define SWIZ_XY01 SWIZ(X, Y, 0, 1)
 #define SWIZ_XYZ1 SWIZ(X, Y, Z, 1)
@@ -57,6 +79,7 @@
 #define SWIZ_XXXX SWIZ(X, X, X, X)
 #define SWIZ_000X SWIZ(0, 0, 0, X)
 #define SWIZ_WXYZ SWIZ(W, X, Y, Z)
+#define SWIZ_WZYX SWIZ(W, Z, Y, X)
 
 /* FIXME: expand format table to describe whether the format is supported
  * for buffer surfaces (texel buffers, vertex buffers, etc).
@@ -132,6 +155,7 @@ static const struct v3dv_format format_table[] = {
    FORMAT(A8B8G8R8_SRGB_PACK32,    SRGB8_ALPHA8, RGBA8,         SWIZ_XYZW, 16, true), /* RGBA8 sRGB */
    FORMAT(A2B10G10R10_UNORM_PACK32,RGB10_A2,     RGB10_A2,      SWIZ_XYZW, 16, true),
    FORMAT(A2B10G10R10_UINT_PACK32, RGB10_A2UI,   RGB10_A2UI,    SWIZ_XYZW, 16, false),
+   FORMAT(A2R10G10B10_UNORM_PACK32,RGB10_A2,     RGB10_A2,      SWIZ_ZYXW, 16, true),
    FORMAT(E5B9G9R9_UFLOAT_PACK32,  NO,           RGB9_E5,       SWIZ_XYZ1, 16, true),
    FORMAT(B10G11R11_UFLOAT_PACK32, R11F_G11F_B10F,R11F_G11F_B10F, SWIZ_XYZ1, 16, true),
 
@@ -196,13 +220,61 @@ static const struct v3dv_format format_table[] = {
    FORMAT(ASTC_12x12_SRGB_BLOCK,      NO,  ASTC_12X12,               SWIZ_XYZW, 16, true),
 };
 
+/**
+ * Vulkan layout for 4444 formats is defined like this:
+ *
+ * Vulkan ABGR4: (LSB) R | G | B | A (MSB)
+ * Vulkan ARGB4: (LSB) B | G | R | A (MSB)
+ *
+ * We map this to the V3D RGB4 texture format, which really, is ABGR4 with
+ * R in the MSB, so:
+ *
+ * V3D ABGR4   : (LSB) A | B | G | R (MSB)
+ *
+ * Which is reversed from Vulkan's ABGR4 layout. So in order to match Vulkan
+ * semantics we need to apply the following swizzles:
+ *
+ * ABGR4: WZYX (reverse)
+ * ARGB4: YZWX (reverse + swap R/B)
+ */
+static const struct v3dv_format format_table_4444[] = {
+   FORMAT(A4B4G4R4_UNORM_PACK16, ABGR4444, RGBA4, SWIZ_WZYX, 16, true), /* Reverse */
+   FORMAT(A4R4G4B4_UNORM_PACK16, ABGR4444, RGBA4, SWIZ_YZWX, 16, true), /* Reverse + RB swap */
+};
+
+static const struct v3dv_format format_table_ycbcr[] = {
+   YCBCR_FORMAT(G8_B8R8_2PLANE_420_UNORM, false, 2,
+       PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16),
+       PLANE(RG8, RG8, SWIZ(X, Y, 0, 1), 16)
+   ),
+   YCBCR_FORMAT(G8_B8_R8_3PLANE_420_UNORM, false, 3,
+       PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16),
+       PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16),
+       PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16)
+   ),
+};
+
 const struct v3dv_format *
 v3dX(get_format)(VkFormat format)
 {
-   if (format < ARRAY_SIZE(format_table) && format_table[format].supported)
+   /* Core formats */
+   if (format < ARRAY_SIZE(format_table) && format_table[format].plane_count)
       return &format_table[format];
-   else
+
+   uint32_t ext_number = VK_ENUM_EXTENSION(format);
+   uint32_t enum_offset = VK_ENUM_OFFSET(format);
+
+   switch (ext_number) {
+   case _VK_EXT_4444_formats_number:
+      return &format_table_4444[enum_offset];
+   case _VK_KHR_sampler_ycbcr_conversion_number:
+      if (enum_offset < ARRAY_SIZE(format_table_ycbcr))
+         return &format_table_ycbcr[enum_offset];
+      else
+         return NULL;
+   default:
       return NULL;
+   }
 }
 
 void
@@ -339,18 +411,32 @@ bool
 v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format)
 {
    uint32_t type, bpp;
-   v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp);
+
+   /* Multiplanar images cannot be multisampled:
+    *
+    *   "sampleCounts will be set to VK_SAMPLE_COUNT_1_BIT if at least one of
+    *    the following conditions is true: (...) format is one of the formats
+    *    that require a sampler Y′CBCR conversion (...)"
+    */
+   if (!format->plane_count || format->plane_count > 1)
+      return false;
+
+   v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type, &type, &bpp);
    return type == V3D_INTERNAL_TYPE_8 || type == V3D_INTERNAL_TYPE_16F;
 }
 
 bool
 v3dX(format_supports_blending)(const struct v3dv_format *format)
 {
+   /* ycbcr formats don't support blending */
+   if (!format->plane_count || format->plane_count > 1)
+      return false;
+
    /* Hardware blending is only supported on render targets that are configured
     * 4x8-bit unorm, 2x16-bit float or 4x16-bit float.
     */
    uint32_t type, bpp;
-   v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp);
+   v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type, &type, &bpp);
    switch (type) {
    case V3D_INTERNAL_TYPE_8:
       return bpp == V3D_INTERNAL_BPP_32;
@@ -426,23 +512,17 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
                                               uint32_t *internal_type,
                                               uint32_t *internal_bpp)
 {
-   const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
-                                         VK_IMAGE_ASPECT_STENCIL_BIT;
-
    /* We can't store depth/stencil pixel formats to a raster format, so
-    * so instead we load our depth/stencil aspects to a compatible color
-    * format.
+    * instead we load our depth/stencil aspects to a compatible color format.
     */
-   /* FIXME: pre-compute this at image creation time? */
-   if (aspect_mask & ds_aspects) {
+   if (aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+      *internal_bpp = V3D_INTERNAL_BPP_32;
       switch (vk_format) {
       case VK_FORMAT_D16_UNORM:
          *internal_type = V3D_INTERNAL_TYPE_16UI;
-         *internal_bpp = V3D_INTERNAL_BPP_64;
          break;
       case VK_FORMAT_D32_SFLOAT:
          *internal_type = V3D_INTERNAL_TYPE_32F;
-         *internal_bpp = V3D_INTERNAL_BPP_128;
          break;
       case VK_FORMAT_X8_D24_UNORM_PACK32:
       case VK_FORMAT_D24_UNORM_S8_UINT:
@@ -451,7 +531,6 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
           * load command for more details.
           */
          *internal_type = V3D_INTERNAL_TYPE_8UI;
-         *internal_bpp = V3D_INTERNAL_BPP_32;
          break;
       default:
          assert(!"unsupported format");
@@ -459,7 +538,9 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
       }
    } else {
       const struct v3dv_format *format = v3dX(get_format)(vk_format);
-      v3dX(get_internal_type_bpp_for_output_format)(format->rt_type,
+      /* We only expect this to be called for single-plane formats */
+      assert(format->plane_count == 1);
+      v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type,
                                                     internal_type, internal_bpp);
    }
 }
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index a9aa0fb9797..de984e81220 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,32 +26,6 @@
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/compiler/v3d_compiler.h"
 
-#include "vk_format_info.h"
-
-/*
- * This method translates pipe_swizzle to the swizzle values used at the
- * packet TEXTURE_SHADER_STATE
- *
- * FIXME: C&P from v3d, common place?
- */
-static uint32_t
-translate_swizzle(unsigned char pipe_swizzle)
-{
-   switch (pipe_swizzle) {
-   case PIPE_SWIZZLE_0:
-      return 0;
-   case PIPE_SWIZZLE_1:
-      return 1;
-   case PIPE_SWIZZLE_X:
-   case PIPE_SWIZZLE_Y:
-   case PIPE_SWIZZLE_Z:
-   case PIPE_SWIZZLE_W:
-      return 2 + pipe_swizzle;
-   default:
-      unreachable("unknown swizzle");
-   }
-}
-
 /*
  * Packs and ensure bo for the shader state (the latter can be temporal).
  */
@@ -71,78 +45,125 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
           image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
    const uint32_t msaa_scale = image->vk.samples == VK_SAMPLE_COUNT_1_BIT ? 1 : 2;
 
-   v3dvx_pack(image_view->texture_shader_state[index], TEXTURE_SHADER_STATE, tex) {
-
-      tex.level_0_is_strictly_uif =
-         (image->slices[0].tiling == V3D_TILING_UIF_XOR ||
-          image->slices[0].tiling == V3D_TILING_UIF_NO_XOR);
-
-      tex.level_0_xor_enable = (image->slices[0].tiling == V3D_TILING_UIF_XOR);
-
-      if (tex.level_0_is_strictly_uif)
-         tex.level_0_ub_pad = image->slices[0].ub_pad;
-
-      /* FIXME: v3d never sets uif_xor_disable, but uses it on the following
-       * check so let's set the default value
-       */
-      tex.uif_xor_disable = false;
-      if (tex.uif_xor_disable ||
-          tex.level_0_is_strictly_uif) {
-         tex.extended = true;
-      }
-
-      tex.base_level = image_view->vk.base_mip_level;
-      tex.max_level = image_view->vk.base_mip_level +
-                      image_view->vk.level_count - 1;
-
-      tex.swizzle_r = translate_swizzle(image_view->swizzle[0]);
-      tex.swizzle_g = translate_swizzle(image_view->swizzle[1]);
-      tex.swizzle_b = translate_swizzle(image_view->swizzle[2]);
-      tex.swizzle_a = translate_swizzle(image_view->swizzle[3]);
-
-      tex.texture_type = image_view->format->tex_type;
-
-      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
-         tex.image_depth = image->vk.extent.depth;
-      } else {
-         tex.image_depth = image_view->vk.layer_count;
+   for (uint8_t plane = 0; plane < image_view->plane_count; plane++) {
+      uint8_t iplane = image_view->planes[plane].image_plane;
+      v3dvx_pack(image_view->planes[plane].texture_shader_state[index], TEXTURE_SHADER_STATE, tex) {
+
+         tex.level_0_is_strictly_uif =
+            (image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_XOR ||
+             image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_NO_XOR);
+
+         tex.level_0_xor_enable = (image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_XOR);
+
+         if (tex.level_0_is_strictly_uif)
+            tex.level_0_ub_pad = image->planes[iplane].slices[0].ub_pad;
+
+         /* FIXME: v3d never sets uif_xor_disable, but uses it on the following
+          * check so let's set the default value
+          */
+         tex.uif_xor_disable = false;
+         if (tex.uif_xor_disable ||
+             tex.level_0_is_strictly_uif) {
+            tex.extended = true;
+         }
+
+         tex.base_level = image_view->vk.base_mip_level;
+         tex.max_level = image_view->vk.base_mip_level +
+                         image_view->vk.level_count - 1;
+
+         tex.swizzle_r = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[0]);
+         tex.swizzle_g = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[1]);
+         tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
+         tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
+
+         tex.texture_type = image_view->format->planes[plane].tex_type;
+
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+            tex.image_depth = image->vk.extent.depth;
+         } else {
+            tex.image_depth = image_view->vk.layer_count;
+         }
+
+         /* Empirical testing with CTS shows that when we are sampling from cube
+          * arrays we want to set image depth to layers / 6, but not when doing
+          * image load/store.
+          */
+         if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
+             !for_cube_map_array_storage) {
+            assert(tex.image_depth % 6 == 0);
+            tex.image_depth /= 6;
+         }
+
+         tex.image_height = image->planes[iplane].height * msaa_scale;
+         tex.image_width = image->planes[iplane].width * msaa_scale;
+
+         /* On 4.x, the height of a 1D texture is redefined to be the
+          * upper 14 bits of the width (which is only usable with txf).
+          */
+         if (image->vk.image_type == VK_IMAGE_TYPE_1D)
+            tex.image_height = tex.image_width >> 14;
+
+         tex.image_width &= (1 << 14) - 1;
+         tex.image_height &= (1 << 14) - 1;
+
+         tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+
+         /* At this point we don't have the job. That's the reason the first
+          * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+          * add the bo to the job. This also means that we need to add manually
+          * the image bo to the job using the texture.
+          */
+         const uint32_t base_offset =
+            image->planes[iplane].mem->bo->offset +
+            v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
+                              iplane);
+         tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+         bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+
+         /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
+          * the reverse and/or swap_r/b swizzle from the format table with the
+          * image view swizzle. This, however, doesn't work for border colors,
+          * for that there is the reverse_standard_border_color.
+          *
+          * In v3d 7.x, however, there is no reverse_standard_border_color bit,
+          * since the reverse and swap_r/b bits also affect border colors. It is
+          * because of this that we absolutely need to use these bits with
+          * reversed and swpaped formats, since that's the only way to ensure
+          * correct border colors. In that case we don't want to program the
+          * swizzle to the composition of the format swizzle and the view
+          * swizzle like we do in v3d 4.x, since the format swizzle is applied
+          * via the reverse and swap_r/b bits.
+          */
+#if V3D_VERSION == 42
+         tex.srgb = is_srgb;
+         tex.reverse_standard_border_color =
+            image_view->planes[plane].channel_reverse;
+#endif
+#if V3D_VERSION >= 71
+         tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+
+         tex.reverse = image_view->planes[plane].channel_reverse;
+         tex.r_b_swap = image_view->planes[plane].swap_rb;
+
+         if (tex.reverse || tex.r_b_swap) {
+            tex.swizzle_r =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
+            tex.swizzle_g =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
+            tex.swizzle_b =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
+            tex.swizzle_a =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
+         }
+
+         tex.chroma_offset_x = 1;
+         tex.chroma_offset_y = 1;
+         /* See comment in XML field definition for rationale of the shifts */
+         tex.texture_base_pointer_cb = base_offset >> 6;
+         tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
       }
-
-      /* Empirical testing with CTS shows that when we are sampling from cube
-       * arrays we want to set image depth to layers / 6, but not when doing
-       * image load/store.
-       */
-      if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
-          !for_cube_map_array_storage) {
-         assert(tex.image_depth % 6 == 0);
-         tex.image_depth /= 6;
-      }
-
-      tex.image_height = image->vk.extent.height * msaa_scale;
-      tex.image_width = image->vk.extent.width * msaa_scale;
-
-      /* On 4.x, the height of a 1D texture is redefined to be the
-       * upper 14 bits of the width (which is only usable with txf).
-       */
-      if (image->vk.image_type == VK_IMAGE_TYPE_1D) {
-         tex.image_height = tex.image_width >> 14;
-      }
-      tex.image_width &= (1 << 14) - 1;
-      tex.image_height &= (1 << 14) - 1;
-
-      tex.array_stride_64_byte_aligned = image->cube_map_stride / 64;
-
-      tex.srgb = vk_format_is_srgb(image_view->vk.format);
-
-      /* At this point we don't have the job. That's the reason the first
-       * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
-       * add the bo to the job. This also means that we need to add manually
-       * the image bo to the job using the texture.
-       */
-      const uint32_t base_offset =
-         image->mem->bo->offset +
-         v3dv_layer_offset(image, 0, image_view->vk.base_array_layer);
-      tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
    }
 }
 
@@ -163,10 +184,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
    const struct v3dv_buffer *buffer = buffer_view->buffer;
 
    v3dvx_pack(buffer_view->texture_shader_state, TEXTURE_SHADER_STATE, tex) {
-      tex.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X);
-      tex.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y);
-      tex.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z);
-      tex.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W);
+      tex.swizzle_r =
+         v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[0]);
+      tex.swizzle_g =
+         v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[1]);
+      tex.swizzle_b =
+         v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[2]);
+      tex.swizzle_a =
+         v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[3]);
 
       tex.image_depth = 1;
 
@@ -180,8 +205,16 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
       tex.image_width &= (1 << 14) - 1;
       tex.image_height &= (1 << 14) - 1;
 
-      tex.texture_type = buffer_view->format->tex_type;
-      tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+      assert(buffer_view->format->plane_count == 1);
+      tex.texture_type = buffer_view->format->planes[0].tex_type;
+
+      bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
+#if V3D_VERSION == 42
+      tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+      tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif
 
       /* At this point we don't have the job. That's the reason the first
        * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -194,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
          buffer_view->offset;
 
       tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+      tex.chroma_offset_x = 1;
+      tex.chroma_offset_y = 1;
+      /* See comment in XML field definition for rationale of the shifts */
+      tex.texture_base_pointer_cb = base_offset >> 6;
+      tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
    }
 }
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index 2f79e4e9c32..858096f9e4b 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,11 +25,11 @@
 #include "v3dv_meta_common.h"
 
 #include "broadcom/common/v3d_macros.h"
+#include "broadcom/common/v3d_tfu.h"
+#include "broadcom/common/v3d_util.h"
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/compiler/v3d_compiler.h"
 
-#include "vk_format_info.h"
-
 struct rcl_clear_info {
    const union v3dv_clear_value *clear_value;
    struct v3dv_image *image;
@@ -51,25 +51,46 @@ emit_rcl_prologue(struct v3dv_job *job,
    if (job->cmd_buffer->state.oom)
       return NULL;
 
+   assert(!tiling->msaa || !tiling->double_buffer);
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
       config.early_z_disable = true;
       config.image_width_pixels = tiling->width;
       config.image_height_pixels = tiling->height;
       config.number_of_render_targets = 1;
       config.multisample_mode_4x = tiling->msaa;
+      config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideallly we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
       config.internal_depth_type = fb->internal_depth_type;
    }
 
+   const uint32_t *color = NULL;
    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
-      uint32_t clear_pad = 0;
+      UNUSED uint32_t clear_pad = 0;
       if (clear_info->image) {
          const struct v3dv_image *image = clear_info->image;
+
+         /* From vkCmdClearColorImage:
+          *   "image must not use any of the formats that require a sampler
+          *    YCBCR conversion"
+          */
+         assert(image->plane_count == 1);
          const struct v3d_resource_slice *slice =
-            &image->slices[clear_info->level];
+            &image->planes[0].slices[clear_info->level];
          if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
              slice->tiling == V3D_TILING_UIF_XOR) {
-            int uif_block_height = v3d_utile_height(image->cpp) * 2;
+            int uif_block_height = v3d_utile_height(image->planes[0].cpp) * 2;
 
             uint32_t implicit_padded_height =
                align(tiling->height, uif_block_height) / uif_block_height;
@@ -81,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job,
          }
       }
 
-      const uint32_t *color = &clear_info->clear_value->color[0];
+      color = &clear_info->clear_value->color[0];
+
+#if V3D_VERSION == 42
       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
          clear.clear_color_low_32_bits = color[0];
          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
@@ -105,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job,
             clear.render_target_number = 0;
          };
       }
+#endif
    }
 
+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
       rt.render_target_0_internal_bpp = tiling->internal_bpp;
       rt.render_target_0_internal_type = fb->internal_type;
       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
    }
+#endif
+
+#if V3D_VERSION >= 71
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+      if (color)
+         rt.clear_color_low_bits = color[0];
+      rt.internal_bpp = tiling->internal_bpp;
+      rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+                                                                      fb->vk_format);
+      rt.stride =
+         v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+                                                v3d_internal_bpp_words(rt.internal_bpp));
+      rt.base_address = 0;
+      rt.render_target_number = 0;
+   }
+
+   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+         rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
+            ((uint64_t) color[1]) |
+            (((uint64_t) (color[2] & 0xff)) << 32);
+         rt.render_target_number = 0;
+      }
+   }
+
+   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+         rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+            (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
+            (((uint64_t) (color[3])) << 24);
+         rt.render_target_number = 0;
+      }
+   }
+#endif
 
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
@@ -167,11 +226,20 @@ emit_frame_setup(struct v3dv_job *job,
       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
          store.buffer_to_store = NONE;
       }
-      if (clear_value && i == 0) {
+      /* When using double-buffering, we need to clear both buffers (unless
+       * we only have a single tile to render).
+       */
+      if (clear_value &&
+          (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
             clear.clear_z_stencil_buffer = true;
             clear.clear_all_render_targets = true;
          }
+#endif
+#if V3D_VERSION >= 71
+         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
       }
       cl_emit(rcl, END_OF_TILE_MARKER, end);
    }
@@ -254,6 +322,9 @@ choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
                   bool is_copy_to_buffer,
                   bool is_copy_from_buffer)
 {
+   /* At this point the framebuffer was already lowered to single-plane */
+   assert(framebuffer->format->plane_count == 1);
+
    if (is_copy_to_buffer || is_copy_from_buffer) {
       switch (framebuffer->vk_format) {
       case VK_FORMAT_D16_UNORM:
@@ -295,11 +366,11 @@ choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
             }
          }
       default: /* Color formats */
-         return framebuffer->format->rt_type;
+         return framebuffer->format->planes[0].rt_type;
          break;
       }
    } else {
-      return framebuffer->format->rt_type;
+      return framebuffer->format->planes[0].rt_type;
    }
 }
 
@@ -307,8 +378,24 @@ static inline bool
 format_needs_rb_swap(struct v3dv_device *device,
                      VkFormat format)
 {
-   const uint8_t *swizzle = v3dv_get_format_swizzle(device, format);
-   return swizzle[0] == PIPE_SWIZZLE_Z;
+   /* We are calling these methods for framebuffer formats, that at this point
+    * should be single-plane
+    */
+   assert(vk_format_get_plane_count(format) == 1);
+   const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0);
+   return v3dv_format_swizzle_needs_rb_swap(swizzle);
+}
+
+static inline bool
+format_needs_reverse(struct v3dv_device *device,
+                     VkFormat format)
+{
+   /* We are calling these methods for framebuffer formats, that at this point
+    * should be single-plane
+    */
+   assert(vk_format_get_plane_count(format) == 1);
+   const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0);
+   return v3dv_format_swizzle_needs_reverse(swizzle);
 }
 
 static void
@@ -322,22 +409,29 @@ emit_image_load(struct v3dv_device *device,
                 bool is_copy_to_buffer,
                 bool is_copy_from_buffer)
 {
-   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
+   uint8_t plane = v3dv_plane_from_aspect(aspect);
+   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane);
 
+   /* For multi-plane formats we are copying plane by plane to the color
+    * tlb. Framebuffer format was already selected to be a tlb single-plane
+    * compatible format. We still need to use the real plane to get the
+    * address etc from the source image.
+    */
+   assert(framebuffer->format->plane_count == 1);
    /* For image to/from buffer copies we always load to and store from RT0,
     * even for depth/stencil aspects, because the hardware can't do raster
     * stores or loads from/to the depth/stencil tile buffers.
     */
    bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
+                            image->format->plane_count > 1 ||
                             aspect == VK_IMAGE_ASPECT_COLOR_BIT;
 
-   const struct v3d_resource_slice *slice = &image->slices[mip_level];
+   const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
       load.buffer_to_load = load_to_color_tlb ?
          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
 
-      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
-
+      load.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset);
       load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
                                                   is_copy_to_buffer,
                                                   is_copy_from_buffer);
@@ -374,6 +468,7 @@ emit_image_load(struct v3dv_device *device,
           * so we need to make sure we respect the format swizzle.
           */
          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
+         needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
       }
 
       load.r_b_swap = needs_rb_swap;
@@ -406,17 +501,28 @@ emit_image_store(struct v3dv_device *device,
                  bool is_copy_from_buffer,
                  bool is_multisample_resolve)
 {
-   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
+   uint8_t plane = v3dv_plane_from_aspect(aspect);
+   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane);
+
+   /*
+    * For multi-plane formats we are copying plane by plane to the color
+    * tlb. Framebuffer format was already selected to be a tlb single-plane
+    * compatible format. We still need to use the real plane to get the
+    * address etc.
+    */
+   assert(framebuffer->format->plane_count == 1);
 
    bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
+                               image->format->plane_count > 1 ||
                                aspect == VK_IMAGE_ASPECT_COLOR_BIT;
 
-   const struct v3d_resource_slice *slice = &image->slices[mip_level];
+   const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
       store.buffer_to_store = store_from_color_tlb ?
          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
 
-      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
+      store.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset);
+
       store.clear_buffer_being_stored = false;
 
       /* See rationale in emit_image_load() */
@@ -431,6 +537,7 @@ emit_image_store(struct v3dv_device *device,
       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
+         needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
       }
 
       store.r_b_swap = needs_rb_swap;
@@ -463,7 +570,7 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
                                         struct v3dv_buffer *buffer,
                                         struct v3dv_image *image,
                                         uint32_t layer_offset,
-                                        const VkBufferImageCopy2KHR *region)
+                                        const VkBufferImageCopy2 *region)
 {
    struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
@@ -512,9 +619,10 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
     * Vulkan spec states that the output buffer must have packed stencil
     * values, where each stencil value is 1 byte.
     */
+   uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
    uint32_t cpp =
       region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
-         1 : image->cpp;
+      1 : image->planes[plane].cpp;
    uint32_t buffer_stride = width * cpp;
    uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
                             height * buffer_stride * layer_offset;
@@ -543,7 +651,7 @@ emit_copy_layer_to_buffer(struct v3dv_job *job,
                           struct v3dv_image *image,
                           struct v3dv_meta_framebuffer *framebuffer,
                           uint32_t layer,
-                          const VkBufferImageCopy2KHR *region)
+                          const VkBufferImageCopy2 *region)
 {
    emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
                                            image, layer, region);
@@ -555,7 +663,7 @@ v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
                                          struct v3dv_buffer *buffer,
                                          struct v3dv_image *image,
                                          struct v3dv_meta_framebuffer *framebuffer,
-                                         const VkBufferImageCopy2KHR *region)
+                                         const VkBufferImageCopy2 *region)
 {
    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
    v3dv_return_if_oom(NULL, job);
@@ -572,7 +680,7 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
                                        struct v3dv_image *dst,
                                        struct v3dv_image *src,
                                        uint32_t layer_offset,
-                                       const VkImageResolve2KHR *region)
+                                       const VkImageResolve2 *region)
 {
    struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
@@ -608,11 +716,14 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
       region->dstSubresource.baseArrayLayer + layer_offset :
       region->dstOffset.z + layer_offset;
 
+   bool is_depth_or_stencil =
+      region->dstSubresource.aspectMask &
+      (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);
    emit_image_store(job->device, cl, framebuffer, dst,
                     region->dstSubresource.aspectMask,
                     dst_layer,
                     region->dstSubresource.mipLevel,
-                    false, false, true);
+                    false, false, !is_depth_or_stencil);
 
    cl_emit(cl, END_OF_TILE_MARKER, end);
 
@@ -630,7 +741,7 @@ emit_resolve_image_layer(struct v3dv_job *job,
                          struct v3dv_image *src,
                          struct v3dv_meta_framebuffer *framebuffer,
                          uint32_t layer,
-                         const VkImageResolve2KHR *region)
+                         const VkImageResolve2 *region)
 {
    emit_resolve_image_layer_per_tile_list(job, framebuffer,
                                           dst, src, layer, region);
@@ -642,7 +753,7 @@ v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
                                   struct v3dv_image *dst,
                                   struct v3dv_image *src,
                                   struct v3dv_meta_framebuffer *framebuffer,
-                                  const VkImageResolve2KHR *region)
+                                  const VkImageResolve2 *region)
 {
    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
    v3dv_return_if_oom(NULL, job);
@@ -733,7 +844,7 @@ emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
                                     struct v3dv_image *dst,
                                     struct v3dv_image *src,
                                     uint32_t layer_offset,
-                                    const VkImageCopy2KHR *region)
+                                    const VkImageCopy2 *region)
 {
    struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
@@ -791,7 +902,7 @@ emit_copy_image_layer(struct v3dv_job *job,
                       struct v3dv_image *src,
                       struct v3dv_meta_framebuffer *framebuffer,
                       uint32_t layer,
-                      const VkImageCopy2KHR *region)
+                      const VkImageCopy2 *region)
 {
    emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
    emit_supertile_coordinates(job, framebuffer);
@@ -802,7 +913,7 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
                                struct v3dv_image *dst,
                                struct v3dv_image *src,
                                struct v3dv_meta_framebuffer *framebuffer,
-                               const VkImageCopy2KHR *region)
+                               const VkImageCopy2 *region)
 {
    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
    v3dv_return_if_oom(NULL, job);
@@ -815,79 +926,108 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
 
 void
 v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
-                        struct v3dv_image *dst,
-                        uint32_t dst_mip_level,
-                        uint32_t dst_layer,
-                        struct v3dv_image *src,
-                        uint32_t src_mip_level,
-                        uint32_t src_layer,
+                        uint32_t dst_bo_handle,
+                        uint32_t dst_offset,
+                        enum v3d_tiling_mode dst_tiling,
+                        uint32_t dst_padded_height_or_stride,
+                        uint32_t dst_cpp,
+                        uint32_t src_bo_handle,
+                        uint32_t src_offset,
+                        enum v3d_tiling_mode src_tiling,
+                        uint32_t src_padded_height_or_stride,
+                        uint32_t src_cpp,
                         uint32_t width,
                         uint32_t height,
-                        const struct v3dv_format *format)
+                        const struct v3dv_format_plane *format_plane)
 {
-   const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
-   const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
-
-   assert(dst->mem && dst->mem->bo);
-   const struct v3dv_bo *dst_bo = dst->mem->bo;
-
-   assert(src->mem && src->mem->bo);
-   const struct v3dv_bo *src_bo = src->mem->bo;
-
    struct drm_v3d_submit_tfu tfu = {
       .ios = (height << 16) | width,
       .bo_handles = {
-         dst_bo->handle,
-         src_bo->handle != dst_bo->handle ? src_bo->handle : 0
+         dst_bo_handle,
+         src_bo_handle != dst_bo_handle ? src_bo_handle : 0
       },
    };
 
-   const uint32_t src_offset =
-      src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
    tfu.iia |= src_offset;
 
-   uint32_t icfg;
-   if (src_slice->tiling == V3D_TILING_RASTER) {
-      icfg = V3D_TFU_ICFG_FORMAT_RASTER;
+#if V3D_VERSION <= 42
+   if (src_tiling == V3D_TILING_RASTER) {
+      tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
+   } else {
+      tfu.icfg = (V3D33_TFU_ICFG_FORMAT_LINEARTILE +
+                  (src_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D33_TFU_ICFG_FORMAT_SHIFT;
+   }
+   tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
+#endif
+#if V3D_VERSION >= 71
+   if (src_tiling == V3D_TILING_RASTER) {
+      tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
    } else {
-      icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
-             (src_slice->tiling - V3D_TILING_LINEARTILE);
+      tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+                  (src_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D71_TFU_ICFG_IFORMAT_SHIFT;
    }
-   tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
+   tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
+#endif
 
-   const uint32_t dst_offset =
-      dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
-   tfu.ioa |= dst_offset;
+   tfu.ioa = dst_offset;
 
-   tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
-               (dst_slice->tiling - V3D_TILING_LINEARTILE)) <<
-                V3D_TFU_IOA_FORMAT_SHIFT;
-   tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
+#if V3D_VERSION <= 42
+   tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
+               (dst_tiling - V3D_TILING_LINEARTILE)) <<
+                V3D33_TFU_IOA_FORMAT_SHIFT;
+#endif
 
-   switch (src_slice->tiling) {
+#if V3D_VERSION >= 71
+   tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
+                  (dst_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D71_TFU_IOC_FORMAT_SHIFT;
+
+   switch (dst_tiling) {
    case V3D_TILING_UIF_NO_XOR:
    case V3D_TILING_UIF_XOR:
-      tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
+      tfu.v71.ioc |=
+         (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
+         V3D71_TFU_IOC_STRIDE_SHIFT;
       break;
    case V3D_TILING_RASTER:
-      tfu.iis |= src_slice->stride / src->cpp;
+      tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
+                      V3D71_TFU_IOC_STRIDE_SHIFT;
       break;
    default:
       break;
    }
+#endif
 
+   switch (src_tiling) {
+   case V3D_TILING_UIF_NO_XOR:
+   case V3D_TILING_UIF_XOR:
+      tfu.iis |= src_padded_height_or_stride / (2 * v3d_utile_height(src_cpp));
+      break;
+   case V3D_TILING_RASTER:
+      tfu.iis |= src_padded_height_or_stride / src_cpp;
+      break;
+   default:
+      break;
+   }
+
+   /* The TFU can handle raster sources but always produces UIF results */
+   assert(dst_tiling != V3D_TILING_RASTER);
+
+#if V3D_VERSION <= 42
    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
     * OPAD field for the destination (how many extra UIF blocks beyond
     * those necessary to cover the height).
     */
-   if (dst_slice->tiling == V3D_TILING_UIF_NO_XOR ||
-       dst_slice->tiling == V3D_TILING_UIF_XOR) {
-      uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
+   if (dst_tiling == V3D_TILING_UIF_NO_XOR || dst_tiling == V3D_TILING_UIF_XOR) {
+      uint32_t uif_block_h = 2 * v3d_utile_height(dst_cpp);
       uint32_t implicit_padded_height = align(height, uif_block_h);
-      uint32_t icfg =
-         (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
-      tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
+      uint32_t icfg = (dst_padded_height_or_stride - implicit_padded_height) /
+                      uif_block_h;
+      tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
    }
+#endif
 
    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
 }
@@ -1042,7 +1182,7 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
                                         struct v3dv_image *image,
                                         struct v3dv_buffer *buffer,
                                         uint32_t layer,
-                                        const VkBufferImageCopy2KHR *region)
+                                        const VkBufferImageCopy2 *region)
 {
    struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
@@ -1072,8 +1212,9 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
 
+   uint8_t plane = v3dv_plane_from_aspect(imgrsc->aspectMask);
    uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
-                  1 : image->cpp;
+                  1 : image->planes[plane].cpp;
    uint32_t buffer_stride = width * cpp;
    uint32_t buffer_offset =
       buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
@@ -1081,6 +1222,9 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
    uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
                                        false, false, true);
 
+   uint32_t image_layer = layer + (image->vk.image_type != VK_IMAGE_TYPE_3D ?
+      imgrsc->baseArrayLayer : region->imageOffset.z);
+
    emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
                     buffer_offset, buffer_stride, format);
 
@@ -1100,13 +1244,13 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          emit_image_load(job->device, cl, framebuffer, image,
                          VK_IMAGE_ASPECT_STENCIL_BIT,
-                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                         image_layer, imgrsc->mipLevel,
                          false, false);
       } else {
          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
          emit_image_load(job->device, cl, framebuffer, image,
                          VK_IMAGE_ASPECT_DEPTH_BIT,
-                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                         image_layer, imgrsc->mipLevel,
                          false, false);
       }
    }
@@ -1117,20 +1261,20 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
 
    /* Store TLB to image */
    emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
-                    imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                    image_layer, imgrsc->mipLevel,
                     false, true, false);
 
    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          emit_image_store(job->device, cl, framebuffer, image,
                           VK_IMAGE_ASPECT_STENCIL_BIT,
-                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                          image_layer, imgrsc->mipLevel,
                           false, false, false);
       } else {
          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
          emit_image_store(job->device, cl, framebuffer, image,
                           VK_IMAGE_ASPECT_DEPTH_BIT,
-                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                          image_layer, imgrsc->mipLevel,
                           false, false, false);
       }
    }
@@ -1151,7 +1295,7 @@ emit_copy_buffer_to_layer(struct v3dv_job *job,
                           struct v3dv_buffer *buffer,
                           struct v3dv_meta_framebuffer *framebuffer,
                           uint32_t layer,
-                          const VkBufferImageCopy2KHR *region)
+                          const VkBufferImageCopy2 *region)
 {
    emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
                                            layer, region);
@@ -1163,7 +1307,7 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
                                          struct v3dv_image *image,
                                          struct v3dv_buffer *buffer,
                                          struct v3dv_meta_framebuffer *framebuffer,
-                                         const VkBufferImageCopy2KHR *region)
+                                         const VkBufferImageCopy2 *region)
 {
    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
    v3dv_return_if_oom(NULL, job);
@@ -1175,8 +1319,8 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
 }
 
 /* Figure out a TLB size configuration for a number of pixels to process.
- * Beware that we can't "render" more than 4096x4096 pixels in a single job,
- * if the pixel count is larger than this, the caller might need to split
+ * Beware that we can't "render" more than MAX_DIMxMAX_DIM pixels in a single
+ * job, if the pixel count is larger than this, the caller might need to split
  * the job and call this function multiple times.
  */
 static void
@@ -1186,7 +1330,7 @@ framebuffer_size_for_pixel_count(uint32_t num_pixels,
 {
    assert(num_pixels > 0);
 
-   const uint32_t max_dim_pixels = 4096;
+   const uint32_t max_dim_pixels = V3D_MAX_IMAGE_DIMENSION;
    const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
 
    uint32_t w, h;
@@ -1215,7 +1359,7 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
                        uint32_t dst_offset,
                        struct v3dv_bo *src,
                        uint32_t src_offset,
-                       const VkBufferCopy2KHR *region)
+                       const VkBufferCopy2 *region)
 {
    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
@@ -1264,7 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
       uint32_t width, height;
       framebuffer_size_for_pixel_count(num_items, &width, &height);
 
-      v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
+      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                           false);
 
       struct v3dv_meta_framebuffer framebuffer;
       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
@@ -1310,7 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
       uint32_t width, height;
       framebuffer_size_for_pixel_count(num_items, &width, &height);
 
-      v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
+      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                           false);
 
       struct v3dv_meta_framebuffer framebuffer;
       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 8623a453701..616a7730cd4 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,8 +26,6 @@
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/compiler/v3d_compiler.h"
 
-#include "vk_format_info.h"
-
 static uint8_t
 blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants)
 {
@@ -58,15 +56,10 @@ blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants)
    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
    case VK_BLEND_FACTOR_SRC1_ALPHA:
    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
-      assert(!"Invalid blend factor: dual source blending not supported.");
+      unreachable("Invalid blend factor: dual source blending not supported.");
    default:
-      assert(!"Unknown blend factor.");
+      unreachable("Unknown blend factor.");
    }
-
-   /* Should be handled by the switch, added to avoid a "end of non-void
-    * function" error
-    */
-   unreachable("Unknown blend factor.");
 }
 
 static void
@@ -86,21 +79,19 @@ pack_blend(struct v3dv_pipeline *pipeline,
    if (!cb_info)
       return;
 
-   assert(pipeline->subpass);
-   if (pipeline->subpass->color_count == 0)
+   const struct vk_render_pass_state *ri = &pipeline->rendering_info;
+   if (ri->color_attachment_count == 0)
       return;
 
-   assert(pipeline->subpass->color_count == cb_info->attachmentCount);
-
+   assert(ri->color_attachment_count == cb_info->attachmentCount);
    pipeline->blend.needs_color_constants = false;
    uint32_t color_write_masks = 0;
-   for (uint32_t i = 0; i < pipeline->subpass->color_count; i++) {
+   for (uint32_t i = 0; i < ri->color_attachment_count; i++) {
       const VkPipelineColorBlendAttachmentState *b_state =
          &cb_info->pAttachments[i];
 
-      uint32_t attachment_idx =
-         pipeline->subpass->color_attachments[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      const VkFormat vk_format = ri->color_attachment_formats[i];
+      if (vk_format == VK_FORMAT_UNDEFINED)
          continue;
 
       color_write_masks |= (~b_state->colorWriteMask & 0xf) << (4 * i);
@@ -108,10 +99,13 @@ pack_blend(struct v3dv_pipeline *pipeline,
       if (!b_state->blendEnable)
          continue;
 
-      VkAttachmentDescription *desc =
-         &pipeline->pass->attachments[attachment_idx].desc;
-      const struct v3dv_format *format = v3dX(get_format)(desc->format);
-      bool dst_alpha_one = (format->swizzle[3] == PIPE_SWIZZLE_1);
+      const struct v3dv_format *format = v3dX(get_format)(vk_format);
+
+      /* We only do blending with render pass attachments, so we should not have
+       * multiplanar images here
+       */
+      assert(format->plane_count == 1);
+      bool dst_alpha_one = (format->planes[0].swizzle[3] == PIPE_SWIZZLE_1);
 
       uint8_t rt_mask = 1 << i;
       pipeline->blend.enables |= rt_mask;
@@ -148,6 +142,7 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
               const VkPipelineDepthStencilStateCreateInfo *ds_info,
               const VkPipelineRasterizationStateCreateInfo *rs_info,
               const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
+              const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info,
               const VkPipelineMultisampleStateCreateInfo *ms_info)
 {
    assert(sizeof(pipeline->cfg_bits) == cl_packet_length(CFG_BITS));
@@ -156,23 +151,21 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
       ms_info && ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
 
    v3dvx_pack(pipeline->cfg_bits, CFG_BITS, config) {
-      config.enable_forward_facing_primitive =
-         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_FRONT_BIT) : false;
-
-      config.enable_reverse_facing_primitive =
-         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_BACK_BIT) : false;
-
-      /* Seems like the hardware is backwards regarding this setting... */
-      config.clockwise_primitives =
-         rs_info ? rs_info->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE : false;
-
-      config.enable_depth_offset = rs_info ? rs_info->depthBiasEnable: false;
+      /* Even if rs_info->depthBiasEnabled is true, we can decide to not
+       * enable it, like if there isn't a depth/stencil attachment with the
+       * pipeline.
+       */
+      config.enable_depth_offset = pipeline->depth_bias.enabled;
 
       /* This is required to pass line rasterization tests in CTS while
        * exposing, at least, a minimum of 4-bits of subpixel precision
        * (the minimum requirement).
        */
-      config.line_rasterization = 1; /* perp end caps */
+      if (ls_info &&
+          ls_info->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT)
+         config.line_rasterization = V3D_LINE_RASTERIZATION_DIAMOND_EXIT;
+      else
+         config.line_rasterization = V3D_LINE_RASTERIZATION_PERP_END_CAPS;
 
       if (rs_info && rs_info->polygonMode != VK_POLYGON_MODE_FILL) {
          config.direct3d_wireframe_triangles_mode = true;
@@ -180,7 +173,10 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
             rs_info->polygonMode == VK_POLYGON_MODE_POINT;
       }
 
-      config.rasterizer_oversample_mode = pipeline->msaa ? 1 : 0;
+      /* diamond-exit rasterization does not support oversample */
+      config.rasterizer_oversample_mode =
+         (config.line_rasterization == V3D_LINE_RASTERIZATION_PERP_END_CAPS &&
+          pipeline->msaa) ? 1 : 0;
 
       /* From the Vulkan spec:
        *
@@ -203,30 +199,42 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
 
       config.blend_enable = pipeline->blend.enables != 0;
 
-      /* Disable depth/stencil if we don't have a D/S attachment */
-      bool has_ds_attachment =
-         pipeline->subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED;
-
-      if (ds_info && ds_info->depthTestEnable && has_ds_attachment) {
-         config.z_updates_enable = ds_info->depthWriteEnable;
-         config.depth_test_function = ds_info->depthCompareOp;
+#if V3D_VERSION >= 71
+      /* From the Vulkan spec:
+       *
+       *    "depthClampEnable controls whether to clamp the fragment’s depth
+       *     values as described in Depth Test. If the pipeline is not created
+       *     with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
+       *     then enabling depth clamp will also disable clipping primitives to
+       *     the z planes of the frustrum as described in Primitive Clipping.
+       *     Otherwise depth clipping is controlled by the state set in
+       *     VkPipelineRasterizationDepthClipStateCreateInfoEXT."
+       */
+      bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
+      bool z_clip_enable = false;
+      const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+         rs_info ? vk_find_struct_const(rs_info->pNext,
+                                        PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
+                   NULL;
+      if (clip_info)
+         z_clip_enable = clip_info->depthClipEnable;
+      else if (!z_clamp_enable)
+         z_clip_enable = true;
+
+      if (z_clip_enable) {
+         config.z_clipping_mode = pipeline->negative_one_to_one ?
+	    V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
       } else {
-         config.depth_test_function = VK_COMPARE_OP_ALWAYS;
+         config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
       }
 
-      /* EZ state will be updated at draw time based on bound pipeline state */
-      config.early_z_updates_enable = false;
-      config.early_z_enable = false;
-
-      config.stencil_enable =
-         ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
-
-      pipeline->z_updates_enable = config.z_updates_enable;
+      config.z_clamp_mode = z_clamp_enable;
+#endif
    };
 }
 
-static uint32_t
-translate_stencil_op(enum pipe_stencil_op op)
+uint32_t
+v3dX(translate_stencil_op)(VkStencilOp op)
 {
    switch (op) {
    case VK_STENCIL_OP_KEEP:
@@ -255,7 +263,8 @@ pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
                         uint8_t *stencil_cfg,
                         bool is_front,
                         bool is_back,
-                        const VkStencilOpState *stencil_state)
+                        const VkStencilOpState *stencil_state,
+                        const struct vk_graphics_pipeline_state *state)
 {
    /* From the Vulkan spec:
     *
@@ -267,60 +276,54 @@ pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
     *
     * In our case, 's' is always 8, so we clamp to that to prevent our packing
     * functions to assert in debug mode if they see larger values.
-    *
-    * If we have dynamic state we need to make sure we set the corresponding
-    * state bits to 0, since cl_emit_with_prepacked ORs the new value with
-    * the old.
     */
-   const uint8_t write_mask =
-      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK ?
-         0 : stencil_state->writeMask & 0xff;
-
-   const uint8_t compare_mask =
-      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
-         0 : stencil_state->compareMask & 0xff;
-
-   const uint8_t reference =
-      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
-         0 : stencil_state->reference & 0xff;
-
    v3dvx_pack(stencil_cfg, STENCIL_CFG, config) {
       config.front_config = is_front;
       config.back_config = is_back;
-      config.stencil_write_mask = write_mask;
-      config.stencil_test_mask = compare_mask;
+      config.stencil_write_mask = stencil_state->writeMask & 0xff;
+      config.stencil_test_mask = stencil_state->compareMask & 0xff;
       config.stencil_test_function = stencil_state->compareOp;
-      config.stencil_pass_op = translate_stencil_op(stencil_state->passOp);
-      config.depth_test_fail_op = translate_stencil_op(stencil_state->depthFailOp);
-      config.stencil_test_fail_op = translate_stencil_op(stencil_state->failOp);
-      config.stencil_ref_value = reference;
+      config.stencil_pass_op =
+         v3dX(translate_stencil_op)(stencil_state->passOp);
+      config.depth_test_fail_op =
+         v3dX(translate_stencil_op)(stencil_state->depthFailOp);
+      config.stencil_test_fail_op =
+         v3dX(translate_stencil_op)(stencil_state->failOp);
+      config.stencil_ref_value = stencil_state->reference & 0xff;
    }
 }
 
 static void
 pack_stencil_cfg(struct v3dv_pipeline *pipeline,
-                 const VkPipelineDepthStencilStateCreateInfo *ds_info)
+                 const VkPipelineDepthStencilStateCreateInfo *ds_info,
+                 const struct vk_graphics_pipeline_state *state)
 {
    assert(sizeof(pipeline->stencil_cfg) == 2 * cl_packet_length(STENCIL_CFG));
 
-   if (!ds_info || !ds_info->stencilTestEnable)
+   if ((!ds_info || !ds_info->stencilTestEnable) &&
+       (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE))) {
       return;
+   }
 
-   if (pipeline->subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
+   const struct vk_render_pass_state *ri = &pipeline->rendering_info;
+   if (ri->stencil_attachment_format == VK_FORMAT_UNDEFINED)
       return;
 
-   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_WRITE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_REFERENCE;
-
+   const bool any_dynamic_stencil_states =
+      BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+      BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+      BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+      BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+      BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_OP);
 
    /* If front != back or we have dynamic stencil state we can't emit a single
     * packet for both faces.
     */
    bool needs_front_and_back = false;
-   if ((pipeline->dynamic_state.mask & dynamic_stencil_states) ||
-       memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front)))
+   if ((any_dynamic_stencil_states) ||
+       memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front))) {
       needs_front_and_back = true;
+   }
 
    /* If the front and back configurations are the same we can emit both with
     * a single packet.
@@ -328,33 +331,41 @@ pack_stencil_cfg(struct v3dv_pipeline *pipeline,
    pipeline->emit_stencil_cfg[0] = true;
    if (!needs_front_and_back) {
       pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
-                              true, true, &ds_info->front);
+                              true, true, &ds_info->front, state);
    } else {
       pipeline->emit_stencil_cfg[1] = true;
       pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
-                              true, false, &ds_info->front);
+                              true, false, &ds_info->front, state);
       pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[1],
-                              false, true, &ds_info->back);
+                              false, true, &ds_info->back, state);
    }
 }
 
+
+/* FIXME: Now that we are passing the vk_graphics_pipeline_state we could
+ * avoid passing all those parameters. But doing that we would need to change
+ * all the code that uses the VkXXX structures, and use instead the equivalent
+ * vk_xxx
+ */
 void
 v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
                           const VkPipelineColorBlendStateCreateInfo *cb_info,
                           const VkPipelineDepthStencilStateCreateInfo *ds_info,
                           const VkPipelineRasterizationStateCreateInfo *rs_info,
                           const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
-                          const VkPipelineMultisampleStateCreateInfo *ms_info)
+                          const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info,
+                          const VkPipelineMultisampleStateCreateInfo *ms_info,
+                          const struct vk_graphics_pipeline_state *state)
 {
    pack_blend(pipeline, cb_info);
-   pack_cfg_bits(pipeline, ds_info, rs_info, pv_info, ms_info);
-   pack_stencil_cfg(pipeline, ds_info);
+   pack_cfg_bits(pipeline, ds_info, rs_info, pv_info, ls_info, ms_info);
+   pack_stencil_cfg(pipeline, ds_info, state);
 }
 
 static void
 pack_shader_state_record(struct v3dv_pipeline *pipeline)
 {
-   assert(sizeof(pipeline->shader_state_record) ==
+   assert(sizeof(pipeline->shader_state_record) >=
           cl_packet_length(GL_SHADER_STATE_RECORD));
 
    struct v3d_fs_prog_data *prog_data_fs =
@@ -378,7 +389,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
 
       if (!pipeline->has_gs) {
          shader.point_size_in_shaded_vertex_data =
-            pipeline->topology == PIPE_PRIM_POINTS;
+            pipeline->topology == MESA_PRIM_POINTS;
       } else {
          struct v3d_gs_prog_data *prog_data_gs =
             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]->prog_data.gs;
@@ -390,6 +401,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
        * shader needs to write the Z value (even just discards).
        */
       shader.fragment_shader_does_z_writes = prog_data_fs->writes_z;
+
       /* Set if the EZ test must be disabled (due to shader side
        * effects and the early_z flag not being present in the
        * shader).
@@ -428,15 +440,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
       shader.number_of_varyings_in_fragment_shader =
          prog_data_fs->num_inputs;
 
-      shader.coordinate_shader_propagate_nans = true;
-      shader.vertex_shader_propagate_nans = true;
-      shader.fragment_shader_propagate_nans = true;
-
-      /* Note: see previous note about adresses */
+      /* Note: see previous note about addresses */
       /* shader.coordinate_shader_code_address */
       /* shader.vertex_shader_code_address */
       /* shader.fragment_shader_code_address */
 
+#if V3D_VERSION == 42
+      shader.coordinate_shader_propagate_nans = true;
+      shader.vertex_shader_propagate_nans = true;
+      shader.fragment_shader_propagate_nans = true;
+
       /* FIXME: Use combined input/output size flag in the common case (also
        * on v3d, see v3dx_draw).
        */
@@ -444,20 +457,32 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
          prog_data_vs_bin->separate_segments;
       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
          prog_data_vs->separate_segments;
-
       shader.coordinate_shader_input_vpm_segment_size =
          prog_data_vs_bin->separate_segments ?
          prog_data_vs_bin->vpm_input_size : 1;
       shader.vertex_shader_input_vpm_segment_size =
          prog_data_vs->separate_segments ?
          prog_data_vs->vpm_input_size : 1;
+#endif
+
+      /* On V3D 7.1 there isn't a specific flag to set if we are using
+       * shared/separate segments or not. We just set the value of
+       * vpm_input_size to 0, and set output to the max needed. That should be
+       * already properly set on prog_data_vs_bin
+       */
+#if V3D_VERSION == 71
+      shader.coordinate_shader_input_vpm_segment_size =
+         prog_data_vs_bin->vpm_input_size;
+      shader.vertex_shader_input_vpm_segment_size =
+         prog_data_vs->vpm_input_size;
+#endif
 
       shader.coordinate_shader_output_vpm_segment_size =
          prog_data_vs_bin->vpm_output_size;
       shader.vertex_shader_output_vpm_segment_size =
          prog_data_vs->vpm_output_size;
 
-      /* Note: see previous note about adresses */
+      /* Note: see previous note about addresses */
       /* shader.coordinate_shader_uniforms_address */
       /* shader.vertex_shader_uniforms_address */
       /* shader.fragment_shader_uniforms_address */
@@ -499,7 +524,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
       shader.instance_id_read_by_vertex_shader =
          prog_data_vs->uses_iid;
 
-      /* Note: see previous note about adresses */
+      /* Note: see previous note about addresses */
       /* shader.address_of_default_attribute_values */
    }
 }
@@ -592,7 +617,6 @@ pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline,
 
       attr.instance_divisor = MIN2(pipeline->vb[binding].instance_divisor,
                                    0xffff);
-      attr.stride = pipeline->vb[binding].stride;
       attr.type = get_attr_type(desc);
    }
 }
@@ -652,3 +676,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
       }
    }
 }
+
+#if V3D_VERSION == 42
+static bool
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+{
+   for (uint8_t i = 0; i < pipeline->va_count; i++) {
+      if (vk_format_is_int(pipeline->va[i].vk_format))
+         return true;
+   }
+   return false;
+}
+#endif
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION == 42
+   return pipeline_has_integer_vertex_attrib(pipeline);
+#endif
+
+   return false;
+}
+
+/* @pipeline can be NULL. In that case we assume the most common case. For
+ * example, for v42 we assume in that case that all the attributes have a
+ * float format (we only create an all-float BO once and we reuse it with all
+ * float pipelines), otherwise we look at the actual type of each attribute
+ * used with the specific pipeline passed in.
+ */
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+                                      struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION >= 71
+   return NULL;
+#endif
+
+   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+   struct v3dv_bo *bo;
+
+   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+
+   if (!bo) {
+      fprintf(stderr, "failed to allocate memory for the default "
+              "attribute values\n");
+      return NULL;
+   }
+
+   bool ok = v3dv_bo_map(device, bo, size);
+   if (!ok) {
+      fprintf(stderr, "failed to map default attribute values buffer\n");
+      return NULL;
+   }
+
+   uint32_t *attrs = bo->map;
+   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+      attrs[i * 4 + 0] = 0;
+      attrs[i * 4 + 1] = 0;
+      attrs[i * 4 + 2] = 0;
+      VkFormat attr_format =
+         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+      if (i < va_count && vk_format_is_int(attr_format)) {
+         attrs[i * 4 + 3] = 1;
+      } else {
+         attrs[i * 4 + 3] = fui(1.0);
+      }
+   }
+
+   v3dv_bo_unmap(device, bo);
+
+   return bo;
+}
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index ab134225a3a..68df5db74ad 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -55,6 +55,9 @@ void
 v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
 
 void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
 
 void
@@ -75,6 +78,14 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
                               uint32_t layers);
 
 void
+v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job);
+
+void
+v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend,
+                               struct v3dv_job *suspend,
+                               struct v3dv_job *resume);
+
+void
 v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
                                      uint32_t cmd_buffer_count,
                                      const VkCommandBuffer *cmd_buffers);
@@ -117,31 +128,34 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
                                        uint32_t stride);
 
 void
+v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer);
+
+struct v3dv_job *
+v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job);
+
+void
 v3dX(get_hw_clear_color)(const VkClearColorValue *color,
                          uint32_t internal_type,
                          uint32_t internal_size,
                          uint32_t *hw_color);
 
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
-                                                 int rt,
-                                                 uint32_t *rt_bpp,
-                                                 uint32_t *rt_type,
-                                                 uint32_t *rt_clamp);
-
 /* Used at v3dv_device */
 
 void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+                         struct v3dv_sampler *sampler,
                          const VkSamplerCreateInfo *pCreateInfo,
                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
 
 void
 v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
+                                            const struct v3dv_cmd_buffer_attachment_state *attachments,
                                             const struct v3dv_subpass *subpass,
-                                            uint8_t *max_bpp, bool *msaa);
+                                            uint8_t *max_internal_bpp,
+                                            uint8_t *total_color_bpp,
+                                            bool *msaa);
 
-#ifdef DEBUG
+#if MESA_DEBUG
 void
 v3dX(device_check_prepacked_sizes)(void);
 #endif
@@ -161,6 +175,10 @@ v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format);
 bool
 v3dX(format_supports_blending)(const struct v3dv_format *format);
 
+/* FIXME: tex_format should be `enum V3DX(Texture_Data_Formats)`, but using
+ * that enum type in the header requires including v3dx_pack.h, which triggers
+ * circular include dependencies issues, so we're using a `uint32_t` for now.
+ */
 bool
 v3dX(tfu_supports_tex_format)(uint32_t tex_format);
 
@@ -189,14 +207,14 @@ v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
                                          struct v3dv_buffer *buffer,
                                          struct v3dv_image *image,
                                          struct v3dv_meta_framebuffer *framebuffer,
-                                         const VkBufferImageCopy2KHR *region);
+                                         const VkBufferImageCopy2 *region);
 
 void
 v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
                                   struct v3dv_image *dst,
                                   struct v3dv_image *src,
                                   struct v3dv_meta_framebuffer *framebuffer,
-                                  const VkImageResolve2KHR *region);
+                                  const VkImageResolve2 *region);
 
 void
 v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
@@ -223,19 +241,23 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
                                struct v3dv_image *dst,
                                struct v3dv_image *src,
                                struct v3dv_meta_framebuffer *framebuffer,
-                               const VkImageCopy2KHR *region);
+                               const VkImageCopy2 *region);
 
 void
 v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
-                        struct v3dv_image *dst,
-                        uint32_t dst_mip_level,
-                        uint32_t dst_layer,
-                        struct v3dv_image *src,
-                        uint32_t src_mip_level,
-                        uint32_t src_layer,
+                        uint32_t dst_bo_handle,
+                        uint32_t dst_offset,
+                        enum v3d_tiling_mode dst_tiling,
+                        uint32_t dst_padded_height_or_stride,
+                        uint32_t dst_cpp,
+                        uint32_t src_bo_handle,
+                        uint32_t src_offset,
+                        enum v3d_tiling_mode src_tiling,
+                        uint32_t src_padded_height_or_stride,
+                        uint32_t src_cpp,
                         uint32_t width,
                         uint32_t height,
-                        const struct v3dv_format *format);
+                        const struct v3dv_format_plane *format_plane);
 
 void
 v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
@@ -259,7 +281,7 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
                                          struct v3dv_image *image,
                                          struct v3dv_buffer *buffer,
                                          struct v3dv_meta_framebuffer *framebuffer,
-                                         const VkBufferImageCopy2KHR *region);
+                                         const VkBufferImageCopy2 *region);
 
 void
 v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
@@ -273,7 +295,7 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
                        uint32_t dst_offset,
                        struct v3dv_bo *src,
                        uint32_t src_offset,
-                       const VkBufferCopy2KHR *region);
+                       const VkBufferCopy2 *region);
 
 void
 v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
@@ -295,20 +317,57 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
                           const VkPipelineDepthStencilStateCreateInfo *ds_info,
                           const VkPipelineRasterizationStateCreateInfo *rs_info,
                           const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
-                          const VkPipelineMultisampleStateCreateInfo *ms_info);
+                          const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info,
+                          const VkPipelineMultisampleStateCreateInfo *ms_info,
+                          const struct vk_graphics_pipeline_state *state);
 void
 v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
                                   const VkPipelineVertexInputStateCreateInfo *vi_info,
                                   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
+
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+                                      struct v3dv_pipeline *pipeline);
+
 /* Used at v3dv_queue */
 void
 v3dX(job_emit_noop)(struct v3dv_job *job);
 
+/* Used at v3dv_query */
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+                                           VkPerformanceCounterKHR *pCounters,
+                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
+
 /* Used at v3dv_descriptor_set, and other descriptor set utils */
 uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
 
 uint32_t v3dX(max_descriptor_bo_size)(void);
 
-uint32_t v3dX(combined_image_sampler_texture_state_offset)(void);
+uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
+
+uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+
+/* General utils */
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format);
+
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format);
 
-uint32_t v3dX(combined_image_sampler_sampler_state_offset)(void);
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+                             float scale[3],
+                             float translate[3]);
+
+uint32_t
+v3dX(translate_stencil_op)(VkStencilOp op);
diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c
new file mode 100644
index 00000000000..e59a1e84ff6
--- /dev/null
+++ b/src/broadcom/vulkan/v3dvx_query.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+
+#include "common/v3d_performance_counters.h"
+
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+                                           VkPerformanceCounterKHR *pCounters,
+                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+   uint32_t desc_count = *pCounterCount;
+
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+                          out, pCounters, pCounterCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+                          out_desc, pCounterDescriptions, &desc_count);
+
+   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+         unsigned char sha1_result[20];
+         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+                            sha1_result);
+
+         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+      }
+
+      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+                               &out_desc, desc) {
+         desc->flags = 0;
+         snprintf(desc->name, sizeof(desc->name), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+         snprintf(desc->category, sizeof(desc->category), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+         snprintf(desc->description, sizeof(desc->description), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
index 38f9efbfa5d..6eed2de9d54 100644
--- a/src/broadcom/vulkan/v3dvx_queue.c
+++ b/src/broadcom/vulkan/v3dvx_queue.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -29,7 +29,8 @@
 void
 v3dX(job_emit_noop)(struct v3dv_job *job)
 {
-   v3dv_job_start_frame(job, 1, 1, 1, true, 1, V3D_INTERNAL_BPP_32, false);
+   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
+                        V3D_INTERNAL_BPP_32, 4, false);
    v3dX(job_emit_binning_flush)(job);
 
    struct v3dv_cl *rcl = &job->rcl;
@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
       config.image_height_pixels = 1;
       config.number_of_render_targets = 1;
       config.multisample_mode_4x = false;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = 3; /* Tile size 64 */
+      config.log2_tile_height = 3; /* Tile size 64 */
+#endif
    }
 
+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
       rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
       rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
    }
+#endif
+#if V3D_VERSION >= 71
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+      rt.internal_bpp = V3D_INTERNAL_BPP_32;
+      rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
+      rt.stride = 1; /* Unused RT */
+   }
+#endif
 
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
       clear.z_clear_value = 1.0f;
diff --git a/src/broadcom/vulkan/vk_format_info.h b/src/broadcom/vulkan/vk_format_info.h
deleted file mode 100644
index da85cb5b5dd..00000000000
--- a/src/broadcom/vulkan/vk_format_info.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright © 2016 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef VK_FORMAT_INFO_H
-#define VK_FORMAT_INFO_H
-
-#include <stdbool.h>
-#include <vulkan/vulkan.h>
-
-#include "util/format/u_format.h"
-#include "vulkan/util/vk_format.h"
-
-/* FIXME: from freedreno vk_format.h, common place?*/
-static inline bool
-vk_format_is_int(VkFormat format)
-{
-   return util_format_is_pure_integer(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_sint(VkFormat format)
-{
-   return util_format_is_pure_sint(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_uint(VkFormat format)
-{
-   return util_format_is_pure_uint(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_unorm(VkFormat format)
-{
-   return util_format_is_unorm(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_snorm(VkFormat format)
-{
-   return util_format_is_snorm(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_float(VkFormat format)
-{
-   return util_format_is_float(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_srgb(VkFormat format)
-{
-   return util_format_is_srgb(vk_format_to_pipe_format(format));
-}
-
-static inline unsigned
-vk_format_get_blocksize(VkFormat format)
-{
-   return util_format_get_blocksize(vk_format_to_pipe_format(format));
-}
-
-static inline unsigned
-vk_format_get_blockwidth(VkFormat format)
-{
-   return util_format_get_blockwidth(vk_format_to_pipe_format(format));
-}
-
-static inline unsigned
-vk_format_get_blockheight(VkFormat format)
-{
-   return util_format_get_blockheight(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_compressed(VkFormat format)
-{
-   return util_format_is_compressed(vk_format_to_pipe_format(format));
-}
-
-static inline const struct util_format_description *
-vk_format_description(VkFormat format)
-{
-   return util_format_description(vk_format_to_pipe_format(format));
-}
-
-#endif /* VK_FORMAT_INFO_H */