summaryrefslogtreecommitdiff
path: root/src/broadcom
diff options
context:
space:
mode:
Diffstat (limited to 'src/broadcom')
-rw-r--r--src/broadcom/ci/broadcom-rpi3-fails.txt (renamed from src/broadcom/ci/piglit-vc4-rpi3-fails.txt)456
-rw-r--r--src/broadcom/ci/broadcom-rpi3-flakes.txt52
-rw-r--r--src/broadcom/ci/broadcom-rpi3-skips.txt (renamed from src/broadcom/ci/deqp-vc4-rpi3-skips.txt)25
-rw-r--r--src/broadcom/ci/broadcom-rpi4-fails.txt602
-rw-r--r--src/broadcom/ci/broadcom-rpi4-flakes.txt48
-rw-r--r--src/broadcom/ci/broadcom-rpi4-skips.txt293
-rw-r--r--src/broadcom/ci/broadcom-rpi5-fails.txt11
-rw-r--r--src/broadcom/ci/broadcom-rpi5-flakes.txt15
-rw-r--r--src/broadcom/ci/broadcom-rpi5-skips.txt96
-rw-r--r--src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml6
-rw-r--r--src/broadcom/ci/deqp-broadcom-rpi3.toml61
-rw-r--r--src/broadcom/ci/deqp-broadcom-rpi4.toml89
-rw-r--r--src/broadcom/ci/deqp-v3d-rpi4-fails.txt4
-rw-r--r--src/broadcom/ci/deqp-v3d-rpi4-flakes.txt3
-rw-r--r--src/broadcom/ci/deqp-v3d-rpi4-gles.toml47
-rw-r--r--src/broadcom/ci/deqp-v3dv-rpi4-fails.txt32
-rw-r--r--src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt5
-rw-r--r--src/broadcom/ci/deqp-v3dv-rpi4-skips.txt21
-rw-r--r--src/broadcom/ci/deqp-vc4-rpi3-fails.txt420
-rw-r--r--src/broadcom/ci/deqp-vc4-rpi3-flakes.txt30
-rw-r--r--src/broadcom/ci/deqp-vc4-rpi3-gles.toml23
-rw-r--r--src/broadcom/ci/gitlab-ci-inc.yml156
-rw-r--r--src/broadcom/ci/gitlab-ci.yml190
-rw-r--r--src/broadcom/ci/piglit-v3d-rpi4-fails.txt337
-rw-r--r--src/broadcom/ci/piglit-v3d-rpi4-flakes.txt7
-rw-r--r--src/broadcom/ci/piglit-v3d-rpi4-skips.txt20
-rw-r--r--src/broadcom/ci/piglit-vc4-rpi3-flakes.txt8
-rw-r--r--src/broadcom/ci/piglit-vc4-rpi3-skips.txt19
-rw-r--r--src/broadcom/ci/traces-broadcom.yml205
-rw-r--r--src/broadcom/cle/gen_pack_header.py37
-rw-r--r--src/broadcom/cle/meson.build26
-rw-r--r--src/broadcom/cle/v3d_decoder.c45
-rw-r--r--src/broadcom/cle/v3d_packet.xml (renamed from src/broadcom/cle/v3d_packet_v33.xml)825
-rw-r--r--src/broadcom/cle/v3d_packet_helpers.h115
-rw-r--r--src/broadcom/cle/v3dx_pack.h6
-rw-r--r--src/broadcom/cle/vc4_packet.xml (renamed from src/broadcom/cle/v3d_packet_v21.xml)0
-rw-r--r--src/broadcom/clif/clif_dump.c32
-rw-r--r--src/broadcom/clif/clif_private.h7
-rw-r--r--src/broadcom/clif/v3dx_dump.c19
-rw-r--r--src/broadcom/common/v3d_cpu_tiling.h8
-rw-r--r--src/broadcom/common/v3d_csd.h (renamed from src/broadcom/vulkan/v3dv_util.c)60
-rw-r--r--src/broadcom/common/v3d_debug.c46
-rw-r--r--src/broadcom/common/v3d_debug.h16
-rw-r--r--src/broadcom/common/v3d_device_info.c19
-rw-r--r--src/broadcom/common/v3d_device_info.h6
-rw-r--r--src/broadcom/common/v3d_limits.h45
-rw-r--r--src/broadcom/common/v3d_macros.h9
-rw-r--r--src/broadcom/common/v3d_performance_counters.h229
-rw-r--r--src/broadcom/common/v3d_tfu.h74
-rw-r--r--src/broadcom/common/v3d_tiling.c1
-rw-r--r--src/broadcom/common/v3d_tiling.h4
-rw-r--r--src/broadcom/common/v3d_util.c186
-rw-r--r--src/broadcom/common/v3d_util.h47
-rw-r--r--src/broadcom/compiler/meson.build13
-rw-r--r--src/broadcom/compiler/nir_to_vir.c1994
-rw-r--r--src/broadcom/compiler/qpu_schedule.c1158
-rw-r--r--src/broadcom/compiler/qpu_validate.c102
-rw-r--r--src/broadcom/compiler/v3d33_tex.c195
-rw-r--r--src/broadcom/compiler/v3d33_vpm_setup.c75
-rw-r--r--src/broadcom/compiler/v3d_compiler.h309
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_image_load_store.c352
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_io.c221
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_line_smooth.c84
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c260
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_logic_ops.c153
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c167
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_scratch.c83
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_txf_ms.c33
-rw-r--r--src/broadcom/compiler/v3d_packing.c50
-rw-r--r--src/broadcom/compiler/v3d_tex.c (renamed from src/broadcom/compiler/v3d40_tex.c)202
-rw-r--r--src/broadcom/compiler/vir.c745
-rw-r--r--src/broadcom/compiler/vir_dump.c16
-rw-r--r--src/broadcom/compiler/vir_live_variables.c23
-rw-r--r--src/broadcom/compiler/vir_opt_constant_alu.c3
-rw-r--r--src/broadcom/compiler/vir_opt_copy_propagate.c97
-rw-r--r--src/broadcom/compiler/vir_opt_dead_code.c25
-rw-r--r--src/broadcom/compiler/vir_opt_redundant_flags.c9
-rw-r--r--src/broadcom/compiler/vir_opt_small_immediates.c26
-rw-r--r--src/broadcom/compiler/vir_register_allocate.c1349
-rw-r--r--src/broadcom/compiler/vir_to_qpu.c202
-rw-r--r--src/broadcom/drm-shim/README.md9
-rw-r--r--src/broadcom/drm-shim/meson.build44
-rw-r--r--src/broadcom/drm-shim/v3d.c101
-rw-r--r--src/broadcom/drm-shim/v3d.h70
-rw-r--r--src/broadcom/drm-shim/v3d_noop.c9
-rw-r--r--src/broadcom/drm-shim/v3dx.c370
-rw-r--r--src/broadcom/drm-shim/vc4_noop.c15
-rw-r--r--src/broadcom/meson.build12
-rw-r--r--src/broadcom/qpu/meson.build8
-rw-r--r--src/broadcom/qpu/qpu_disasm.c89
-rw-r--r--src/broadcom/qpu/qpu_instr.c197
-rw-r--r--src/broadcom/qpu/qpu_instr.h96
-rw-r--r--src/broadcom/qpu/qpu_pack.c1451
-rw-r--r--src/broadcom/qpu/tests/qpu_disasm.c90
-rw-r--r--src/broadcom/simulator/meson.build14
-rw-r--r--src/broadcom/simulator/v3d_simulator.c568
-rw-r--r--src/broadcom/simulator/v3d_simulator.h24
-rw-r--r--src/broadcom/simulator/v3d_simulator_wrapper.cpp28
-rw-r--r--src/broadcom/simulator/v3d_simulator_wrapper.h7
-rw-r--r--src/broadcom/simulator/v3dx_simulator.c161
-rw-r--r--src/broadcom/simulator/v3dx_simulator.h3
-rw-r--r--src/broadcom/vulkan/meson.build69
-rw-r--r--src/broadcom/vulkan/v3dv_android.c544
-rw-r--r--src/broadcom/vulkan/v3dv_bo.c100
-rw-r--r--src/broadcom/vulkan/v3dv_bo.h13
-rw-r--r--src/broadcom/vulkan/v3dv_cl.c31
-rw-r--r--src/broadcom/vulkan/v3dv_cl.h26
-rw-r--r--src/broadcom/vulkan/v3dv_cmd_buffer.c3312
-rw-r--r--src/broadcom/vulkan/v3dv_debug.c2
-rw-r--r--src/broadcom/vulkan/v3dv_debug.h2
-rw-r--r--src/broadcom/vulkan/v3dv_descriptor_set.c600
-rw-r--r--src/broadcom/vulkan/v3dv_device.c2587
-rw-r--r--src/broadcom/vulkan/v3dv_event.c712
-rw-r--r--src/broadcom/vulkan/v3dv_formats.c445
-rw-r--r--src/broadcom/vulkan/v3dv_image.c634
-rw-r--r--src/broadcom/vulkan/v3dv_limits.h9
-rw-r--r--src/broadcom/vulkan/v3dv_meta_clear.c142
-rw-r--r--src/broadcom/vulkan/v3dv_meta_common.h26
-rw-r--r--src/broadcom/vulkan/v3dv_meta_copy.c1692
-rw-r--r--src/broadcom/vulkan/v3dv_pass.c434
-rw-r--r--src/broadcom/vulkan/v3dv_pipeline.c2425
-rw-r--r--src/broadcom/vulkan/v3dv_pipeline_cache.c110
-rw-r--r--src/broadcom/vulkan/v3dv_private.h1459
-rw-r--r--src/broadcom/vulkan/v3dv_query.c1625
-rw-r--r--src/broadcom/vulkan/v3dv_queue.c2086
-rw-r--r--src/broadcom/vulkan/v3dv_uniforms.c232
-rw-r--r--src/broadcom/vulkan/v3dv_wsi.c352
-rw-r--r--src/broadcom/vulkan/v3dv_wsi_display.c195
-rw-r--r--src/broadcom/vulkan/v3dv_wsi_wayland.c57
-rw-r--r--src/broadcom/vulkan/v3dv_wsi_x11.c103
-rw-r--r--src/broadcom/vulkan/v3dvx_cmd_buffer.c1435
-rw-r--r--src/broadcom/vulkan/v3dvx_descriptor_set.c14
-rw-r--r--src/broadcom/vulkan/v3dvx_device.c113
-rw-r--r--src/broadcom/vulkan/v3dvx_formats.c125
-rw-r--r--src/broadcom/vulkan/v3dvx_image.c249
-rw-r--r--src/broadcom/vulkan/v3dvx_meta_common.c322
-rw-r--r--src/broadcom/vulkan/v3dvx_pipeline.c301
-rw-r--r--src/broadcom/vulkan/v3dvx_private.h111
-rw-r--r--src/broadcom/vulkan/v3dvx_query.c67
-rw-r--r--src/broadcom/vulkan/v3dvx_queue.c20
-rw-r--r--src/broadcom/vulkan/vk_format_info.h106
141 files changed, 25515 insertions, 13827 deletions
diff --git a/src/broadcom/ci/piglit-vc4-rpi3-fails.txt b/src/broadcom/ci/broadcom-rpi3-fails.txt
index cb9dfaa6eb6..fdcf09f1fef 100644
--- a/src/broadcom/ci/piglit-vc4-rpi3-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi3-fails.txt
@@ -1,35 +1,116 @@
-glx@glx-copy-sub-buffer samples=2,Crash
-glx@glx-copy-sub-buffer samples=4,Crash
-glx@glx-make-current,Crash
-glx@glx-multithread-buffer,Fail
-glx@glx-query-drawable-glx_fbconfig_id-window,Fail
+# Test expects red instead of luminance, contra OES_depth_texture spec.
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3815
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail
+
+# Creating OpenGL ES 3 context
+# Fail, context: 0x00000000, error: EGL_BAD_MATCH
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3816
+x11-dEQP-EGL.functional.create_context.no_config,Fail
+wayland-dEQP-EGL.functional.create_context.no_config,Fail
+
+# wide line outside the viewport incorrectly clipped out when ES wants it
+# rendered as a quad and clipped appropriately. I think by expanding
+# CLIPPER_XY_SCALING to have a guard band we might get these to work.
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
+
+dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail
+
+dEQP-GLES2.functional.uniform_api.random.3,Fail
+dEQP-GLES2.functional.uniform_api.random.79,Fail
+
+# Sampling grid slightly off in test 2?
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_mirror_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888,Fail
+
+# " Warning: High precision not supported in fragment shaders.
+# ERROR: Image verification failed, found 2048 invalid pixels!"
+# one of the magnified pixels is (0xff, 0x29,0xd6) instead of (0xff,0x2d,0xd2).
+# We do support highp, so we should fix glGetShaderPrecisionFormat reporting.
+dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail
+
+# One of the pixels on the left edge near the bottom is wrong for both min and
+# mag. Also a line of pixels through the image in minification.
+dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail
+
+# Despite exposing GL 2.1, the HW doesn't actually support 3D textures so we set
+# 0 max levels. These tests fail (or assertion fail) as a result.
+spec@!opengl 1.1@max-texture-size,Crash
+spec@!opengl 1.2@copyteximage 3d,Fail
+spec@!opengl 1.2@getteximage-targets 3d,Fail
+spec@!opengl 1.2@tex3d-maxsize,Fail
+spec@!opengl 1.2@tex3d,Fail
+spec@!opengl 1.2@texture-packed-formats,Fail
+spec@!opengl 1.2@texwrap 3d bordercolor,Fail
+spec@!opengl 1.2@texwrap 3d proj bordercolor,Fail
+spec@!opengl 1.2@texwrap 3d proj,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- swizzled- projected,Fail
+spec@!opengl 1.2@texwrap 3d,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8- swizzled,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8,Fail
+spec@!opengl 1.3@tex3d-depth1,Fail
+spec@!opengl 2.0@tex3d-npot,Fail
+spec@!opengl 2.1@minmax,Fail
+spec@arb_framebuffer_object@fbo-generatemipmap-3d,Fail
+spec@arb_framebuffer_object@fbo-incomplete,Fail
+spec@arb_framebuffer_object@fbo-incomplete@invalid slice of 3D texture,Fail
+spec@arb_get_texture_sub_image@arb_get_texture_sub_image-get,Fail
+spec@arb_robustness@arb_robustness_client-mem-bounds,Fail
+spec@arb_texture_multisample@arb_texture_multisample-teximage-3d-multisample,Fail
+spec@arb_texture_storage@texture-storage,Crash
+spec@arb_texture_storage@texture-storage@3D mipmapped ,Fail
+spec@arb_texture_storage@texture-storage@3D mipmapped (EXT_dsa),Fail
+spec@arb_texture_storage@texture-storage@3D non-mipmapped ,Fail
+spec@arb_texture_storage@texture-storage@3D non-mipmapped (EXT_dsa),Fail
+spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail
+spec@ext_direct_state_access@textures,Crash
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT,Fail
+spec@ext_framebuffer_object@fbo-3d,Fail
+spec@glsl-1.10@execution@texture3d-computed-coord,Fail
+spec@glsl-1.10@execution@texture3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 3d,Fail
+spec@khr_texture_compression_astc@basic-gl,Fail
+
+glx@glx-make-current,Fail
glx@glx-swap-pixmap-bad,Fail
-glx@glx-visuals-depth -pixmap,Crash
-glx@glx-visuals-depth,Crash
-glx@glx-visuals-stencil -pixmap,Crash
-glx@glx-visuals-stencil,Crash
glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
glx@glx_arb_create_context_no_error@no error,Fail
-glx@glx_ext_import_context@free context,Fail
-glx@glx_ext_import_context@get context id,Fail
-glx@glx_ext_import_context@get current display,Fail
-glx@glx_ext_import_context@import context- multi process,Fail
-glx@glx_ext_import_context@import context- single process,Fail
-glx@glx_ext_import_context@imported context has same context id,Fail
-glx@glx_ext_import_context@make current- multi process,Fail
-glx@glx_ext_import_context@make current- single process,Fail
-glx@glx_ext_import_context@query context info,Fail
+
+# piglit: error: Test timed out.
+glx@glx_arb_sync_control@waitformsc,Fail
+
+glslparsertest@glsl2@gst-gl-text-download-i420-yv12.frag,Fail
shaders@glsl-arb-fragment-coord-conventions,Fail
shaders@glsl-bug-110796,Fail
shaders@glsl-max-vertex-attrib,Fail
-shaders@glsl-predication-on-large-array,Fail
-spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
-spec@!opengl 1.0@gl-1.0-dlist-bitmap,Crash
spec@!opengl 1.0@gl-1.0-drawbuffer-modes,Fail
spec@!opengl 1.0@gl-1.0-edgeflag,Fail
spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail
spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
-spec@!opengl 1.0@gl-1.0-logicop,Crash
spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
spec@!opengl 1.0@gl-1.0-scissor-offscreen,Fail
spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
@@ -682,33 +763,53 @@ spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)-
spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
-spec@!opengl 1.1@depthstencil-default_fb-blit samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-blit samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-clear samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-clear samples=4,Crash
+spec@arb_clear_texture@arb_clear_texture-3d,Fail
+spec@arb_clear_texture@arb_clear_texture-sized-formats,Fail
+spec@arb_clear_texture@arb_clear_texture-supported-formats,Fail
+spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash
+spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
+spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
+
+# fails on arm64, passes on armhf
+spec@arb_depth_buffer_float@depthstencil-render-miplevels 1024 s=z24_s8_d=z32f,Fail
+
+# Crashes in this group are CMA allocation fails
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=2,Fail
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=4,Fail
spec@!opengl 1.1@depthstencil-default_fb-clear,Fail
-spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Fail
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4,Fail
spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Crash
spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=4,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=2,Crash
-spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=4,Crash
-spec@!opengl 1.1@draw-pixels,Fail
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_line_loop,Fail
+
+# These non-supported primitives draws are converted by Mesa into
+# indexed draws with supported primitives. But these indexed draws
+# require 4-byte index due the number of vertices to draw, but our
+# hardware is limited to 2-byte indexes at most.
spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_polygon,Crash
spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quad_strip,Crash
spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quads,Crash
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_triangle_fan,Fail
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_line_loop,Fail
spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_polygon,Crash
spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quad_strip,Crash
spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quads,Crash
+
+# GFXH-515 / SW-5891: binner uses 16-bit index for drawarrays, so the
+# draw is split in multiple calls. For trifans or lineloops it is not
+# supported because the 1st vertex must be always included, which
+# would require creating new vertex buffer to include the remaining
+# vertices plus the 1st one.
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_line_loop,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_triangle_fan,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_line_loop,Fail
spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_triangle_fan,Fail
+
+spec@!opengl 1.1@draw-pixels,Fail
spec@!opengl 1.1@line-flat-clip-color,Fail
+
+# Hardware do not support line/polygon stipple. In fact, this feature
+# was deprecated/removed in newer OpenGL spec versions. It could be
+# emulated using shaders
+spec@!opengl 1.1@line-smooth-stipple,Fail
spec@!opengl 1.1@linestipple,Fail
spec@!opengl 1.1@linestipple@Baseline,Fail
spec@!opengl 1.1@linestipple@Factor 2x,Fail
@@ -716,6 +817,10 @@ spec@!opengl 1.1@linestipple@Factor 3x,Fail
spec@!opengl 1.1@linestipple@Line loop,Fail
spec@!opengl 1.1@linestipple@Line strip,Fail
spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
+spec@!opengl 2.1@pbo,Fail
+spec@!opengl 2.1@pbo@test_polygon_stip,Fail
+spec@!opengl 2.1@polygon-stipple-fs,Fail
+
spec@!opengl 1.1@polygon-mode,Fail
spec@!opengl 1.1@polygon-mode-offset,Fail
spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
@@ -742,11 +847,6 @@ spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,
spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@read-front clear-front-first samples=2,Crash
-spec@!opengl 1.1@read-front clear-front-first samples=4,Crash
-spec@!opengl 1.1@read-front samples=2,Crash
-spec@!opengl 1.1@read-front samples=4,Crash
-spec@!opengl 1.1@tex-upside-down-miptree,Fail
spec@!opengl 1.1@texsubimage-unpack,Fail
spec@!opengl 1.1@texwrap 2d proj,Fail
spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- NPOT- projected,Fail
@@ -787,25 +887,10 @@ spec@!opengl 1.1@texwrap formats@GL_RGBA16- swizzled,Fail
spec@!opengl 1.1@texwrap formats@GL_RGBA8,Fail
spec@!opengl 1.1@texwrap formats@GL_RGBA8- NPOT,Fail
spec@!opengl 1.1@texwrap formats@GL_RGBA8- swizzled,Fail
-spec@!opengl 1.1@windowoverlap,Fail
-spec@!opengl 1.2@copyteximage 3d,Fail
-spec@!opengl 1.2@getteximage-targets 3d,Fail
spec@!opengl 1.2@lodclamp,Fail
spec@!opengl 1.2@lodclamp-between,Fail
spec@!opengl 1.2@lodclamp-between-max,Fail
spec@!opengl 1.2@mipmap-setup,Fail
-spec@!opengl 1.2@tex3d,Fail
-spec@!opengl 1.2@tex3d-maxsize,Fail
-spec@!opengl 1.2@teximage-errors,Fail
-spec@!opengl 1.2@texwrap 3d proj,Fail
-spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail
-spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail
-spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- swizzled- projected,Fail
-spec@!opengl 1.2@texwrap 3d,Fail
-spec@!opengl 1.2@texwrap 3d@GL_RGBA8,Fail
-spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail
-spec@!opengl 1.2@texwrap 3d@GL_RGBA8- swizzled,Fail
-spec@!opengl 1.3@tex3d-depth1,Fail
spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
spec@!opengl 1.4@tex-miplevel-selection,Fail
spec@!opengl 1.4@tex-miplevel-selection-lod,Fail
@@ -814,14 +899,6 @@ spec@!opengl 1.5@depth-tex-compare,Fail
spec@!opengl 2.0@attrib-assignments,Fail
spec@!opengl 2.0@gl-2.0-edgeflag,Fail
spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
-spec@!opengl 2.0@occlusion-query-discard,Fail
-spec@!opengl 2.0@tex3d-npot,Fail
-spec@!opengl 2.1@minmax,Fail
-spec@!opengl 2.1@pbo,Fail
-spec@!opengl 2.1@pbo@test_polygon_stip,Fail
-spec@!opengl 2.1@polygon-stipple-fs,Fail
-spec@!opengl es 2.0@draw_buffers_gles2,Fail
-spec@arb_arrays_of_arrays@execution@glsl-arrays-copy-size-mismatch,Fail
spec@arb_depth_texture@depth-level-clamp,Fail
spec@arb_depth_texture@texwrap formats,Fail
spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16,Fail
@@ -835,7 +912,6 @@ spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- swizzled,Fail
spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index,Crash
spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index-user_varrays,Crash
-spec@arb_es2_compatibility@arb_es2_compatibility-drawbuffers,Fail
spec@arb_es2_compatibility@texwrap formats,Fail
spec@arb_es2_compatibility@texwrap formats@GL_RGB565,Fail
spec@arb_es2_compatibility@texwrap formats@GL_RGB565- NPOT,Fail
@@ -844,58 +920,24 @@ spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-integer,Fa
spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-none,Fail
spec@arb_fragment_program@fp-indirections2,Fail
spec@arb_fragment_program@minmax,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_depth24_stencil8,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index1,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index16,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index4,Fail
-spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index8,Fail
spec@arb_framebuffer_object@fbo-attachments-blit-scaled-linear,Fail
spec@arb_framebuffer_object@fbo-blit-stretch,Fail
-spec@arb_framebuffer_object@fbo-generatemipmap-3d,Fail
spec@arb_framebuffer_object@fbo-mipmap-copypix,Fail
-spec@arb_framebuffer_object@framebuffer-blit-levels draw stencil,Fail
-spec@arb_framebuffer_object@framebuffer-blit-levels read stencil,Fail
spec@arb_framebuffer_object@mixed-buffer-sizes,Fail
-spec@arb_framebuffer_object@same-attachment-glframebuffertexture2d-gl_depth_stencil_attachment,Fail
+spec@arb_framebuffer_object@same-attachment-tex2d-depth_stencil,Fail
spec@arb_framebuffer_srgb@arb_framebuffer_srgb-srgb_conformance,Fail
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled render,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled clear,Crash
-spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled render,Crash
spec@arb_internalformat_query2@all internalformat_<x>_size pname checks,Fail
spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_ALPHA_SIZE,Fail
spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_BLUE_SIZE,Fail
spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_DEPTH_SIZE,Fail
spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_GREEN_SIZE,Fail
spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_RED_SIZE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_ALPHA_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_BLUE_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_DEPTH_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_GREEN_TYPE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_RED_TYPE,Fail
spec@arb_internalformat_query2@api error checks,Fail
spec@arb_internalformat_query2@max dimensions related pname checks,Fail
spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_COMBINED_DIMENSIONS,Fail
@@ -903,147 +945,47 @@ spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_DEPTH,
spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_HEIGHT,Fail
spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_WIDTH,Fail
spec@arb_occlusion_query2@render,Fail
-spec@arb_occlusion_query@occlusion_query,Fail
spec@arb_occlusion_query@occlusion_query_conform,Fail
-spec@arb_occlusion_query@occlusion_query_meta_fragments,Fail
-spec@arb_occlusion_query@occlusion_query_meta_save,Fail
+spec@arb_occlusion_query@occlusion_query_conform@GetObjivAval_multi2,Fail
spec@arb_pixel_buffer_object@fbo-pbo-readpixels-small,Fail
spec@arb_pixel_buffer_object@pbo-getteximage,Fail
spec@arb_pixel_buffer_object@texsubimage-unpack pbo,Fail
spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
spec@arb_provoking_vertex@arb-provoking-vertex-render,Fail
spec@arb_sampler_objects@sampler-objects,Fail
-spec@arb_shader_texture_lod@execution@glsl-fs-texturelod-01,Fail
-spec@arb_texture_multisample@arb_texture_multisample-teximage-3d-multisample,Fail
spec@arb_texture_rectangle@1-1-linear-texture,Fail
-spec@arb_texture_rectangle@copyteximage rect samples=2,Crash
-spec@arb_texture_rectangle@copyteximage rect samples=4,Crash
spec@arb_texture_rectangle@texrect-many,Crash
-spec@arb_texture_storage@texture-storage,Fail
-spec@arb_texture_storage@texture-storage@3D mipmapped ,Fail
-spec@arb_texture_storage@texture-storage@3D non-mipmapped ,Fail
spec@arb_vertex_program@minmax,Fail
-spec@egl 1.4@egl-copy-buffers,Crash
spec@egl 1.4@eglterminate then unbind context,Fail
spec@egl 1.4@largest possible eglcreatepbuffersurface and then glclear,Fail
-spec@egl_ext_protected_content@conformance,Fail
spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
spec@egl_khr_surfaceless_context@viewport,Fail
spec@egl_mesa_configless_context@basic,Fail
-spec@ext_direct_state_access@indexed-state-queries 12,Fail
-spec@ext_direct_state_access@indexed-state-queries 12@GetIntegerIndexedvEXT,Fail
spec@ext_direct_state_access@multi-texture,Crash
-spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail
spec@ext_direct_state_access@multi-texture@MultiTexSubImage1DEXT,Fail
-spec@ext_direct_state_access@textures,Fail
-spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT,Fail
spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE,Fail
spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex*,Fail
-spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT,Fail
-spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT,Fail
spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE,Fail
spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
spec@ext_direct_state_access@textures@TextureSubImage2DEXT,Fail
-spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureSubImage3DEXT,Fail
spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
-spec@ext_framebuffer_multisample@blit-flipped 2 x,Crash
-spec@ext_framebuffer_multisample@blit-flipped 2 y,Crash
-spec@ext_framebuffer_multisample@blit-flipped 4 x,Crash
-spec@ext_framebuffer_multisample@blit-flipped 4 y,Crash
+
+# Remaining crashes are CMA allocation failures.
spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 downsample,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 msaa,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 upsample,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 downsample,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 msaa,Crash
-spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 upsample,Crash
-spec@ext_framebuffer_multisample@enable-flag,Crash
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 msaa,Fail
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 msaa,Fail
spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
-spec@ext_framebuffer_multisample@line-smooth 2,Crash
-spec@ext_framebuffer_multisample@line-smooth 4,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 color linear,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 color,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 depth,Crash
-spec@ext_framebuffer_multisample@multisample-blit 2 stencil,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 color linear,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 color,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 depth,Crash
-spec@ext_framebuffer_multisample@multisample-blit 4 stencil,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth combined,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth single,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth-computed combined,Crash
-spec@ext_framebuffer_multisample@no-color 2 depth-computed single,Crash
-spec@ext_framebuffer_multisample@no-color 2 stencil combined,Crash
-spec@ext_framebuffer_multisample@no-color 2 stencil single,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth combined,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth single,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth-computed combined,Crash
-spec@ext_framebuffer_multisample@no-color 4 depth-computed single,Crash
-spec@ext_framebuffer_multisample@no-color 4 stencil combined,Crash
-spec@ext_framebuffer_multisample@no-color 4 stencil single,Crash
-spec@ext_framebuffer_multisample@point-smooth 2,Crash
-spec@ext_framebuffer_multisample@point-smooth 4,Crash
-spec@ext_framebuffer_multisample@polygon-smooth 2,Crash
-spec@ext_framebuffer_multisample@polygon-smooth 4,Crash
spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 color,Fail
-spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 depth,Crash
spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 color,Fail
-spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 depth,Crash
-spec@ext_framebuffer_multisample@sample-coverage 2 inverted,Crash
-spec@ext_framebuffer_multisample@sample-coverage 2 non-inverted,Crash
-spec@ext_framebuffer_multisample@sample-coverage 4 inverted,Crash
-spec@ext_framebuffer_multisample@sample-coverage 4 non-inverted,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 color downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 color msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 color upsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 depth downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 depth msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 2 depth upsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 color downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 color msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 color upsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 depth downsample,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 depth msaa,Crash
-spec@ext_framebuffer_multisample@unaligned-blit 4 depth upsample,Crash
-spec@ext_framebuffer_multisample@upsample 2 color linear,Crash
-spec@ext_framebuffer_multisample@upsample 2 color,Crash
-spec@ext_framebuffer_multisample@upsample 2 depth,Crash
-spec@ext_framebuffer_multisample@upsample 2 stencil,Crash
-spec@ext_framebuffer_multisample@upsample 4 color linear,Crash
-spec@ext_framebuffer_multisample@upsample 4 color,Crash
-spec@ext_framebuffer_multisample@upsample 4 depth,Crash
-spec@ext_framebuffer_multisample@upsample 4 stencil,Crash
-spec@ext_framebuffer_multisample_blit_scaled@negative-blit-scaled,Crash
-spec@ext_framebuffer_object@fbo-3d,Fail
-spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
+spec@ext_framebuffer_multisample@sample-coverage 2 inverted,Fail
+spec@ext_framebuffer_multisample@sample-coverage 2 non-inverted,Fail
+spec@ext_framebuffer_multisample@sample-coverage 4 inverted,Fail
+spec@ext_framebuffer_multisample@sample-coverage 4 non-inverted,Fail
+
spec@ext_framebuffer_object@fbo-depth-sample-compare,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index1-blit,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index16-blit,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index4-blit,Fail
-spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index8-blit,Fail
spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail
spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail
spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail
@@ -1054,10 +996,8 @@ spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail
spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y410,Fail
spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail
spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail
-spec@ext_occlusion_query_boolean@any-samples,Fail
spec@ext_packed_depth_stencil@depth_stencil texture,Fail
spec@ext_packed_depth_stencil@fbo-depthstencil-gl_depth24_stencil8-clear,Fail
-spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-blit,Fail
spec@ext_packed_depth_stencil@texwrap formats,Fail
spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8,Fail
spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
@@ -1087,6 +1027,24 @@ spec@ext_texture_srgb@texwrap formats@GL_SRGB8- swizzled,Fail
spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8,Fail
spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- NPOT,Fail
spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- swizzled,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SLUMINANCE- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SLUMINANCE_ALPHA- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SLUMINANCE- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SLUMINANCE_ALPHA- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT- border color only,Fail
+spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- border color only,Fail
spec@glsl-1.10@built-in constants,Fail
spec@glsl-1.10@built-in constants@gl_MaxVertexAttribs,Fail
spec@glsl-1.10@execution@built-in-functions@fs-cos-float,Fail
@@ -1153,12 +1111,7 @@ spec@glsl-1.10@execution@built-in-functions@vs-tan-float,Fail
spec@glsl-1.10@execution@built-in-functions@vs-tan-vec2,Fail
spec@glsl-1.10@execution@built-in-functions@vs-tan-vec3,Fail
spec@glsl-1.10@execution@built-in-functions@vs-tan-vec4,Fail
-spec@glsl-1.10@execution@fs-texture-select,Fail
spec@glsl-1.10@execution@glsl-fs-convolution-2,Fail
-spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-2,Fail
-spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-3,Fail
-spec@glsl-1.10@execution@samplers@in-parameter-array,Fail
-spec@glsl-1.10@execution@texture3d,Fail
spec@glsl-1.20@built-in constants,Fail
spec@glsl-1.20@built-in constants@gl_MaxVertexAttribs,Fail
spec@glsl-1.20@execution@fs-nan-builtin-max,Fail
@@ -1167,13 +1120,11 @@ spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1dshadow,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 3d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() cube,Crash
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1dshadow,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 3d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) cube,Crash
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d_projvec4,Fail
@@ -1181,19 +1132,15 @@ spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1dshadow,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d_projvec4,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 3d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d_projvec4,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1dshadow,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d_projvec4,Fail
spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2dshadow,Fail
-spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 3d,Fail
-spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-col-row-wr,Fail
-spec@glsl-1.20@execution@variable-indexing@vs-temp-array-mat4-index-col-row-wr,Fail
+
spec@glsl-1.20@execution@vs-nan-builtin-max,Fail
spec@glsl-1.20@execution@vs-nan-builtin-min,Fail
-spec@intel_performance_query@intel_performance_query-issue_2235,Fail
spec@khr_texture_compression_astc@basic-gles,Fail
spec@khr_texture_compression_astc@miptree-gl ldr,Fail
spec@khr_texture_compression_astc@miptree-gl ldr@LDR Profile,Fail
@@ -1208,3 +1155,48 @@ spec@khr_texture_compression_astc@miptree-gles ldr@LDR Profile,Fail
spec@khr_texture_compression_astc@miptree-gles srgb,Fail
spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
spec@oes_compressed_etc1_rgb8_texture@miptree,Fail
+spec@!opengl 1.0@depth-clear-precision-check,Fail
+spec@!opengl 1.0@depth-clear-precision-check@depth16,Fail
+spec@!opengl 1.0@depth-clear-precision-check@depth32,Fail
+
+spec@glsl-1.10@execution@variable-indexing@vs-output-array-vec2-index-wr-no-unroll,Fail
+
+spec@ext_framebuffer_multisample@accuracy 2 depth_draw depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 2 depth_draw small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 2 depth_resolve depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 2 depth_resolve small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_draw depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_draw small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_resolve depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 depth_resolve small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb small depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy 4 srgb small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_draw depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_draw small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_resolve depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples depth_resolve small depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb depthstencil,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb small depthstencil linear,Fail
+spec@ext_framebuffer_multisample@accuracy all_samples srgb small depthstencil,Fail
+spec@ext_framebuffer_multisample@multisample-blit 2 depth,Fail
+spec@ext_framebuffer_multisample@multisample-blit 4 depth,Fail
+spec@ext_framebuffer_multisample@no-color 2 depth combined,Fail
+spec@ext_framebuffer_multisample@no-color 2 depth-computed combined,Fail
+spec@ext_framebuffer_multisample@no-color 4 depth combined,Fail
+spec@ext_framebuffer_multisample@no-color 4 depth-computed combined,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 2 depth msaa,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 2 stencil msaa,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 4 depth msaa,Fail
+spec@ext_framebuffer_multisample@unaligned-blit 4 stencil msaa,Fail
+
+# https://gitlab.freedesktop.org/mesa/piglit/-/merge_requests/817
+spec@intel_performance_query@intel_performance_query-issue_2235,Fail
+
+# Bisected to 35ae5dce39c ("mesa: don't pass Infs to the shader via gl_Fog.scale")
+spec@glsl-1.10@execution@glsl-1.10-built-in-uniform-state,Fail
+
+# Couldn't reproduce locally
+spec@oes_packed_depth_stencil@depth_stencil texture gles2,Fail
diff --git a/src/broadcom/ci/broadcom-rpi3-flakes.txt b/src/broadcom/ci/broadcom-rpi3-flakes.txt
new file mode 100644
index 00000000000..7e11d7da34e
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi3-flakes.txt
@@ -0,0 +1,52 @@
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_neg_x_neg_y_neg_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_pos_y_pos_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
+
+glx@glx-multi-window-single-context
+glx@glx-visuals-stencil
+shaders@glsl-vs-loop
+shaders@glsl-vs-loop-nested
+spec@ext_framebuffer_blit@fbo-sys-blit
+spec@ext_framebuffer_blit@fbo-sys-sub-blit
+spec@egl_chromium_sync_control@conformance
+
+# CMA allocations that may sometimes succeed
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=2
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/7186
+spec@!opengl 1.0@rasterpos
+
+# Sometimes fail when run along with other tests, never when run by themselves
+spec@!opengl 1.1@copypixels-sync
+spec@!opengl 1.1@copypixels-draw-sync
+spec@!opengl 1.1@draw-copypixels-sync
+spec@!opengl 1.1@draw-sync
+
+# flaky on wayland, was stable on x11
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import
+
+# fails on arm64, passes on armhf
+spec@arb_depth_buffer_float@depthstencil-render-miplevels 1024 s=z24_s8_d=z32f
+
+# Sometimes goes into an infinite loop and times out
+spec@arb_depth_buffer_float@depthstencil-render-miplevels 146 s=z24_s8_d=z32f_s8
+
+spec@arb_depth_texture@depthstencil-render-miplevels 273 d=z24
+spec@arb_shader_texture_lod@execution@tex-miplevel-selection *lod 1d
+spec@arb_occlusion_query2@render
+
+# Updated by ci-collate, found in this job run: https://gitlab.freedesktop.org/mesa/mesa/-/jobs/56164970
+glx@glx-multithread-clearbuffer
+
+spec@arb_vertex_buffer_object@vbo-subdata-many drawarrays
+spec@arb_vertex_buffer_object@vbo-subdata-many drawelements
+spec@arb_vertex_buffer_object@vbo-subdata-many drawrangeelements
+
+# Nightly run expectations update
+spec@glsl-1.20@execution@variable-indexing@fs-uniform-mat2-rd
+
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-skips.txt b/src/broadcom/ci/broadcom-rpi3-skips.txt
index 62d4d939d2d..6da79a463a7 100644
--- a/src/broadcom/ci/deqp-vc4-rpi3-skips.txt
+++ b/src/broadcom/ci/broadcom-rpi3-skips.txt
@@ -5,10 +5,6 @@
# This is causing a binning memory overflow problem
dEQP-GLES2.functional.fragment_ops.scissor.outside_render_line
-# These are very slow
-dEQP-GLES2.functional.uniform_api.random.3
-dEQP-GLES2.functional.uniform_api.random.79
-
# Conformance issue: VC4 needs dynamic loops in the VS to cause a
# shader link failure.
#
@@ -20,6 +16,21 @@ dEQP-GLES2.functional.uniform_api.random.79
# list for tracking.
dEQP-GLES2.functional.shaders.loops.*dynamic.*vertex
-# Timeout tests (> 1 minute to run)
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_linear
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_linear
+# Slow tests (> 1 minute to run)
+spec@!opengl 1.1@streaming-texture-leak
+
+# Versions / Extensions not supported
+spec@!opengl 3.*
+spec@!opengl 4.*
+spec@!opengl es 3.*
+spec@arb_gpu_shader5.*
+spec@arb_gpu_shader_fp64.*
+spec@arb_gpu_shader_int64.*
+spec@arb_tessellation_shader.*
+spec@arb_texture_cube_map.*
+spec@glsl-1.30.*
+spec@glsl-1.40.*
+spec@glsl-1.50.*
+spec@glsl-3.*
+spec@glsl-4.*
+spec@glsl-es-3.*
diff --git a/src/broadcom/ci/broadcom-rpi4-fails.txt b/src/broadcom/ci/broadcom-rpi4-fails.txt
new file mode 100644
index 00000000000..bac3d618634
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi4-fails.txt
@@ -0,0 +1,602 @@
+glx@glx-make-current,Fail
+glx@glx-multi-window-single-context,Fail
+glx@glx-swap-pixmap-bad,Fail
+glx@glx-visuals-depth -pixmap,Fail
+glx@glx-visuals-stencil -pixmap,Fail
+glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
+glx@glx_arb_create_context_no_error@no error,Fail
+shaders@glsl-bug-110796,Fail
+shaders@point-vertex-id divisor,Fail
+shaders@point-vertex-id gl_instanceid divisor,Fail
+shaders@point-vertex-id gl_instanceid,Fail
+shaders@point-vertex-id gl_vertexid divisor,Fail
+shaders@point-vertex-id gl_vertexid gl_instanceid divisor,Fail
+shaders@point-vertex-id gl_vertexid gl_instanceid,Fail
+shaders@point-vertex-id gl_vertexid,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
+spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
+spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
+spec@!opengl 1.1@point-line-no-cull,Fail
+spec@!opengl 1.1@teximage-colors gl_alpha16@Exact upload-download of GL_ALPHA16,Fail
+spec@!opengl 1.1@texwrap formats bordercolor,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA4- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA4- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail
+spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-fog,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Fail
+spec@arb_copy_image@arb_copy_image-formats,Fail
+spec@arb_copy_image@arb_copy_image-formats@Source: GL_ALPHA16/Destination: GL_ALPHA16,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F NPOT,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH32F_STENCIL8- swizzled- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32F- swizzled- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH32F_STENCIL8- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH_COMPONENT32F- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats offset,Fail
+spec@arb_depth_buffer_float@texwrap formats offset@GL_DEPTH32F_STENCIL8- NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats offset@GL_DEPTH_COMPONENT32F- NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats,Fail
+spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH32F_STENCIL8- NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH_COMPONENT32F- NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail
+spec@arb_depth_texture@texwrap formats bordercolor,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail
+spec@arb_depth_texture@texwrap formats offset,Fail
+spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT16- NPOT,Fail
+spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT24- NPOT,Fail
+spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT32- NPOT,Fail
+spec@arb_depth_texture@texwrap formats,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
+spec@arb_direct_state_access@gettextureimage-formats init-by-rendering,Fail
+spec@arb_direct_state_access@gettextureimage-formats,Fail
+spec@arb_framebuffer_object@fbo-blit-scaled-linear,Fail
+spec@arb_point_sprite@arb_point_sprite-checkerboard,Fail
+spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
+spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail
+spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgrad,Fail
+spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail
+spec@arb_texture_buffer_object@formats (fs- arb),Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb),Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE8UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32F_ARB,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32UI_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA8I_EXT,Fail
+spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA8UI_EXT,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp@r8ui_texture_buffer_size_via_sampler,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp@rg8ui_texture_buffer_size_via_sampler,Fail
+spec@arb_texture_buffer_object@texture-buffer-size-clamp@rgba8ui_texture_buffer_size_via_sampler,Fail
+spec@arb_texture_float@fbo-blending-formats,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_ALPHA32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE_ALPHA32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_RGBA32F,Fail
+spec@arb_texture_float@texwrap formats bordercolor,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_ALPHA32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_INTENSITY32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE_ALPHA32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGB32F- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGBA32F- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_ALPHA32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_INTENSITY32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE_ALPHA32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_RGB32F- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_RGBA32F- border color only,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch.*,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R16_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R8_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG32F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG8_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10_A2,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10_A2UI,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB16_SNORM,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB4,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB8,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB9_E5,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA32F,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA32I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA4,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA8,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA8I,Fail
+spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_SRGB8_ALPHA8,Fail
+spec@arb_texture_rectangle@1-1-linear-texture,Fail
+spec@arb_texture_rg@fbo-blending-formats-float,Fail
+spec@arb_texture_rg@fbo-blending-formats-float@GL_R32F,Fail
+spec@arb_texture_rg@fbo-blending-formats-float@GL_RG32F,Fail
+spec@arb_texture_rg@texwrap formats bordercolor,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_R16- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_RG16- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor@GL_R16- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor@GL_RG16- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_R32F- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_RG32F- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor@GL_R32F- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor@GL_RG32F- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float offset,Fail
+spec@arb_texture_rg@texwrap formats-float offset@GL_R32F- NPOT,Fail
+spec@arb_texture_rg@texwrap formats-float offset@GL_RG32F- NPOT,Fail
+spec@arb_texture_rg@texwrap formats-float,Fail
+spec@arb_texture_rg@texwrap formats-float@GL_R32F- NPOT,Fail
+spec@arb_texture_rg@texwrap formats-float@GL_RG32F- NPOT,Fail
+spec@arb_texture_storage@texture-storage@cube array texture,Fail
+spec@egl 1.4@eglterminate then unbind context,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
+spec@egl_khr_surfaceless_context@viewport,Fail
+spec@egl_mesa_configless_context@basic,Fail
+spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
+spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
+spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
+spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail
+spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-modifiers,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-modifiers@autogen-R16-DRM_FORMAT_MOD_LINEAR-clear_reimport,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_argb8888,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_nv12,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_nv21,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p016,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_uyvy,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_vyuy,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_xrgb8888,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y210,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y212,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuv420,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuyv,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvu420,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvyu,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-transcode-nv12-as-r8-gr88,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_STENCIL8- swizzled- border color only,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail
+spec@ext_packed_depth_stencil@texwrap formats offset,Fail
+spec@ext_packed_depth_stencil@texwrap formats offset@GL_DEPTH24_STENCIL8- NPOT,Fail
+spec@ext_packed_depth_stencil@texwrap formats,Fail
+spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
+spec@ext_packed_float@query-rgba-signed-components,Fail
+spec@ext_texture_integer@fbo-blending,Fail
+spec@ext_texture_integer@getteximage-clamping gl_arb_texture_rg,Fail
+spec@ext_texture_integer@getteximage-clamping,Fail
+spec@ext_texture_integer@multisample-formats 2 gl_ext_texture_integer,Fail
+spec@ext_texture_integer@multisample-formats 4 gl_ext_texture_integer,Fail
+spec@ext_texture_integer@texwrap formats bordercolor,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA16I_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA16UI_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA32I_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA32UI_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA8I_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA8UI_EXT- swizzled- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA16I_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA16UI_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA32I_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA32UI_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA8I_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA8UI_EXT- border color only,Fail
+spec@ext_texture_integer@texwrap formats offset,Fail
+spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT,Fail
+spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT- NPOT,Fail
+spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT- swizzled,Fail
+spec@ext_texture_integer@texwrap formats,Fail
+spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT,Fail
+spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT- NPOT,Fail
+spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT- swizzled,Fail
+spec@ext_texture_lod_bias@lodbias,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_ALPHA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_INTENSITY16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_R16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RG16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGB16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGBA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_ALPHA16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_INTENSITY16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_R16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RG16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGB16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGBA16_SNORM- border color only,Fail
+spec@ext_transform_feedback@tessellation line_loop flat_first,Fail
+spec@ext_transform_feedback@tessellation line_loop flat_last,Fail
+spec@ext_transform_feedback@tessellation line_loop monochrome,Fail
+spec@ext_transform_feedback@tessellation line_loop smooth,Fail
+spec@ext_transform_feedback@tessellation triangle_fan flat_first,Fail
+spec@ext_transform_feedback@tessellation triangle_strip flat_first,Fail
+spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash
+spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
+spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
+spec@glsl-1.20@execution@clipping@vs-clip-vertex-primitives,Fail
+spec@glsl-1.20@execution@fs-underflow-mul-compare-zero,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@nv_copy_image@nv_copy_image-formats,Fail
+spec@nv_copy_image@nv_copy_image-formats@Source: GL_ALPHA16/Destination: GL_ALPHA16,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.vert,Fail
+spec@nv_read_depth@read_depth_gles3,Fail
+spec@oes_point_sprite@arb_point_sprite-checkerboard_gles1,Fail
+spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail
+
+# This crashes only when LLVM is not enabled. This is because Gallium backend
+# uses TGSI to do some task that do not contains a sampler; when LLVM is
+# enabled, it uses LLVM instead, which is complete.
+spec@!opengl 1.0@rasterpos,Crash
+
+# https://gitlab.freedesktop.org/mesa/piglit/-/merge_requests/899
+spec@!opengl 1.0@depth-clear-precision-check,Fail
+
+# There are two problems here. On one side, hardware do not support
+# different polygon mode for front and back faces. By default we
+# choose the mode set for front face, unless we are culling it; in
+# this case we choose the mode set for back face. The other problem is
+# that we do not support rendering quads, so Gallium decomposes them
+# in triangles. This has the drawback that when setting polygon mode
+# as lines, we are rendering an extra edge.
+spec@!opengl 1.1@polygon-mode,Fail
+spec@!opengl 1.1@polygon-mode-facing,Fail
+spec@!opengl 1.1@polygon-mode-offset,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
+
+# V3D does not support PIPE_FORMAT_{R16,R16G16,R16G16B16A16}_UNORM for
+# rendering
+spec@!opengl 3.0@required-texture-attachment-formats,Fail
+spec@!opengl 3.1@required-texture-attachment-formats,Fail
+spec@arb_texture_view@rendering-formats,Crash
+
+# V3D does not support blending for GL_R{GBA}32F
+spec@!opengl 1.1@getteximage-formats,Fail
+
+# OpenGL 3.x requires 8 RT (MAX_DRAW_BUFFERS)/ color attachments (MAX_COLOR_ATTACHMENTS)
+spec@!opengl 3.0@bindfragdata-link-error,Fail
+spec@!opengl 3.0@bindfragdata-nonexistent-variable,Fail
+spec@!opengl 3.0@clearbuffer-mixed-format,Fail
+spec@!opengl 3.0@getfragdatalocation,Fail
+spec@!opengl 3.0@minmax,Fail
+spec@!opengl 3.1@minmax,Fail
+spec@glsl-1.30@built-in constants,Fail
+spec@glsl-1.30@built-in constants@gl_MaxDrawBuffers,Fail
+spec@glsl-1.40@built-in constants,Fail
+spec@glsl-1.40@built-in constants@gl_MaxDrawBuffers,Fail
+
+# OpenGL 3.x applies non-seamless cubemap texturing, while our
+# driver/GLES uses seamless cubemap texturing.
+spec@!opengl 3.0@sampler-cube-shadow,Fail
+spec@arb_texture_cube_map_array@arb_texture_cube_map_array-sampler-cube-array-shadow,Fail
+
+# Precision differences between expected and obtained; works if
+# exporting V3D_DEBUG=tmu32.
+spec@oes_texture_view@rendering-formats,Fail
+spec@oes_texture_view@rendering-formats@clear GL_R8 as GL_R8I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_R16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_R16I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_RG8I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_R32F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_R32I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RG16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RG16I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RGBA8I,Fail
+
+# Also related with precision issues
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_R32F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_R32I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RG16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RG16I,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RGBA8I,Fail
+
+spec@!opengl 1.0@depth-clear-precision-check@depth16,Fail
+spec@!opengl 1.0@depth-clear-precision-check@depth24,Fail
+
+# This fails the subtest for GL_ALPHA16 because we don't support a 16-bit unorm format for rendering
+# so gallium falls back to using an 8-bit unorm format and we lose some precision in the result.
+spec@arb_clear_texture@arb_clear_texture-sized-formats,Fail
+
+# These fail because the shaders use indirect indexing on samplers which we
+# don't support (the GLSL linker fails to link the shaders because of this).
+# If loop unrolling kicks-in for these tests it removes the indirect indexing
+# and the tests pass, but this would just be working around an issue in the
+# tests.
+spec@!opengl 2.0@max-samplers,Fail
+spec@!opengl 2.0@max-samplers border,Fail
+
+# Hardware do not support line/polygon stipple. In fact, this feature
+# was deprecated/removed in newer OpenGL spec versions. It could be
+# emulated using shaders.
+spec@!opengl 1.1@line-smooth-stipple,Fail
+spec@!opengl 1.1@linestipple,Fail
+spec@!opengl 1.1@linestipple@Factor 2x,Fail
+spec@!opengl 1.1@linestipple@Factor 3x,Fail
+spec@!opengl 1.1@linestipple@Line loop,Fail
+spec@!opengl 1.1@linestipple@Line strip,Fail
+spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
+spec@!opengl 2.1@pbo,Fail
+spec@!opengl 2.1@pbo@test_polygon_stip,Fail
+spec@!opengl 2.1@polygon-stipple-fs,Fail
+
+# Works when run individually, but fail consistently on the CI
+dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32i_cube,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4422
+KHR-GL31.texture_size_promotion.functional,Fail
+
+# uprev Piglit in Mesa
+spec@glsl-1.40@uniform_buffer@two-stages,Fail
+
+# RPI4 only supports 4RT, so this tests with 8RT will fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 8,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 1,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 2,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 3,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 4,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 7,Fail
+spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 8,Fail
+
+# This seems to be a Vulkan Loader issue. Can be fixed by compiling the loader from the Github repo.
+dEQP-VK.api.get_device_proc_addr.non_enabled,Fail
+# This is a bug in CTS: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5096
+dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary,Fail
+
+# New CTS failures in 1.3.8.2
+dEQP-VK.api.info.vulkan1p2_limits_validation.khr_vertex_attribute_divisor,Fail
diff --git a/src/broadcom/ci/broadcom-rpi4-flakes.txt b/src/broadcom/ci/broadcom-rpi4-flakes.txt
new file mode 100644
index 00000000000..c1a2cd94b04
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi4-flakes.txt
@@ -0,0 +1,48 @@
+KHR-GLES31.core.shader_image_load_store.basic-glsl-earlyFragTests
+dEQP-GLES31.functional.ssbo.layout.instance_array_basic_type.std430.ivec4
+
+glx@glx_arb_sync_control@waitformsc
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=4
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
+spec@!opengl 1.1@masked-clear
+spec@arb_occlusion_query@occlusion_query_order
+spec@arb_texture_multisample@large-float-texture
+spec@egl_chromium_sync_control@conformance
+spec@ext_packed_depth_stencil@depthstencil-render-miplevels 585 ds=z24_s8
+
+# Seen this one flake a few times already
+spec@egl 1.4@largest possible eglcreatepbuffersurface and then glclear
+
+# This test works alone, but fails when executing all the tests together
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/8684
+dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32f_cube
+dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32i_cube
+
+# Seem reliable on arm64, but they flake on armhf
+dEQP-VK.glsl.builtin.function.integer.findMSB.ivec2_mediump_geometry
+dEQP-VK.glsl.builtin.function.integer.findMSB.ivec2_highp_geometry
+
+# Failed twice one day with two different bad renders, and never since:
+# https://gitlab.freedesktop.org/eric/mesa/-/jobs/37556931
+# https://gitlab.freedesktop.org/eric/mesa/-/jobs/37596148
+dEQP-VK.renderpass2.suballocation.load_store_op_none.depthstencil_d24_unorm_s8_uint_load_op_depth_load_stencil_none_store_op_depth_store_stencil_none_stencil_write_off
+
+# first encounter 01/04/2023
+spec@ext_framebuffer_blit@fbo-sys-blit
+spec@ext_framebuffer_blit@fbo-sys-sub-blit
+
+dEQP-VK.fragment_operations.occlusion_query.precise_test_scissors_depth_write_stencil_clear_stencil_write
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1048576
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.atomic_atomic.atomicrmw.device.payload_local.image.guard_local.image.frag
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.device.payload_local.image.guard_local.buffer.frag
+dEQP-VK.pipeline.monolithic.image.suballocation.sampling_type.combined.view_type.1d_array.format.r8_unorm.count_1.size.443x1_array_of_6
+dEQP-VK.renderpass.suballocation.load_store_op_none.depthstencil_d24_unorm_s8_uint_load_op_depth_load_stencil_none_store_op_depth_store_stencil_none_stencil_write_off
+dEQP-VK.synchronization.basic.timeline_semaphore.one_queue
+dEQP-VK.synchronization2.basic.timeline_semaphore.one_queue
+dEQP-VK.synchronization2.signal_order.shared_binary_semaphore.write_ssbo_compute_indirect_read_ssbo_geometry.buffer_262144_opaque_fd
+dEQP-VK.texture.shadow.cube.linear.less_d24_unorm_s8_uint
diff --git a/src/broadcom/ci/broadcom-rpi4-skips.txt b/src/broadcom/ci/broadcom-rpi4-skips.txt
new file mode 100644
index 00000000000..66d371eaae2
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi4-skips.txt
@@ -0,0 +1,293 @@
+# Slow tests (> 1 minute to run)
+spec@!opengl 1.1@streaming-texture-leak
+spec@!opengl 1.2@tex3d-maxsize
+spec@arb_texture_multisample@texelfetch fs sampler2dms 4 1x130-501x130
+spec@arb_texture_multisample@texelfetch fs sampler2dms 4 1x71-501x71
+spec@arb_texture_multisample@texelfetch fs sampler2dmsarray 4 98x1x9-98x129x9
+spec@glsl-1.30@execution@texelfetch fs sampler2d 1x281-501x281
+
+# Versions / Extensions not supported
+spec@!opengl 3.2@.*
+spec@!opengl 3.3@.*
+spec@!opengl 4.2@.*
+spec@!opengl 4.3@.*
+spec@!opengl 4.4@.*
+spec@!opengl 4.5@.*
+spec@arb_gpu_shader5.*
+spec@arb_gpu_shader_fp64.*
+spec@arb_gpu_shader_int64.*
+spec@arb_tessellation_shader.*
+spec@glsl-1.50.*
+spec@glsl-3.*
+spec@glsl-4.*
+spec@glsl-es-3.20.*
+
+# Broadcom waivers
+dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+dEQP-VK.rasterization.depth_bias.d32_sfloat
+
+# Kernel blocks (probably GMP violations)
+spec@arb_shading_language_420pack@active sampler conflict
+spec@arb_texture_buffer_object@render-no-bo
+
+# Slow tests (> 1 minute to run)
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.comp_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.comp_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.frag_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.frag_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.comp
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.frag
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.frag_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.single.std140.vert_offset_nonzero
+dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap
+dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap
+dEQP-VK.memory.mapping.suballocation.full.variable.implicit_unmap
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std140.mat4
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_array.std430.mat4
+dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_store_cols
+dEQP-VK.ssbo.layout.random.16bit.all_per_block_buffers.47
+dEQP-VK.ssbo.layout.random.16bit.all_per_block_buffers.5
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.5
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.6
+dEQP-VK.ssbo.layout.random.8bit.nested_structs_arrays_instance_arrays.15
+dEQP-VK.ssbo.layout.random.8bit.nested_structs_arrays_instance_arrays.9
+dEQP-VK.ssbo.layout.random.all_shared_buffer.3
+dEQP-VK.ssbo.layout.random.arrays_of_arrays.13
+dEQP-VK.ssbo.layout.random.nested_structs_arrays.17
+dEQP-VK.ssbo.phys.layout.2_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.2_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4x2
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.column_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.random.16bit.all_per_block_buffers.45
+dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.23
+dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.36
+dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.40
+dEQP-VK.ssbo.phys.layout.random.16bit.nested_structs_arrays.23
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.17
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.38
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.49
+dEQP-VK.ssbo.phys.layout.random.8bit.all_shared_buffer.19
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.17
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.20
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays_instance_arrays.12
+dEQP-VK.ssbo.phys.layout.random.8bit.unsized_arrays.0
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.14
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.18
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.22
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.46
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.20
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.3
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.8
+dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays.13
+dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays_instance_arrays.23
+dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays_instance_arrays.3
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_comp_access
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_store_cols
+dEQP-VK.synchronization.basic.timeline_semaphore.chain
+dEQP-VK.synchronization2.basic.timeline_semaphore.chain
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat
+
+# WSI tests are too flaky to be useful
+dEQP-VK.image.swapchain_mutable.*
+dEQP-VK.wsi.*
+
+# These require VK_KHR_shader_draw_parameters but they don't check for it
+# (Seems to be fixed in some later release of CTS 1.3.7).
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multi_draw.*
+
+# Skip tests for unsupported features so we can increase the number of tests
+# that are actually useful in the limited CI time we have per job.
+dEQP-VK.pipeline.monolithic.multisample_with_fragment_shading_rate.*
+dEQP-VK.pipeline.monolithic.bind_point.graphics_raytracing.*
+dEQP-VK.pipeline.monolithic.bind_point.compute_raytracing.*
+dEQP-VK.pipeline.pipeline_library.*
+dEQP-VK.pipeline.fast_linked_library.*
+dEQP-VK.pipeline.shader_object*
+dEQP-VK.protected_memory.*
+dEQP-VK.transform_feedback.*
+dEQP-VK.ray_tracing_pipeline.*
+dEQP-VK.ray_query.*
+dEQP-VK.fragment_shading_rate.*
+dEQP-VK.mesh_shader.*
+dEQP-VK.shader_object.rendering.*
diff --git a/src/broadcom/ci/broadcom-rpi5-fails.txt b/src/broadcom/ci/broadcom-rpi5-fails.txt
new file mode 100644
index 00000000000..3241bf827dc
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi5-fails.txt
@@ -0,0 +1,11 @@
+# New CTS failures in 1.3.8.0
+dEQP-VK.query_pool.performance_query.query_compute,Fail
+dEQP-VK.query_pool.performance_query.query_compute_copy,Fail
+dEQP-VK.query_pool.performance_query.query_graphic,Fail
+dEQP-VK.query_pool.performance_query.query_graphic_copy,Fail
+# This seems to be a Vulkan Loader issue. Can be fixed by compiling the loader from the Github repo.
+dEQP-VK.api.get_device_proc_addr.non_enabled,Fail
+# This is a bug in CTS: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5096
+dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary,Fail
+
+dEQP-VK.api.info.vulkan1p2_limits_validation.khr_vertex_attribute_divisor,Fail
diff --git a/src/broadcom/ci/broadcom-rpi5-flakes.txt b/src/broadcom/ci/broadcom-rpi5-flakes.txt
new file mode 100644
index 00000000000..35a53c59666
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi5-flakes.txt
@@ -0,0 +1,15 @@
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.4_samples
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.4_samples
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1048576
+dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.8192
+dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.queuefamily.payload_local.image.guard_local.image.frag
+dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.subgroup.payload_local.image.guard_local.buffer.frag
+dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.subgroup.payload_local.image.guard_local.buffer.frag
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.device.payload_local.image.guard_local.buffer.frag
+dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.queuefamily.payload_local.image.guard_local.image.frag
+dEQP-VK.pipeline.monolithic.image.suballocation.sampling_type.combined.view_type.1d_array.format.r8_unorm.count_1.size.443x1_array_of_6
+dEQP-VK.spirv_assembly.type.scalar.i8.shift_left_logical_shift16_tesse
+dEQP-VK.synchronization2.cross_instance.suballocated.write_blit_image_read_image_tess_eval.image_128x128_r32g32b32a32_sfloat_binary_semaphore_fence_fd
+dEQP-VK.texture.shadow.cube.linear.less_d24_unorm_s8_uint
diff --git a/src/broadcom/ci/broadcom-rpi5-skips.txt b/src/broadcom/ci/broadcom-rpi5-skips.txt
new file mode 100644
index 00000000000..17110a448da
--- /dev/null
+++ b/src/broadcom/ci/broadcom-rpi5-skips.txt
@@ -0,0 +1,96 @@
+# Slow tests (> 1 minute to run)
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.vert
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.5
+dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.6
+dEQP-VK.ssbo.layout.random.8bit.scalar.78
+dEQP-VK.ssbo.layout.random.nested_structs_arrays.17
+dEQP-VK.ssbo.layout.random.scalar.75
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat3x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4x3
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4x3_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access
+dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_comp_access
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_comp_access_store_cols
+dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_store_cols
+dEQP-VK.ssbo.phys.layout.random.16bit.scalar.78
+dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.17
+dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays_instance_arrays.12
+dEQP-VK.ssbo.phys.layout.random.8bit.scalar.78
+dEQP-VK.ssbo.phys.layout.random.8bit.scalar.96
+dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.22
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.3
+dEQP-VK.ssbo.phys.layout.random.scalar.3
+dEQP-VK.ssbo.phys.layout.random.scalar.93
+
+# WSI tests are too flaky to be useful
+dEQP-VK.image.swapchain_mutable.*
+dEQP-VK.wsi.*
+
+# These require VK_KHR_shader_draw_parameters but they don't check for it
+# (Seems to be fixed in some later release of CTS 1.3.7).
+dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multi_draw.*
+
+# Skip tests for unsupported features so we can increase the number of tests
+# that are actually useful in the limited CI time we have per job.
+dEQP-VK.pipeline.monolithic.multisample_with_fragment_shading_rate.*
+dEQP-VK.pipeline.monolithic.bind_point.graphics_raytracing.*
+dEQP-VK.pipeline.monolithic.bind_point.compute_raytracing.*
+dEQP-VK.pipeline.pipeline_library.*
+dEQP-VK.pipeline.fast_linked_library.*
+dEQP-VK.pipeline.shader_object*
+dEQP-VK.protected_memory.*
+dEQP-VK.transform_feedback.*
+dEQP-VK.ray_tracing_pipeline.*
+dEQP-VK.ray_query.*
+dEQP-VK.fragment_shading_rate.*
+dEQP-VK.mesh_shader.*
+dEQP-VK.shader_object.rendering.*
diff --git a/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml b/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml
new file mode 100644
index 00000000000..a9649cbe516
--- /dev/null
+++ b/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml
@@ -0,0 +1,6 @@
+[[piglit]]
+piglit_folder = "/piglit"
+profile = "gpu"
+process_isolation = true
+ [piglit.env]
+ PIGLIT_PLATFORM = "mixed_glx_egl"
diff --git a/src/broadcom/ci/deqp-broadcom-rpi3.toml b/src/broadcom/ci/deqp-broadcom-rpi3.toml
new file mode 100644
index 00000000000..1b7293b7c5c
--- /dev/null
+++ b/src/broadcom/ci/deqp-broadcom-rpi3.toml
@@ -0,0 +1,61 @@
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-main.txt"]
+tests_per_group = 250
+deqp_args = [
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+ "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer",
+ "--deqp-surface-width=256",
+ "--deqp-visibility=hidden",
+]
+version_check = "GL ES 2.0.*git"
+renderer_check = "VC4"
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = ["/deqp/mustpass/gles2-khr-main.txt"]
+tests_per_group = 250
+deqp_args = [
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+ "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer",
+ "--deqp-surface-width=256",
+ "--deqp-visibility=hidden",
+]
+
+# We are getting frequent GPU hangs with piglit, but still haven't identified
+# the cause. So let's disable it for now.
+# [[piglit]]
+# piglit_folder = "/piglit"
+# profile = "quick_gl"
+# process_isolation = true
+# [piglit.env]
+# PIGLIT_PLATFORM = "mixed_glx_egl"
+
+[[piglit]]
+piglit_folder = "/piglit"
+profile = "quick_shader"
+process_isolation = true
+
+# wayland
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-wayland"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+ "--deqp-surface-width=256", "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "wayland-"
+
+# x11
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-x11"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+ "--deqp-surface-width=256", "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "x11-"
diff --git a/src/broadcom/ci/deqp-broadcom-rpi4.toml b/src/broadcom/ci/deqp-broadcom-rpi4.toml
new file mode 100644
index 00000000000..930077f31f2
--- /dev/null
+++ b/src/broadcom/ci/deqp-broadcom-rpi4.toml
@@ -0,0 +1,89 @@
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-main.txt"]
+deqp_args = [
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+ "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer",
+ "--deqp-surface-width=256",
+ "--deqp-visibility=hidden",
+]
+version_check = "GL ES 3.1.*git"
+renderer_check = "V3D"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-main.txt"]
+deqp_args = [
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+ "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer",
+ "--deqp-surface-width=256",
+ "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-main.txt"]
+deqp_args = [
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+ "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer",
+ "--deqp-surface-width=256",
+ "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+ "/deqp/mustpass/gles31-khr-main.txt",
+ "/deqp/mustpass/gles3-khr-main.txt",
+ "/deqp/mustpass/gles2-khr-main.txt",
+]
+deqp_args = [
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+ "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer",
+ "--deqp-surface-width=256",
+ "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = ["/deqp/mustpass/gl31-main.txt"]
+deqp_args = [
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+ "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer",
+ "--deqp-surface-width=256",
+ "--deqp-visibility=hidden",
+]
+
+[[piglit]]
+piglit_folder = "/piglit"
+profile = "gpu"
+process_isolation = true
+ [piglit.env]
+ PIGLIT_PLATFORM = "mixed_glx_egl"
+
+# wayland
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-wayland"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+ "--deqp-surface-width=256", "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "wayland-"
+
+# x11
+[[deqp]]
+deqp = "/deqp/modules/egl/deqp-egl-x11"
+caselists = ["/deqp/mustpass/egl-main.txt"]
+deqp_args = [
+ "--deqp-surface-width=256", "--deqp-surface-height=256",
+ "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+ "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "x11-"
diff --git a/src/broadcom/ci/deqp-v3d-rpi4-fails.txt b/src/broadcom/ci/deqp-v3d-rpi4-fails.txt
deleted file mode 100644
index 10ab688613d..00000000000
--- a/src/broadcom/ci/deqp-v3d-rpi4-fails.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_amplification,Fail
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_instanced,Fail
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_no_amplification,Fail
-dEQP-GLES31.functional.geometry_shading.query.primitives_generated_partial_primitives,Fail
diff --git a/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt b/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt
deleted file mode 100644
index 673cc5b0941..00000000000
--- a/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-dEQP-GLES31.functional.compute.shared_var.basic_type.ivec3_highp
-dEQP-GLES31.functional.ssbo.layout.single_basic_type.packed.highp_mat2
-KHR-GLES31.core.shader_image_load_store.basic-glsl-earlyFragTests
diff --git a/src/broadcom/ci/deqp-v3d-rpi4-gles.toml b/src/broadcom/ci/deqp-v3d-rpi4-gles.toml
deleted file mode 100644
index 32a569344d2..00000000000
--- a/src/broadcom/ci/deqp-v3d-rpi4-gles.toml
+++ /dev/null
@@ -1,47 +0,0 @@
-[[deqp]]
-deqp = "/deqp/modules/gles31/deqp-gles31"
-caselists = [ "/deqp/mustpass/gles31-master.txt" ]
-deqp_args = [
- "--deqp-gl-config-name=rgba8888d24s8ms0",
- "--deqp-surface-height=256",
- "--deqp-surface-type=pbuffer",
- "--deqp-surface-width=256",
- "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/modules/gles3/deqp-gles3"
-caselists = [ "/deqp/mustpass/gles3-master.txt" ]
-deqp_args = [
- "--deqp-gl-config-name=rgba8888d24s8ms0",
- "--deqp-surface-height=256",
- "--deqp-surface-type=pbuffer",
- "--deqp-surface-width=256",
- "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/modules/gles2/deqp-gles2"
-caselists = [ "/deqp/mustpass/gles2-master.txt" ]
-deqp_args = [
- "--deqp-gl-config-name=rgba8888d24s8ms0",
- "--deqp-surface-height=256",
- "--deqp-surface-type=pbuffer",
- "--deqp-surface-width=256",
- "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/external/openglcts/modules/glcts"
-caselists = [
- "/deqp/mustpass/gles31-khr-master.txt",
- "/deqp/mustpass/gles3-khr-master.txt",
- "/deqp/mustpass/gles2-khr-master.txt",
-]
-deqp_args = [
- "--deqp-gl-config-name=rgba8888d24s8ms0",
- "--deqp-surface-height=256",
- "--deqp-surface-type=pbuffer",
- "--deqp-surface-width=256",
- "--deqp-visibility=hidden",
-]
diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt b/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt
deleted file mode 100644
index 7898bc2a2d1..00000000000
--- a/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-# This seems to fail due to the test error threshold being insufficient
-dEQP-VK.geometry.input.basic_primitive.line_strip_adjacency,Fail
-
-# CTS bug; fix submitted
-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_single_buffer_geom,Fail
-
-# Multiview doesn't work with points
-dEQP-VK.multiview.point_size.15,Fail
-dEQP-VK.multiview.point_size.8,Fail
-dEQP-VK.multiview.point_size.1_2_4_8,Fail
-dEQP-VK.multiview.point_size.15_15_15_15,Fail
-dEQP-VK.multiview.point_size.8_1_1_8,Fail
-dEQP-VK.multiview.point_size.5_10_5_10,Fail
-dEQP-VK.multiview.point_size.1_2_4_8_16_32,Fail
-dEQP-VK.multiview.point_size.max_multi_view_view_count,Fail
-
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail
-dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail
diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt b/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt
deleted file mode 100644
index 0d22f002dbd..00000000000
--- a/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-dEQP-VK.api.external.fence.opaque_fd.reset_permanent
-dEQP-VK.api.external.fence.opaque_fd.reset_temporary
-dEQP-VK.api.external.fence.opaque_fd.signal_export_import_wait_permanent
-dEQP-VK.ssbo.layout.instance_array_basic_type.std430.uvec4
-dEQP-VK.wsi.display.get_display_plane_capabilities
diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt b/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt
deleted file mode 100644
index bf6a82c19bf..00000000000
--- a/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# Broadcom waivers
-dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
-dEQP-VK.rasterization.depth_bias.d32_sfloat
-
-# Timeout tests (> 1 minute to run)
-dEQP-VK.api.object_management.max_concurrent.query_pool
-dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
-dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap
-dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap
-dEQP-VK.memory.mapping.suballocation.full.variable.implicit_unmap
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert
-dEQP-VK.ssbo.layout.random.all_shared_buffer.5
-dEQP-VK.ssbo.layout.random.arrays_of_arrays.13
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.0
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp
-dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat
-dEQP-VK.ubo.random.all_out_of_order_offsets.45
-dEQP-VK.ubo.random.all_shared_buffer.48
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-fails.txt b/src/broadcom/ci/deqp-vc4-rpi3-fails.txt
deleted file mode 100644
index d0722563e60..00000000000
--- a/src/broadcom/ci/deqp-vc4-rpi3-fails.txt
+++ /dev/null
@@ -1,420 +0,0 @@
-KHR-GLES2.core.internalformat.copy_tex_image.alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance4_alpha4_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb565,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb5_a1,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgba4,Fail
-KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail
-KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail
-KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail
-KHR-GLES2.texture_3d.copy_sub_image.negative,Fail
-KHR-GLES2.texture_3d.copy_sub_image.rgba,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_clamp,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_mirror,Fail
-KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_repeat,Fail
-KHR-GLES2.texture_3d.filtering.combinations.negative,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_linear,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_linear,Fail
-KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_nearest,Fail
-KHR-GLES2.texture_3d.framebuffer_texture.rgba,Fail
-KHR-GLES2.texture_3d.sub_image.rgba8,Fail
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_window,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.create_context.no_config,Fail
-dEQP-EGL.functional.render.multi_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.render.multi_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_context.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgb888_window,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_window,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgb888_pbuffer,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgb888_window,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgba8888_pbuffer,Crash
-dEQP-EGL.functional.render.single_context.gles2.rgba8888_window,Crash
-dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
-dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
-dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail
-dEQP-GLES2.functional.draw.draw_arrays.line_loop.multiple_attributes,Fail
-dEQP-GLES2.functional.draw.draw_arrays.line_loop.single_attribute,Fail
-dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail
-dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail
-dEQP-GLES2.functional.negative_api.shader.uniform_matrixfv_invalid_transpose,Fail
-dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level_array_compressed,Fail
-dEQP-GLES2.functional.negative_api.vertex_array.vertex_attrib,Fail
-dEQP-GLES2.functional.negative_api.vertex_array.vertex_attribv,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_mirror_rgba8888,Fail
-dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail
-dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail
-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.2d_rgba,Fail
-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.cube_rgba,Fail
-dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt b/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt
deleted file mode 100644
index 497be959096..00000000000
--- a/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_neg_x_neg_y_neg_z
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_pos_y_pos_z
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
-dEQP-GLES2.functional.draw.random.51
-dEQP-GLES2.functional.fragment_ops.blend.rgb_func_alpha_func.src.one_minus_src_alpha_constant_color
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_direct_write_dynamic_loop_subscript_read_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_mediump_int_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.conditional_continue_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_inout_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_return_vertex
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_sequence_vertex
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.select_iteration_count_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.function_call_return_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.infinite_with_conditional_break_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.post_increment_vertex
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.single_iteration_vertex
-dEQP-GLES2.functional.shaders.operator.unary_operator.pre_decrement_result.mediump_vec3_fragment
-dEQP-GLES2.functional.shaders.random.exponential.fragment.51
-dEQP-GLES2.functional.shaders.random.texture.fragment.129
-dEQP-GLES2.functional.shaders.return.output_write_in_func_never_vertex
-dEQP-GLES2.functional.texture.filtering.2d.linear_linear_clamp_rgb888_pot
-dEQP-GLES2.functional.texture.filtering.cube.linear_mipmap_linear_nearest_mirror_rgba8888
-dEQP-GLES2.functional.texture.filtering.cube.nearest_linear_mirror_rgba8888_pot
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_linear_clamp_rgba8888
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_nearest_repeat_l8
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_clamp_rgba8888
-dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_mirror_rgba8888
-dEQP-GLES2.functional.texture.mipmap.cube.generate.rgb565_fastest
-dEQP-GLES2.functional.texture.size.cube.256x256_rgb888
diff --git a/src/broadcom/ci/deqp-vc4-rpi3-gles.toml b/src/broadcom/ci/deqp-vc4-rpi3-gles.toml
deleted file mode 100644
index 4ca3ab03231..00000000000
--- a/src/broadcom/ci/deqp-vc4-rpi3-gles.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-[[deqp]]
-deqp = "/deqp/modules/gles2/deqp-gles2"
-caselists = [ "/deqp/mustpass/gles2-master.txt" ]
-tests_per_group = 250
-deqp_args = [
- "--deqp-gl-config-name=rgba8888d24s8ms0",
- "--deqp-surface-height=256",
- "--deqp-surface-type=pbuffer",
- "--deqp-surface-width=256",
- "--deqp-visibility=hidden",
-]
-
-[[deqp]]
-deqp = "/deqp/external/openglcts/modules/glcts"
-caselists = [ "/deqp/mustpass/gles2-khr-master.txt" ]
-tests_per_group = 250
-deqp_args = [
- "--deqp-gl-config-name=rgba8888d24s8ms0",
- "--deqp-surface-height=256",
- "--deqp-surface-type=pbuffer",
- "--deqp-surface-width=256",
- "--deqp-visibility=hidden",
-]
diff --git a/src/broadcom/ci/gitlab-ci-inc.yml b/src/broadcom/ci/gitlab-ci-inc.yml
new file mode 100644
index 00000000000..4a106db4af2
--- /dev/null
+++ b/src/broadcom/ci/gitlab-ci-inc.yml
@@ -0,0 +1,156 @@
+.broadcom-common-rules:
+ rules:
+ - changes: &broadcom_file_list
+ - src/broadcom/meson.build
+ - src/broadcom/ci/gitlab-ci.yml
+ - src/broadcom/ci/gitlab-ci-inc.yml
+ - src/broadcom/ci/deqp-$DEQP_SUITE.toml
+ - src/broadcom/ci/$GPU_VERSION-fails.txt
+ - src/broadcom/ci/$GPU_VERSION-flakes.txt
+ - src/broadcom/ci/$GPU_VERSION-skips.txt
+ - src/broadcom/ci/$PIGLIT_TRACES_FILE
+ - src/broadcom/cle/**/*
+ - src/broadcom/clif/**/*
+ - src/broadcom/common/**/*
+ - src/broadcom/compiler/**/*
+ - src/broadcom/drm-shim/**/*
+ - src/broadcom/qpu/**/*
+ - src/broadcom/simulator/**/*
+ when: on_success
+
+.broadcom-common-manual-rules:
+ rules:
+ - changes: *broadcom_file_list
+ when: manual
+
+.vc4-rules:
+ stage: broadcom
+ rules:
+ - if: $FORCE_KERNEL_TAG != null
+ when: never
+ - !reference [.test, rules]
+ - !reference [.igalia-farm-rules, rules]
+ - !reference [.gl-rules, rules]
+ - !reference [.broadcom-common-rules, rules]
+ - changes: &vc4_file_list
+ - src/gallium/drivers/vc4/**/*
+ - src/gallium/winsys/vc4/**/*
+ - src/gallium/auxiliary/renderonly/**/*
+ - src/gallium/winsys/kmsro/**/*
+ when: on_success
+
+.vc4-manual-rules:
+ stage: broadcom
+ rules:
+ - !reference [.test, rules]
+ - !reference [.igalia-farm-manual-rules, rules]
+ - !reference [.gl-manual-rules, rules]
+ - !reference [.broadcom-common-manual-rules, rules]
+ - changes: *vc4_file_list
+ when: manual
+
+.v3d-rules:
+ stage: broadcom
+ rules:
+ - if: $FORCE_KERNEL_TAG != null
+ when: never
+ - !reference [.test, rules]
+ - !reference [.igalia-farm-rules, rules]
+ - !reference [.gl-rules, rules]
+ - !reference [.broadcom-common-rules, rules]
+ - changes: &v3d_file_list
+ - src/gallium/drivers/v3d/**/*
+ - src/gallium/winsys/v3d/**/*
+ - src/gallium/auxiliary/renderonly/**/*
+ - src/gallium/winsys/kmsro/**/*
+ when: on_success
+
+.v3d-manual-rules:
+ stage: broadcom
+ retry: !reference [.scheduled_pipeline-rules, retry]
+ rules:
+ - !reference [.test, rules]
+ - !reference [.igalia-farm-manual-rules, rules]
+ - !reference [.gl-manual-rules, rules]
+ - !reference [.broadcom-common-manual-rules, rules]
+ - changes:
+ *v3d_file_list
+ when: manual
+
+.v3dv-rules:
+ stage: broadcom
+ rules:
+ - if: $FORCE_KERNEL_TAG != null
+ when: never
+ - !reference [.test, rules]
+ - !reference [.igalia-farm-rules, rules]
+ - !reference [.vulkan-rules, rules]
+ - !reference [.broadcom-common-rules, rules]
+ - changes: &v3dv_file_list
+ - src/broadcom/vulkan/**/*
+ when: on_success
+
+.v3dv-manual-rules:
+ stage: broadcom
+ rules:
+ - !reference [.test, rules]
+ - !reference [.igalia-farm-manual-rules, rules]
+ - !reference [.vulkan-manual-rules, rules]
+ - !reference [.broadcom-common-manual-rules, rules]
+ - changes: *v3dv_file_list
+ when: manual
+
+# 8 devices (2023-12-18)
+.igalia-bcm2837-rpi-3-b:arm64:
+ variables:
+ DEVICE_TYPE: rpi3
+ GPU_VERSION: broadcom-rpi3
+ script:
+ - ./install/bare-metal/poe-powered.sh
+ tags:
+ - igalia-rpi3
+
+# 21 devices (2023-12-18)
+.igalia-bcm2711-rpi-4:arm64:
+ variables:
+ DEVICE_TYPE: rpi4
+ GPU_VERSION: broadcom-rpi4
+ VK_DRIVER: broadcom
+ script:
+ - ./install/bare-metal/poe-powered.sh
+ tags:
+ - igalia-rpi4
+
+# 1 device (2024-01-02)
+.igalia-bcm2712-rpi-5:arm64:
+ variables:
+ DEVICE_TYPE: rpi5
+ GPU_VERSION: broadcom-rpi5
+ VK_DRIVER: broadcom
+ script:
+ - ./install/bare-metal/poe-powered.sh
+ tags:
+ - igalia-rpi5
+
+.broadcom-test:
+ script:
+ - ./install/bare-metal/poe-powered.sh
+ variables:
+ HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+ FLAKES_CHANNEL: "#videocore-ci"
+ FARM: igalia
+ timeout: 20m
+
+.broadcom-test:arm64:
+ extends:
+ - .broadcom-test
+ - .baremetal-test-arm64
+ variables:
+ BM_BOOTFS: /boot/raspberrypi_arm64
+
+.broadcom-test:arm32:
+ extends:
+ - .broadcom-test
+ - .baremetal-test-arm32
+ variables:
+ BM_BOOTFS: /boot/raspberrypi_armhf
diff --git a/src/broadcom/ci/gitlab-ci.yml b/src/broadcom/ci/gitlab-ci.yml
index 165f9959936..32ef88554fc 100644
--- a/src/broadcom/ci/gitlab-ci.yml
+++ b/src/broadcom/ci/gitlab-ci.yml
@@ -1,141 +1,113 @@
-.vc4-rpi3-test:armhf:
+include:
+ - local: 'src/broadcom/ci/gitlab-ci-inc.yml'
+
+vc4-rpi3-gl:arm32:
extends:
- - .baremetal-test-armhf
+ - .igalia-bcm2837-rpi-3-b:arm64
+ - .broadcom-test:arm32
- .vc4-rules
- - .use-debian/arm_test
+ parallel: 4
variables:
- BM_BOOTFS: /boot/raspberrypi_armhf
- BM_ROOTFS: /rootfs-armhf
- DEQP_EXPECTED_RENDERER: VC4
- GPU_VERSION: vc4-rpi3
- HWCI_KERNEL_MODULES: vc4
- FLAKES_CHANNEL: "#videocore-ci"
- script:
- - ./install/bare-metal/poe-powered.sh
- needs:
- - job: debian/arm_test
- artifacts: false
- - debian-armhf
- tags:
- - igalia-rpi3
+ DEQP_SUITE: broadcom-rpi3
+ HWCI_START_WESTON: 1
-vc4-rpi3-gles:armhf:
+vc4-rpi3-gl-piglit-full:arm32:
extends:
- - .vc4-rpi3-test:armhf
- parallel: 2
+ - vc4-rpi3-gl:arm32
+ - .vc4-manual-rules
+ tags:
+ - igalia-rpi3
+ - igalia-fullrun
variables:
- HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
- DEQP_SUITE: vc4-rpi3-gles
- DEQP_VER: gles2
+ DEQP_SUITE: broadcom-rpi3-piglit-full
-vc4-rpi3-egl:armhf:
- extends:
- - .vc4-rpi3-test:armhf
- variables:
- HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
- HWCI_START_XORG: 1
- DEQP_RUNNER_OPTIONS: "--tests-per-group 250"
- DEQP_VER: egl
-.vc4-rpi3-piglit:armhf:
+v3d-rpi4-gl:arm64:
extends:
- - .piglit-test
- - .vc4-rpi3-test:armhf
- - .test-manual
+ - .igalia-bcm2711-rpi-4:arm64
+ - .broadcom-test:arm64
+ - .v3d-rules
+ parallel: 8
variables:
- HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
- BM_POE_TIMEOUT: 180
- HWCI_START_XORG: 1
- PIGLIT_PLATFORM: mixed_glx_egl
+ HWCI_START_WESTON: 1
+ DEQP_SUITE: broadcom-rpi4
+ DEQP_FRACTION: 2
-vc4-rpi3-piglit-quick_gl:armhf:
+v3d-rpi4-gl-full:arm64:
extends:
- - .vc4-rpi3-piglit:armhf
- parallel: 4
+ - v3d-rpi4-gl:arm64
+ - .v3d-manual-rules
+ tags:
+ - igalia-rpi4
+ - igalia-fullrun
+ parallel: 6
+ timeout: 45m
variables:
- FDO_CI_CONCURRENT: 1
- PIGLIT_PROFILES: quick_gl
+ TEST_PHASE_TIMEOUT: 40
+ DEQP_FRACTION: 1
-vc4-rpi3-piglit-quick_shader:armhf:
- extends:
- - .vc4-rpi3-piglit:armhf
- parallel: 2
- variables:
- FDO_CI_CONCURRENT: 2
- PIGLIT_PROFILES: quick_shader
-.v3d-rpi4-test:armhf:
+v3d-rpi4-traces:arm64:
extends:
- - .baremetal-test-armhf
+ - .igalia-bcm2711-rpi-4:arm64
+ - .piglit-traces-test
+ - .broadcom-test:arm64
- .v3d-rules
- - .use-debian/arm_test
variables:
- HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
- BM_BOOTFS: /boot/raspberrypi_armhf
- BM_POE_TIMEOUT: 300
- BM_ROOTFS: /rootfs-armhf
- DEQP_EXPECTED_RENDERER: V3D
- FLAKES_CHANNEL: "#videocore-ci"
- GPU_VERSION: v3d-rpi4
- HWCI_KERNEL_MODULES: v3d,vc4
- script:
- - ./install/bare-metal/poe-powered.sh
- needs:
- - debian/arm_test
- - debian-armhf
- tags:
- - igalia-rpi4
+ HWCI_TEST_SCRIPT: "/install/piglit/piglit-traces.sh"
+ PIGLIT_TRACES_FILE: traces-broadcom.yml
+ PIGLIT_REPLAY_DEVICE_NAME: "broadcom-rpi4"
+ PIGLIT_RESULTS: "broadcom-rpi4-replay"
-v3d-rpi4-gles:armhf:
- extends:
- - .v3d-rpi4-test:armhf
- parallel: 8
- variables:
- DEQP_SUITE: v3d-rpi4-gles
- DEQP_VER: gles31
-v3d-rpi4-egl:armhf:
+v3dv-rpi4-vk:arm64:
extends:
- - .v3d-rpi4-test:armhf
+ - .igalia-bcm2711-rpi-4:arm64
+ - .broadcom-test:arm64
+ - .v3dv-rules
+ parallel: 10
variables:
- HWCI_START_XORG: 1
- DEQP_VER: egl
+ HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+ HWCI_START_WESTON: 1
+ DEQP_EXPECTED_RENDERER: "V3D.4.2"
+ DEQP_FRACTION: 3
+ DEQP_VER: vk
+ FLAKES_CHANNEL: "#videocore-ci"
-v3d-rpi4-piglit:armhf:
+v3dv-rpi4-vk-full:arm64:
extends:
- - .piglit-test
- - .v3d-rpi4-test:armhf
- parallel: 4
+ - v3dv-rpi4-vk:arm64
+ - .v3dv-manual-rules
+ tags:
+ - igalia-rpi4
+ - igalia-fullrun
+ parallel: 6
+ timeout: 2h
variables:
- HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
- HWCI_START_XORG: 1
- PIGLIT_PLATFORM: mixed_glx_egl
- PIGLIT_PROFILES: all
+ # Keep 10 minutes for boot + setup + uploading the artifacts at the end
+ TEST_PHASE_TIMEOUT: 110
+ DEQP_FRACTION: 1
-v3dv-rpi4-vk:arm64:
+
+.v3dv-rpi5-vk:arm64:
extends:
- - .baremetal-test
- - .use-debian/arm_test
+ - .igalia-bcm2712-rpi-5:arm64
+ - .broadcom-test:arm64
- .v3dv-rules
- parallel: 8
variables:
HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
- BM_BOOTFS: /boot/raspberrypi_arm64
- BM_POE_TIMEOUT: 300
- BM_ROOTFS: /rootfs-arm64
- DEQP_EXPECTED_RENDERER: "V3D 4.2"
- DEQP_FRACTION: 5
+ HWCI_START_WESTON: 1
+ DEQP_EXPECTED_RENDERER: "V3D.7.1"
+ DEQP_FRACTION: 15
DEQP_VER: vk
FLAKES_CHANNEL: "#videocore-ci"
- GPU_VERSION: v3dv-rpi4
- HWCI_KERNEL_MODULES: v3d,vc4
- MINIO_ARTIFACT_NAME: mesa-arm64
- VK_DRIVER: broadcom
- script:
- - ./install/bare-metal/poe-powered.sh
- needs:
- - debian/arm_test
- - job: debian-arm64
- artifacts: false
- tags:
- - igalia-rpi4
+
+v3dv-rpi5-vk-full:arm64:
+ extends:
+ - .v3dv-rpi5-vk:arm64
+ - .v3dv-manual-rules
+ timeout: 3h
+ variables:
+ # Keep 10 minutes for boot + setup + uploading the artifacts at the end
+ TEST_PHASE_TIMEOUT: 170
+ DEQP_FRACTION: 1
diff --git a/src/broadcom/ci/piglit-v3d-rpi4-fails.txt b/src/broadcom/ci/piglit-v3d-rpi4-fails.txt
deleted file mode 100644
index 4557a55562f..00000000000
--- a/src/broadcom/ci/piglit-v3d-rpi4-fails.txt
+++ /dev/null
@@ -1,337 +0,0 @@
-glx@glx-make-current,Crash
-glx@glx-multi-window-single-context,Fail
-glx@glx-multithread-buffer,Fail
-glx@glx-query-drawable-glx_fbconfig_id-window,Fail
-glx@glx-swap-pixmap-bad,Fail
-glx@glx-visuals-depth -pixmap,Crash
-glx@glx-visuals-stencil -pixmap,Crash
-glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
-glx@glx_arb_create_context_no_error@no error,Fail
-glx@glx_ext_import_context@free context,Fail
-glx@glx_ext_import_context@get context id,Fail
-glx@glx_ext_import_context@get current display,Fail
-glx@glx_ext_import_context@import context- multi process,Fail
-glx@glx_ext_import_context@import context- single process,Fail
-glx@glx_ext_import_context@imported context has same context id,Fail
-glx@glx_ext_import_context@make current- multi process,Fail
-glx@glx_ext_import_context@make current- single process,Fail
-glx@glx_ext_import_context@query context info,Fail
-shaders@glsl-bug-110796,Fail
-spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
-spec@!opengl 1.0@gl-1.0-dlist-bitmap,Fail
-spec@!opengl 1.0@gl-1.0-edgeflag,Fail
-spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail
-spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
-spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
-spec@!opengl 1.0@gl-1.0-spot-light,Fail
-spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=4,Fail
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Fail
-spec@!opengl 1.1@getteximage-depth,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT16,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT24,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT32,Fail
-spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT,Fail
-spec@!opengl 1.1@getteximage-formats,Fail
-spec@!opengl 1.1@linestipple,Fail
-spec@!opengl 1.1@linestipple@Factor 2x,Fail
-spec@!opengl 1.1@linestipple@Factor 3x,Fail
-spec@!opengl 1.1@linestipple@Line loop,Fail
-spec@!opengl 1.1@linestipple@Line strip,Fail
-spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
-spec@!opengl 1.1@point-line-no-cull,Fail
-spec@!opengl 1.1@polygon-mode,Fail
-spec@!opengl 1.1@polygon-mode-offset,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@texwrap formats bordercolor,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA4- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA12- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA4- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB16- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA12- swizzled- border color only,Fail
-spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA16- swizzled- border color only,Fail
-spec@!opengl 1.1@windowoverlap,Fail
-spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
-spec@!opengl 2.0@gl-2.0-edgeflag,Fail
-spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
-spec@!opengl 2.0@max-samplers,Fail
-spec@!opengl 2.0@max-samplers border,Fail
-spec@!opengl 2.1@pbo,Fail
-spec@!opengl 2.1@pbo@test_polygon_stip,Fail
-spec@!opengl 2.1@polygon-stipple-fs,Fail
-spec@!opengl es 3.0@gles-3.0-transform-feedback-uniform-buffer-object,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render-fog,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Fail
-spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Fail
-spec@arb_compute_shader@minmax,Fail
-spec@arb_copy_buffer@targets,Fail
-spec@arb_depth_buffer_float@fbo-generatemipmap-formats,Fail
-spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F,Fail
-spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F NPOT,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH32F_STENCIL8- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH_COMPONENT32F- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH32F_STENCIL8- swizzled- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32F- swizzled- border color only,Fail
-spec@arb_depth_buffer_float@texwrap formats,Fail
-spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH32F_STENCIL8- NPOT,Fail
-spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH_COMPONENT32F- NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16 NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT,Fail
-spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT NPOT,Fail
-spec@arb_depth_texture@texwrap formats bordercolor,Fail
-spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail
-spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail
-spec@arb_depth_texture@texwrap formats,Fail
-spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail
-spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail
-spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
-spec@arb_framebuffer_object@fbo-drawbuffers-none use_frag_out,Fail
-spec@arb_pixel_buffer_object@pbo-getteximage,Fail
-spec@arb_pixel_buffer_object@texsubimage array pbo,Fail
-spec@arb_point_sprite@arb_point_sprite-checkerboard,Fail
-spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
-spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail
-spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgrad,Fail
-spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail
-spec@arb_texture_float@fbo-blending-formats,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_ALPHA32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY16F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE16F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE_ALPHA32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_RGB16F,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_RGBA32F,Fail
-spec@arb_texture_float@texwrap formats bordercolor,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_ALPHA32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_INTENSITY32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE_ALPHA32F_ARB- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_RGB32F- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor@GL_RGBA32F- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_ALPHA32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_INTENSITY32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE_ALPHA32F_ARB- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGB32F- swizzled- border color only,Fail
-spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGBA32F- swizzled- border color only,Fail
-spec@arb_texture_rectangle@1-1-linear-texture,Fail
-spec@arb_texture_rg@fbo-blending-formats-float,Fail
-spec@arb_texture_rg@fbo-blending-formats-float@GL_R32F,Fail
-spec@arb_texture_rg@fbo-blending-formats-float@GL_RG32F,Fail
-spec@arb_texture_rg@texwrap formats bordercolor,Fail
-spec@arb_texture_rg@texwrap formats bordercolor@GL_R16- border color only,Fail
-spec@arb_texture_rg@texwrap formats bordercolor@GL_RG16- border color only,Fail
-spec@arb_texture_rg@texwrap formats bordercolor-swizzled,Fail
-spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_R16- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_RG16- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor@GL_R32F- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor@GL_RG32F- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_R32F- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_RG32F- swizzled- border color only,Fail
-spec@arb_texture_rg@texwrap formats-float,Fail
-spec@arb_texture_rg@texwrap formats-float@GL_R32F- NPOT,Fail
-spec@arb_texture_rg@texwrap formats-float@GL_RG32F- NPOT,Fail
-spec@arb_transform_feedback2@change objects while paused (gles3),Fail
-spec@egl 1.4@egl-copy-buffers,Crash
-spec@egl 1.4@eglterminate then unbind context,Fail
-spec@egl_ext_protected_content@conformance,Fail
-spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
-spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
-spec@egl_khr_surfaceless_context@viewport,Fail
-spec@egl_mesa_configless_context@basic,Fail
-spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
-spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
-spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
-spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
-spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
-spec@ext_framebuffer_object@fbo-blending-formats,Fail
-spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB10,Fail
-spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail
-spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail
-spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-isampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-sampler1darray,Fail
-spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-usampler1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() cubeshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darrayshadow,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darray,Fail
-spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darrayshadow,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled,Fail
-spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_STENCIL8- swizzled- border color only,Fail
-spec@ext_packed_depth_stencil@texwrap formats,Fail
-spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
-spec@ext_packed_float@query-rgba-signed-components,Fail
-spec@ext_texture_array@array-texture,Fail
-spec@ext_texture_array@fbo-generatemipmap-array rgb9_e5,Fail
-spec@ext_texture_array@fbo-generatemipmap-array,Fail
-spec@ext_texture_array@texsubimage array,Fail
-spec@ext_texture_integer@getteximage-clamping gl_arb_texture_rg,Fail
-spec@ext_texture_integer@getteximage-clamping,Fail
-spec@ext_texture_lod_bias@lodbias,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_ALPHA16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_INTENSITY16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_R16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_RG16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGB16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGBA16_SNORM- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_ALPHA16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_INTENSITY16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_R16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RG16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGB16_SNORM- swizzled- border color only,Fail
-spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGBA16_SNORM- swizzled- border color only,Fail
-spec@arb_texture_storage@texture-storage@cube array texture,Fail
-spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash
-spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
-spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
-spec@glsl-1.20@execution@clipping@vs-clip-vertex-primitives,Fail
-spec@glsl-1.20@execution@fs-underflow-mul-compare-zero,Fail
-spec@intel_performance_query@intel_performance_query-issue_2235,Fail
-spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
-spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail
-spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
-spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x223344ff,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x76356278,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x223344ff,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x76356278,Crash
-spec@nv_copy_depth_to_color@nv_copy_depth_to_color,Crash
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.vert,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.frag,Fail
-spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.vert,Fail
-spec@nv_read_depth@read_depth_gles3,Fail
-spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Crash
-spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail
diff --git a/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt b/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt
deleted file mode 100644
index 14d2b9b4fd8..00000000000
--- a/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-glx@glx_arb_sync_control@swapbuffersmsc-divisor-zero
-glx@glx_arb_sync_control@waitformsc
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
-spec@arb_occlusion_query@occlusion_query_order
-spec@egl_chromium_sync_control@conformance
diff --git a/src/broadcom/ci/piglit-v3d-rpi4-skips.txt b/src/broadcom/ci/piglit-v3d-rpi4-skips.txt
deleted file mode 100644
index 2c70ff30c3f..00000000000
--- a/src/broadcom/ci/piglit-v3d-rpi4-skips.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-# Slow tests (> 1 minute to run)
-spec@!opengl 1.1@streaming-texture-leak
-spec@!opengl 1.2@tex3d-maxsize
-spec@ext_texture_env_combine@texture-env-combine
-spec@glsl-1.10@execution@loops@glsl-fs-unroll-explosion
-spec@glsl-1.10@execution@loops@glsl-vs-unroll-explosion
-spec@!opengl 1.0@gl-1.0-blend-func
-
-# Extensions not supported
-spec@arb_gpu_shader_fp64.*
-spec@arb_gpu_shader_gpu5.*
-spec@arb_gpu_shader_int64.*
-spec@arb_tessellation_shader.*
-spec@arb_texture_cube_map.*
-spec@glsl-1.30.*
-spec@glsl-1.40.*
-spec@glsl-1.50.*
-spec@glsl-3.*
-spec@glsl-4.*
-spec@glsl-es-3.20.*
diff --git a/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt b/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt
deleted file mode 100644
index afb7a908c87..00000000000
--- a/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-glx@glx-multi-window-single-context
-shaders@glsl-vs-loop
-shaders@glsl-vs-loop-nested
-spec@arb_framebuffer_srgb@blit renderbuffer srgb single_sampled enabled clear
-spec@egl_chromium_sync_control@conformance
-spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-readpixels
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2
-spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
diff --git a/src/broadcom/ci/piglit-vc4-rpi3-skips.txt b/src/broadcom/ci/piglit-vc4-rpi3-skips.txt
deleted file mode 100644
index ae25a28bb9a..00000000000
--- a/src/broadcom/ci/piglit-vc4-rpi3-skips.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# Slow tests (> 1 minute to run)
-spec@ext_framebuffer_multisample@accuracy
-glx@glx-multithread-texture
-spec@arb_internalformat_query2@all internalformat_<x>_type pname checks
-spec@!opengl 1.1@streaming-texture-leak
-spec@!opengl 1.0@gl-1.0-blend-func
-
-# Extensions not supported
-spec@arb_gpu_shader_fp64.*
-spec@arb_gpu_shader_gpu5.*
-spec@arb_gpu_shader_int64.*
-spec@arb_tessellation_shader.*
-spec@arb_texture_cube_map.*
-spec@glsl-1.30.*
-spec@glsl-1.40.*
-spec@glsl-1.50.*
-spec@glsl-3.*
-spec@glsl-4.*
-spec@glsl-es-3.*
diff --git a/src/broadcom/ci/traces-broadcom.yml b/src/broadcom/ci/traces-broadcom.yml
new file mode 100644
index 00000000000..d330ad0dcc8
--- /dev/null
+++ b/src/broadcom/ci/traces-broadcom.yml
@@ -0,0 +1,205 @@
+%YAML 1.2
+---
+traces-db:
+ download-url: "http://192.168.40.131:8888/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public/"
+
+traces:
+ 0ad/0ad-v2.trace:
+ broadcom-rpi4:
+ checksum: 8bdca9e63f483ee71970075842f003db
+
+ behdad-glyphy/glyphy-v2.trace:
+ broadcom-rpi4:
+ checksum: ea49462ff1545f21506dbd7b5028df45
+
+ blender/blender-demo-cube_diorama.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 4.5
+
+ blender/blender-demo-ellie_pose.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 4.5
+
+ filament/filament-default.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 4.1
+
+ glxgears/glxgears-2-v2.trace:
+ broadcom-rpi4:
+ label: [skip, flakes]
+ text: "Often fails when running on xwayland, with what looks like an incorrect resolution"
+ checksum: 2a9c5e35fa5693fd7d3a76f7b9746edb
+
+ godot/godot-thrive.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ godot/godot-tps-gles3-high.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ gputest/furmark-v2.trace:
+ broadcom-rpi4:
+ checksum: 800b2be5981d7e1a6570643f7dfd9a33
+
+ gputest/gimark-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ gputest/pixmark-julia-fp32-v2.trace:
+ broadcom-rpi4:
+ label: [skip, flakes]
+ checksum: be70fc9e3829fff5ad1b6ecfb6fa551c
+
+ gputest/pixmark-julia-fp64-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 4.0
+
+ gputest/pixmark-volplosion-v2.trace:
+ broadcom-rpi4:
+ checksum: 03f6b1c064af4e7eb117b800893cdba6
+
+ gputest/plot3d-v2.trace:
+ broadcom-rpi4:
+ checksum: 1ef33ad22679107a256501c79bfd9e7c
+
+ gputest/tessmark-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 4.0
+
+ gputest/triangle-v2.trace:
+ broadcom-rpi4:
+ checksum: df6df2af5fecfa42b5c2c332b726e93c
+
+ humus/AmbientAperture-v2.trace:
+ broadcom-rpi4:
+ checksum: a2d2a0141384a23e91ed30a27ed46bfe
+
+ humus/CelShading-v2.trace:
+ broadcom-rpi4:
+ checksum: 1135888a0e8723bbcded5ef9f0925964
+
+ humus/DynamicBranching3-v2.trace:
+ broadcom-rpi4:
+ checksum: 68011c66cfd83aa8a6b568de7c726d49
+
+ humus/HDR-v2.trace:
+ broadcom-rpi4:
+ checksum: de024f342418b578841f98ce697de8b5
+
+ humus/Portals-v2.trace:
+ broadcom-rpi4:
+ checksum: 269b9572113d6991cf58c96a833502bf
+
+ humus/RaytracedShadows-v2.trace:
+ broadcom-rpi4:
+ checksum: 6b572f241f4f9ee001ef849d10d03cc5
+
+ humus/VolumetricFogging2-v2.trace:
+ broadcom-rpi4:
+ checksum: d3b89dfaff0277be4b4b2ad2cf055d54
+
+ jvgs/jvgs-d27fb67-v2.trace:
+ broadcom-rpi4:
+ checksum: 831138a408cc9557528ef68381b080f2
+
+ neverball/neverball-v2.trace:
+ broadcom-rpi4:
+ checksum: c8e8ee352bdb303e4ed144b69272575e
+
+ nheko/nheko-colors.trace:
+ broadcom-rpi4:
+ checksum: 922597b0203ff18d6e430002bcf32ef4
+
+ supertuxkart/supertuxkart-mansion-egl-gles-v2.trace:
+ broadcom-rpi4:
+ checksum: 93fe17a18ab10d862b5a42b4ea05a658
+
+ valve/counterstrike-source-v2.trace:
+ broadcom-rpi4:
+ label: [skip, timeout]
+
+ valve/counterstrike-v2.trace:
+ broadcom-rpi4:
+ checksum: 547f6435bf21458e518bbcb2161962ab
+
+ valve/half-life-2-v2.trace:
+ broadcom-rpi4:
+ label: [crash]
+ text: v3d42_create_texture_shader_state_bo assertion abot serial_id
+
+ valve/portal-2-v2.trace:
+ broadcom-rpi4:
+ label: [skip, timeout]
+
+ paraview/pv-manyspheres-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ paraview/pv-waveletcontour-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ paraview/pv-waveletvolume-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ pathfinder/canvas_moire-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ pathfinder/canvas_text_v2-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ pathfinder/demo-v2.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ pioneer/pioneer.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.2
+
+ ror/ror-default.trace:
+ broadcom-rpi4:
+ label: [skip, flakes]
+ checksum: 533edca21409981b4983db846de4355e
+
+ thedarkmod/thedarkmod.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.3
+
+ unvanquished/unvanquished-lowest.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.2
+
+ unvanquished/unvanquished-ultra.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GL 3.2
+
+ warzone2100/warzone2100-default.trace:
+ broadcom-rpi4:
+ label: [unsupported]
+ text: needs GLSL 1.50
+
+ xonotic/xonotic-keybench-high-v2.trace:
+ broadcom-rpi4:
+ checksum: 3bc4ca2efa5a7b35701a8daad378e565
diff --git a/src/broadcom/cle/gen_pack_header.py b/src/broadcom/cle/gen_pack_header.py
index 0090b616d50..1cc2446d0bd 100644
--- a/src/broadcom/cle/gen_pack_header.py
+++ b/src/broadcom/cle/gen_pack_header.py
@@ -25,9 +25,8 @@
import xml.parsers.expat
import re
import sys
-import copy
-license = """/* Generated code, see v3d_packet_v21.xml, v3d_packet_v33.xml and gen_pack_header.py */
+license = """/* Generated code, see vc4_packet.xml, v3d_packet.xml and gen_pack_header.py */
"""
pack_header = """%(license)s
@@ -113,7 +112,7 @@ class Field(object):
self.type = attrs["type"]
if self.type == 'bool' and self.start != self.end:
- print("#error Field {} has bool type but more than one bit of size".format(self.name));
+ print("#error Field {} has bool type but more than one bit of size".format(self.name))
if "prefix" in attrs:
self.prefix = safe_name(attrs["prefix"]).upper()
@@ -215,7 +214,7 @@ class Group(object):
last_byte = field.end // 8
for b in range(first_byte, last_byte + 1):
- if not b in bytes:
+ if b not in bytes:
bytes[b] = self.Byte()
bytes[b].fields.append(field)
@@ -240,7 +239,7 @@ class Group(object):
for index in range(self.length):
# Handle MBZ bytes
- if not index in bytes:
+ if index not in bytes:
print(" cl[%2d] = 0;" % index)
continue
byte = bytes[index]
@@ -276,7 +275,6 @@ class Group(object):
byte_start = index * 8
- v = None
prefix = " cl[%2d] =" % index
field_index = 0
@@ -296,46 +294,46 @@ class Group(object):
value = "%s - 1" % value
if field.type == "mbo":
- s = "__gen_mbo(%d, %d)" % \
+ s = "util_bitpack_ones(%d, %d)" % \
(start, end)
elif field.type == "address":
extra_shift = (31 - (end - start)) // 8 * 8
s = "__gen_address_offset(&values->%s)" % byte.address.name
elif field.type == "uint":
- s = "__gen_uint(%s, %d, %d)" % \
+ s = "util_bitpack_uint(%s, %d, %d)" % \
(value, start, end)
elif field.type in self.parser.enums:
- s = "__gen_uint(%s, %d, %d)" % \
+ s = "util_bitpack_uint(%s, %d, %d)" % \
(value, start, end)
elif field.type == "int":
- s = "__gen_sint(%s, %d, %d)" % \
+ s = "util_bitpack_sint(%s, %d, %d)" % \
(value, start, end)
elif field.type == "bool":
- s = "__gen_uint(%s, %d, %d)" % \
+ s = "util_bitpack_uint(%s, %d, %d)" % \
(value, start, end)
elif field.type == "float":
s = "#error %s float value mixed in with other fields" % name
elif field.type == "f187":
- s = "__gen_uint(fui(%s) >> 16, %d, %d)" % \
+ s = "util_bitpack_uint(fui(%s) >> 16, %d, %d)" % \
(value, start, end)
elif field.type == "offset":
s = "__gen_offset(%s, %d, %d)" % \
(value, start, end)
elif field.type == 'ufixed':
- s = "__gen_ufixed(%s, %d, %d, %d)" % \
+ s = "util_bitpack_ufixed(%s, %d, %d, %d)" % \
(value, start, end, field.fractional_size)
elif field.type == 'sfixed':
- s = "__gen_sfixed(%s, %d, %d, %d)" % \
+ s = "util_bitpack_sfixed(%s, %d, %d, %d)" % \
(value, start, end, field.fractional_size)
elif field.type in self.parser.structs:
- s = "__gen_uint(v%d_%d, %d, %d)" % \
+ s = "util_bitpack_uint(v%d_%d, %d, %d)" % \
(index, field_index, start, end)
field_index = field_index + 1
else:
print("/* unhandled field %s, type %s */\n" % (name, field.type))
s = None
- if not s == None:
+ if s is not None:
shift = byte_start - field_byte_start + extra_shift
if shift:
s = "%s >> %d" % (s, shift)
@@ -383,7 +381,6 @@ class Group(object):
convert = "__gen_unpack_sfixed"
else:
print("/* unhandled field %s, type %s */\n" % (field.name, field.type))
- s = None
plusone = ""
if field.minus_one:
@@ -545,9 +542,9 @@ class Parser(object):
def emit_header(self, name):
default_fields = []
for field in self.group.fields:
- if not type(field) is Field:
+ if type(field) is not Field:
continue
- if field.default == None:
+ if field.default is None:
continue
default_fields.append(" .%-35s = %6d" % (field.name, field.default))
@@ -577,7 +574,7 @@ class Parser(object):
return
name = self.register
- if not self.reg_num == None:
+ if self.reg_num is not None:
print('#define %-33s 0x%04x' %
(self.gen_prefix(name + "_num"), self.reg_num))
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
index 4cab2b38dda..da88cd220a5 100644
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -18,27 +18,25 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-# [version, cle XML version]
+# [version, cle XML file]
v3d_versions = [
- [21, 21],
- [33, 33],
- [41, 33],
- [42, 33]
+ [21, 'vc4_packet.xml'],
+ [42, 'v3d_packet.xml'],
+ [71, 'v3d_packet.xml']
]
v3d_xml_files = []
v3d_xml_pack = []
foreach _v : v3d_versions
v = _v[0]
- xmlver = _v[1]
- f = 'v3d_packet_v@0@.xml'.format(xmlver)
+ xmlfile = _v[1]
_name = 'v3d_packet_v@0@_pack.h'.format(v)
- if not v3d_xml_files.contains(f)
- v3d_xml_files += f
+ if not v3d_xml_files.contains(xmlfile)
+ v3d_xml_files += xmlfile
endif
v3d_xml_pack += custom_target(
_name,
- input : ['gen_pack_header.py', f],
+ input : ['gen_pack_header.py', xmlfile],
output : _name,
command : [prog_python, '@INPUT@', '@0@'.format(v)],
capture : true,
@@ -47,7 +45,7 @@ endforeach
v3d_xml_h = custom_target(
'v3d_xml.h',
- input : ['../../intel/genxml/gen_zipped_file.py', v3d_xml_files],
+ input : ['../../util/gen_zipped_xml_file.py', v3d_xml_files],
output : 'v3d_xml.h',
command : [prog_python, '@INPUT@'],
capture : true,
@@ -59,9 +57,9 @@ if dep_expat.found()
endif
libbroadcom_cle = static_library(
- ['broadcom_cle', v3d_xml_h],
- 'v3d_decoder.c',
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+ 'broadcom_cle',
+ ['v3d_decoder.c', v3d_xml_h],
+ include_directories : [inc_include, inc_src, inc_broadcom],
c_args : [no_override_init_args, expat_args],
gnu_symbol_visibility : 'hidden',
dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib],
diff --git a/src/broadcom/cle/v3d_decoder.c b/src/broadcom/cle/v3d_decoder.c
index 97dd8ce8423..46cd152e599 100644
--- a/src/broadcom/cle/v3d_decoder.c
+++ b/src/broadcom/cle/v3d_decoder.c
@@ -267,51 +267,6 @@ get_register_offset(const char **atts, uint32_t *offset)
return;
}
-static void
-get_start_end_pos(int *start, int *end)
-{
- /* start value has to be mod with 32 as we need the relative
- * start position in the first DWord. For the end position, add
- * the length of the field to the start position to get the
- * relative postion in the 64 bit address.
- */
- if (*end - *start > 32) {
- int len = *end - *start;
- *start = *start % 32;
- *end = *start + len;
- } else {
- *start = *start % 32;
- *end = *end % 32;
- }
-
- return;
-}
-
-static inline uint64_t
-mask(int start, int end)
-{
- uint64_t v;
-
- v = ~0ULL >> (63 - end + start);
-
- return v << start;
-}
-
-static inline uint64_t
-field(uint64_t value, int start, int end)
-{
- get_start_end_pos(&start, &end);
- return (value & mask(start, end)) >> (start);
-}
-
-static inline uint64_t
-field_address(uint64_t value, int start, int end)
-{
- /* no need to right shift for address/offset */
- get_start_end_pos(&start, &end);
- return (value & mask(start, end));
-}
-
static struct v3d_type
string_to_type(struct parser_context *ctx, const char *s)
{
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet.xml
index de80a6b64a1..09dde392fac 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet.xml
@@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="33" max_ver="42">
+<vcxml gen="3.3" min_ver="42" max_ver="71">
<enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
<value name="NEVER" value="0"/>
@@ -69,30 +69,7 @@
<value name="TRIANGLE_FAN_TF" value="22"/>
</enum>
- <enum name="TMU Filter" prefix="V3D_TMU_FILTER" max_ver="33">
- <!-- Names are mip filter, min filter, mag filter -->
- <value name="MIN_LIN_MIP_NONE_MAG_LIN" value="0"/>
- <value name="MIN_LIN_MIP_NONE_MAG_NEAR" value="1"/>
- <value name="MIN_NEAR_MIP_NONE_MAG_LIN" value="2"/>
- <value name="MIN_NEAR_MIP_NONE_MAG_NEAR" value="3"/>
-
- <value name="MIN_NEAR_MIP_NEAR_MAG_LIN" value="4"/>
- <value name="MIN_NEAR_MIP_NEAR_MAG_NEAR" value="5"/>
- <value name="MIN_NEAR_MIP_LIN_MAG_LIN" value="6"/>
- <value name="MIN_NEAR_MIP_LIN_MAG_NEAR" value="7"/>
-
- <value name="MIN_LIN_MIP_NEAR_MAG_LIN" value="8"/>
- <value name="MIN_LIN_MIP_NEAR_MAG_NEAR" value="9"/>
- <value name="MIN_LIN_MIP_LIN_MAG_LIN" value="10"/>
- <value name="MIN_LIN_MIP_LIN_MAG_NEAR" value="11"/>
-
- <value name="ANISOTROPIC_2_1" value="12"/>
- <value name="ANISOTROPIC_4_1" value="13"/>
- <value name="ANISOTROPIC_8_1" value="14"/>
- <value name="ANISOTROPIC_16_1" value="15"/>
- </enum>
-
- <enum name="Border Color Mode" prefix="V3D_BORDER_COLOR" min_ver="41">
+ <enum name="Border Color Mode" prefix="V3D_BORDER_COLOR">
<value name="0000" value="0"/>
<value name="0001" value="1"/>
<value name="1111" value="2"/>
@@ -107,7 +84,7 @@
<value name="MIRROR_ONCE" value="4"/>
</enum>
- <enum name="TMU Op" prefix="V3D_TMU_OP" min_ver="41">
+ <enum name="TMU Op" prefix="V3D_TMU_OP">
<value name="Write ADD, Read Prefetch" value="0"/>
<value name="Write SUB, Read Clear" value="1"/>
<value name="Write XCHG, Read Flush" value="2"/>
@@ -167,11 +144,34 @@
<value name="depth_16" value="2"/>
</enum>
- <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
+ <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" max_ver="42">
<value name="none" value="0"/> <!-- no clamping -->
<value name="norm" value="1"/> <!-- [0,1] for f16 -->
<value name="pos" value="2"/> <!-- [0, for f16 -->
- <value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
+ <value name="int" value="3"/> <!-- clamp to integer RT's range -->
+ </enum>
+
+ <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
+ <value name="8i" value="0"/> <!-- no clamping -->
+ <value name="16i" value="1"/> <!-- no clamping -->
+ <value name="32i" value="2"/> <!-- no clamping -->
+ <value name="8ui" value="4"/> <!-- no clamping -->
+ <value name="16ui" value="5"/> <!-- no clamping -->
+ <value name="32ui" value="6"/> <!-- no clamping -->
+ <value name="8" value="8"/> <!-- no clamping -->
+ <value name="16f" value="9"/> <!-- no clamping -->
+ <value name="32f" value="10"/> <!-- no clamping -->
+ <value name="8i_clamped" value="16"/> <!-- clamp to integer RT's range -->
+ <value name="16i_clamped" value="17"/> <!-- clamp to integer RT's range -->
+ <value name="32i_clamped" value="18"/> <!-- clamp to integer RT's range -->
+ <value name="8ui_clamped" value="20"/> <!-- clamp to integer RT's range -->
+ <value name="16ui_clamped" value="21"/> <!-- clamp to integer RT's range -->
+ <value name="32ui_clamped" value="22"/> <!-- clamp to integer RT's range -->
+ <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
+ <value name="16f_clamp_pos" value="25"/> <!-- [0, for f16 -->
+ <value name="16f_clamp_pq" value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
+ <value name="16f_clamp_hlg" value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
+ <value name="invalid" value="32"/>
</enum>
<!---
@@ -261,22 +261,27 @@
<value name="rgba8ui" value="34"/>
<value name="rg8ui" value="35"/>
<value name="r8ui" value="36"/>
- <value name="srgbx8" value="37" max_ver="33"/>
- <value name="rgbx8" value="38" max_ver="33"/>
- <value name="bstc" value="39" min_ver="41"/>
- <value name="d32f" value="40" min_ver="41"/>
- <value name="d24" value="41" min_ver="41"/>
- <value name="d16" value="42" min_ver="41"/>
- <value name="d24s8" value="43" min_ver="41"/>
- <value name="s8" value="44" min_ver="41"/>
- <value name="rgba5551" value="45" min_ver="41"/>
- </enum>
-
- <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
- <value name="depth_component32f" value="0"/>
- <value name="depth_component24" value="1"/> <!-- depth low, pad high -->
- <value name="depth_component16" value="2"/>
- <value name="depth24_stencil8" value="3"/> <!-- stencil low, depth high -->
+ <value name="bstc8" value="39"/>
+ <value name="d32f" value="40"/>
+ <value name="d24" value="41"/>
+ <value name="d16" value="42"/>
+ <value name="d24s8" value="43"/>
+ <value name="s8" value="44"/>
+ <value name="rgba5551" value="45"/>
+ <value name="bstc8_srgb" value="46" min_ver="71"/>
+ <value name="bstc10" value="47" min_ver="71"/>
+ <value name="bstc10_srgb" value="48" min_ver="71"/>
+ <value name="bstc10_pq" value="49" min_ver="71"/>
+ <value name="rgba10x6" value="50" min_ver="71"/>
+ <value name="bstc10_hlg" value="55" min_ver="71"/>
+ <value name="rgba10x6_hlg" value="56" min_ver="71"/>
+ <value name="rgb10_a2_hlg" value="57" min_ver="71"/>
+ <value name="bstc10_pq_bt1886" value="58" min_ver="71"/>
+ <value name="rgba10x6_pq_bt1886" value="59" min_ver="71"/>
+ <value name="rgb10_a2_pq_bt1886" value="60" min_ver="71"/>
+ <value name="bstc10_hlg_bt1886" value="61" min_ver="71"/>
+ <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
+ <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
</enum>
<enum name="Dither Mode" prefix="V3D_DITHER_MODE">
@@ -299,7 +304,7 @@
<value name="packed complete patches" value="2"/>
</enum>
- <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS">
+ <enum name="Primitive counters" prefix="V3D_PRIM_COUNTS">
<value name="tf_words_buffer0" value="0"/>
<value name="tf_words_buffer1" value="1"/>
<value name="tf_words_buffer2" value="2"/>
@@ -309,6 +314,17 @@
<value name="tf_overflow" value="6"/>
</enum>
+ <enum name="Line Rasterization" prefix="V3D_LINE_RASTERIZATION">
+ <value name="diamond exit" value="0"/>
+ <value name="perp end caps" value="1"/>
+ </enum>
+
+ <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
+ <value name="NONE" value="0"/>
+ <value name="MIN_ONE_TO_ONE" value="1"/>
+ <value name="ZERO_TO_ONE" value="2"/>
+ </enum>
+
<packet code="0" name="Halt"/>
<packet code="1" name="NOP"/>
<packet code="4" name="Flush"/>
@@ -362,57 +378,18 @@
<field name="column number in supertiles" size="8" start="0" type="uint"/>
</packet>
- <packet code="24" shortname="store_subsample" name="Store Multi-Sample Resolved Tile Color Buffer" cl="R" max_ver="33"/>
-
- <packet code="25" shortname="store_subsample_ex" name="Store Multi-Sample Resolved Tile Color Buffer (extended)" cl="R" max_ver="33">
- <field name="Disable Color Buffer write" size="8" start="8" type="uint"/>
- <field name="Enable Z write" size="1" start="7" type="bool"/>
- <field name="Enable Stencil write" size="1" start="6" type="bool"/>
- <!-- bit 5 unused -->
- <field name="Disable Color buffer(s) clear on write" size="1" start="4" type="bool"/>
- <field name="Disable Stencil buffer clear on write" size="1" start="3" type="bool"/>
- <field name="Disable Z buffer clear on write" size="1" start="2" type="bool"/>
- <field name="Disable fast opportunistic write out in multisample mode" size="1" start="1" type="bool"/>
- <field name="Last Tile of Frame" size="1" start="0" type="bool"/>
- </packet>
-
- <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
+ <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" max_ver="42">
<field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
<field name="Clear all Render Targets" size="1" start="0" type="bool"/>
</packet>
- <packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
- <field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
- <field name="Enable Z load" size="1" start="7" type="bool"/>
- <field name="Enable Stencil load" size="1" start="6" type="bool"/>
- </packet>
+ <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
- <packet code="26" shortname="end_loads" name="End of Loads" cl="R" min_ver="41"/>
+ <packet code="26" shortname="end_loads" name="End of Loads" cl="R"/>
<packet code="27" shortname="end_tile" name="End of Tile Marker" cl="R"/>
- <packet code="29" shortname="store_general" name="Store Tile Buffer General" cl="R" max_ver="33">
- <field name="Address" size="24" start="24" type="address"/>
- <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/>
- <field name="XOR UIF" size="1" start="10" type="bool"/>
- <field name="Last Tile of Frame" size="1" start="8" type="bool"/>
- <field name="Disable Color buffer(s) clear on write" size="1" start="7" type="bool"/>
- <field name="Disable Stencil buffer clear on write" size="1" start="6" type="bool"/>
- <field name="Disable Z buffer clear on write" size="1" start="5" type="bool"/>
- <field name="Raw Mode" size="1" start="4" type="bool"/>
- <field name="Buffer to Store" size="4" start="0" type="uint">
- <value name="Render target 0" value="0"/>
- <value name="Render target 1" value="1"/>
- <value name="Render target 2" value="2"/>
- <value name="Render target 3" value="3"/>
- <value name="None" value="8"/>
- <value name="Z" value="9"/>
- <value name="Stencil" value="10"/>
- <value name="Z+Stencil" value="11"/>
- </field>
- </packet>
-
- <packet code="29" shortname="store" name="Store Tile Buffer General" cl="R" min_ver="41">
+ <packet code="29" shortname="store" name="Store Tile Buffer General" cl="R">
<field name="Address" size="32" start="64" type="address"/>
<!-- used for y flip -->
@@ -438,6 +415,10 @@
<value name="Render target 1" value="1"/>
<value name="Render target 2" value="2"/>
<value name="Render target 3" value="3"/>
+ <value name="Render target 4" value="4" min_ver="71"/>
+ <value name="Render target 5" value="5" min_ver="71"/>
+ <value name="Render target 6" value="6" min_ver="71"/>
+ <value name="Render target 7" value="7" min_ver="71"/>
<value name="None" value="8"/>
<value name="Z" value="9"/>
<value name="Stencil" value="10"/>
@@ -445,24 +426,7 @@
</field>
</packet>
- <packet code="30" shortname="load_general" name="Load Tile Buffer General" cl="R" max_ver="33">
- <field name="Address" size="24" start="24" type="address"/>
- <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/>
- <field name="XOR UIF" size="1" start="10" type="bool"/>
- <field name="Raw Mode" size="1" start="4" type="bool"/>
- <field name="Buffer to Load" size="4" start="0" type="uint">
- <value name="Render target 0" value="0"/>
- <value name="Render target 1" value="1"/>
- <value name="Render target 2" value="2"/>
- <value name="Render target 3" value="3"/>
- <value name="None" value="8"/>
- <value name="Z" value="9"/>
- <value name="Stencil" value="10"/>
- <value name="Z+Stencil" value="11"/>
- </field>
- </packet>
-
- <packet code="30" shortname="load" name="Load Tile Buffer General" cl="R" min_ver="41">
+ <packet code="30" shortname="load" name="Load Tile Buffer General" cl="R">
<field name="Address" size="32" start="64" type="address"/>
<!-- used for y flip -->
@@ -496,23 +460,7 @@
<packet code="31" shortname="tf_draw_flush_and_count" name="Transform Feedback Flush and Count"/>
- <packet code="32" name="Indexed Prim List" cl="B" max_ver="33">
- <field name="Minimum index" size="32" start="104" type="uint"/>
- <field name="Enable Primitive Restarts" size="1" start="103" type="bool"/>
- <field name="Maximum index" size="31" start="72" type="uint"/>
- <field name="Address of Indices List" size="32" start="40" type="address"/>
- <field name="Length" size="32" start="8" type="uint"/>
-
- <field name="Index type" size="2" start="6" type="uint">
- <value name="Index type 8-bit" value="0"/>
- <value name="Index type 16-bit" value="1"/>
- <value name="Index type 32-bit" value="2"/>
- </field>
-
- <field name="mode" size="5" start="0" type="Primitive"/>
- </packet>
-
- <packet code="32" name="Indexed Prim List" cl="B" min_ver="41">
+ <packet code="32" name="Indexed Prim List" cl="B">
<field name="Index Offset" size="32" start="40" type="uint"/>
<field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
@@ -527,23 +475,7 @@
<field name="mode" size="6" start="0" type="Primitive"/>
</packet>
- <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B" max_ver="33">
- <field name="Stride in Multiples of 4 Bytes" size="8" start="104" type="uint"/>
- <field name="Address of Indices List" size="32" start="72" type="address"/>
- <field name="Address" size="32" start="40" type="address"/>
- <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
- <field name="Number of Draw Indirect Indexed Records" size="31" start="8" type="uint"/>
-
- <field name="Index type" size="2" start="6" type="uint">
- <value name="Index type 8-bit" value="0"/>
- <value name="Index type 16-bit" value="1"/>
- <value name="Index type 32-bit" value="2"/>
- </field>
-
- <field name="mode" size="6" start="0" type="Primitive"/>
- </packet>
-
- <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B" min_ver="41">
+ <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B">
<field name="Stride in Multiples of 4 Bytes" size="8" start="72" type="uint"/>
<field name="Address" size="32" start="40" type="address"/>
<field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
@@ -558,23 +490,7 @@
<field name="mode" size="6" start="0" type="Primitive"/>
</packet>
- <packet code="34" name="Indexed Instanced Prim List" cl="B" max_ver="33">
- <field name="Enable Primitive Restarts" size="1" start="135" type="bool"/>
- <field name="Maximum index" size="31" start="104" type="uint"/>
- <field name="Address of Indices List" size="32" start="72" type="address"/>
- <field name="Number of Instances" size="32" start="40" type="uint"/>
- <field name="Instance Length" size="32" start="8" type="uint"/>
-
- <field name="Index type" size="2" start="6" type="uint">
- <value name="Index type 8-bit" value="0"/>
- <value name="Index type 16-bit" value="1"/>
- <value name="Index type 32-bit" value="2"/>
- </field>
-
- <field name="mode" size="5" start="0" type="Primitive"/>
- </packet>
-
- <packet code="34" name="Indexed Instanced Prim List" cl="B" min_ver="41">
+ <packet code="34" name="Indexed Instanced Prim List" cl="B">
<field name="Index Offset" size="32" start="72" type="uint"/>
<field name="Number of Instances" size="32" start="40" type="uint"/>
<field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
@@ -626,16 +542,16 @@
<field name="Base Vertex" size="32" start="0" type="uint"/>
</packet>
- <packet code="44" name="Index Buffer Setup" cl="B" min_ver="41">
+ <packet code="44" name="Index Buffer Setup" cl="B">
<field name="Address" size="32" start="0" type="address"/>
<field name="Size" size="32" start="32" type="uint"/>
</packet>
- <packet code="54" name="Set InstanceID" cl="B" min_ver="41">
+ <packet code="54" name="Set InstanceID" cl="B">
<field name="Instance ID" size="32" start="0" type="uint"/>
</packet>
- <packet code="55" name="Set PrimitiveID" cl="B" min_ver="41">
+ <packet code="55" name="Set PrimitiveID" cl="B">
<field name="Primitive ID" size="32" start="0" type="uint"/>
</packet>
@@ -662,22 +578,22 @@
<field name="number of attribute arrays" size="5" start="0" type="uint"/>
</packet>
- <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS" min_ver="41">
+ <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS">
<field name="address" size="27" start="5" type="address"/>
<field name="number of attribute arrays" size="5" start="0" type="uint"/>
</packet>
- <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS" min_ver="41">
+ <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS">
<field name="address" size="27" start="5" type="address"/>
<field name="number of attribute arrays" size="5" start="0" type="uint"/>
</packet>
- <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS" min_ver="41">
+ <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS">
<field name="address" size="27" start="5" type="address"/>
<field name="number of attribute arrays" size="5" start="0" type="uint"/>
</packet>
- <packet code="71" name="VCM Cache Size" min_ver="41">
+ <packet code="71" name="VCM Cache Size">
<field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
<field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
</packet>
@@ -706,23 +622,13 @@
</field>
</packet>
- <packet code="73" name="VCM Cache Size" max_ver="33">
- <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
- <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
- </packet>
-
- <packet code="73" name="Transform Feedback Buffer" min_ver="41">
+ <packet code="73" name="Transform Feedback Buffer">
<field name="Buffer Address" size="32" start="32" type="address"/>
<field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
<field name="Buffer Number" size="2" start="0" type="uint"/>
</packet>
- <packet code="74" name="Transform Feedback Enable" max_ver="33">
- <field name="number of 32-bit Output Buffer Address following" size="3" start="8" type="uint"/>
- <field name="number of 16-bit Output Data Specs following" size="5" start="11" type="uint"/>
- </packet>
-
- <packet code="74" name="Transform Feedback Specs" min_ver="41">
+ <packet code="74" name="Transform Feedback Specs">
<field name="Enable" size="1" start="7" type="bool"/>
<field name="Number of 16-bit Output Data Specs following" size="5" start="0" type="uint"/>
</packet>
@@ -742,13 +648,7 @@
<field name="L2T Flush Start" size="32" start="0" type="address"/>
</packet>
- <struct name="Transform Feedback Output Data Spec" max_ver="33">
- <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
- <field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/>
- <field name="Output Buffer to write to" size="2" start="12" type="uint"/>
- </struct>
-
- <struct name="Transform Feedback Output Data Spec" min_ver="41">
+ <struct name="Transform Feedback Output Data Spec">
<field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
<field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/>
<field name="Output Buffer to write to" size="2" start="12" type="uint"/>
@@ -771,11 +671,12 @@
<field name="Stencil Ref Value" size="8" start="0" type="uint"/>
</packet>
- <packet code="83" name="Blend Enables" min_ver="41">
+ <packet code="83" name="Blend Enables">
<field name="Mask" size="8" start="0" type="uint"/>
</packet>
- <packet code="84" name="Blend Cfg" max_ver="33">
+ <packet code="84" name="Blend Cfg" max_ver="42">
+ <field name="Render Target Mask" size="4" start="24" type="uint"/>
<field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
<field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
<field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
@@ -784,8 +685,8 @@
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
</packet>
- <packet code="84" name="Blend Cfg" min_ver="41">
- <field name="Render Target Mask" size="4" start="24" type="uint"/>
+ <packet code="84" name="Blend Cfg" min_ver="71">
+ <field name="Render Target Mask" size="8" start="24" type="uint"/>
<field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
<field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
<field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
@@ -805,16 +706,16 @@
<field name="Mask" size="32" start="0" type="uint"/>
</packet>
- <packet code="88" name="Zero All Centroid Flags" min_ver="41"/>
+ <packet code="88" name="Zero All Centroid Flags" />
- <packet code="89" name="Centroid Flags" min_ver="41">
+ <packet code="89" name="Centroid Flags">
<field name="Centroid Flags for varyings V0*24" size="24" start="8" type="uint"/>
<field name="Action for Centroid Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
<field name="Action for Centroid Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
<field name="Varying offset V0" size="4" start="0" type="uint"/>
</packet>
- <packet code="91" name="Sample State" min_ver="41">
+ <packet code="91" name="Sample State">
<field name="Coverage" size="16" start="16" type="f187"/>
<field name="Mask" size="4" start="0" type="uint"/>
</packet>
@@ -823,7 +724,12 @@
<field name="address" size="32" start="0" type="address"/>
</packet>
- <packet code="96" name="Cfg Bits">
+ <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
+ <field name="Lower Test Limit" size="32" start="0" type="float"/>
+ <field name="Upper Test Limit" size="32" start="32" type="float"/>
+ </packet>
+
+ <packet code="96" name="Cfg Bits" max_ver="42">
<field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
<field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
<field name="Blend enable" size="1" start="19" type="bool"/>
@@ -834,7 +740,26 @@
<field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
<field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
<field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
- <field name="Line Rasterization" size="2" start="4" type="uint"/>
+ <field name="Line Rasterization" size="2" start="4" type="Line Rasterization"/>
+ <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
+ <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
+ <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
+ <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+ </packet>
+
+ <packet code="96" name="Cfg Bits" min_ver="71">
+ <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
+ <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+ <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+ <field name="Blend enable" size="1" start="19" type="bool"/>
+ <field name="Stencil enable" size="1" start="18" type="bool"/>
+ <field name="Z updates enable" size="1" start="15" type="bool"/>
+ <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
+ <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
+ <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
+ <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
+ <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
+ <field name="Line Rasterization" size="1" start="4" type="uint"/>
<field name="Enable Depth Offset" size="1" start="3" type="bool"/>
<field name="Clockwise Primitives" size="1" start="2" type="bool"/>
<field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
@@ -850,9 +775,9 @@
<field name="Varying offset V0" size="4" start="0" type="uint"/>
</packet>
- <packet code="99" shortname="zero_all_noperspective_flags" name="Zero All Non-perspective Flags" min_ver="41"/>
+ <packet code="99" shortname="zero_all_noperspective_flags" name="Zero All Non-perspective Flags" />
- <packet code="100" shortname="noperspective_flags" name="Non-perspective Flags" min_ver="41">
+ <packet code="100" shortname="noperspective_flags" name="Non-perspective Flags">
<field name="Non-perspective Flags for varyings V0*24" size="24" start="8" type="uint"/>
<field name="Action for Non-perspective Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
<field name="Action for Non-perspective Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
@@ -867,12 +792,7 @@
<field name="Line width" size="32" start="0" type="float"/>
</packet>
- <packet name="Depth Offset" code="106" max_ver="33">
- <field name="Depth Offset Units" size="16" start="16" type="f187"/>
- <field name="Depth Offset Factor" size="16" start="0" type="f187"/>
- </packet>
-
- <packet name="Depth Offset" code="106" min_ver="41">
+ <packet name="Depth Offset" code="106">
<field name="Limit" size="32" start="32" type="float"/>
<field name="Depth Offset Units" size="16" start="16" type="f187"/>
<field name="Depth Offset Factor" size="16" start="0" type="f187"/>
@@ -885,16 +805,11 @@
<field name="Clip Window Left Pixel Coordinate" size="16" start="0" type="uint"/>
</packet>
- <packet name="Viewport Offset" code="108" max_ver="33">
- <field name="Viewport Centre Y-coordinate" size="32" start="32" type="s24.8"/>
- <field name="Viewport Centre X-coordinate" size="32" start="0" type="s24.8"/>
- </packet>
-
- <packet name="Viewport Offset" code="108" min_ver="41">
- <field name="Coarse Y" size="10" start="54" type="uint"/>
- <field name="Viewport Centre Y-coordinate" size="22" start="32" type="s14.8"/>
- <field name="Coarse X" size="10" start="22" type="uint"/>
- <field name="Viewport Centre X-coordinate" size="22" start="0" type="s14.8"/>
+ <packet name="Viewport Offset" code="108">
+ <field name="Coarse Y" size="10" start="54" type="int"/>
+ <field name="Fine Y" size="22" start="32" type="u14.8"/>
+ <field name="Coarse X" size="10" start="22" type="int"/>
+ <field name="Fine X" size="22" start="0" type="u14.8"/>
</packet>
<packet shortname="clipz" name="Clipper Z min/max clipping planes" code="109">
@@ -902,31 +817,41 @@
<field name="Minimum Zw" size="32" start="0" type="float"/>
</packet>
- <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
<field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
<field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
</packet>
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
+ <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
+ <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
+ </packet>
+
<packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
<field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
<field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
</packet>
- <packet name="Number of Layers" code="119" min_ver="41">
+ <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
+ <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+ <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+ </packet>
+
+ <packet name="Number of Layers" code="119">
<field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
</packet>
- <packet code="120" name="Tile Binning Mode Cfg (Part1)" max_ver="33">
- <field name="Double-buffer in non-ms mode" size="1" start="63" type="bool"/>
- <field name="Multisample Mode (4x)" size="1" start="62" type="bool"/>
+ <packet code="120" name="Tile Binning Mode Cfg" max_ver="42">
- <field name="Maximum BPP of all render targets" size="2" start="60" type="Internal BPP"/>
+ <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+ <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
- <field name="Number of Render Targets" size="4" start="56" type="uint"/>
- <field name="Height (in tiles)" size="12" start="44" type="uint"/>
- <field name="Width (in tiles)" size="12" start="32" type="uint"/>
+ <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
+ <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
- <field name="Tile State Data Array Base Address" size="26" start="6" type="address"/>
+ <field name="Maximum BPP of all render targets" size="2" start="12" type="Internal BPP"/>
+
+ <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/>
<field name="tile allocation block size" size="2" start="4" type="uint">
<value name="tile allocation block size 64b" value="0"/>
@@ -938,21 +863,24 @@
<value name="tile allocation initial block size 128b" value="1"/>
<value name="tile allocation initial block size 256b" value="2"/>
</field>
- <field name="auto-initialize tile state data array" size="1" start="1" type="bool" default="1"/>
- <field name="sub-id" size="1" start="0" type="uint" default="0"/>
</packet>
- <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
-
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
<field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
<field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
- <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
- <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
-
- <field name="Maximum BPP of all render targets" size="2" start="12" type="Internal BPP"/>
-
- <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/>
+ <field name="Log2 Tile Height" size="3" start="11" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="8" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
<field name="tile allocation block size" size="2" start="4" type="uint">
<value name="tile allocation block size 64b" value="0"/>
@@ -966,17 +894,11 @@
</field>
</packet>
- <packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
- <field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
- <field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
-
- <field name="sub-id" size="1" start="0" type="uint" default="1"/>
- </packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" max_ver="42">
+ <field name="Pad" size="12" start="52" type="uint"/>
- <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" max_ver="33">
- <field name="Disable Render Target Stores" size="8" start="56" type="uint"/>
- <field name="Enable Z Store" size="1" start="55" type="bool"/>
- <field name="Enable Stencil Store" size="1" start="54" type="bool"/>
+ <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+ <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
<field name="Early-Z disable" size="1" start="46" type="bool"/>
@@ -988,7 +910,11 @@
<field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
<field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
- <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
+ <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
+ <value name="Render target maximum 32bpp" value="0"/>
+ <value name="Render target maximum 64bpp" value="1"/>
+ <value name="Render target maximum 128bpp" value="2"/>
+ </field>
<field name="Image Height (pixels)" size="16" start="24" type="uint"/>
<field name="Image Width (pixels)" size="16" start="8" type="uint"/>
@@ -997,8 +923,21 @@
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
- <field name="Pad" size="12" start="52" type="uint"/>
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
+ <field name="Pad" size="6" start="58" type="uint"/>
+
+ <field name="Log2 Tile Height" size="3" start="55" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="52" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
<field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
<field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
@@ -1010,40 +949,18 @@
<value name="Early-Z direction GT/GE" value="1"/>
</field>
+ <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
<field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
<field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
- <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
-
<field name="Image Height (pixels)" size="16" start="24" type="uint"/>
<field name="Image Width (pixels)" size="16" start="8" type="uint"/>
<field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
- <field name="sub-id" size="4" start="0" type="uint" default="0"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="0"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
- <field name="Address" size="32" start="32" type="address"/>
-
- <field name="Pad" size="4" start="28" type="uint"/>
-
- <field name="Flip Y" size="1" start="27" type="bool"/>
-
- <field name="Memory Format" size="3" start="24" type="Memory Format"/>
-
- <field name="Dither Mode" size="2" start="22" type="Dither Mode"/>
-
- <field name="Output image format" size="6" start="16" type="Output Image Format"/>
-
- <field name="Decimate mode" size="2" start="14" type="Decimate Mode"/>
-
- <field name="Internal Type" size="4" start="10" type="Internal Type"/>
- <field name="Internal BPP" size="2" start="8" type="Internal BPP"/>
- <field name="Render Target Number" size="4" start="4" type="uint"/>
- <field name="sub-id" size="4" start="0" type="uint" default="2"/>
- </packet>
-
- <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="42">
<field name="Pad" size="28" start="36" type="uint"/>
@@ -1066,53 +983,25 @@
<field name="sub-id" size="4" start="0" type="uint" default="1"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Z/Stencil)" cl="R" max_ver="33">
- <field name="Address" size="26" start="38" type="address"/>
-
- <field name="Padded height of output image in UIF blocks" size="13" start="25" type="uint"/>
-
- <field name="Memory Format" size="3" start="22" type="Memory Format"/>
-
- <field name="Output image format" size="6" start="16" type="Z/S Output Image Format"/>
-
- <field name="Decimate mode" size="2" start="14" type="uint"/>
-
- <field name="Internal Type" size="4" start="10" type="Internal Depth Type"/>
-
- <field name="Internal BPP (ignored)" size="2" start="8" type="uint"/>
- <!-- selects between Z/Stencil config packet and Separate Stencil packet. -->
- <field name="Z/Stencil ID" size="4" start="4" type="uint" default="0"/>
- <field name="sub-id" size="4" start="0" type="uint" default="1"/>
- </packet>
-
- <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" max_ver="33">
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" max_ver="42">
<field name="unused" size="16" start="48" type="uint"/>
<field name="Z Clear Value" size="32" start="16" type="float"/>
<field name="Stencil Clear Value" size="8" start="8" type="uint"/>
- <field name="sub-id" size="4" start="0" type="uint" default="3"/>
+ <field name="sub-id" size="4" start="0" type="uint" default="2"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
<field name="unused" size="16" start="48" type="uint"/>
<field name="Z Clear Value" size="32" start="16" type="float"/>
<field name="Stencil Clear Value" size="8" start="8" type="uint"/>
- <field name="sub-id" size="4" start="0" type="uint" default="2"/>
- </packet>
-
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
- <!-- Express this as a 56-bit field? -->
- <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
- <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
-
- <field name="Render Target number" size="4" start="4" type="uint"/>
- <field name="sub-id" size="4" start="0" type="uint" default="4"/>
+ <field name="sub-id" size="4" start="0" type="uint" default="1"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
@@ -1121,16 +1010,20 @@
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
- <!-- Express this as a 56-bit field? -->
- <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
- <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
- <field name="Render Target number" size="4" start="4" type="uint"/>
- <field name="sub-id" size="4" start="0" type="uint" default="5"/>
+ <field name="Clear Color low bits" size="32" start="32" type="uint"/>
+ <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
+ <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
+
+ <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
+ <!-- In multiples of 512 bits -->
+ <field name="Base Address" size="11" start="7" type="uint"/>
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="2"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
@@ -1139,18 +1032,14 @@
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
- <field name="pad" size="11" start="53" type="uint"/>
- <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
- <!-- image height is for Y flipping -->
- <field name="Raster Row Stride or Image Height in Pixels" size="16" start="24" type="uint"/>
- <field name="Clear Color high 16 bits" size="16" start="8" type="uint"/>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
+ <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
- <field name="Render Target number" size="4" start="4" type="uint"/>
- <field name="sub-id" size="4" start="0" type="uint" default="6"/>
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="3"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="42">
<field name="pad" size="11" start="53" type="uint"/>
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
<!-- image height is for Y flipping -->
@@ -1161,6 +1050,13 @@
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
+ <field name="Clear Color top bits" size="56" start="8" type="uint"/>
+
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="4"/>
+ </packet>
+
<packet code="124" shortname="tile_coords" name="Tile Coordinates">
<field name="tile row number" size="12" start="12" type="uint"/>
<field name="tile column number" size="12" start="0" type="uint"/>
@@ -1199,43 +1095,7 @@
</field>
</packet>
- <struct name="GL Shader State Record" max_ver="33">
- <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
- <field name="Enable clipping" size="1" start="1" type="bool"/>
- <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
- <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
- <field name="Vertex ID read by vertex shader" size="1" start="4" type="bool"/>
- <field name="Instance ID read by vertex shader" size="1" start="5" type="bool"/>
- <field name="Fragment shader does Z writes" size="1" start="6" type="bool"/>
- <field name="Turn off early-z test" size="1" start="7" type="bool"/>
- <field name="Coordinate shader has separate input and output VPM blocks" size="1" start="8" type="bool"/>
- <field name="Vertex shader has separate input and output VPM blocks" size="1" start="9" type="bool"/>
- <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="10" type="bool"/>
-
- <field name="Number of varyings in Fragment Shader" size="8" start="2b" type="uint"/>
- <field name="Coordinate Shader output VPM segment size" size="8" start="4b" type="uint"/>
- <field name="Coordinate Shader input VPM segment size" size="8" start="5b" type="uint"/>
- <field name="Vertex Shader output VPM segment size" size="8" start="6b" type="uint"/>
- <field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/>
- <field name="Address of default attribute values" size="32" start="8b" type="address"/>
- <field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
- <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/>
- <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/>
- <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/>
- <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
- <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
- <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/>
- <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/>
- <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/>
- <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
- <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
- <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/>
- <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/>
- <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/>
- <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
- </struct>
-
- <struct name="GL Shader State Record" min_ver="41">
+ <struct name="GL Shader State Record" max_ver="42">
<field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
<field name="Enable clipping" size="1" start="1" type="bool"/>
@@ -1294,7 +1154,64 @@
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
- <struct name="Geometry Shader State Record" min_ver="41">
+ <struct name="GL Shader State Record" min_ver="71">
+ <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+ <field name="Enable clipping" size="1" start="1" type="bool"/>
+
+ <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
+ <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
+ <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
+ <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
+ <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
+ <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
+
+ <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
+ <field name="Turn off early-z test" size="1" start="9" type="bool"/>
+
+ <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
+ <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
+ <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
+ <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
+ <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
+ <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
+ <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
+ <field name="No prim pack" size="1" start="19" type="bool"/>
+ <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
+
+ <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
+
+ <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
+ <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
+
+ <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
+ <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
+
+ <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
+ <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
+
+ <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
+ <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
+
+ <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
+ <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
+ <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
+ <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
+ <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
+
+ <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
+ <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
+ <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
+ <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
+ <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
+
+ <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
+ <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
+ <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
+ <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
+ <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
+ </struct>
+
+ <struct name="Geometry Shader State Record">
<field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
<field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
<field name="Geometry Bin Mode Shader Start in final thread section" size="1" start="1" type="bool"/>
@@ -1307,7 +1224,7 @@
<field name="Geometry Render Mode Shader Uniforms Address" size="32" start="12b" type="address"/>
</struct>
- <struct name="Tessellation Shader State Record" min_ver="41">
+ <struct name="Tessellation Shader State Record">
<field name="Tessellation Bin Mode Control Shader Code Address" size="29" start="3" type="address"/>
<field name="Tessellation Bin Mode Control Shader 4-way threadable" size="1" start="0" type="bool"/>
<field name="Tessellation Bin Mode Control Shader Start in final thread section" size="1" start="1" type="bool"/>
@@ -1331,7 +1248,7 @@
<field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/>
</struct>
- <struct name="Tessellation/Geometry Common Params" min_ver="41">
+ <struct name="Tessellation/Geometry Common Params">
<field name="Tessellation Type" size="2" start="1" type="uint">
<value name="Tessellation Type Triangle" value="0"/>
<value name="Tessellation Type Quads" value="1"/>
@@ -1391,31 +1308,7 @@
<field name="GBG min GS output segments required in play" size="3" start="59" type="uint" minus_one="true"/>
</struct>
- <struct name="GL Shader State Attribute Record" max_ver="33">
- <field name="Address" size="32" start="0" type="address"/>
-
- <field name="Vec size" size="2" start="32" type="uint"/>
- <field name="Type" size="3" start="34" type="uint">
- <value name="Attribute half-float" value="1"/>
- <value name="Attribute float" value="2"/>
- <value name="Attribute fixed" value="3"/>
- <value name="Attribute byte" value="4"/>
- <value name="Attribute short" value="5"/>
- <value name="Attribute int" value="6"/>
- <value name="Attribute int2_10_10_10" value="7"/>
- </field>
- <field name="Signed int type" size="1" start="37" type="bool"/>
- <field name="Normalized int type" size="1" start="38" type="bool"/>
- <field name="Read as int/uint" size="1" start="39" type="bool"/>
-
- <field name="Number of values read by Coordinate shader" size="4" start="40" type="uint"/>
- <field name="Number of values read by Vertex shader" size="4" start="44" type="uint"/>
-
- <field name="Instance Divisor" size="16" start="6b" type="uint"/>
- <field name="Stride" size="32" start="8b" type="uint"/>
- </struct>
-
- <struct name="GL Shader State Attribute Record" min_ver="41">
+ <struct name="GL Shader State Attribute Record">
<field name="Address" size="32" start="0" type="address"/>
<field name="Vec size" size="2" start="32" type="uint"/>
@@ -1476,55 +1369,19 @@
<field name="addr" size="13" start="0" type="uint"/>
</struct>
- <struct name="Texture Uniform Parameter 0 CFG_MODE=1" max_ver="33">
- <field name="Per-pixel mask enable" size="1" start="31" type="bool"/>
-
- <field name="Texel offset for r coordinate" size="4" start="27" type="int"/>
- <field name="Texel offset for t coordinate" size="4" start="23" type="int"/>
- <field name="Texel offset for s coordinate" size="4" start="19" type="int"/>
-
- <field name="R Wrap Mode" size="3" start="16" type="Wrap Mode"/>
- <field name="T Wrap Mode" size="3" start="13" type="Wrap Mode"/>
- <field name="S Wrap Mode" size="3" start="10" type="Wrap Mode"/>
-
- <field name="New configuration mode" size="1" start="9" type="bool" default="1"/>
-
- <field name="Shadow" size="1" start="8" type="bool"/>
- <field name="Coefficient lookup mode" size="1" start="7" type="bool"/>
- <field name="Disable AutoLOD, use bias only" size="1" start="6" type="bool"/>
- <field name="Bias supplied" size="1" start="5" type="bool"/>
- <field name="Gather sample mode" size="1" start="4" type="bool"/>
- <field name="Fetch sample mode" size="1" start="3" type="bool"/>
-
- <field name="Lookup Type" size="3" start="0" type="uint">
- <value name="Texture 2D" value="0"/>
- <value name="Texture 2D array" value="1"/>
- <value name="Texture 3D" value="2"/>
- <value name="Texture Cube Map" value="3"/>
- <value name="Texture 1D" value="4"/>
- <value name="Texture 1D Array" value="5"/>
- <value name="Texture Child Image" value="6"/>
- </field>
- </struct>
-
- <struct name="Texture Uniform Parameter 1 CFG_MODE=1" max_ver="33">
- <field name="Texture state record base address" size="28" start="4" type="address"/>
- <field name="Return words of texture data" size="4" start="0" type="uint"/>
- </struct>
-
- <struct name="TMU Config Parameter 0" min_ver="41">
+ <struct name="TMU Config Parameter 0">
<field name="Texture state address" size="32" start="0" type="address"/>
<field name="Return words of texture data" size="4" start="0" type="uint"/>
</struct>
- <struct name="TMU Config Parameter 1" min_ver="41">
+ <struct name="TMU Config Parameter 1">
<field name="Sampler state address" size="32" start="0" type="address"/>
<field name="Per-pixel mask enable" size="1" start="2" type="bool"/>
<field name="Unnormalized coordinates" size="1" start="1" type="bool"/>
<field name="Output Type 32-bit" size="1" start="0" type="bool"/>
</struct>
- <struct name="TMU Config Parameter 2" min_ver="41" max_ver="41">
+ <struct name="TMU Config Parameter 2" max_ver="41">
<field name="Pad" size="8" start="24" type="uint"/>
<field name="Op" size="4" start="20" type="TMU Op"/>
<field name="Offset R" size="4" start="16" type="int"/>
@@ -1538,7 +1395,7 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
- <struct name="TMU Config Parameter 2" min_ver="42">
+ <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
<field name="Pad" size="7" start="25" type="uint"/>
<field name="LOD Query" size="1" start="24" type="bool"/>
<field name="Op" size="4" start="20" type="TMU Op"/>
@@ -1553,30 +1410,34 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
- <struct name="Texture Shader State" max_ver="33">
- <field name="UIF XOR disable" size="1" start="255" type="bool"/>
- <field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
- <field name="Level 0 XOR enable" size="1" start="252" type="bool"/>
- <field name="Level 0 UB_PAD" size="4" start="248" type="uint"/>
- <field name="Output 32-bit" size="1" start="246" type="bool"/>
- <field name="Sample Number" size="2" start="244" type="uint"/>
-
- <field name="Base Level" size="4" start="240" type="uint"/>
- <field name="Fixed Bias" size="16" start="224" type="s8.8"/>
- <field name="Max Level-of-Detail" size="16" start="208" type="s8.8"/>
- <field name="Min Level-of-Detail" size="16" start="192" type="s8.8"/>
-
- <field name="Border Color alpha" size="16" start="176" type="uint"/>
- <field name="Border Color blue" size="16" start="160" type="uint"/>
- <field name="Border Color green" size="16" start="144" type="uint"/>
- <field name="Border Color red" size="16" start="128" type="uint"/>
-
- <field name="Flip S and T on incoming request" size="1" start="127" type="bool"/>
- <field name="Flip ETC Y" size="1" start="126" type="bool" default="1"/>
- <field name="Flip texture Y Axis" size="1" start="125" type="bool"/>
- <field name="Flip texture X Axis" size="1" start="124" type="bool"/>
-
- <field name="Swizzle A" size="3" start="121" type="uint">
+ <struct name="TMU Config Parameter 2" min_ver="71">
+ <field name="Pad" size="5" start="27" type="uint"/>
+ <field name="Write conversion" size="1" start="26" type="bool"/>
+ <field name="DIM query" size="1" start="25" type="bool"/>
+ <field name="LOD Query" size="1" start="24" type="bool"/>
+ <field name="Op" size="4" start="20" type="TMU Op"/>
+ <field name="Offset R" size="4" start="16" type="int"/>
+ <field name="Offset T" size="4" start="12" type="int"/>
+ <field name="Offset S" size="4" start="8" type="int"/>
+ <field name="Gather Mode" size="1" start="7" type="bool"/>
+ <field name="Gather Component" size="2" start="5" type="uint"/>
+ <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+ <field name="Sample Number" size="2" start="2" type="uint"/>
+ <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+ <field name="Offset Format 8" size="1" start="0" type="bool"/>
+ </struct>
+
+ <struct name="Texture Shader State" max_ver="42">
+ <field name="Pad" size="56" start="136" type="uint"/>
+ <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+ <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+ <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
+ <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
+
+ <field name="Base Level" size="4" start="124" type="uint"/>
+ <field name="Max Level" size="4" start="120" type="uint"/>
+
+ <field name="Swizzle A" size="3" start="117" type="uint">
<value name="Swizzle Zero" value="0"/>
<value name="Swizzle One" value="1"/>
<value name="Swizzle Red" value="2"/>
@@ -1585,29 +1446,54 @@
<value name="Swizzle Alpha" value="5"/>
</field>
- <field name="Swizzle B" size="3" start="118" type="uint"/>
- <field name="Swizzle G" size="3" start="115" type="uint"/>
- <field name="Swizzle R" size="3" start="112" type="uint"/>
-
- <field name="Depth Compare Function" size="3" start="109" type="Compare Function"/>
-
- <field name="sRGB" size="1" start="107" type="bool"/>
+ <field name="Swizzle B" size="3" start="114" type="uint"/>
+ <field name="Swizzle G" size="3" start="111" type="uint"/>
+ <field name="Swizzle R" size="3" start="108" type="uint"/>
+ <field name="Extended" size="1" start="107" type="bool"/>
<field name="Texture type" size="7" start="100" type="uint"/>
-
<field name="Image Depth" size="14" start="86" type="uint"/>
<field name="Image Height" size="14" start="72" type="uint"/>
<field name="Image Width" size="14" start="58" type="uint"/>
<field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/>
- <field name="Texture base pointer" size="30" start="2" type="address"/>
+ <field name="Texture base pointer" size="32" start="0" type="address"/>
- <field name="Filter" size="4" start="0" type="TMU Filter"/>
+ <field name="Reverse Standard Border Color" size="1" start="5" type="bool"/>
+ <field name="AHDR" size="1" start="4" type="bool"/>
+ <field name="sRGB" size="1" start="3" type="bool"/>
+ <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/>
+ <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
+ <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
</struct>
- <struct name="Texture Shader State" min_ver="41">
- <field name="Pad" size="56" start="136" type="uint"/>
+ <struct name="Texture Shader State" min_ver="71">
+ <field name="Pad" size="2" start="190" type="uint"/>
+ <!-- When we use an address type, there is an implicit requirement
+ that the address is a 32-bit that is encoded starting at a 32-bit
+ aligned bit offset into the packet. If the address field has less than
+ 32 bits, it is assumed that the address is aligned. For example, a
+ 26-bit address field is expected to be 64-byte aligned (6 lsb bits
+ are 0) and that this will be encoded into a packet starting at bit
+ offset 6 into a 32-bit dword (since bits 0..5 of the address are
+ implicitly 0 and don't need to be explicitly encoded).
+
+ Unfortunately, the CB address below doesn't match this requirement:
+ it starts at bit 138, which is 10 bits into a 32-bit dword, but it
+ represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
+ encode it as an address type. To fix this we encode these addresses
+ as uint types which has two implications:
+ 1. the driver is responsible for manually addinng the buffer objects
+ for these addresses to the job BO list.
+ 2. the driver needs to pass an actual 26-bit address value by manually
+ shifting the 6 lsb bits (that are implicitly 0).
+ -->
+ <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
+ <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
+ <field name="Chroma offset y" size="1" start="137" type="uint"/>
+ <field name="Chroma offset x" size="1" start="136" type="uint"/>
+
<field name="UIF XOR disable" size="1" start="135" type="bool"/>
<field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
<field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
@@ -1635,19 +1521,30 @@
<field name="Image Height" size="14" start="72" type="uint"/>
<field name="Image Width" size="14" start="58" type="uint"/>
- <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/>
+ <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
+ at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
+ Array Stride starting at 33, which is backwards incompatible,
+ We use the definition from 7.1.5.
+ -->
+ <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
+ <field name="R/B swap" size="1" start="32" type="bool"/>
<field name="Texture base pointer" size="32" start="0" type="address"/>
- <field name="Reverse Standard Border Color" size="1" start="5" type="bool"/>
- <field name="AHDR" size="1" start="4" type="bool"/>
- <field name="sRGB" size="1" start="3" type="bool"/>
- <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/>
+ <field name="Reverse" size="1" start="5" type="bool"/>
+ <field name="Transfer func" size="3" start="2" type="uint">
+ <value name="Transfer Func None" value="0"/>
+ <value name="Transfer Func sRGB" value="1"/>
+ <value name="Transfer Func PQ" value="2"/>
+ <value name="Transfer Func HLG" value="3"/>
+ <value name="Transfer Func PQ BT1886" value="4"/>
+ <value name="Transfer Func HLG BT1886" value="5"/>
+ </field>
<field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
<field name="Flip texture X Axis" size="1" start="0" type="bool"/>
</struct>
- <struct name="Sampler State" min_ver="41">
+ <struct name="Sampler State">
<field name="Border color word 3" size="32" start="160" type="uint"/>
<field name="Border color word 2" size="32" start="128" type="uint"/>
<field name="Border color word 1" size="32" start="96" type="uint"/>
diff --git a/src/broadcom/cle/v3d_packet_helpers.h b/src/broadcom/cle/v3d_packet_helpers.h
index 2b5e32ff215..41054618e3a 100644
--- a/src/broadcom/cle/v3d_packet_helpers.h
+++ b/src/broadcom/cle/v3d_packet_helpers.h
@@ -24,87 +24,20 @@
#ifndef MESA_V3D_PACKET_HELPERS_H
#define MESA_V3D_PACKET_HELPERS_H
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <assert.h>
-#include <math.h>
-#include "util/u_math.h"
+#include "util/bitpack_helpers.h"
#ifdef HAVE_VALGRIND
#include <valgrind.h>
#include <memcheck.h>
#define VG(x) x
-#ifndef NDEBUG
-#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
-#endif
#else
#define VG(x) ((void)0)
#endif
-#ifndef __gen_validate_value
-#define __gen_validate_value(x)
-#endif
-/*
-#ifndef __gen_address_type
-#error #define __gen_address_type before including this file
-#endif
-
-#ifndef __gen_user_data
-#error #define __gen_combine_address before including this file
-#endif
-*/
-union __gen_value {
- float f;
- uint32_t dw;
-};
-
-static inline uint64_t
-__gen_mbo(uint32_t start, uint32_t end)
-{
- return (~0ull >> (64 - (end - start + 1))) << start;
-}
-
-static inline uint64_t
-__gen_uint(uint64_t v, uint32_t start, uint32_t end)
-{
- __gen_validate_value(v);
-
-#ifndef NDEBUG
- const int width = end - start + 1;
- if (width < 64) {
- const uint64_t max = (1ull << width) - 1;
- assert(v <= max);
- }
-#endif
-
- return v << start;
-}
-
-static inline uint64_t
-__gen_sint(int64_t v, uint32_t start, uint32_t end)
-{
- const int width = end - start + 1;
-
- __gen_validate_value(v);
-
-#ifndef NDEBUG
- if (width < 64) {
- const int64_t max = (1ll << (width - 1)) - 1;
- const int64_t min = -(1ll << (width - 1));
- assert(min <= v && v <= max);
- }
-#endif
-
- const uint64_t mask = ~0ull >> (64 - width);
-
- return (v & mask) << start;
-}
-
static inline uint64_t
__gen_offset(uint64_t v, uint32_t start, uint32_t end)
{
- __gen_validate_value(v);
+ util_bitpack_validate_value(v);
#ifndef NDEBUG
uint64_t mask = (~0ull >> (64 - (end - start + 1))) << start;
@@ -114,50 +47,6 @@ __gen_offset(uint64_t v, uint32_t start, uint32_t end)
return v;
}
-static inline uint32_t
-__gen_float(float v)
-{
- __gen_validate_value(v);
- return ((union __gen_value) { .f = (v) }).dw;
-}
-
-static inline uint64_t
-__gen_sfixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits)
-{
- __gen_validate_value(v);
-
- const float factor = (1 << fract_bits);
-
-#ifndef NDEBUG
- const float max = ((1 << (end - start)) - 1) / factor;
- const float min = -(1 << (end - start)) / factor;
- assert(min <= v && v <= max);
-#endif
-
- const int64_t int_val = llroundf(v * factor);
- const uint64_t mask = ~0ull >> (64 - (end - start + 1));
-
- return (int_val & mask) << start;
-}
-
-static inline uint64_t
-__gen_ufixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits)
-{
- __gen_validate_value(v);
-
- const float factor = (1 << fract_bits);
-
-#ifndef NDEBUG
- const float max = ((1 << (end - start + 1)) - 1) / factor;
- const float min = 0.0f;
- assert(min <= v && v <= max);
-#endif
-
- const uint64_t uint_val = llroundf(v * factor);
-
- return uint_val << start;
-}
-
static inline uint64_t
__gen_unpack_uint(const uint8_t *restrict cl, uint32_t start, uint32_t end)
{
diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
index 5762e5aaa70..0062ddbd516 100644
--- a/src/broadcom/cle/v3dx_pack.h
+++ b/src/broadcom/cle/v3dx_pack.h
@@ -31,12 +31,10 @@
#if (V3D_VERSION == 21)
# include "cle/v3d_packet_v21_pack.h"
-#elif (V3D_VERSION == 33)
-# include "cle/v3d_packet_v33_pack.h"
-#elif (V3D_VERSION == 41)
-# include "cle/v3d_packet_v41_pack.h"
#elif (V3D_VERSION == 42)
# include "cle/v3d_packet_v42_pack.h"
+#elif (V3D_VERSION == 71)
+# include "cle/v3d_packet_v71_pack.h"
#else
# error "Need to add a pack header include for this v3d version"
#endif
diff --git a/src/broadcom/cle/v3d_packet_v21.xml b/src/broadcom/cle/vc4_packet.xml
index df838a70845..df838a70845 100644
--- a/src/broadcom/cle/v3d_packet_v21.xml
+++ b/src/broadcom/cle/vc4_packet.xml
diff --git a/src/broadcom/clif/clif_dump.c b/src/broadcom/clif/clif_dump.c
index 0aaa6b6ad8b..db94edba113 100644
--- a/src/broadcom/clif/clif_dump.c
+++ b/src/broadcom/clif/clif_dump.c
@@ -106,12 +106,16 @@ static bool
clif_dump_packet(struct clif_dump *clif, uint32_t offset, const uint8_t *cl,
uint32_t *size, bool reloc_mode)
{
- if (clif->devinfo->ver >= 42)
+
+ switch (clif->devinfo->ver) {
+ case 42:
return v3d42_clif_dump_packet(clif, offset, cl, size, reloc_mode);
- else if (clif->devinfo->ver >= 41)
- return v3d41_clif_dump_packet(clif, offset, cl, size, reloc_mode);
- else
- return v3d33_clif_dump_packet(clif, offset, cl, size, reloc_mode);
+ case 71:
+ return v3d71_clif_dump_packet(clif, offset, cl, size, reloc_mode);
+ default:
+ break;
+ };
+ unreachable("Unknown HW version");
}
static uint32_t
@@ -160,7 +164,8 @@ clif_dump_cl(struct clif_dump *clif, uint32_t start, uint32_t end,
static uint32_t
clif_dump_gl_shader_state_record(struct clif_dump *clif,
struct reloc_worklist_entry *reloc,
- void *vaddr)
+ void *vaddr,
+ bool including_gs)
{
struct v3d_group *state = v3d_spec_find_struct(clif->spec,
"GL Shader State Record");
@@ -170,6 +175,16 @@ clif_dump_gl_shader_state_record(struct clif_dump *clif,
assert(attr);
uint32_t offset = 0;
+ if (including_gs) {
+ struct v3d_group *gs_state = v3d_spec_find_struct(clif->spec,
+ "Geometry Shader State Record");
+ assert(gs_state);
+ out(clif, "@format shadrec_gl_geom\n");
+ v3d_print_group(clif, gs_state, 0, vaddr + offset);
+ offset += v3d_group_get_length(gs_state);
+ /* Extra pad when geometry/tessellation shader is present */
+ offset += 20;
+ }
out(clif, "@format shadrec_gl_main\n");
v3d_print_group(clif, state, 0, vaddr + offset);
offset += v3d_group_get_length(state);
@@ -201,6 +216,7 @@ clif_process_worklist(struct clif_dump *clif)
break;
case reloc_gl_shader_state:
+ case reloc_gl_including_gs_shader_state:
break;
case reloc_generic_tile_list:
clif_dump_cl(clif, reloc->addr,
@@ -336,10 +352,12 @@ clif_dump_buffers(struct clif_dump *clif)
break;
case reloc_gl_shader_state:
+ case reloc_gl_including_gs_shader_state:
offset += clif_dump_gl_shader_state_record(clif,
reloc,
bo->vaddr +
- offset);
+ offset,
+ reloc->type == reloc_gl_including_gs_shader_state);
break;
case reloc_generic_tile_list:
offset = clif_dump_cl(clif, reloc->addr,
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
index d96bfd12de9..d4e55e03730 100644
--- a/src/broadcom/clif/clif_private.h
+++ b/src/broadcom/clif/clif_private.h
@@ -64,6 +64,7 @@ struct clif_dump {
enum reloc_worklist_type {
reloc_cl,
reloc_gl_shader_state,
+ reloc_gl_including_gs_shader_state,
reloc_generic_tile_list,
};
@@ -94,12 +95,10 @@ clif_dump_add_address_to_worklist(struct clif_dump *clif,
enum reloc_worklist_type type,
uint32_t addr);
-bool v3d33_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
- const uint8_t *cl, uint32_t *size, bool reloc_mode);
-bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
- const uint8_t *cl, uint32_t *size, bool reloc_mode);
bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
const uint8_t *cl, uint32_t *size, bool reloc_mode);
+bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+ const uint8_t *cl, uint32_t *size, bool reloc_mode);
static inline void
out(struct clif_dump *clif, const char *fmt, ...)
diff --git a/src/broadcom/clif/v3dx_dump.c b/src/broadcom/clif/v3dx_dump.c
index 9cf59f88920..454478531ff 100644
--- a/src/broadcom/clif/v3dx_dump.c
+++ b/src/broadcom/clif/v3dx_dump.c
@@ -94,6 +94,25 @@ v3dX(clif_dump_packet)(struct clif_dump *clif, uint32_t offset,
return true;
}
+#if V3D_VERSION >= 41
+ case V3DX(GL_SHADER_STATE_INCLUDING_GS_opcode): {
+ struct V3DX(GL_SHADER_STATE_INCLUDING_GS) values;
+ V3DX(GL_SHADER_STATE_INCLUDING_GS_unpack)(cl, &values);
+
+ if (reloc_mode) {
+ struct reloc_worklist_entry *reloc =
+ clif_dump_add_address_to_worklist(clif,
+ reloc_gl_including_gs_shader_state,
+ values.address);
+ if (reloc) {
+ reloc->shader_state.num_attrs =
+ values.number_of_attribute_arrays;
+ }
+ }
+ return true;
+ }
+#endif /* V3D_VERSION >= 41 */
+
#if V3D_VERSION < 40
case V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED_opcode): {
struct V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED) values;
diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h
index cb1ee7c96f4..4cfd98f961b 100644
--- a/src/broadcom/common/v3d_cpu_tiling.h
+++ b/src/broadcom/common/v3d_cpu_tiling.h
@@ -31,7 +31,7 @@ static inline void
v3d_load_utile(void *cpu, uint32_t cpu_stride,
void *gpu, uint32_t gpu_stride)
{
-#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+#if defined(V3D_BUILD_NEON) && DETECT_ARCH_ARM
if (gpu_stride == 8) {
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
@@ -80,7 +80,7 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
: "q0", "q1", "q2", "q3");
return;
}
-#elif defined (PIPE_ARCH_AARCH64)
+#elif DETECT_ARCH_AARCH64
if (gpu_stride == 8) {
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
@@ -141,7 +141,7 @@ static inline void
v3d_store_utile(void *gpu, uint32_t gpu_stride,
void *cpu, uint32_t cpu_stride)
{
-#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+#if defined(V3D_BUILD_NEON) && DETECT_ARCH_ARM
if (gpu_stride == 8) {
__asm__ volatile (
/* Load each 8-byte line from cpu-side source,
@@ -188,7 +188,7 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
: "q0", "q1", "q2", "q3");
return;
}
-#elif defined (PIPE_ARCH_AARCH64)
+#elif DETECT_ARCH_AARCH64
if (gpu_stride == 8) {
__asm__ volatile (
/* Load each 8-byte line from cpu-side source,
diff --git a/src/broadcom/vulkan/v3dv_util.c b/src/broadcom/common/v3d_csd.h
index d26369f9f56..dc1bd11efc5 100644
--- a/src/broadcom/vulkan/v3dv_util.c
+++ b/src/broadcom/common/v3d_csd.h
@@ -1,12 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
- *
- * based in part on anv driver which is:
- * Copyright © 2015 Intel Corporation
- *
- * based in part on radv driver which is:
- * Copyright © 2016 Red Hat.
- * Copyright © 2016 Bas Nieuwenhuizen
+ * Copyright © 2023 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -28,38 +21,23 @@
* IN THE SOFTWARE.
*/
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-
-#include "vk_enum_to_str.h"
-#include "v3dv_private.h"
-
-VkResult
-__vk_errorf(struct v3dv_instance *instance, VkResult error, const char *file,
- int line, const char *format, ...)
-{
- va_list ap;
- char buffer[256];
+#ifndef V3D_CSD_H
+#define V3D_CSD_H
+
+#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
+#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
+/* Allow this dispatch to start while the last one is still running. */
+#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
+/* Maximum supergroup ID. 6 bits. */
+#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
+/* Batches per supergroup minus 1. 8 bits. */
+#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
+/* Workgroups per supergroup, 0 means 16 */
+#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
+#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
+
+#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
+#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
+#define V3D_CSD_CFG5_THREADING (1 << 0)
-#ifndef DEBUG
- return error;
#endif
-
- const char *error_str = vk_Result_to_str(error);
-
- if (format) {
- va_start(ap, format);
- vsnprintf(buffer, sizeof(buffer), format, ap);
- va_end(ap);
-
- fprintf(stderr, "%s:%d: %s (%s)\n", file, line, buffer, error_str);
- } else {
- fprintf(stderr, "%s:%d: %s\n", file, line, error_str);
- }
-
- return error;
-}
diff --git a/src/broadcom/common/v3d_debug.c b/src/broadcom/common/v3d_debug.c
index 508a2b7c74c..b6b32bc72ad 100644
--- a/src/broadcom/common/v3d_debug.c
+++ b/src/broadcom/common/v3d_debug.c
@@ -37,13 +37,13 @@
#include "util/u_debug.h"
#include "c11/threads.h"
-uint32_t V3D_DEBUG = 0;
+uint32_t v3d_mesa_debug = 0;
static const struct debug_named_value debug_control[] = {
{ "cl", V3D_DEBUG_CL,
"Dump command list during creation" },
{ "cl_nobin", V3D_DEBUG_CL_NO_BIN,
- "Dump command listduring creation, excluding binary resources" },
+ "Dump command list during creation, excluding binary resources" },
{ "clif", V3D_DEBUG_CLIF,
"Dump command list (CLIF format) during creation", },
{ "qpu", V3D_DEBUG_QPU,
@@ -53,15 +53,21 @@ static const struct debug_named_value debug_control[] = {
{ "nir", V3D_DEBUG_NIR,
"Dump NIR during program compile" },
{ "tgsi", V3D_DEBUG_TGSI,
- "Dump TGSI during program compile" },
+ "Dump TGSI during program compile (v3d only)" },
+ /* `shaderdb` is *not* used by shader-db, but is here so that any other
+ * game/app can dump its stats in the shader-db format, allowing them
+ * to be compared using shader-db's report.py tool.
+ */
{ "shaderdb", V3D_DEBUG_SHADERDB,
"Dump program compile information for shader-db analysis" },
{ "surface", V3D_DEBUG_SURFACE,
- "Print resource layout information" },
+ /* FIXME: evaluate to implement it on v3dv */
+ "Print resource layout information (v3d only)" },
{ "perf", V3D_DEBUG_PERF,
- "Print during runtime performance-related events" },
+ "Print performance-related events during runtime" },
{ "norast", V3D_DEBUG_NORAST,
- "Skip actual hardware execution of commands" },
+ /* FIXME: evaluate to implement on v3dv*/
+ "Skip actual hardware execution of commands (v3d only)" },
{ "fs", V3D_DEBUG_FS,
"Dump fragment shaders" },
{ "gs", V3D_DEBUG_GS,
@@ -73,11 +79,11 @@ static const struct debug_named_value debug_control[] = {
{ "always_flush", V3D_DEBUG_ALWAYS_FLUSH,
"Flush after each draw call" },
{ "precompile", V3D_DEBUG_PRECOMPILE,
- "Precompiles shader variant at shader state creation time" },
+ "Precompiles shader variant at shader state creation time (v3d only)" },
{ "ra", V3D_DEBUG_RA,
"Dump register allocation failures" },
{ "dump_spirv", V3D_DEBUG_DUMP_SPIRV,
- "Dump SPIR-V code" },
+ "Dump SPIR-V code (v3dv only)" },
{ "tmu32", V3D_DEBUG_TMU_32BIT,
"Force 32-bit precision on all TMU operations" },
/* This can lead to incorrect behavior for applications that do
@@ -88,12 +94,25 @@ static const struct debug_named_value debug_control[] = {
"Force 16-bit precision on all TMU operations" },
{ "noloopunroll", V3D_DEBUG_NO_LOOP_UNROLL,
"Disable loop unrolling" },
- { NULL }
+ { "db", V3D_DEBUG_DOUBLE_BUFFER,
+ "Enable double buffer for Tile Buffer when MSAA is disabled" },
+#ifdef ENABLE_SHADER_CACHE
+ { "cache", V3D_DEBUG_CACHE,
+ "Print on-disk cache events (only with cache enabled)" },
+#endif
+ { "no_merge_jobs", V3D_DEBUG_NO_MERGE_JOBS,
+ "Don't try to merge subpasses in the same job even if they share framebuffer configuration (v3dv only)" },
+ { "opt_compile_time", V3D_DEBUG_OPT_COMPILE_TIME,
+ "Don't try to reduce shader spilling, might improve compile times with expensive shaders." },
+ /* disable_tfu is v3dv only because v3d has some uses of the TFU without alternative codepaths */
+ { "disable_tfu", V3D_DEBUG_DISABLE_TFU,
+ "Disable TFU (v3dv only)" },
+ DEBUG_NAMED_VALUE_END
};
DEBUG_GET_ONCE_FLAGS_OPTION(v3d_debug, "V3D_DEBUG", debug_control, 0)
-uint32_t
+bool
v3d_debug_flag_for_shader_stage(gl_shader_stage stage)
{
uint32_t flags[] = {
@@ -105,14 +124,11 @@ v3d_debug_flag_for_shader_stage(gl_shader_stage stage)
[MESA_SHADER_COMPUTE] = V3D_DEBUG_CS,
};
STATIC_ASSERT(MESA_SHADER_STAGES == 6);
- return flags[stage];
+ return v3d_mesa_debug & flags[stage];
}
void
v3d_process_debug_variable(void)
{
- V3D_DEBUG = debug_get_option_v3d_debug();
-
- if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
- V3D_DEBUG |= V3D_DEBUG_NORAST;
+ v3d_mesa_debug = debug_get_option_v3d_debug();
}
diff --git a/src/broadcom/common/v3d_debug.h b/src/broadcom/common/v3d_debug.h
index b5278c4c759..67112ebf361 100644
--- a/src/broadcom/common/v3d_debug.h
+++ b/src/broadcom/common/v3d_debug.h
@@ -39,7 +39,9 @@ extern "C" {
* list of debugging flags, as well as some macros for handling them.
*/
-extern uint32_t V3D_DEBUG;
+extern uint32_t v3d_mesa_debug;
+
+#define V3D_DBG(flag) unlikely(v3d_mesa_debug & V3D_DEBUG_ ## flag)
#define V3D_DEBUG_SHADERDB (1 << 0)
#define V3D_DEBUG_TGSI (1 << 1)
@@ -63,6 +65,11 @@ extern uint32_t V3D_DEBUG;
#define V3D_DEBUG_TMU_16BIT (1 << 19)
#define V3D_DEBUG_NO_LOOP_UNROLL (1 << 20)
#define V3D_DEBUG_CL_NO_BIN (1 << 21)
+#define V3D_DEBUG_DOUBLE_BUFFER (1 << 22)
+#define V3D_DEBUG_CACHE (1 << 23)
+#define V3D_DEBUG_NO_MERGE_JOBS (1 << 24)
+#define V3D_DEBUG_OPT_COMPILE_TIME (1 << 25)
+#define V3D_DEBUG_DISABLE_TFU (1 << 26)
#define V3D_DEBUG_SHADERS (V3D_DEBUG_TGSI | V3D_DEBUG_NIR | \
V3D_DEBUG_VIR | V3D_DEBUG_QPU | \
@@ -85,12 +92,7 @@ extern uint32_t V3D_DEBUG;
#define dbg_printf(...) fprintf(stderr, __VA_ARGS__)
#endif /* HAVE_ANDROID_PLATFORM */
-#define DBG(flag, ...) do { \
- if (unlikely(V3D_DEBUG & (flag))) \
- dbg_printf(__VA_ARGS__); \
-} while(0)
-
-extern uint32_t v3d_debug_flag_for_shader_stage(gl_shader_stage stage);
+extern bool v3d_debug_flag_for_shader_stage(gl_shader_stage stage);
extern void v3d_process_debug_variable(void);
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 272190eb2e5..fa85a7d5077 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
struct drm_v3d_get_param ident1 = {
.param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
};
+ struct drm_v3d_get_param hub_ident3 = {
+ .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
+ };
int ret;
ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
@@ -62,10 +65,11 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
int qups = (ident1.value >> 8) & 0xf;
devinfo->qpu_count = nslc * qups;
+ devinfo->has_accumulators = devinfo->ver < 71;
+
switch (devinfo->ver) {
- case 33:
- case 41:
case 42:
+ case 71:
break;
default:
fprintf(stderr,
@@ -75,5 +79,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
return false;
}
- return true;
+ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
+ if (ret != 0) {
+ fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ devinfo->rev = (hub_ident3.value >> 8) & 0xff;
+
+ return true;
}
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 97abd9b8d9f..8dfc7858727 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -34,11 +34,17 @@ struct v3d_device_info {
/** Simple V3D version: major * 10 + minor */
uint8_t ver;
+ /** V3D revision number */
+ uint8_t rev;
+
/** Size of the VPM, in bytes. */
int vpm_size;
/* NSLC * QUPS from the core's IDENT registers. */
int qpu_count;
+
+ /* If the hw has accumulator registers */
+ bool has_accumulators;
};
typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index 129e53e29a4..354c8784914 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -24,6 +24,8 @@
#ifndef V3D_LIMITS_H
#define V3D_LIMITS_H
+#define V3D_CL_MAX_INSTR_SIZE 25
+
/* Number of channels a QPU thread executes in parallel. Also known as
* gl_SubGroupSizeARB.
*/
@@ -36,32 +38,35 @@
V3D_MAX_GS_INPUTS, \
V3D_MAX_FS_INPUTS)
-/* For now we need to maintain a different limits for OpenGL and Vulkan due
- * some OpenGL CTS tests hitting register allocation when trying to use all
- * the texture available.
- *
- * FIXME: nir_schedule should be able to handle that. When fixed it would be
- * simpler to keep just one limit
- */
-#define V3D_VULKAN_MAX_TEXTURE_SAMPLERS 24
-#define V3D_OPENGL_MAX_TEXTURE_SAMPLERS 16
-
-/* Not specifically a hardware limit, just coordination between compiler and
- * driver.
- */
-#define V3D_MAX_TEXTURE_SAMPLERS MAX2(V3D_VULKAN_MAX_TEXTURE_SAMPLERS, \
- V3D_OPENGL_MAX_TEXTURE_SAMPLERS)
-
-/* The HW can do 16384 (15), but we run into hangs when we expose that. */
-#define V3D_MAX_MIP_LEVELS 13
+#define V3D_MAX_TEXTURE_SAMPLERS 24
#define V3D_MAX_SAMPLES 4
-#define V3D_MAX_DRAW_BUFFERS 4
+#define V3D_MAX_DRAW_BUFFERS 8
+#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8)
#define V3D_MAX_POINT_SIZE 512.0f
#define V3D_MAX_LINE_WIDTH 32
-#define V3D_MAX_BUFFER_RANGE (1 << 27)
+#define V3D_MAX_BUFFER_RANGE (1 << 30)
+
+/* Sub-pixel precision bits in the rasterizer */
+#define V3D_COORD_SHIFT 6
+
+/* Size of a cache line */
+#define V3D_NON_COHERENT_ATOM_SIZE 256
+
+/* Minimum alignment for texel buffers */
+#define V3D_TMU_TEXEL_ALIGN 64
+
+#define V3D_MAX_IMAGE_DIMENSION 4096
+
+/* The HW can do 16384 (15), but we run into hangs when we expose that. Also,
+ * since we are only exposing images up to 4096 pixels per dimension 13 is
+ * all we need.
+ */
+#define V3D_MAX_MIP_LEVELS 13
+
+#define V3D_MAX_ARRAY_LAYERS 2048
#endif /* V3D_LIMITS_H */
diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
index fe89398208a..4ab66f647ab 100644
--- a/src/broadcom/common/v3d_macros.h
+++ b/src/broadcom/common/v3d_macros.h
@@ -32,15 +32,12 @@
#if (V3D_VERSION == 21)
# define V3DX(x) V3D21_##x
# define v3dX(x) v3d21_##x
-#elif (V3D_VERSION == 33)
-# define V3DX(x) V3D33_##x
-# define v3dX(x) v3d33_##x
-#elif (V3D_VERSION == 41)
-# define V3DX(x) V3D41_##x
-# define v3dX(x) v3d41_##x
#elif (V3D_VERSION == 42)
# define V3DX(x) V3D42_##x
# define v3dX(x) v3d42_##x
+#elif (V3D_VERSION == 71)
+# define V3DX(x) V3D71_##x
+# define v3dX(x) v3d71_##x
#else
# error "Need to add prefixing macros for this v3d version"
#endif
diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h
new file mode 100644
index 00000000000..33e3e0e78db
--- /dev/null
+++ b/src/broadcom/common/v3d_performance_counters.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_PERFORMANCE_COUNTERS_H
+#define V3D_PERFORMANCE_COUNTERS_H
+
+#define V3D_PERFCNT_CATEGORY 0
+#define V3D_PERFCNT_NAME 1
+#define V3D_PERFCNT_DESCRIPTION 2
+
+#ifndef V3D_VERSION
+# error "The V3D_VERSION macro must be defined"
+#endif
+
+#if (V3D_VERSION >= 71)
+
+static const char *v3d_performance_counters[][3] = {
+ {"CORE", "cycle-count", "[CORE] Cycle counter"},
+ {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"},
+ {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+ {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+ {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+ {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+ {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+ {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+ {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+ {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+ {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+ {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+ {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+ {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+ {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+ {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+ {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+ {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+ {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+ {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+ {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+ {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+ {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+ {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"},
+ {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"},
+ {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"},
+ {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+ {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+ {"L2T", "L2T-local", "[L2T] Local mode access"},
+ {"L2T", "L2T-writeback", "[L2T] Writeback"},
+ {"L2T", "L2T-zero", "[L2T] Zero"},
+ {"L2T", "L2T-merge", "[L2T] Merge"},
+ {"L2T", "L2T-fill", "[L2T] Fill"},
+ {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"},
+ {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"},
+ {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"},
+ {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"},
+ {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"},
+ {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"},
+ {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"},
+ {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"},
+ {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"},
+ {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"},
+ {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+ {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"},
+ {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+ {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"},
+ {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"},
+ {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+ {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+ {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+ {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+ {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+ {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+ {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+ {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+ {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+ {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+ {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+ {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+ {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+ {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+ {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+ {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+ {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+ {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+ {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+ {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+ {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+ {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+ {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+ {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+ {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+ {"AXI", "AXI-read-trans", "[AXI] Read transaction count"},
+ {"AXI", "AXI-write-trans", "[AXI] Write transaction count"},
+ {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"},
+ {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"},
+ {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"},
+ {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"},
+ {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"},
+ {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"},
+ {"QPU", "QPU-active", "[QPU] Executed shader instruction"},
+ {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+ {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"},
+ {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+ {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"},
+ {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"},
+ {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"},
+ {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"},
+ {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"},
+ {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"},
+};
+
+#elif (V3D_VERSION >= 42)
+
+static const char *v3d_performance_counters[][3] = {
+ {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+ {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+ {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+ {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+ {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+ {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
+ {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
+ {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+ {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+ {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+ {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+ {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
+ {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
+ {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+ {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
+ {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
+ {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
+ {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
+ {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+ {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+ {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+ {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+ {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+ {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
+ {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
+ {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
+ {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+ {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+ {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+ {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+ {"CORE", "cycle-count", "[CORE] Cycle counter"},
+ {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
+ {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+ {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+ {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+ {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+ {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+ {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+ {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+ {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+ {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+ {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+ {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+ {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+ {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+ {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+ {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+ {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
+ {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
+ {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
+ {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
+ {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+ {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+ {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
+ {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
+ {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
+ {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
+ {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
+ {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
+ {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
+ {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
+ {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
+ {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+ {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
+ {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+ {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
+ {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
+ {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
+ {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
+ {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+ {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+ {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+ {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+ {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+ {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+ {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+ {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+ {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+ {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
+ {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+ {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+ {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+ {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+ {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
+ {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+};
+
+#else
+static const char *v3d_performance_counters[][3] = { };
+#endif
+
+#endif
diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h
new file mode 100644
index 00000000000..572d0074794
--- /dev/null
+++ b/src/broadcom/common/v3d_tfu.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2021 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_TFU_H
+#define V3D_TFU_H
+
+/* Disable level 0 write, just write following mipmaps */
+#define V3D33_TFU_IOA_DIMTW (1 << 0)
+#define V3D33_TFU_IOA_FORMAT_SHIFT 3
+#define V3D33_TFU_IOA_FORMAT_LINEARTILE 3
+#define V3D33_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
+#define V3D33_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
+#define V3D33_TFU_IOA_FORMAT_UIF_NO_XOR 6
+#define V3D33_TFU_IOA_FORMAT_UIF_XOR 7
+
+#define V3D33_TFU_ICFG_NUMMM_SHIFT 5
+#define V3D33_TFU_ICFG_TTYPE_SHIFT 9
+
+#define V3D33_TFU_ICFG_OPAD_SHIFT 22
+
+#define V3D33_TFU_ICFG_FORMAT_SHIFT 18
+#define V3D33_TFU_ICFG_FORMAT_RASTER 0
+#define V3D33_TFU_ICFG_FORMAT_SAND_128 1
+#define V3D33_TFU_ICFG_FORMAT_SAND_256 2
+#define V3D33_TFU_ICFG_FORMAT_LINEARTILE 11
+#define V3D33_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D33_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+#define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15
+
+/* Disable level 0 write, just write following mipmaps */
+#define V3D71_TFU_IOC_DIMTW (1 << 0)
+#define V3D71_TFU_IOC_FORMAT_SHIFT 12
+#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
+#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6
+#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7
+
+#define V3D71_TFU_IOC_STRIDE_SHIFT 16
+#define V3D71_TFU_IOC_NUMMM_SHIFT 4
+
+#define V3D71_TFU_ICFG_OTYPE_SHIFT 16
+#define V3D71_TFU_ICFG_IFORMAT_SHIFT 23
+#define V3D71_TFU_ICFG_FORMAT_RASTER 0
+#define V3D71_TFU_ICFG_FORMAT_SAND_128 1
+#define V3D71_TFU_ICFG_FORMAT_SAND_256 2
+#define V3D71_TFU_ICFG_FORMAT_LINEARTILE 11
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+#define V3D71_TFU_ICFG_FORMAT_UIF_XOR 15
+
+#endif
diff --git a/src/broadcom/common/v3d_tiling.c b/src/broadcom/common/v3d_tiling.c
index 22f84811e19..6e785916578 100644
--- a/src/broadcom/common/v3d_tiling.c
+++ b/src/broadcom/common/v3d_tiling.c
@@ -28,6 +28,7 @@
*/
#include <stdint.h>
+#include "util/box.h"
#include "v3d_tiling.h"
#include "broadcom/common/v3d_cpu_tiling.h"
diff --git a/src/broadcom/common/v3d_tiling.h b/src/broadcom/common/v3d_tiling.h
index 08ae7cce805..2573c8a5f02 100644
--- a/src/broadcom/common/v3d_tiling.h
+++ b/src/broadcom/common/v3d_tiling.h
@@ -24,7 +24,7 @@
#ifndef V3D_TILING_H
#define V3D_TILING_H
-#include "util/u_box.h"
+#include "util/format/u_format.h"
/* A UIFblock is a 256-byte region of memory that's 256-byte aligned. These
* will be grouped in 4x4 blocks (left-to-right, then top-to-bottom) in a 4KB
@@ -63,6 +63,8 @@ enum v3d_tiling_mode {
V3D_TILING_UIF_XOR,
};
+struct pipe_box;
+
uint32_t v3d_utile_width(int cpp) ATTRIBUTE_CONST;
uint32_t v3d_utile_height(int cpp) ATTRIBUTE_CONST;
bool v3d_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
index 424656fd8b1..8a50d279985 100644
--- a/src/broadcom/common/v3d_util.c
+++ b/src/broadcom/common/v3d_util.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -86,3 +86,187 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
return best_wgs_per_sg;
}
+
+#define V3D71_TLB_COLOR_SIZE (16 * 1024)
+#define V3D71_TLB_DETPH_SIZE (16 * 1024)
+#define V3D71_TLB_AUX_DETPH_SIZE (8 * 1024)
+
+static bool
+tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp)
+{
+ /* First, we check if we can fit this tile size allocating the depth
+ * TLB memory to color.
+ */
+ if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE &&
+ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) {
+ return true;
+ }
+
+ /* Otherwise the tile must fit in the main TLB buffers */
+ return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE &&
+ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE;
+}
+
+void
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+ uint32_t color_attachment_count,
+ /* V3D 4.x max internal bpp of all RTs */
+ uint32_t max_internal_bpp,
+ /* V3D 7.x accumulated bpp for all RTs (in bytes) */
+ uint32_t total_color_bpp,
+ bool msaa,
+ bool double_buffer,
+ uint32_t *width,
+ uint32_t *height)
+{
+ static const uint8_t tile_sizes[] = {
+ 64, 64,
+ 64, 32,
+ 32, 32,
+ 32, 16,
+ 16, 16,
+ 16, 8,
+ 8, 8
+ };
+
+ uint32_t idx = 0;
+ if (devinfo->ver >= 71) {
+ /* In V3D 7.x, we use the actual bpp used by color attachments to compute
+ * the tile size instead of the maximum bpp. This may allow us to choose a
+ * larger tile size than we would in 4.x in scenarios with multiple RTs
+ * with different bpps.
+ *
+ * Also, the TLB has an auxiliary buffer of 8KB that will be automatically
+ * used for depth instead of the main 16KB depth TLB buffer when the depth
+ * tile fits in the auxiliary buffer, allowing the hardware to allocate
+ * the 16KB from the main depth TLB to the color TLB. If we can do that,
+ * then we are effectively doubling the memory we have for color and we
+ * can also select a larger tile size. This is necessary to support
+ * the most expensive configuration: 8x128bpp RTs + MSAA.
+ *
+ * FIXME: the docs state that depth TLB memory can be used for color
+ * if depth testing is not used by setting the 'depth disable' bit in the
+ * rendering configuration. However, this comes with a requirement that
+ * occlussion queries must not be active. We need to clarify if this means
+ * active at the point at which we emit a tile rendering configuration
+ * item, meaning that the we have a query spanning a full render pass
+ * (this is something we can tell before we emit the rendering
+ * configuration item) or active in the subpass for which we are enabling
+ * the bit (which we can't tell until later, when we record commands for
+ * the subpass). If it is the latter, then we cannot use this feature.
+ *
+ * FIXME: pending handling double_buffer.
+ */
+ const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1);
+ const uint32_t depth_bpp = 4 * (msaa ? 4 : 1);
+ do {
+ const uint32_t tile_w = tile_sizes[idx * 2];
+ const uint32_t tile_h = tile_sizes[idx * 2 + 1];
+ if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp))
+ break;
+ idx++;
+ } while (idx < ARRAY_SIZE(tile_sizes) / 2);
+
+ /* FIXME: pending handling double_buffer */
+ assert(!double_buffer);
+ } else {
+ /* On V3D 4.x tile size is selected based on the number of RTs, the
+ * maximum bpp across all of them and whether 4x MSAA is used.
+ */
+ if (color_attachment_count > 4)
+ idx += 3;
+ else if (color_attachment_count > 2)
+ idx += 2;
+ else if (color_attachment_count > 1)
+ idx += 1;
+
+ /* MSAA and double-buffer are mutually exclusive */
+ assert(!msaa || !double_buffer);
+ if (msaa)
+ idx += 2;
+ else if (double_buffer)
+ idx += 1;
+
+ idx += max_internal_bpp;
+ }
+
+ assert(idx < ARRAY_SIZE(tile_sizes) / 2);
+
+ *width = tile_sizes[idx * 2];
+ *height = tile_sizes[idx * 2 + 1];
+}
+
+/* Translates a pipe swizzle to the swizzle values used in the
+ * TEXTURE_SHADER_STATE packet.
+ */
+uint32_t
+v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle)
+{
+ switch (swizzle) {
+ case PIPE_SWIZZLE_0:
+ return 0;
+ case PIPE_SWIZZLE_1:
+ return 1;
+ case PIPE_SWIZZLE_X:
+ case PIPE_SWIZZLE_Y:
+ case PIPE_SWIZZLE_Z:
+ case PIPE_SWIZZLE_W:
+ return 2 + swizzle;
+ default:
+ unreachable("unknown swizzle");
+ }
+}
+
+/* Translates a pipe primitive type to a hw value we can use in the various
+ * draw packets.
+ */
+uint32_t
+v3d_hw_prim_type(enum mesa_prim prim_type)
+{
+ switch (prim_type) {
+ case MESA_PRIM_POINTS:
+ case MESA_PRIM_LINES:
+ case MESA_PRIM_LINE_LOOP:
+ case MESA_PRIM_LINE_STRIP:
+ case MESA_PRIM_TRIANGLES:
+ case MESA_PRIM_TRIANGLE_STRIP:
+ case MESA_PRIM_TRIANGLE_FAN:
+ return prim_type;
+
+ case MESA_PRIM_LINES_ADJACENCY:
+ case MESA_PRIM_LINE_STRIP_ADJACENCY:
+ case MESA_PRIM_TRIANGLES_ADJACENCY:
+ case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
+ return 8 + (prim_type - MESA_PRIM_LINES_ADJACENCY);
+
+ default:
+ unreachable("Unsupported primitive type");
+ }
+}
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp)
+{
+ switch (internal_bpp) {
+ case 0 /* V3D_INTERNAL_BPP_32 */:
+ return 1;
+ case 1 /* V3D_INTERNAL_BPP_64 */:
+ return 2;
+ case 2 /* V3D_INTERNAL_BPP_128 */:
+ return 4;
+ default:
+ unreachable("Unsupported internal BPP");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp)
+{
+ /* stride in multiples of 128 bits, and covers 2 rows. This is the
+ * reason we divide by 2 instead of 4, as we divide number of 32-bit
+ * words per row by 2.
+ */
+
+ return (tile_width * bpp) / 2;
+}
diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
index b9804f235ae..cc6b57b27b2 100644
--- a/src/broadcom/common/v3d_util.h
+++ b/src/broadcom/common/v3d_util.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,10 @@
#ifndef V3D_UTIL_H
#define V3D_UTIL_H
+#include "util/macros.h"
#include "common/v3d_device_info.h"
+#include "compiler/shader_enums.h"
+#include "util/format/u_formats.h"
uint32_t
v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
@@ -34,4 +37,46 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
uint32_t num_wgs,
uint32_t wg_size);
+void
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+ uint32_t color_attachment_count,
+ uint32_t max_internal_bpp,
+ uint32_t total_color_bpp,
+ bool msaa,
+ bool double_buffer,
+ uint32_t *width,
+ uint32_t *height);
+
+uint32_t
+v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
+
+uint32_t
+v3d_hw_prim_type(enum mesa_prim prim_type);
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp);
+
+/* Some configuration packets want the size on log2, but starting at 0 for
+ * size 8.
+ */
+static inline uint8_t
+log2_tile_size(uint32_t size)
+{
+ switch(size) {
+ case 8:
+ return 0;
+ case 16:
+ return 1;
+ case 32:
+ return 2;
+ case 64:
+ return 3;
+ default:
+ unreachable("Unsupported tile width/height");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp);
#endif
diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build
index 95156140ad9..d5aafb3879e 100644
--- a/src/broadcom/compiler/meson.build
+++ b/src/broadcom/compiler/meson.build
@@ -32,23 +32,22 @@ libbroadcom_compiler_files = files(
'vir_to_qpu.c',
'qpu_schedule.c',
'qpu_validate.c',
- 'v3d33_tex.c',
- 'v3d40_tex.c',
- 'v3d33_vpm_setup.c',
+ 'v3d_tex.c',
'v3d_compiler.h',
'v3d_nir_lower_io.c',
'v3d_nir_lower_image_load_store.c',
'v3d_nir_lower_line_smooth.c',
+ 'v3d_nir_lower_load_store_bitsize.c',
'v3d_nir_lower_logic_ops.c',
- 'v3d_nir_lower_robust_buffer_access.c',
'v3d_nir_lower_scratch.c',
'v3d_nir_lower_txf_ms.c',
+ 'v3d_packing.c',
)
libbroadcom_compiler = static_library(
- ['broadcom_compiler', v3d_xml_pack],
- libbroadcom_compiler_files,
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+ 'broadcom_compiler',
+ [libbroadcom_compiler_files, v3d_xml_pack],
+ include_directories : [inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_broadcom],
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',
dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index d0a89f1a7d4..acc62a092f2 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -38,7 +38,7 @@
#define __gen_address_type uint32_t
#define __gen_address_offset(reloc) (*reloc)
#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
@@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c)
c->last_thrsw->qpu.sig.thrsw = true;
c->last_thrsw_at_top_level = !c->in_control_flow;
- /* We need to lock the scoreboard before any tlb acess happens. If this
+ /* We need to lock the scoreboard before any tlb access happens. If this
* thread switch comes after we have emitted a tlb load, then it means
* that we can't lock on the last thread switch any more.
*/
@@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
}
static uint32_t
+v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
+{
+ nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+ switch (atomic_op) {
+ case nir_atomic_op_iadd:
+ return instr->intrinsic == nir_intrinsic_ssbo_atomic ?
+ v3d_get_op_for_atomic_add(instr, 2) :
+ v3d_get_op_for_atomic_add(instr, 1);
+ case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN;
+ case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+ case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX;
+ case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX;
+ case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC;
+ case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC;
+ case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+ case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+ case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+ default: unreachable("unknown atomic op");
+ }
+}
+
+static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
@@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_global_2x32:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
return V3D_TMU_OP_REGULAR;
- case nir_intrinsic_ssbo_atomic_add:
- return v3d_get_op_for_atomic_add(instr, 2);
- case nir_intrinsic_shared_atomic_add:
- return v3d_get_op_for_atomic_add(instr, 1);
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_shared_atomic_imin:
- return V3D_TMU_OP_WRITE_SMIN;
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_shared_atomic_umin:
- return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_shared_atomic_imax:
- return V3D_TMU_OP_WRITE_SMAX;
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_shared_atomic_umax:
- return V3D_TMU_OP_WRITE_UMAX;
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_shared_atomic_and:
- return V3D_TMU_OP_WRITE_AND_READ_INC;
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_shared_atomic_or:
- return V3D_TMU_OP_WRITE_OR_READ_DEC;
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_shared_atomic_xor:
- return V3D_TMU_OP_WRITE_XOR_READ_NOT;
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_shared_atomic_exchange:
- return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
- case nir_intrinsic_ssbo_atomic_comp_swap:
- case nir_intrinsic_shared_atomic_comp_swap:
- return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+ case nir_intrinsic_ssbo_atomic:
+ case nir_intrinsic_ssbo_atomic_swap:
+ case nir_intrinsic_shared_atomic:
+ case nir_intrinsic_shared_atomic_swap:
+ case nir_intrinsic_global_atomic_2x32:
+ case nir_intrinsic_global_atomic_swap_2x32:
+ return v3d_general_tmu_op_for_atomic(instr);
+
default:
unreachable("unknown intrinsic op");
}
@@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c)
bool emitted_tmuwt = false;
for (int i = 0; i < c->tmu.flush_count; i++) {
if (c->tmu.flush[i].component_mask > 0) {
- nir_dest *dest = c->tmu.flush[i].dest;
- assert(dest);
+ nir_def *def = c->tmu.flush[i].def;
+ assert(def);
for (int j = 0; j < 4; j++) {
if (c->tmu.flush[i].component_mask & (1 << j)) {
- ntq_store_dest(c, dest, j,
- vir_MOV(c, vir_LDTMU(c)));
+ ntq_store_def(c, def, j,
+ vir_MOV(c, vir_LDTMU(c)));
}
}
} else if (!emitted_tmuwt) {
@@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c)
/**
* Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
- * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
+ * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
* and more specifically, the output fifo, since that can't stall.
*/
void
ntq_add_pending_tmu_flush(struct v3d_compile *c,
- nir_dest *dest,
+ nir_def *def,
uint32_t component_mask)
{
const uint32_t num_components = util_bitcount(component_mask);
@@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
if (num_components > 0) {
c->tmu.output_fifo_size += num_components;
- if (!dest->is_ssa)
- _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
+
+ nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+ if (store != NULL) {
+ nir_def *reg = store->src[1].ssa;
+ _mesa_set_add(c->tmu.outstanding_regs, reg);
+ }
}
- c->tmu.flush[c->tmu.flush_count].dest = dest;
+ c->tmu.flush[c->tmu.flush_count].def = def;
c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
c->tmu.flush_count++;
+ c->tmu.total_count++;
if (c->disable_tmu_pipelining)
ntq_flush_tmu(c);
@@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
uint32_t base_const_offset,
uint32_t *writemask,
uint32_t *const_offset,
+ uint32_t *type_size,
uint32_t *tmu_writes)
{
struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
@@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
/* Update the offset for the TMU write based on the
* the first component we are writing.
*/
- *const_offset = base_const_offset + first_component * 4;
+ *type_size = nir_src_bit_size(instr->src[0]) / 8;
+ *const_offset =
+ base_const_offset + first_component * (*type_size);
/* Clear these components from the writemask */
uint32_t written_mask =
@@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
int offset_src,
struct qreg base_offset,
uint32_t const_offset,
+ uint32_t dest_components,
uint32_t *tmu_writes)
{
if (mode == MODE_COUNT) {
@@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
if (vir_in_nonuniform_control_flow(c))
vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+ tmu->ldtmu_count = dest_components;
}
/**
@@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
*/
static void
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
- bool is_shared_or_scratch)
+ bool is_shared_or_scratch, bool is_global)
{
uint32_t tmu_op = v3d_general_tmu_op(instr);
@@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
* amount to add/sub, as that is implicit.
*/
bool atomic_add_replaced =
- ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
- instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+ (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
+ instr->intrinsic == nir_intrinsic_shared_atomic ||
+ instr->intrinsic == nir_intrinsic_global_atomic_2x32) &&
+ nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
(tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
- tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+ tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
instr->intrinsic == nir_intrinsic_store_scratch ||
- instr->intrinsic == nir_intrinsic_store_shared);
+ instr->intrinsic == nir_intrinsic_store_shared ||
+ instr->intrinsic == nir_intrinsic_store_global_2x32);
bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
instr->intrinsic == nir_intrinsic_load_ubo ||
instr->intrinsic == nir_intrinsic_load_ssbo ||
instr->intrinsic == nir_intrinsic_load_scratch ||
- instr->intrinsic == nir_intrinsic_load_shared);
+ instr->intrinsic == nir_intrinsic_load_shared ||
+ instr->intrinsic == nir_intrinsic_load_global_2x32);
if (!is_load)
c->tmu_dirty_rcl = true;
- bool has_index = !is_shared_or_scratch;
+ if (is_global)
+ c->has_global_address = true;
+
+ bool has_index = !is_shared_or_scratch && !is_global;
int offset_src;
if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
instr->intrinsic == nir_intrinsic_load_ubo ||
instr->intrinsic == nir_intrinsic_load_scratch ||
instr->intrinsic == nir_intrinsic_load_shared ||
+ instr->intrinsic == nir_intrinsic_load_global_2x32 ||
atomic_add_replaced) {
offset_src = 0 + has_index;
} else if (is_store) {
@@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
v3d_unit_data_create(0, const_offset));
const_offset = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ubo) {
- uint32_t index = nir_src_as_uint(instr->src[0]);
- /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
- * shifted up by 1 (0 is gallium's constant buffer 0).
+ /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
+ * is gallium's constant buffer 0 in GL and push constants
+ * in Vulkan)).
*/
- if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
- index++;
-
+ uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
base_offset =
vir_uniform(c, QUNIFORM_UBO_ADDR,
v3d_unit_data_create(index, const_offset));
@@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
base_offset = c->cs_shared_offset;
const_offset += nir_intrinsic_base(instr);
}
+ } else if (is_global) {
+ /* Global load/store intrinsics use gloal addresses, so the
+ * offset is the target address and we don't need to add it
+ * to a base offset.
+ */
+ base_offset = vir_uniform_ui(c, 0);
} else {
+ uint32_t idx = is_store ? 1 : 0;
base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
- nir_src_as_uint(instr->src[is_store ?
- 1 : 0]));
+ nir_src_comp_as_uint(instr->src[idx], 0));
}
/* We are ready to emit TMU register writes now, but before we actually
@@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
assert(mode == MODE_COUNT || tmu_writes > 0);
+ uint32_t type_size = 4;
+
if (is_store) {
emit_tmu_general_store_writes(c, mode, instr,
base_const_offset,
&writemask,
&const_offset,
+ &type_size,
&tmu_writes);
} else if (!is_load && !atomic_add_replaced) {
- emit_tmu_general_atomic_writes(c, mode, instr,
- tmu_op, has_index,
- &tmu_writes);
+ emit_tmu_general_atomic_writes(c, mode, instr,
+ tmu_op, has_index,
+ &tmu_writes);
+ } else if (is_load) {
+ type_size = instr->def.bit_size / 8;
}
/* For atomics we use 32bit except for CMPXCHG, that we need
@@ -618,17 +648,40 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
V3D_TMU_OP_TYPE_ATOMIC;
+ /* Only load per-quad if we can be certain that all
+ * lines in the quad are active. Notice that demoted
+ * invocations, unlike terminated ones, are still
+ * active: we want to skip memory writes for them but
+ * loads should still work.
+ */
uint32_t perquad =
- is_load && !vir_in_nonuniform_control_flow(c)
- ? GENERAL_TMU_LOOKUP_PER_QUAD
- : GENERAL_TMU_LOOKUP_PER_PIXEL;
+ is_load && !vir_in_nonuniform_control_flow(c) &&
+ ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
+ c->s->info.fs.needs_quad_helper_invocations &&
+ !c->emitted_discard) ||
+ c->s->info.uses_wide_subgroup_intrinsics) ?
+ GENERAL_TMU_LOOKUP_PER_QUAD :
+ GENERAL_TMU_LOOKUP_PER_PIXEL;
config = 0xffffff00 | tmu_op << 3 | perquad;
if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
} else if (is_atomic || num_components == 1) {
- config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+ switch (type_size) {
+ case 4:
+ config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+ break;
+ case 2:
+ config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
+ break;
+ case 1:
+ config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
+ break;
+ default:
+ unreachable("Unsupported bitsize");
+ }
} else {
+ assert(type_size == 4);
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
num_components - 2;
}
@@ -637,7 +690,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
emit_tmu_general_address_write(c, mode, instr, config,
dynamic_src, offset_src,
base_offset, const_offset,
- &tmu_writes);
+ dest_components, &tmu_writes);
assert(tmu_writes > 0);
if (mode == MODE_COUNT) {
@@ -660,7 +713,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
*/
const uint32_t component_mask =
(1 << dest_components) - 1;
- ntq_add_pending_tmu_flush(c, &instr->dest,
+ ntq_add_pending_tmu_flush(c, &instr->def,
component_mask);
}
}
@@ -673,7 +726,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
}
static struct qreg *
-ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
+ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
{
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
def->num_components);
@@ -717,8 +770,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig)
* its destination to be the NIR reg's destination
*/
void
-ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
- struct qreg result)
+ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+ struct qreg result)
{
struct qinst *last_inst = NULL;
if (!list_is_empty(&c->cur_block->instructions))
@@ -731,23 +784,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
assert(result.file == QFILE_TEMP && last_inst &&
(last_inst == c->defs[result.index] || is_reused_uniform));
- if (dest->is_ssa) {
- assert(chan < dest->ssa.num_components);
+ nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+ if (store == NULL) {
+ assert(chan < def->num_components);
struct qreg *qregs;
struct hash_entry *entry =
- _mesa_hash_table_search(c->def_ht, &dest->ssa);
+ _mesa_hash_table_search(c->def_ht, def);
if (entry)
qregs = entry->data;
else
- qregs = ntq_init_ssa_def(c, &dest->ssa);
+ qregs = ntq_init_ssa_def(c, def);
qregs[chan] = result;
} else {
- nir_register *reg = dest->reg.reg;
- assert(dest->reg.base_offset == 0);
- assert(reg->num_array_elems == 0);
+ nir_def *reg = store->src[1].ssa;
+ ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+ assert(nir_intrinsic_base(store) == 0);
+ assert(nir_intrinsic_num_array_elems(decl) == 0);
struct hash_entry *entry =
_mesa_hash_table_search(c->def_ht, reg);
struct qreg *qregs = entry->data;
@@ -802,7 +857,9 @@ struct qreg
ntq_get_src(struct v3d_compile *c, nir_src src, int i)
{
struct hash_entry *entry;
- if (src.is_ssa) {
+
+ nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
+ if (load == NULL) {
assert(i < src.ssa->num_components);
entry = _mesa_hash_table_search(c->def_ht, src.ssa);
@@ -811,10 +868,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i)
entry = _mesa_hash_table_search(c->def_ht, src.ssa);
}
} else {
- nir_register *reg = src.reg.reg;
- assert(reg->num_array_elems == 0);
- assert(src.reg.base_offset == 0);
- assert(i < reg->num_components);
+ nir_def *reg = load->src[0].ssa;
+ ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+ assert(nir_intrinsic_base(load) == 0);
+ assert(nir_intrinsic_num_array_elems(decl) == 0);
+ assert(i < nir_intrinsic_num_components(decl));
if (_mesa_set_search(c->tmu.outstanding_regs, reg))
ntq_flush_tmu(c);
@@ -830,13 +888,8 @@ static struct qreg
ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
unsigned src)
{
- assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
- unsigned chan = ffs(instr->dest.write_mask) - 1;
struct qreg r = ntq_get_src(c, instr->src[src].src,
- instr->src[src].swizzle[chan]);
-
- assert(!instr->src[src].abs);
- assert(!instr->src[src].negate);
+ instr->src[src].swizzle[0]);
return r;
};
@@ -876,6 +929,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
case GLSL_SAMPLER_DIM_3D:
case GLSL_SAMPLER_DIM_CUBE:
case GLSL_SAMPLER_DIM_BUF:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
/* Don't minify the array size. */
if (!(instr->is_array && i == dest_size - 1)) {
size = ntq_minify(c, size, lod);
@@ -890,7 +944,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
unreachable("Bad sampler type");
}
- ntq_store_dest(c, &instr->dest, i, size);
+ ntq_store_def(c, &instr->def, i, size);
}
}
@@ -905,12 +959,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
*/
switch (instr->op) {
case nir_texop_query_levels:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
return;
case nir_texop_texture_samples:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
return;
case nir_texop_txs:
ntq_emit_txs(c, instr);
@@ -919,10 +973,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
break;
}
- if (c->devinfo->ver >= 40)
- v3d40_vir_emit_tex(c, instr);
- else
- v3d33_vir_emit_tex(c, instr);
+ v3d_vir_emit_tex(c, instr);
}
static struct qreg
@@ -963,44 +1014,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
static struct qreg
emit_smooth_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg w, struct qreg r5)
+ struct qreg vary, struct qreg w, struct qreg c_reg)
{
- return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
}
static struct qreg
emit_noperspective_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
- return vir_FADD(c, vir_MOV(c, vary), r5);
+ return vir_FADD(c, vir_MOV(c, vary), c_reg);
}
static struct qreg
emit_flat_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
vir_MOV_dest(c, c->undef, vary);
- return vir_MOV(c, r5);
+ return vir_MOV(c, c_reg);
}
static struct qreg
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
int8_t input_idx, uint8_t swizzle, int array_index)
{
- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ struct qreg c_reg; /* C coefficient */
+
+ if (c->devinfo->has_accumulators)
+ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ else
+ c_reg = vir_reg(QFILE_REG, 0);
struct qinst *ldvary = NULL;
struct qreg vary;
- if (c->devinfo->ver >= 41) {
- ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
- c->undef, c->undef);
- ldvary->qpu.sig.ldvary = true;
- vary = vir_emit_def(c, ldvary);
- } else {
- vir_NOP(c)->qpu.sig.ldvary = true;
- vary = r3;
- }
+ ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+ c->undef, c->undef);
+ ldvary->qpu.sig.ldvary = true;
+ vary = vir_emit_def(c, ldvary);
/* Store the input value before interpolation so we can implement
* GLSL's interpolateAt functions if the shader uses them.
@@ -1008,7 +1058,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (input_idx >= 0) {
assert(var);
c->interp[input_idx].vp = vary;
- c->interp[input_idx].C = vir_MOV(c, r5);
+ c->interp[input_idx].C = vir_MOV(c, c_reg);
c->interp[input_idx].mode = var->data.interpolation;
}
@@ -1018,7 +1068,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
*/
if (!var) {
assert(input_idx < 0);
- return emit_smooth_varying(c, vary, c->payload_w, r5);
+ return emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
int i = c->num_inputs++;
@@ -1033,20 +1083,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (var->data.centroid) {
BITSET_SET(c->centroid_flags, i);
result = emit_smooth_varying(c, vary,
- c->payload_w_centroid, r5);
+ c->payload_w_centroid, c_reg);
} else {
- result = emit_smooth_varying(c, vary, c->payload_w, r5);
+ result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
break;
case INTERP_MODE_NOPERSPECTIVE:
BITSET_SET(c->noperspective_flags, i);
- result = emit_noperspective_varying(c, vary, r5);
+ result = emit_noperspective_varying(c, vary, c_reg);
break;
case INTERP_MODE_FLAT:
BITSET_SET(c->flat_shade_flags, i);
- result = emit_flat_varying(c, vary, r5);
+ result = emit_flat_varying(c, vary, c_reg);
break;
default:
@@ -1163,16 +1213,6 @@ ntq_emit_comparison(struct v3d_compile *c,
vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
break;
- case nir_op_i2b32:
- vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
- cond_invert = true;
- break;
-
- case nir_op_f2b32:
- vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
- cond_invert = true;
- break;
-
default:
return false;
}
@@ -1188,7 +1228,7 @@ ntq_emit_comparison(struct v3d_compile *c,
static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)
{
- if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
+ if (src.ssa->parent_instr->type != nir_instr_type_alu)
return NULL;
nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
if (!instr)
@@ -1199,7 +1239,7 @@ ntq_get_alu_parent(nir_src src)
* src.
*/
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
- if (!instr->src[i].src.is_ssa)
+ if (nir_load_reg_for_def(instr->src[i].src.ssa))
return NULL;
}
@@ -1242,12 +1282,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
return result;
}
+static struct qreg
+ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+ struct qreg result =
+ vir_MOV(c, vir_SEL(c, cond,
+ vir_uniform_ui(c, 1),
+ vir_uniform_ui(c, 0)));
+ c->flags_temp = result.index;
+ c->flags_cond = cond;
+ return result;
+}
+
+static struct qreg
+f2f16_rtz(struct v3d_compile *c, struct qreg f32)
+{
+ /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
+ * method and seems to be using RTE by default, so we need to implement
+ * RTZ rounding in software.
+ */
+ struct qreg rf16 = vir_FMOV(c, f32);
+ vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
+
+ struct qreg rf32 = vir_FMOV(c, rf16);
+ vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
+
+ struct qreg f32_abs = vir_FMOV(c, f32);
+ vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+ struct qreg rf32_abs = vir_FMOV(c, rf32);
+ vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+ vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
+ V3D_QPU_PF_PUSHN);
+ return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+ vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
+}
+
+/**
+ * Takes the result value of a signed integer width conversion from a smaller
+ * type to a larger type and if needed, it applies sign extension to it.
+ */
+static struct qreg
+sign_extend(struct v3d_compile *c,
+ struct qreg value,
+ uint32_t src_bit_size,
+ uint32_t dst_bit_size)
+{
+ assert(src_bit_size < dst_bit_size);
+
+ struct qreg tmp = vir_MOV(c, value);
+
+ /* Do we need to sign-extend? */
+ uint32_t sign_mask = 1 << (src_bit_size - 1);
+ struct qinst *sign_check =
+ vir_AND_dest(c, vir_nop_reg(),
+ tmp, vir_uniform_ui(c, sign_mask));
+ vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
+
+ /* If so, fill in leading sign bits */
+ uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
+ ((1ull << dst_bit_size) - 1);
+ struct qinst *extend_inst =
+ vir_OR_dest(c, tmp, tmp,
+ vir_uniform_ui(c, extend_bits));
+ vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
+
+ return tmp;
+}
+
static void
ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
{
- /* This should always be lowered to ALU operations for V3D. */
- assert(!instr->dest.saturate);
-
/* Vectors are special in that they have non-scalarized writemasks,
* and just take the first swizzle channel for each argument in order
* into each writemask channel.
@@ -1260,8 +1366,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
srcs[i] = ntq_get_src(c, instr->src[i].src,
instr->src[i].swizzle[0]);
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
- ntq_store_dest(c, &instr->dest.dest, i,
- vir_MOV(c, srcs[i]));
+ ntq_store_def(c, &instr->def, i,
+ vir_MOV(c, srcs[i]));
return;
}
@@ -1327,6 +1433,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
break;
+ case nir_op_f2f16:
+ case nir_op_f2f16_rtne:
+ assert(nir_src_bit_size(instr->src[0].src) == 32);
+ result = vir_FMOV(c, src[0]);
+ vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
+ break;
+
+ case nir_op_f2f16_rtz:
+ assert(nir_src_bit_size(instr->src[0].src) == 32);
+ result = f2f16_rtz(c, src[0]);
+ break;
+
+ case nir_op_f2f32:
+ assert(nir_src_bit_size(instr->src[0].src) == 16);
+ result = vir_FMOV(c, src[0]);
+ vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
+ break;
+
+ case nir_op_i2i16: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 32 || bit_size == 8);
+ if (bit_size == 32) {
+ /* We don't have integer pack/unpack methods for
+ * converting between 16-bit and 32-bit, so we implement
+ * the conversion manually by truncating the src.
+ */
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+ } else {
+ struct qreg tmp = vir_AND(c, src[0],
+ vir_uniform_ui(c, 0xff));
+ result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
+ }
+ break;
+ }
+
+ case nir_op_u2u16: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 32 || bit_size == 8);
+
+ /* We don't have integer pack/unpack methods for converting
+ * between 16-bit and 32-bit, so we implement the conversion
+ * manually by truncating the src. For the 8-bit case, we
+ * want to make sure we don't copy garbage from any of the
+ * 24 MSB bits.
+ */
+ if (bit_size == 32)
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+ else
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+ break;
+ }
+
+ case nir_op_i2i8:
+ case nir_op_u2u8:
+ assert(nir_src_bit_size(instr->src[0].src) == 32 ||
+ nir_src_bit_size(instr->src[0].src) == 16);
+ /* We don't have integer pack/unpack methods for converting
+ * between 8-bit and 32-bit, so we implement the conversion
+ * manually by truncating the src.
+ */
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+ break;
+
+ case nir_op_u2u32: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 16 || bit_size == 8);
+
+ /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
+ * from the src but we make sure to clear any garbage bits that
+ * may be present in the invalid src bits.
+ */
+ uint32_t mask = (1 << bit_size) - 1;
+ result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
+ break;
+ }
+
+ case nir_op_i2i32: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 16 || bit_size == 8);
+
+ uint32_t mask = (1 << bit_size) - 1;
+ struct qreg tmp = vir_AND(c, src[0],
+ vir_uniform_ui(c, mask));
+
+ result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
+ break;
+ }
+
case nir_op_iadd:
result = vir_ADD(c, src[0], src[1]);
break;
@@ -1390,8 +1584,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
break;
}
- case nir_op_i2b32:
- case nir_op_f2b32:
case nir_op_feq32:
case nir_op_fneu32:
case nir_op_fge32:
@@ -1485,13 +1677,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
case nir_op_uadd_carry:
vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
V3D_QPU_PF_PUSHC);
- result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+ result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
+ break;
+
+ case nir_op_usub_borrow:
+ vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
+ V3D_QPU_PF_PUSHC);
+ result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
break;
case nir_op_pack_half_2x16_split:
result = vir_VFPACK(c, src[0], src[1]);
break;
+ case nir_op_pack_2x32_to_2x16_v3d:
+ result = vir_VPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_pack_32_to_r11g11b10_v3d:
+ result = vir_V11FPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
+ result = vir_V10PACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_pack_4x16_to_4x8_v3d:
+ result = vir_V8PACK(c, src[0], src[1]);
+ break;
+
case nir_op_unpack_half_2x16_split_x:
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1502,26 +1716,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
break;
- case nir_op_fquantize2f16: {
- /* F32 -> F16 -> F32 conversion */
- struct qreg tmp = vir_FMOV(c, src[0]);
- vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
- tmp = vir_FMOV(c, tmp);
- vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
+ case nir_op_pack_2x16_to_unorm_2x8_v3d:
+ result = vir_VFTOUNORM8(c, src[0]);
+ break;
- /* Check for denorm */
- struct qreg abs_src = vir_FMOV(c, src[0]);
- vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
- struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
- vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
- V3D_QPU_PF_PUSHC);
+ case nir_op_pack_2x16_to_snorm_2x8_v3d:
+ result = vir_VFTOSNORM8(c, src[0]);
+ break;
- /* Return +/-0 for denorms */
- struct qreg zero =
- vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
- result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
+ case nir_op_pack_2x16_to_unorm_2x10_v3d:
+ result = vir_VFTOUNORM10LO(c, src[0]);
+ break;
+
+ case nir_op_pack_2x16_to_unorm_10_2_v3d:
+ result = vir_VFTOUNORM10HI(c, src[0]);
+ break;
+
+ case nir_op_f2unorm_16_v3d:
+ result = vir_FTOUNORM16(c, src[0]);
+ break;
+
+ case nir_op_f2snorm_16_v3d:
+ result = vir_FTOSNORM16(c, src[0]);
break;
- }
default:
fprintf(stderr, "unknown NIR ALU inst: ");
@@ -1530,17 +1747,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
abort();
}
- /* We have a scalar result, so the instruction should only have a
- * single channel written to.
- */
- assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
- ntq_store_dest(c, &instr->dest.dest,
- ffs(instr->dest.write_mask) - 1, result);
+ ntq_store_def(c, &instr->def, 0, result);
}
/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
* specifier. They come from a register that's preloaded with 0xffffffff
- * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
+ * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
* 8 bits are shifted off the bottom and 0xff shifted in from the top.
*/
#define TLB_TYPE_F16_COLOR (3 << 6)
@@ -1670,15 +1882,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
static void
emit_frag_end(struct v3d_compile *c)
{
- /* If the shader has no non-TLB side effects and doesn't write Z
- * we can promote it to enabling early_fragment_tests even
- * if the user didn't.
- */
- if (c->output_position_index == -1 &&
- !(c->s->info.num_images || c->s->info.num_ssbos)) {
- c->s->info.fs.early_fragment_tests = true;
- }
-
if (c->output_sample_mask_index != -1) {
vir_SETMSF_dest(c, vir_nop_reg(),
vir_AND(c,
@@ -1703,55 +1906,75 @@ emit_frag_end(struct v3d_compile *c)
}
struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
- if (c->output_position_index != -1 &&
- !c->s->info.fs.early_fragment_tests) {
- struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
- c->outputs[c->output_position_index]);
- uint8_t tlb_specifier = TLB_TYPE_DEPTH;
- if (c->devinfo->ver >= 42) {
- tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
- TLB_SAMPLE_MODE_PER_PIXEL);
- } else
- tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
+ /* If the shader has no non-TLB side effects and doesn't write Z
+ * we can promote it to enabling early_fragment_tests even
+ * if the user didn't.
+ */
+ if (c->output_position_index == -1 &&
+ !(c->s->info.num_images || c->s->info.num_ssbos) &&
+ !c->s->info.fs.uses_discard &&
+ !c->s->info.fs.uses_demote &&
+ !c->fs_key->sample_alpha_to_coverage &&
+ c->output_sample_mask_index == -1 &&
+ has_any_tlb_color_write) {
+ c->s->info.fs.early_fragment_tests = true;
+ }
- inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
- tlb_specifier |
- 0xffffff00);
+ /* By default, Z buffer writes are implicit using the Z values produced
+ * from FEP (Z value produced from rasterization). When this is not
+ * desirable (shader writes Z explicitly, has discards, etc) we need
+ * to let the hardware know by setting c->writes_z to true, in which
+ * case we always need to write a Z value from the QPU, even if it is
+ * just the passthrough Z value produced from FEP.
+ *
+ * Also, from the V3D 4.2 spec:
+ *
+ * "If a shader performs a Z read the “Fragment shader does Z writes”
+ * bit in the shader record must be enabled to ensure deterministic
+ * results"
+ *
+ * So if c->reads_z is set we always need to write Z, even if it is
+ * a passthrough from the Z value produced from FEP.
+ */
+ if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
c->writes_z = true;
- } else if (c->s->info.fs.uses_discard ||
- !c->s->info.fs.early_fragment_tests ||
- c->fs_key->sample_alpha_to_coverage ||
- !has_any_tlb_color_write) {
- /* Emit passthrough Z if it needed to be delayed until shader
- * end due to potential discards.
- *
- * Since (single-threaded) fragment shaders always need a TLB
- * write, emit passthrouh Z if we didn't have any color
- * buffers and flag us as potentially discarding, so that we
- * can use Z as the TLB write.
- */
- c->s->info.fs.uses_discard = true;
-
- struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
- vir_nop_reg());
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
+ struct qinst *inst;
+
+ if (c->output_position_index != -1) {
+ /* Shader writes to gl_FragDepth, use that */
+ inst = vir_MOV_dest(c, tlbu_reg,
+ c->outputs[c->output_position_index]);
+
+ tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
+ TLB_SAMPLE_MODE_PER_PIXEL);
+ } else {
+ /* Shader doesn't write to gl_FragDepth, take Z from
+ * FEP.
+ */
+ c->writes_z_from_fep = true;
+ inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
- if (c->devinfo->ver >= 42) {
/* The spec says the PER_PIXEL flag is ignored for
* invariant writes, but the simulator demands it.
*/
tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
TLB_SAMPLE_MODE_PER_PIXEL);
- } else {
- tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
+
+ /* Since (single-threaded) fragment shaders always need
+ * a TLB write, if we dond't have any we emit a
+ * passthrouh Z and flag us as potentially discarding,
+ * so that we can use Z as the required TLB write.
+ */
+ if (!has_any_tlb_color_write)
+ c->s->info.fs.uses_discard = true;
}
- inst->uniform = vir_get_uniform_index(c,
- QUNIFORM_CONSTANT,
+ inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
tlb_specifier |
0xffffff00);
- c->writes_z = true;
+ inst->is_tlb_z_write = true;
}
/* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1767,7 +1990,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
struct qreg vpm_index,
bool uniform_vpm_index)
{
- assert(c->devinfo->ver >= 40);
if (uniform_vpm_index)
vir_STVPMV(c, vpm_index, val);
else
@@ -1777,13 +1999,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
static void
vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
{
- if (c->devinfo->ver >= 40) {
- vir_VPM_WRITE_indirect(c, val,
- vir_uniform_ui(c, vpm_index), true);
- } else {
- /* XXX: v3d33_vir_vpm_write_setup(c); */
- vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
- }
+ vir_VPM_WRITE_indirect(c, val,
+ vir_uniform_ui(c, vpm_index), true);
}
static void
@@ -1791,7 +2008,7 @@ emit_vert_end(struct v3d_compile *c)
{
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
*/
- if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+ if (c->devinfo->ver == 42)
vir_VPMWT(c);
}
@@ -1800,7 +2017,7 @@ emit_geom_end(struct v3d_compile *c)
{
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
*/
- if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+ if (c->devinfo->ver == 42)
vir_VPMWT(c);
}
@@ -1812,8 +2029,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
nir_intrinsic_instr *high,
void *data)
{
- /* Our backend is 32-bit only at present */
- if (bit_size != 32)
+ /* TMU general access only supports 32-bit vectors */
+ if (bit_size > 32)
+ return false;
+
+ if ((bit_size == 8 || bit_size == 16) && num_components > 1)
return false;
if (align_mul % 4 != 0 || align_offset % 4 != 0)
@@ -1843,7 +2063,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
do {
progress = false;
- NIR_PASS_V(s, nir_lower_vars_to_ssa);
+ NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
+ NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
+ NIR_PASS(progress, s, nir_opt_deref);
+
+ NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+ if (!s->info.var_copies_lowered) {
+ /* Only run this pass if nir_lower_var_copies was not called
+ * yet. That would lower away any copy_deref instructions and we
+ * don't want to introduce any more.
+ */
+ NIR_PASS(progress, s, nir_opt_find_array_copies);
+ }
+
+ NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, s, nir_opt_dead_write_vars);
+ NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
+
+ NIR_PASS(progress, s, nir_remove_dead_variables,
+ (nir_variable_mode)(nir_var_function_temp |
+ nir_var_shader_temp |
+ nir_var_mem_shared),
+ NULL);
+
NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
NIR_PASS(progress, s, nir_copy_prop);
@@ -1851,10 +2093,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
NIR_PASS(progress, s, nir_opt_dce);
NIR_PASS(progress, s, nir_opt_dead_cf);
NIR_PASS(progress, s, nir_opt_cse);
- NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
+ NIR_PASS(progress, s, nir_opt_intrinsics);
+ NIR_PASS(progress, s, nir_opt_idiv_const, 32);
+ NIR_PASS(progress, s, nir_lower_alu);
+
+ if (nir_opt_loop(s)) {
+ progress = true;
+ NIR_PASS(progress, s, nir_copy_prop);
+ NIR_PASS(progress, s, nir_opt_dce);
+ }
+
+ NIR_PASS(progress, s, nir_opt_conditional_discard);
+
+ NIR_PASS(progress, s, nir_opt_remove_phis);
+ NIR_PASS(progress, s, nir_opt_if, false);
+ if (c && !c->disable_gcm) {
+ bool local_progress = false;
+ NIR_PASS(local_progress, s, nir_opt_gcm, false);
+ c->gcm_progress |= local_progress;
+ progress |= local_progress;
+ }
+
+ /* Note that vectorization may undo the load/store scalarization
+ * pass we run for non 32-bit TMU general load/store by
+ * converting, for example, 2 consecutive 16-bit loads into a
+ * single 32-bit load. This is fine (and desirable) as long as
+ * the resulting 32-bit load meets 32-bit alignment requirements,
+ * which mem_vectorize_callback() should be enforcing.
+ */
nir_load_store_vectorize_options vectorize_opts = {
.modes = nir_var_mem_ssbo | nir_var_mem_ubo |
nir_var_mem_push_const | nir_var_mem_shared |
@@ -1862,7 +2133,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
.callback = mem_vectorize_callback,
.robust_modes = 0,
};
- NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
+ bool vectorize_progress = false;
+
+
+ /* This requires that we have called
+ * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
+ * first, which we may not have done yet if we call here too
+ * early durign NIR pre-processing. We can detect this because
+ * in that case we won't have a compile object
+ */
+ if (c) {
+ NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
+ &vectorize_opts);
+ if (vectorize_progress) {
+ NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
+ NIR_PASS(progress, s, nir_lower_pack);
+ progress = true;
+ }
+ }
if (lower_flrp != 0) {
bool lower_flrp_progress = false;
@@ -1895,10 +2183,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
nir_move_options sink_opts =
nir_move_const_undef | nir_move_comparisons | nir_move_copies |
- nir_move_load_ubo;
+ nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
NIR_PASS(progress, s, nir_opt_sink, sink_opts);
-
- NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
}
static int
@@ -1915,27 +2201,9 @@ ntq_emit_vpm_read(struct v3d_compile *c,
uint32_t *remaining,
uint32_t vpm_index)
{
- struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
-
- if (c->devinfo->ver >= 40 ) {
- return vir_LDVPMV_IN(c,
- vir_uniform_ui(c,
- (*num_components_queued)++));
- }
-
- if (*num_components_queued != 0) {
- (*num_components_queued)--;
- return vir_MOV(c, vpm);
- }
-
- uint32_t num_components = MIN2(*remaining, 32);
-
- v3d33_vir_vpm_read_setup(c, num_components);
-
- *num_components_queued = num_components - 1;
- *remaining -= num_components;
-
- return vir_MOV(c, vpm);
+ return vir_LDVPMV_IN(c,
+ vir_uniform_ui(c,
+ (*num_components_queued)++));
}
static void
@@ -2005,31 +2273,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c)
}
/* The actual loads will happen directly in nir_intrinsic_load_input
- * on newer versions.
*/
- if (c->devinfo->ver >= 40)
- return;
-
- for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
- resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
- (loc + 1) * 4);
-
- for (int i = 0; i < c->vattr_sizes[loc]; i++) {
- c->inputs[loc * 4 + i] =
- ntq_emit_vpm_read(c,
- &vpm_components_queued,
- &num_components,
- loc * 4 + i);
-
- }
- }
-
- if (c->devinfo->ver >= 40) {
- assert(vpm_components_queued == num_components);
- } else {
- assert(vpm_components_queued == 0);
- assert(num_components == 0);
- }
+ return;
}
static bool
@@ -2058,14 +2303,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
*/
assert(glsl_type_is_array(var->type));
const struct glsl_type *type = glsl_get_array_element(var->type);
- unsigned array_len = MAX2(glsl_get_length(type), 1);
+ unsigned var_len = glsl_count_vec4_slots(type, false, false);
unsigned loc = var->data.driver_location;
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
- (loc + array_len) * 4);
+ (loc + var_len) * 4);
if (var->data.compact) {
- for (unsigned j = 0; j < array_len; j++) {
+ for (unsigned j = 0; j < var_len; j++) {
unsigned input_idx = c->num_inputs++;
unsigned loc_frac = var->data.location_frac + j;
unsigned loc = var->data.location + loc_frac / 4;
@@ -2076,8 +2321,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
continue;
}
- for (unsigned j = 0; j < array_len; j++) {
- unsigned num_elements = glsl_get_vector_elements(type);
+ for (unsigned j = 0; j < var_len; j++) {
+ unsigned num_elements =
+ glsl_type_is_struct(glsl_without_array(type)) ?
+ 4 : glsl_get_vector_elements(type);
for (unsigned k = 0; k < num_elements; k++) {
unsigned chan = var->data.location_frac + k;
unsigned input_idx = c->num_inputs++;
@@ -2124,7 +2371,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
} else if (var->data.compact) {
for (int j = 0; j < var_len; j++)
emit_compact_fragment_input(c, loc, var, j);
- } else if (glsl_type_is_struct(var->type)) {
+ } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
for (int j = 0; j < var_len; j++) {
emit_fragment_input(c, loc, var, j, 4);
}
@@ -2143,12 +2390,9 @@ ntq_setup_outputs(struct v3d_compile *c)
return;
nir_foreach_shader_out_variable(var, c->s) {
- unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+ assert(glsl_type_is_vector_or_scalar(var->type));
unsigned loc = var->data.driver_location * 4;
- assert(array_len == 1);
- (void)array_len;
-
for (int i = 0; i < 4 - var->data.location_frac; i++) {
add_output(c, loc + var->data.location_frac + i,
var->data.location,
@@ -2157,15 +2401,17 @@ ntq_setup_outputs(struct v3d_compile *c)
switch (var->data.location) {
case FRAG_RESULT_COLOR:
- c->output_color_var[0] = var;
- c->output_color_var[1] = var;
- c->output_color_var[2] = var;
- c->output_color_var[3] = var;
+ for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+ c->output_color_var[i] = var;
break;
case FRAG_RESULT_DATA0:
case FRAG_RESULT_DATA1:
case FRAG_RESULT_DATA2:
case FRAG_RESULT_DATA3:
+ case FRAG_RESULT_DATA4:
+ case FRAG_RESULT_DATA5:
+ case FRAG_RESULT_DATA6:
+ case FRAG_RESULT_DATA7:
c->output_color_var[var->data.location -
FRAG_RESULT_DATA0] = var;
break;
@@ -2185,17 +2431,19 @@ ntq_setup_outputs(struct v3d_compile *c)
* Each nir_register gets a struct qreg per 32-bit component being stored.
*/
static void
-ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
+ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
{
- foreach_list_typed(nir_register, nir_reg, node, list) {
- unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
+ nir_foreach_reg_decl(decl, impl) {
+ unsigned num_components = nir_intrinsic_num_components(decl);
+ unsigned array_len = nir_intrinsic_num_array_elems(decl);
+ array_len = MAX2(array_len, 1);
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
- array_len *
- nir_reg->num_components);
+ array_len * num_components);
+ nir_def *nir_reg = &decl->def;
_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
- for (int i = 0; i < array_len * nir_reg->num_components; i++)
+ for (int i = 0; i < array_len * num_components; i++)
qregs[i] = vir_get_temp(c);
}
}
@@ -2222,23 +2470,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
assert(nir_src_as_uint(instr->src[1]) == 0);
- ntq_store_dest(c, &instr->dest, 0,
+ ntq_store_def(c, &instr->def, 0,
vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
if (instr->num_components > 1) {
- ntq_store_dest(c, &instr->dest, 1,
- vir_uniform(c,
- instr->num_components == 2 && is_array ?
- QUNIFORM_IMAGE_ARRAY_SIZE :
- QUNIFORM_IMAGE_HEIGHT,
- image_index));
+ ntq_store_def(c, &instr->def, 1,
+ vir_uniform(c,
+ instr->num_components == 2 && is_array ?
+ QUNIFORM_IMAGE_ARRAY_SIZE :
+ QUNIFORM_IMAGE_HEIGHT,
+ image_index));
}
if (instr->num_components > 2) {
- ntq_store_dest(c, &instr->dest, 2,
- vir_uniform(c,
- is_array ?
- QUNIFORM_IMAGE_ARRAY_SIZE :
- QUNIFORM_IMAGE_DEPTH,
- image_index));
+ ntq_store_def(c, &instr->def, 2,
+ vir_uniform(c,
+ is_array ?
+ QUNIFORM_IMAGE_ARRAY_SIZE :
+ QUNIFORM_IMAGE_DEPTH,
+ image_index));
}
}
@@ -2263,16 +2511,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
*
* To fix that, we make sure we always emit a thread switch before the
* first tlb color read. If that happens to be the last thread switch
- * we emit, then everything is fine, but otherwsie, if any code after
+ * we emit, then everything is fine, but otherwise, if any code after
* this point needs to emit additional thread switches, then we will
* switch the strategy to locking the scoreboard on the first thread
* switch instead -- see vir_emit_thrsw().
*/
if (!c->emitted_tlb_load) {
- if (!c->last_thrsw_at_top_level) {
- assert(c->devinfo->ver >= 41);
+ if (!c->last_thrsw_at_top_level)
vir_emit_thrsw(c);
- }
c->emitted_tlb_load = true;
}
@@ -2371,27 +2617,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
assert(color_reads_for_sample[component].file != QFILE_NULL);
- ntq_store_dest(c, &instr->dest, 0,
- vir_MOV(c, color_reads_for_sample[component]));
+ ntq_store_def(c, &instr->def, 0,
+ vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
+
+static bool
+try_emit_uniform(struct v3d_compile *c,
+ int offset,
+ int num_components,
+ nir_def *def,
+ enum quniform_contents contents)
+{
+ /* Even though ldunif is strictly 32-bit we can still use it
+ * to load scalar 8-bit/16-bit uniforms so long as their offset
+ * is 32-bit aligned. In this case, ldunif would still load
+ * 32-bit into the destination with the 8-bit/16-bit uniform
+ * data in the LSB and garbage in the MSB, but that is fine
+ * because we should only be accessing the valid bits of the
+ * destination.
+ *
+ * FIXME: if in the future we improve our register allocator to
+ * pack 2 16-bit variables in the MSB and LSB of the same
+ * register then this optimization would not be valid as is,
+ * since the load clobbers the MSB.
+ */
+ if (offset % 4 != 0)
+ return false;
+
+ /* We need dwords */
+ offset = offset / 4;
+
+ for (int i = 0; i < num_components; i++) {
+ ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
+ }
+
+ return true;
}
static void
ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
+ /* We scalarize general TMU access for anything that is not 32-bit. */
+ assert(instr->def.bit_size == 32 ||
+ instr->num_components == 1);
+
+ /* Try to emit ldunif if possible, otherwise fallback to general TMU */
if (nir_src_is_const(instr->src[0])) {
int offset = (nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[0]));
- assert(offset % 4 == 0);
- /* We need dwords */
- offset = offset / 4;
- for (int i = 0; i < instr->num_components; i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_UNIFORM,
- offset + i));
+
+ if (try_emit_uniform(c, offset, instr->num_components,
+ &instr->def, QUNIFORM_UNIFORM)) {
+ return;
+ }
+ }
+
+ if (!ntq_emit_load_unifa(c, instr)) {
+ ntq_emit_tmu_general(c, instr, false, false);
+ c->has_general_tmu_load = true;
+ }
+}
+
+static bool
+ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ if (c->compiler->max_inline_uniform_buffers <= 0)
+ return false;
+
+ /* Regular UBOs start after inline UBOs */
+ uint32_t index = nir_src_as_uint(instr->src[0]);
+ if (index >= c->compiler->max_inline_uniform_buffers)
+ return false;
+
+ /* We scalarize general TMU access for anything that is not 32-bit */
+ assert(instr->def.bit_size == 32 ||
+ instr->num_components == 1);
+
+ if (nir_src_is_const(instr->src[1])) {
+ int offset = nir_src_as_uint(instr->src[1]);
+ if (try_emit_uniform(c, offset, instr->num_components,
+ &instr->def,
+ QUNIFORM_INLINE_UBO_0 + index)) {
+ return true;
}
- } else {
- ntq_emit_tmu_general(c, instr, false);
}
+
+ /* Fallback to regular UBO load */
+ return false;
}
static void
@@ -2411,7 +2726,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
unsigned offset =
nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
- if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
/* Emit the LDVPM directly now, rather than at the top
* of the shader like we did for V3D 3.x (which needs
* vpmsetup when not just taking the next offset).
@@ -2433,19 +2748,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
SYSTEM_VALUE_VERTEX_ID)) {
index++;
}
- for (int i = 0; i < offset; i++)
- index += c->vattr_sizes[i];
+
+ for (int i = 0; i < offset; i++) {
+ /* GFXH-1602: if any builtins (vid, iid, etc) are read then
+ * attribute 0 must be active (size > 0). When we hit this,
+ * the driver is expected to program attribute 0 to have a
+ * size of 1, so here we need to add that.
+ */
+ if (i == 0 && c->vs_key->is_coord &&
+ c->vattr_sizes[i] == 0 && index > 0) {
+ index++;
+ } else {
+ index += c->vattr_sizes[i];
+ }
+ }
+
index += nir_intrinsic_component(instr);
for (int i = 0; i < instr->num_components; i++) {
struct qreg vpm_offset = vir_uniform_ui(c, index++);
- ntq_store_dest(c, &instr->dest, i,
- vir_LDVPMV_IN(c, vpm_offset));
+ ntq_store_def(c, &instr->def, i,
+ vir_LDVPMV_IN(c, vpm_offset));
}
} else {
for (int i = 0; i < instr->num_components; i++) {
int comp = nir_intrinsic_component(instr) + i;
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[offset * 4 + comp]));
+ struct qreg input = c->inputs[offset * 4 + comp];
+ ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
+
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
+ input.file == c->payload_z.file &&
+ input.index == c->payload_z.index) {
+ c->reads_z = true;
+ }
}
}
}
@@ -2610,18 +2944,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c,
/* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
struct qreg F = vir_uniform_ui(c, 0);
struct qreg T = vir_uniform_ui(c, ~0);
- struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
+ struct qreg s0 = vir_AND(c, sample_mask, i1);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
- s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
- struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
+ s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+ struct qreg s1 = vir_AND(c, sample_mask, i2);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
- s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
- struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
+ s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+ struct qreg s2 = vir_AND(c, sample_mask, i4);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
- s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
- struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
+ s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+ struct qreg s3 = vir_AND(c, sample_mask, i8);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
- s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
+ s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
/* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
struct qreg sample_idx = i3;
@@ -2708,28 +3042,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result)
c->current_unifa_offset += 4;
}
-static void
-ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
+/* Checks if the value of a nir src is derived from a nir register */
+static bool
+nir_src_derived_from_reg(nir_src src)
+{
+ nir_def *def = src.ssa;
+ if (nir_load_reg_for_def(def))
+ return true;
+
+ nir_instr *parent = def->parent_instr;
+ switch (parent->type) {
+ case nir_instr_type_alu: {
+ nir_alu_instr *alu = nir_instr_as_alu(parent);
+ int num_srcs = nir_op_infos[alu->op].num_inputs;
+ for (int i = 0; i < num_srcs; i++) {
+ if (nir_src_derived_from_reg(alu->src[i].src))
+ return true;
+ }
+ return false;
+ }
+ case nir_instr_type_intrinsic: {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+ int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
+ for (int i = 0; i < num_srcs; i++) {
+ if (nir_src_derived_from_reg(intr->src[i]))
+ return true;
+ }
+ return false;
+ }
+ case nir_instr_type_load_const:
+ case nir_instr_type_undef:
+ return false;
+ default:
+ /* By default we assume it may come from a register, the above
+ * cases should be able to handle the majority of situations
+ * though.
+ */
+ return true;
+ };
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
+ assert(instr->intrinsic == nir_intrinsic_load_ubo ||
+ instr->intrinsic == nir_intrinsic_load_ssbo ||
+ instr->intrinsic == nir_intrinsic_load_uniform);
+
+ bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
+ bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
+ bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
+
/* Every ldunifa auto-increments the unifa address by 4 bytes, so our
* current unifa offset is 4 bytes ahead of the offset of the last load.
*/
static const int32_t max_unifa_skip_dist =
MAX_UNIFA_SKIP_DISTANCE - 4;
- bool dynamic_src = !nir_src_is_const(instr->src[1]);
- uint32_t const_offset =
- dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
+ /* We can only use unifa if the offset is uniform */
+ nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
+ if (nir_src_is_divergent(offset))
+ return false;
- /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
- * shifted up by 1 (0 is gallium's constant buffer 0).
+ /* Emitting loads from unifa may not be safe under non-uniform control
+ * flow. It seems the address that is used to write to the unifa
+ * register is taken from the first lane and if that lane is disabled
+ * by control flow then the value we read may be bogus and lead to
+ * invalid memory accesses on follow-up ldunifa instructions. However,
+ * ntq_store_def only emits conditional writes for nir registersas long
+ * we can be certain that the offset isn't derived from a load_reg we
+ * should be fine.
+ *
+ * The following CTS test can be used to trigger the problem, which
+ * causes a GMP violations in the sim without this check:
+ * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
*/
- uint32_t index = nir_src_as_uint(instr->src[0]);
- if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
+ if (vir_in_nonuniform_control_flow(c) &&
+ nir_src_derived_from_reg(offset)) {
+ return false;
+ }
+
+ /* We can only use unifa with SSBOs if they are read-only. Otherwise
+ * ldunifa won't see the shader writes to that address (possibly
+ * because ldunifa doesn't read from the L2T cache).
+ */
+ if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
+ return false;
+
+ /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
+ * that we may have been written to scratch using the TMU.
+ */
+ bool dynamic_src = !nir_src_is_const(offset);
+ if (is_uniform && dynamic_src && c->s->scratch_size > 0)
+ return false;
+
+ uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
+ if (is_uniform)
+ const_offset += nir_intrinsic_base(instr);
+
+ /* ldunifa is a 32-bit load instruction so we can only use it with
+ * 32-bit aligned addresses. We always produce 32-bit aligned addresses
+ * except for types smaller than 32-bit, so in these cases we can only
+ * use ldunifa if we can verify alignment, which we can only do for
+ * loads with a constant offset.
+ */
+ uint32_t bit_size = instr->def.bit_size;
+ uint32_t value_skips = 0;
+ if (bit_size < 32) {
+ if (dynamic_src) {
+ return false;
+ } else if (const_offset % 4 != 0) {
+ /* If we are loading from an unaligned offset, fix
+ * alignment and skip over unused elements in result.
+ */
+ value_skips = (const_offset % 4) / (bit_size / 8);
+ const_offset &= ~0x3;
+ }
+ }
+
+ assert((bit_size == 32 && value_skips == 0) ||
+ (bit_size == 16 && value_skips <= 1) ||
+ (bit_size == 8 && value_skips <= 3));
+
+ /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
+ * constants.
+ */
+ uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
+
+ /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
+ * index 0 for Gallium's constant buffer (GL) or push constants
+ * (Vulkan).
+ */
+ if (is_ubo)
index++;
/* We can only keep track of the last unifa address we used with
- * constant offset loads. If the new load targets the same UBO and
+ * constant offset loads. If the new load targets the same buffer and
* is close enough to the previous load, we can skip the unifa register
* write by emitting dummy ldunifa instructions to update the unifa
* address.
@@ -2739,6 +3187,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
if (dynamic_src) {
c->current_unifa_block = NULL;
} else if (c->cur_block == c->current_unifa_block &&
+ c->current_unifa_is_ubo == !is_ssbo &&
c->current_unifa_index == index &&
c->current_unifa_offset <= const_offset &&
c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
@@ -2746,32 +3195,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
} else {
c->current_unifa_block = c->cur_block;
+ c->current_unifa_is_ubo = !is_ssbo;
c->current_unifa_index = index;
c->current_unifa_offset = const_offset;
}
if (!skip_unifa) {
- struct qreg base_offset =
+ struct qreg base_offset = !is_ssbo ?
vir_uniform(c, QUNIFORM_UBO_ADDR,
- v3d_unit_data_create(index, const_offset));
+ v3d_unit_data_create(index, const_offset)) :
+ vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
if (!dynamic_src) {
- vir_MOV_dest(c, unifa, base_offset);
+ if (!is_ssbo) {
+ /* Avoid the extra MOV to UNIFA by making
+ * ldunif load directly into it. We can't
+ * do this if we have not actually emitted
+ * ldunif and are instead reusing a previous
+ * one.
+ */
+ struct qinst *inst =
+ (struct qinst *)c->cur_block->instructions.prev;
+ if (inst == c->defs[base_offset.index]) {
+ inst->dst = unifa;
+ c->defs[base_offset.index] = NULL;
+ } else {
+ vir_MOV_dest(c, unifa, base_offset);
+ }
+ } else {
+ vir_ADD_dest(c, unifa, base_offset,
+ vir_uniform_ui(c, const_offset));
+ }
} else {
vir_ADD_dest(c, unifa, base_offset,
- ntq_get_src(c, instr->src[1], 0));
+ ntq_get_src(c, offset, 0));
}
} else {
for (int i = 0; i < ldunifa_skips; i++)
emit_ldunifa(c, NULL);
}
- for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+ uint32_t num_components = nir_intrinsic_dest_components(instr);
+ for (uint32_t i = 0; i < num_components; ) {
struct qreg data;
emit_ldunifa(c, &data);
- ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
+
+ if (bit_size == 32) {
+ assert(value_skips == 0);
+ ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
+ i++;
+ } else {
+ assert((bit_size == 16 && value_skips <= 1) ||
+ (bit_size == 8 && value_skips <= 3));
+
+ /* If we have any values to skip, shift to the first
+ * valid value in the ldunifa result.
+ */
+ if (value_skips > 0) {
+ data = vir_SHR(c, data,
+ vir_uniform_ui(c, bit_size *
+ value_skips));
+ }
+
+ /* Check how many valid components we have discounting
+ * read components to skip.
+ */
+ uint32_t valid_count = (32 / bit_size) - value_skips;
+ assert((bit_size == 16 && valid_count <= 2) ||
+ (bit_size == 8 && valid_count <= 4));
+ assert(valid_count > 0);
+
+ /* Process the valid components */
+ do {
+ struct qreg tmp;
+ uint32_t mask = (1 << bit_size) - 1;
+ tmp = vir_AND(c, vir_MOV(c, data),
+ vir_uniform_ui(c, mask));
+ ntq_store_def(c, &instr->def, i,
+ vir_MOV(c, tmp));
+ i++;
+ valid_count--;
+
+ /* Shift to next component */
+ if (i < num_components && valid_count > 0) {
+ data = vir_SHR(c, data,
+ vir_uniform_ui(c, bit_size));
+ }
+ } while (i < num_components && valid_count > 0);
+ }
}
+
+ return true;
}
static inline struct qreg
@@ -2781,187 +3296,273 @@ emit_load_local_invocation_index(struct v3d_compile *c)
vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
}
-/* Various subgroup operations rely on the A flags, so this helper ensures that
- * A flags represents currently active lanes in the subgroup.
+/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
+ * fragment shaders a lane is considered active if any sample flags are set
+ * for *any* lane in the same quad, however, we still need to ensure that
+ * terminated lanes (OpTerminate) are not included. Further, we also need to
+ * disable lanes that may be disabled because of non-uniform control
+ * flow.
*/
-static void
-set_a_flags_for_subgroup(struct v3d_compile *c)
+static enum v3d_qpu_cond
+setup_subgroup_control_flow_condition(struct v3d_compile *c)
{
- /* MSF returns 0 for disabled lanes in compute shaders so
- * PUSHZ will set A=1 for disabled lanes. We want the inverse
- * of this but we don't have any means to negate the A flags
- * directly, but we can do it by repeating the same operation
- * with NORZ (A = ~A & ~Z).
+ assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
+ c->s->info.stage == MESA_SHADER_COMPUTE);
+
+ enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
+
+ /* We need to make sure that terminated lanes in fragment shaders are
+ * not included. We can identify these lanes by comparing the inital
+ * sample mask with the current. This fixes:
+ * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
*/
- assert(c->s->info.stage == MESA_SHADER_COMPUTE);
- vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
- vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
+ vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
+ vir_NOT(c, vir_XOR(c, c->start_msf,
+ vir_MSF(c)))),
+ V3D_QPU_PF_PUSHZ);
+ cond = V3D_QPU_COND_IFNA;
+ }
- /* If we are under non-uniform control flow we also need to
- * AND the A flags with the current execute mask.
+ /* If we are in non-uniform control-flow update the condition to
+ * also limit lanes to those in the current execution mask.
*/
if (vir_in_nonuniform_control_flow(c)) {
- const uint32_t bidx = c->cur_block->index;
- vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
- c->execute,
- vir_uniform_ui(c, bidx)),
- V3D_QPU_UF_ANDZ);
+ if (cond == V3D_QPU_COND_IFNA) {
+ vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_UF_NORNZ);
+ } else {
+ assert(cond == V3D_QPU_COND_NONE);
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
+ cond = V3D_QPU_COND_IFA;
}
+
+ return cond;
+}
+
+static void
+emit_compute_barrier(struct v3d_compile *c)
+{
+ /* Ensure we flag the use of the control barrier. NIR's
+ * gather info pass usually takes care of this, but that
+ * requires that we call that pass after any other pass
+ * may emit a control barrier, so this is safer.
+ */
+ c->s->info.uses_control_barrier = true;
+
+ /* Emit a TSY op to get all invocations in the workgroup
+ * (actually supergroup) to block until the last
+ * invocation reaches the TSY op.
+ */
+ vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
+}
+
+static void
+emit_barrier(struct v3d_compile *c)
+{
+ struct qreg eidx = vir_EIDX(c);
+
+ /* The config for the TSY op should be setup like this:
+ * - Lane 0: Quorum
+ * - Lane 2: TSO id
+ * - Lane 3: TSY opcode
+ */
+
+ /* Lane 0: we want to synchronize across one subgroup. Here we write to
+ * all lanes unconditionally and will overwrite other lanes below.
+ */
+ struct qreg tsy_conf = vir_uniform_ui(c, 1);
+
+ /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
+ * curent QPU index and thread index to ensure we get a unique one for
+ * this group of invocations in this core.
+ */
+ struct qreg tso_id =
+ vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
+ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
+ V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
+
+ /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
+ struct qreg tsy_op = vir_uniform_ui(c, 16);
+ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
+ V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
+
+ /* Emit TSY sync */
+ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
}
static void
ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ break; /* Ignore these */
+
case nir_intrinsic_load_uniform:
ntq_emit_load_uniform(c, instr);
break;
+ case nir_intrinsic_load_global_2x32:
+ ntq_emit_tmu_general(c, instr, false, true);
+ c->has_general_tmu_load = true;
+ break;
+
case nir_intrinsic_load_ubo:
- if (!nir_src_is_divergent(instr->src[1]))
- ntq_emit_load_ubo_unifa(c, instr);
- else
- ntq_emit_tmu_general(c, instr, false);
- break;
-
- case nir_intrinsic_ssbo_atomic_add:
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_ssbo_atomic_comp_swap:
+ if (ntq_emit_inline_ubo_load(c, instr))
+ break;
+ FALLTHROUGH;
case nir_intrinsic_load_ssbo:
+ if (!ntq_emit_load_unifa(c, instr)) {
+ ntq_emit_tmu_general(c, instr, false, false);
+ c->has_general_tmu_load = true;
+ }
+ break;
+
case nir_intrinsic_store_ssbo:
- ntq_emit_tmu_general(c, instr, false);
- break;
-
- case nir_intrinsic_shared_atomic_add:
- case nir_intrinsic_shared_atomic_imin:
- case nir_intrinsic_shared_atomic_umin:
- case nir_intrinsic_shared_atomic_imax:
- case nir_intrinsic_shared_atomic_umax:
- case nir_intrinsic_shared_atomic_and:
- case nir_intrinsic_shared_atomic_or:
- case nir_intrinsic_shared_atomic_xor:
- case nir_intrinsic_shared_atomic_exchange:
- case nir_intrinsic_shared_atomic_comp_swap:
- case nir_intrinsic_load_shared:
+ case nir_intrinsic_ssbo_atomic:
+ case nir_intrinsic_ssbo_atomic_swap:
+ ntq_emit_tmu_general(c, instr, false, false);
+ break;
+
+ case nir_intrinsic_store_global_2x32:
+ case nir_intrinsic_global_atomic_2x32:
+ case nir_intrinsic_global_atomic_swap_2x32:
+ ntq_emit_tmu_general(c, instr, false, true);
+ break;
+
+ case nir_intrinsic_shared_atomic:
+ case nir_intrinsic_shared_atomic_swap:
case nir_intrinsic_store_shared:
- case nir_intrinsic_load_scratch:
case nir_intrinsic_store_scratch:
- ntq_emit_tmu_general(c, instr, true);
+ ntq_emit_tmu_general(c, instr, true, false);
+ break;
+
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_shared:
+ ntq_emit_tmu_general(c, instr, true, false);
+ c->has_general_tmu_load = true;
break;
- case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
- case nir_intrinsic_image_atomic_add:
- case nir_intrinsic_image_atomic_imin:
- case nir_intrinsic_image_atomic_umin:
- case nir_intrinsic_image_atomic_imax:
- case nir_intrinsic_image_atomic_umax:
- case nir_intrinsic_image_atomic_and:
- case nir_intrinsic_image_atomic_or:
- case nir_intrinsic_image_atomic_xor:
- case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_image_atomic_comp_swap:
- v3d40_vir_emit_image_load_store(c, instr);
+ case nir_intrinsic_image_atomic:
+ case nir_intrinsic_image_atomic_swap:
+ v3d_vir_emit_image_load_store(c, instr);
+ break;
+
+ case nir_intrinsic_image_load:
+ v3d_vir_emit_image_load_store(c, instr);
+ /* Not really a general TMU load, but we only use this flag
+ * for NIR scheduling and we do schedule these under the same
+ * policy as general TMU.
+ */
+ c->has_general_tmu_load = true;
break;
case nir_intrinsic_get_ssbo_size:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
- nir_src_comp_as_uint(instr->src[0], 0)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
+ nir_src_comp_as_uint(instr->src[0], 0)));
break;
case nir_intrinsic_get_ubo_size:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
- nir_src_comp_as_uint(instr->src[0], 0)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
+ nir_src_comp_as_uint(instr->src[0], 0)));
break;
case nir_intrinsic_load_user_clip_plane:
for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
- nir_intrinsic_ucp_id(instr) *
- 4 + i));
+ ntq_store_def(c, &instr->def, i,
+ vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+ nir_intrinsic_ucp_id(instr) *
+ 4 + i));
}
break;
case nir_intrinsic_load_viewport_x_scale:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
break;
case nir_intrinsic_load_viewport_y_scale:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
break;
case nir_intrinsic_load_viewport_z_scale:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
break;
case nir_intrinsic_load_viewport_z_offset:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
break;
case nir_intrinsic_load_line_coord:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
break;
case nir_intrinsic_load_line_width:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
break;
case nir_intrinsic_load_aa_line_width:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
break;
case nir_intrinsic_load_sample_mask_in:
- ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
+ ntq_store_def(c, &instr->def, 0, vir_MSF(c));
break;
case nir_intrinsic_load_helper_invocation:
vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
- ntq_store_dest(c, &instr->dest, 0, qdest);
+ ntq_store_def(c, &instr->def, 0, qdest);
break;
case nir_intrinsic_load_front_face:
/* The register contains 0 (front) or 1 (back), and we need to
* turn it into a NIR bool where true means front.
*/
- ntq_store_dest(c, &instr->dest, 0,
- vir_ADD(c,
- vir_uniform_ui(c, -1),
- vir_REVF(c)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_ADD(c,
+ vir_uniform_ui(c, -1),
+ vir_REVF(c)));
break;
case nir_intrinsic_load_base_instance:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
break;
case nir_intrinsic_load_instance_id:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
break;
case nir_intrinsic_load_vertex_id:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
+ break;
+
+ case nir_intrinsic_load_draw_id:
+ ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
break;
case nir_intrinsic_load_tlb_color_v3d:
vir_emit_tlb_color_read(c, instr);
break;
+ case nir_intrinsic_load_fep_w_v3d:
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
+ break;
+
case nir_intrinsic_load_input:
ntq_emit_load_input(c, instr);
break;
@@ -2978,7 +3579,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_emit_image_size(c, instr);
break;
+ /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
+ * is intended to match the semantics of GLSL's discard) should
+ * terminate the invocation immediately. Our implementation doesn't
+ * do that. What we do is actually a demote by removing the invocations
+ * from the sample mask. Maybe we could be more strict and force an
+ * early termination by emitting a (maybe conditional) jump to the
+ * end section of the fragment shader for affected invocations.
+ */
case nir_intrinsic_discard:
+ case nir_intrinsic_terminate:
+ c->emitted_discard = true;
+ FALLTHROUGH;
+ case nir_intrinsic_demote:
ntq_flush_tmu(c);
if (vir_in_nonuniform_control_flow(c)) {
@@ -2993,7 +3606,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
break;
- case nir_intrinsic_discard_if: {
+ case nir_intrinsic_discard_if:
+ case nir_intrinsic_terminate_if:
+ c->emitted_discard = true;
+ FALLTHROUGH;
+ case nir_intrinsic_demote_if: {
ntq_flush_tmu(c);
enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
@@ -3011,102 +3628,79 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
vir_uniform_ui(c, 0)), cond);
-
break;
}
- case nir_intrinsic_memory_barrier:
- case nir_intrinsic_memory_barrier_buffer:
- case nir_intrinsic_memory_barrier_image:
- case nir_intrinsic_memory_barrier_shared:
- case nir_intrinsic_memory_barrier_tcs_patch:
- case nir_intrinsic_group_memory_barrier:
- /* We don't do any instruction scheduling of these NIR
- * instructions between each other, so we just need to make
- * sure that the TMU operations before the barrier are flushed
+ case nir_intrinsic_barrier:
+ /* Ensure that the TMU operations before the barrier are flushed
* before the ones after the barrier.
*/
ntq_flush_tmu(c);
- break;
-
- case nir_intrinsic_control_barrier:
- /* Emit a TSY op to get all invocations in the workgroup
- * (actually supergroup) to block until the last invocation
- * reaches the TSY op.
- */
- ntq_flush_tmu(c);
- if (c->devinfo->ver >= 42) {
- vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
- V3D_QPU_WADDR_SYNCB));
- } else {
- struct qinst *sync =
- vir_BARRIERID_dest(c,
- vir_reg(QFILE_MAGIC,
- V3D_QPU_WADDR_SYNCU));
- sync->uniform =
- vir_get_uniform_index(c, QUNIFORM_CONSTANT,
- 0xffffff00 |
- V3D_TSY_WAIT_INC_CHECK);
+ if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
+ if (c->s->info.stage == MESA_SHADER_COMPUTE)
+ emit_compute_barrier(c);
+ else
+ emit_barrier(c);
+ /* The blocking of a TSY op only happens at the next
+ * thread switch. No texturing may be outstanding at the
+ * time of a TSY blocking operation.
+ */
+ vir_emit_thrsw(c);
}
-
- /* The blocking of a TSY op only happens at the next thread
- * switch. No texturing may be outstanding at the time of a
- * TSY blocking operation.
- */
- vir_emit_thrsw(c);
break;
case nir_intrinsic_load_num_workgroups:
for (int i = 0; i < 3; i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
- i));
+ ntq_store_def(c, &instr->def, i,
+ vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
+ i));
}
break;
case nir_intrinsic_load_workgroup_id: {
struct qreg x = vir_AND(c, c->cs_payload[0],
vir_uniform_ui(c, 0xffff));
+ ntq_store_def(c, &instr->def, 0, x);
struct qreg y = vir_SHR(c, c->cs_payload[0],
vir_uniform_ui(c, 16));
+ ntq_store_def(c, &instr->def, 1, y);
struct qreg z = vir_AND(c, c->cs_payload[1],
vir_uniform_ui(c, 0xffff));
+ ntq_store_def(c, &instr->def, 2, z);
+ break;
+ }
- /* We only support dispatch base in Vulkan */
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
- x = vir_ADD(c, x,
- vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
- y = vir_ADD(c, y,
- vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
- z = vir_ADD(c, z,
- vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
- }
+ case nir_intrinsic_load_base_workgroup_id: {
+ struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
+ ntq_store_def(c, &instr->def, 0, x);
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
- ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
- ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
+ struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
+ ntq_store_def(c, &instr->def, 1, y);
+
+ struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
+ ntq_store_def(c, &instr->def, 2, z);
break;
}
case nir_intrinsic_load_local_invocation_index:
- ntq_store_dest(c, &instr->dest, 0,
- emit_load_local_invocation_index(c));
+ ntq_store_def(c, &instr->def, 0,
+ emit_load_local_invocation_index(c));
break;
case nir_intrinsic_load_subgroup_id: {
/* This is basically the batch index, which is the Local
* Invocation Index divided by the SIMD width).
*/
- STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+ STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
struct qreg lii = emit_load_local_invocation_index(c);
- ntq_store_dest(c, &instr->dest, 0,
- vir_SHR(c, lii,
- vir_uniform_ui(c, divide_shift)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_SHR(c, lii,
+ vir_uniform_ui(c, divide_shift)));
break;
}
@@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
struct qreg col = ntq_get_src(c, instr->src[0], 0);
for (int i = 0; i < instr->num_components; i++) {
struct qreg row = vir_uniform_ui(c, row_idx++);
- ntq_store_dest(c, &instr->dest, i,
- vir_LDVPMG_IN(c, row, col));
+ ntq_store_def(c, &instr->def, i,
+ vir_LDVPMG_IN(c, row, col));
}
break;
}
@@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
* using ldvpm(v,d)_in (See Table 71).
*/
assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
- ntq_store_dest(c, &instr->dest, 0,
- vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
break;
}
case nir_intrinsic_load_invocation_id:
- ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
+ ntq_store_def(c, &instr->def, 0, vir_IID(c));
break;
case nir_intrinsic_load_fb_layers_v3d:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
break;
case nir_intrinsic_load_sample_id:
- ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
+ ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
break;
case nir_intrinsic_load_sample_pos:
- ntq_store_dest(c, &instr->dest, 0,
- vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
- ntq_store_dest(c, &instr->dest, 1,
- vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
+ ntq_store_def(c, &instr->def, 0,
+ vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
+ ntq_store_def(c, &instr->def, 1,
+ vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
break;
case nir_intrinsic_load_barycentric_at_offset:
- ntq_store_dest(c, &instr->dest, 0,
- vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
- ntq_store_dest(c, &instr->dest, 1,
- vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
+ ntq_store_def(c, &instr->def, 1,
+ vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
break;
case nir_intrinsic_load_barycentric_pixel:
- ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
- ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
break;
case nir_intrinsic_load_barycentric_at_sample: {
if (!c->fs_key->msaa) {
- ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
- ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
return;
}
@@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
- ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+ ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
break;
}
@@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
struct qreg offset_y =
vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
- ntq_store_dest(c, &instr->dest, 0,
- vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
- ntq_store_dest(c, &instr->dest, 1,
- vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
+ ntq_store_def(c, &instr->def, 1,
+ vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
break;
}
case nir_intrinsic_load_barycentric_centroid: {
struct qreg offset_x, offset_y;
ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
- ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+ ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
break;
}
@@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
*/
if (!c->fs_key->msaa ||
c->interp[input_idx].vp.file == QFILE_NULL) {
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[input_idx]));
+ ntq_store_def(c, &instr->def, i,
+ vir_MOV(c, c->inputs[input_idx]));
continue;
}
@@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_emit_load_interpolated_input(c, p, C,
offset_x, offset_y,
interp_mode);
- ntq_store_dest(c, &instr->dest, i, result);
+ ntq_store_def(c, &instr->def, i, result);
}
break;
}
case nir_intrinsic_load_subgroup_size:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform_ui(c, V3D_CHANNELS));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform_ui(c, V3D_CHANNELS));
break;
case nir_intrinsic_load_subgroup_invocation:
- ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+ ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
break;
case nir_intrinsic_elect: {
- set_a_flags_for_subgroup(c);
- struct qreg first = vir_FLAFIRST(c);
+ struct qreg first;
+ if (vir_in_nonuniform_control_flow(c)) {
+ /* Sets A=1 for lanes enabled in the execution mask */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ /* Updates A ANDing with lanes enabled in MSF */
+ vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
+ V3D_QPU_UF_ANDNZ);
+ first = vir_FLAFIRST(c);
+ } else {
+ /* Sets A=1 for inactive lanes */
+ vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
+ V3D_QPU_PF_PUSHZ);
+ first = vir_FLNAFIRST(c);
+ }
- /* Produce a boolean result from Flafirst */
+ /* Produce a boolean result */
vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
first, vir_uniform_ui(c, 1)),
V3D_QPU_PF_PUSHZ);
struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
- ntq_store_dest(c, &instr->dest, 0, result);
+ ntq_store_def(c, &instr->def, 0, result);
+ break;
+ }
+
+ case nir_intrinsic_ballot: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_read_invocation: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ struct qreg index = ntq_get_src(c, instr->src[1], 0);
+ struct qreg res = vir_SHUFFLE(c, value, index);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_read_first_invocation: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_shuffle: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ struct qreg indices = ntq_get_src(c, instr->src[1], 0);
+ struct qreg res = vir_SHUFFLE(c, value, indices);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_vote_feq:
+ case nir_intrinsic_vote_ieq: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
+ vir_ALLEQ_dest(c, res, value) :
+ vir_ALLFEQ_dest(c, res, value),
+ cond);
+
+ /* Produce boolean result */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+ V3D_QPU_PF_PUSHZ);
+ struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+ ntq_store_def(c, &instr->def, 0, result);
+ break;
+ }
+
+ case nir_intrinsic_vote_all: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+ /* We want to check if 'all lanes are equal (alleq != 0) and
+ * their value is True (value != 0)'.
+ *
+ * The first MOV.pushz generates predicate for 'alleq == 0'.
+ * The second MOV.NORZ generates predicate for:
+ * '!(alleq == 0) & !(value == 0).
+ */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+ V3D_QPU_PF_PUSHZ);
+ vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+ V3D_QPU_UF_NORZ);
+ struct qreg result =
+ ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+ ntq_store_def(c, &instr->def, 0, result);
+ break;
+ }
+
+ case nir_intrinsic_vote_any: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+ /* We want to check 'not (all lanes are equal (alleq != 0)'
+ * and their value is False (value == 0))'.
+ *
+ * The first MOV.pushz generates predicate for 'alleq == 0'.
+ * The second MOV.NORNZ generates predicate for:
+ * '!(alleq == 0) & (value == 0).
+ * The IFNA condition negates the predicate when evaluated:
+ * '!(!alleq == 0) & (value == 0))
+ */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+ V3D_QPU_PF_PUSHZ);
+ vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+ V3D_QPU_UF_NORNZ);
+ struct qreg result =
+ ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+ ntq_store_def(c, &instr->def, 0, result);
break;
}
@@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_view_index:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
break;
default:
@@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c)
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
}
+static bool
+is_cheap_block(nir_block *block)
+{
+ int32_t cost = 3;
+ nir_foreach_instr(instr, block) {
+ switch (instr->type) {
+ case nir_instr_type_alu:
+ case nir_instr_type_undef:
+ case nir_instr_type_load_const:
+ if (--cost <= 0)
+ return false;
+ break;
+ case nir_instr_type_intrinsic: {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ continue;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+ }
+ return true;
+}
+
static void
ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
{
@@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
c->execute,
vir_uniform_ui(c, else_block->index));
- /* Jump to ELSE if nothing is active for THEN, otherwise fall
- * through.
+ /* Set the flags for taking the THEN block */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+
+ /* Jump to ELSE if nothing is active for THEN (unless THEN block is
+ * so small it won't pay off), otherwise fall through.
*/
- vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
- vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
- vir_link_blocks(c->cur_block, else_block);
+ bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
+ is_cheap_block(nir_if_first_then_block(if_stmt));
+ if (!is_cheap) {
+ vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+ vir_link_blocks(c->cur_block, else_block);
+ }
vir_link_blocks(c->cur_block, then_block);
- /* Process the THEN block. */
+ /* Process the THEN block.
+ *
+ * Notice we don't call ntq_activate_execute_for_block here on purpose:
+ * c->execute is already set up to be 0 for lanes that must take the
+ * THEN block.
+ */
vir_set_emit_block(c, then_block);
ntq_emit_cf_list(c, &if_stmt->then_list);
@@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, after_block->index));
- /* If everything points at ENDIF, then jump there immediately. */
- vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
- c->execute,
- vir_uniform_ui(c, after_block->index)),
- V3D_QPU_PF_PUSHZ);
- vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
- vir_link_blocks(c->cur_block, after_block);
+ /* If everything points at ENDIF, then jump there immediately
+ * (unless ELSE block is so small it won't pay off).
+ */
+ bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
+ is_cheap_block(nir_else_block);
+ if (!is_cheap) {
+ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
+ c->execute,
+ vir_uniform_ui(c, after_block->index)),
+ V3D_QPU_PF_PUSHZ);
+ vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+ vir_link_blocks(c->cur_block, after_block);
+ }
vir_link_blocks(c->cur_block, else_block);
vir_set_emit_block(c, else_block);
@@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
ntq_emit_load_const(c, nir_instr_as_load_const(instr));
break;
- case nir_instr_type_ssa_undef:
+ case nir_instr_type_undef:
unreachable("Should've been lowered by nir_lower_undef_to_zero");
break;
@@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
static void
ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
{
-
c->loop_cont_block = vir_new_block(c);
c->loop_break_block = vir_new_block(c);
@@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
static void
ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
{
+ assert(!nir_loop_has_continue_construct(loop));
+
+ /* Disable flags optimization for loop conditions. The problem here is
+ * that we can have code like this:
+ *
+ * // block_0
+ * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
+ * loop {
+ * // block_1
+ * if ssa_9 {
+ *
+ * In this example we emit flags to compute ssa_9 and the optimization
+ * will skip regenerating them again for the loop condition in the
+ * loop continue block (block_1). However, this is not safe after the
+ * first iteration because the loop body can stomp the flags if it has
+ * any conditionals.
+ */
+ c->flags_temp = -1;
+
bool was_in_control_flow = c->in_control_flow;
c->in_control_flow = true;
@@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
static void
ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
{
- ntq_setup_registers(c, &impl->registers);
+ ntq_setup_registers(c, impl);
ntq_emit_cf_list(c, &impl->body);
}
@@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c)
{
switch (c->s->info.stage) {
case MESA_SHADER_FRAGMENT:
- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->start_msf = vir_MSF(c);
+ if (c->devinfo->ver < 71)
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ else
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
@@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c)
emit_fragment_varying(c, NULL, -1, 0, 0);
}
- if (c->fs_key->is_points &&
- (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
+ if (c->fs_key->is_points && program_reads_point_coord(c)) {
c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
c->uses_implicit_point_line_varyings = true;
} else if (c->fs_key->is_lines &&
- (c->devinfo->ver < 40 ||
- BITSET_TEST(c->s->info.system_values_read,
+ (BITSET_TEST(c->s->info.system_values_read,
SYSTEM_VALUE_LINE_COORD))) {
c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
c->uses_implicit_point_line_varyings = true;
}
-
- c->force_per_sample_msaa =
- c->s->info.fs.uses_sample_qualifier ||
- BITSET_TEST(c->s->info.system_values_read,
- SYSTEM_VALUE_SAMPLE_ID) ||
- BITSET_TEST(c->s->info.system_values_read,
- SYSTEM_VALUE_SAMPLE_POS);
break;
case MESA_SHADER_COMPUTE:
/* Set up the TSO for barriers, assuming we do some. */
@@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c)
V3D_QPU_WADDR_SYNC));
}
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ if (c->devinfo->ver == 42) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ } else if (c->devinfo->ver >= 71) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ }
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
@@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c)
/* Find the main function and emit the body. */
nir_foreach_function(function, c->s) {
- assert(strcmp(function->name, "main") == 0);
+ assert(function->is_entrypoint);
assert(function->impl);
ntq_emit_impl(c, function->impl);
}
@@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c,
{
*restore_last_thrsw = c->last_thrsw;
- /* On V3D before 4.1, we need a TMU op to be outstanding when thread
- * switching, so disable threads if we didn't do any TMU ops (each of
- * which would have emitted a THRSW).
- */
- if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
- c->threads = 1;
- if (c->last_thrsw)
- vir_remove_thrsw(c);
- *restore_last_thrsw = NULL;
- }
-
/* If we're threaded and the last THRSW was in conditional code, then
* we need to emit another one so that we can flag it as the last
* thrsw.
*/
- if (c->last_thrsw && !c->last_thrsw_at_top_level) {
- assert(c->devinfo->ver >= 41);
+ if (c->last_thrsw && !c->last_thrsw_at_top_level)
vir_emit_thrsw(c);
- }
/* If we're threaded, then we need to mark the last THRSW instruction
* so we can emit a pair of them at QPU emit time.
@@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c,
* For V3D 4.x, we can spawn the non-fragment shaders already in the
* post-last-THRSW state, so we can skip this.
*/
- if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
- assert(c->devinfo->ver >= 41);
+ if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
vir_emit_thrsw(c);
- }
/* If we have not inserted a last thread switch yet, do it now to ensure
* any potential spilling we do happens before this. If we don't spill
@@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c)
vir_for_each_inst_inorder(inst, c) {
for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_REG &&
- inst->src[i].index == 0) {
+ if (inst->src[i].file == c->payload_w.file &&
+ inst->src[i].index == c->payload_w.index) {
c->uses_center_w = true;
return;
}
@@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c)
void
v3d_nir_to_vir(struct v3d_compile *c)
{
- if (V3D_DEBUG & (V3D_DEBUG_NIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(NIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
fprintf(stderr, "%s prog %d/%d NIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
@@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
unreachable("bad stage");
}
- if (V3D_DEBUG & (V3D_DEBUG_VIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(VIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
@@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
* instructions until the results are needed.
*/
- if (V3D_DEBUG & (V3D_DEBUG_VIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(VIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
fprintf(stderr, "%s prog %d/%d VIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
@@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c)
/* Attempt to allocate registers for the temporaries. If we fail,
* reduce thread count and try again.
*/
- int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
+ int min_threads = 2;
struct qpu_reg *temp_registers;
while (true) {
- bool spilled;
- temp_registers = v3d_register_allocate(c, &spilled);
- if (spilled)
- continue;
-
- if (temp_registers)
+ temp_registers = v3d_register_allocate(c);
+ if (temp_registers) {
+ assert(c->spills + c->fills <= c->max_tmu_spills);
break;
+ }
if (c->threads == min_threads &&
- (V3D_DEBUG & V3D_DEBUG_RA)) {
+ V3D_DBG(RA)) {
fprintf(stderr,
"Failed to register allocate using %s\n",
c->fallback_scheduler ? "the fallback scheduler:" :
@@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c)
}
if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
- if (V3D_DEBUG & V3D_DEBUG_PERF) {
+ if (V3D_DBG(PERF)) {
fprintf(stderr,
- "Failed to register allocate %s at "
- "%d threads.\n", vir_get_stage_name(c),
- c->threads);
+ "Failed to register allocate %s "
+ "prog %d/%d at %d threads.\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id, c->threads);
}
c->compilation_result =
V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
return;
}
- c->spill_count = 0;
+ c->spills = 0;
+ c->fills = 0;
c->threads /= 2;
if (c->threads == 1)
@@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
if (c->spills &&
- (V3D_DEBUG & (V3D_DEBUG_VIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+ (V3D_DBG(VIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index c559814b9ea..ba76ac87e1e 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -85,6 +85,7 @@ struct schedule_state {
struct schedule_node *last_unif;
struct schedule_node *last_rtop;
struct schedule_node *last_unifa;
+ struct schedule_node *last_setmsf;
enum direction dir;
/* Estimated cycle when the current instruction would start. */
uint32_t time;
@@ -97,7 +98,7 @@ add_dep(struct schedule_state *state,
bool write)
{
bool write_after_read = !write && state->dir == R;
- void *edge_data = (void *)(uintptr_t)write_after_read;
+ uintptr_t edge_data = write_after_read;
if (!before || !after)
return;
@@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (inst->alu.add.magic_write &&
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
(inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
return true;
- if (inst->alu.mul.magic_write &&
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
(inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
return true;
@@ -153,12 +156,13 @@ static void
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
enum v3d_qpu_mux mux)
{
+ assert(state->devinfo->ver < 71);
switch (mux) {
case V3D_QPU_MUX_A:
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
break;
case V3D_QPU_MUX_B:
- if (!n->inst->qpu.sig.small_imm) {
+ if (!n->inst->qpu.sig.small_imm_b) {
add_read_dep(state,
state->last_rf[n->inst->qpu.raddr_b], n);
}
@@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
}
}
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+ uint8_t raddr, bool is_small_imm)
+{
+ assert(state->devinfo->ver >= 71);
+
+ if (!is_small_imm)
+ add_read_dep(state, state->last_rf[raddr], n);
+}
+
static bool
tmu_write_is_sequence_terminator(uint32_t waddr)
{
@@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr)
static bool
can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
{
- if (devinfo->ver < 40)
- return false;
-
if (tmu_write_is_sequence_terminator(waddr))
return false;
@@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
break;
case V3D_QPU_WADDR_UNIFA:
- if (state->devinfo->ver >= 40)
- add_write_dep(state, &state->last_unifa, n);
+ add_write_dep(state, &state->last_unifa, n);
break;
case V3D_QPU_WADDR_NOP:
@@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* If the input and output segments are shared, then all VPM reads to
* a location need to happen before all writes. We handle this by
* serializing all VPM operations for now.
+ *
+ * FIXME: we are assuming that the segments are shared. That is
+ * correct right now as we are only using shared, but technically you
+ * can choose.
*/
bool separate_vpm_segment = false;
@@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* XXX: LOAD_IMM */
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
- process_mux_deps(state, n, inst->alu.add.a);
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
- process_mux_deps(state, n, inst->alu.add.b);
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.a.raddr,
+ inst->sig.small_imm_a);
+ }
+ }
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.b.raddr,
+ inst->sig.small_imm_b);
+ }
+ }
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
- process_mux_deps(state, n, inst->alu.mul.a);
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
- process_mux_deps(state, n, inst->alu.mul.b);
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+ inst->sig.small_imm_c);
+ }
+ }
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+ inst->sig.small_imm_d);
+ }
+ }
switch (inst->alu.add.op) {
case V3D_QPU_A_VPMSETUP:
@@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
case V3D_QPU_A_MSF:
add_read_dep(state, state->last_tlb, n);
+ add_read_dep(state, state->last_setmsf, n);
break;
case V3D_QPU_A_SETMSF:
+ add_write_dep(state, &state->last_setmsf, n);
+ add_write_dep(state, &state->last_tmu_write, n);
+ FALLTHROUGH;
case V3D_QPU_A_SETREVF:
add_write_dep(state, &state->last_tlb, n);
break;
+ case V3D_QPU_A_BALLOT:
+ case V3D_QPU_A_BCASTF:
+ case V3D_QPU_A_ALLEQ:
+ case V3D_QPU_A_ALLFEQ:
+ add_read_dep(state, state->last_setmsf, n);
+ break;
+
default:
break;
}
@@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_write_dep(state, &state->last_r[4], n);
if (v3d_qpu_writes_r5(devinfo, inst))
add_write_dep(state, &state->last_r[5], n);
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+ add_write_dep(state, &state->last_rf[0], n);
/* If we add any more dependencies here we should consider whether we
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -492,9 +544,16 @@ struct choose_scoreboard {
int last_thrsw_tick;
int last_branch_tick;
int last_setmsf_tick;
- bool tlb_locked;
+ bool first_thrsw_emitted;
+ bool last_thrsw_emitted;
bool fixup_ldvary;
int ldvary_count;
+ int pending_ldtmu_count;
+ bool first_ldtmu_after_thrsw;
+
+ /* V3D 7.x */
+ int last_implicit_rf0_write_tick;
+ bool has_rf0_flops_conflict;
};
static bool
@@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
}
static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ switch (raddr) {
+ case 0: /* ldvary delayed write of C coefficient to rf0 */
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
struct qinst *qinst)
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
if (inst->alu.add.op != V3D_QPU_A_NOP) {
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+ return true;
+ }
}
}
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
}
@@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
v3d_qpu_writes_r4(devinfo, inst))
return true;
+ if (devinfo->ver == 42)
+ return false;
+
+ /* Don't schedule anything that writes rf0 right after ldvary, since
+ * that would clash with the ldvary's delayed rf0 write (the exception
+ * is another ldvary, since its implicit rf0 write would also have
+ * one cycle of delay and would not clash).
+ */
+ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !inst->sig.ldvary))) {
+ return true;
+ }
+
return false;
}
static bool
-pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+scoreboard_is_locked(struct choose_scoreboard *scoreboard,
+ bool lock_scoreboard_on_first_thrsw)
+{
+ if (lock_scoreboard_on_first_thrsw) {
+ return scoreboard->first_thrsw_emitted &&
+ scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+ }
+
+ return scoreboard->last_thrsw_emitted &&
+ scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct v3d_qpu_instr *inst)
{
- return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+ return qpu_inst_is_tlb(inst) &&
+ !scoreboard_is_locked(scoreboard,
+ c->lock_scoreboard_on_first_thrsw);
}
static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
uint32_t waddr) {
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
- inst->raddr_a == waddr)
- return true;
+ if (devinfo->ver < 71) {
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+ inst->raddr_a == waddr)
+ return true;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
- !inst->sig.small_imm && (inst->raddr_b == waddr))
- return true;
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+ return true;
+ } else {
+ if (v3d71_qpu_reads_raddr(inst, waddr))
+ return true;
+ }
return false;
}
static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
{
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
- qpu_instruction_uses_rf(inst,
+ qpu_instruction_uses_rf(devinfo, inst,
scoreboard->last_stallable_sfu_reg);
}
/* We define a max schedule priority to allow negative priorities as result of
- * substracting this max when an instruction stalls. So instructions that
+ * subtracting this max when an instruction stalls. So instructions that
* stall have lower priority than regular instructions. */
#define MAX_SCHEDULE_PRIORITY 16
@@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
return next_score;
next_score++;
+ /* Empirical testing shows that using priorities to hide latency of
+ * TMU operations when scheduling QPU leads to slightly worse
+ * performance, even at 2 threads. We think this is because the thread
+ * switching is already quite effective at hiding latency and NIR
+ * scheduling (and possibly TMU pipelining too) are sufficient to hide
+ * TMU latency, so piling up on that here doesn't provide any benefits
+ * and instead may cause us to postpone critical paths that depend on
+ * the TMU results.
+ */
+#if 0
/* Schedule texture read results collection late to hide latency. */
if (v3d_qpu_waits_on_tmu(inst))
return next_score;
next_score++;
+#endif
/* Default score for things that aren't otherwise special. */
baseline_score = next_score;
next_score++;
+#if 0
/* Schedule texture read setup early to hide their latency better. */
if (v3d_qpu_writes_tmu(devinfo, inst))
return next_score;
next_score++;
+#endif
/* We should increase the maximum if we assert here */
assert(next_score < MAX_SCHEDULE_PRIORITY);
@@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
return baseline_score;
}
-static bool
-qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
- enum v3d_qpu_waddr waddr)
-{
- return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
- v3d_qpu_magic_waddr_is_sfu(waddr) ||
- v3d_qpu_magic_waddr_is_tlb(waddr) ||
- v3d_qpu_magic_waddr_is_vpm(waddr) ||
- v3d_qpu_magic_waddr_is_tsy(waddr));
-}
+enum {
+ V3D_PERIPHERAL_VPM_READ = (1 << 0),
+ V3D_PERIPHERAL_VPM_WRITE = (1 << 1),
+ V3D_PERIPHERAL_VPM_WAIT = (1 << 2),
+ V3D_PERIPHERAL_SFU = (1 << 3),
+ V3D_PERIPHERAL_TMU_WRITE = (1 << 4),
+ V3D_PERIPHERAL_TMU_READ = (1 << 5),
+ V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
+ V3D_PERIPHERAL_TSY = (1 << 8),
+ V3D_PERIPHERAL_TLB_READ = (1 << 9),
+ V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
+};
-static bool
-qpu_accesses_peripheral(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *inst)
+static uint32_t
+qpu_peripherals(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst)
{
- if (v3d_qpu_uses_vpm(inst))
- return true;
+ uint32_t result = 0;
+ if (v3d_qpu_reads_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_READ;
+ if (v3d_qpu_writes_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_WRITE;
+ if (v3d_qpu_waits_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_WAIT;
+
+ if (v3d_qpu_writes_tmu(devinfo, inst))
+ result |= V3D_PERIPHERAL_TMU_WRITE;
+ if (inst->sig.ldtmu)
+ result |= V3D_PERIPHERAL_TMU_READ;
+ if (inst->sig.wrtmuc)
+ result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
+
if (v3d_qpu_uses_sfu(inst))
- return true;
+ result |= V3D_PERIPHERAL_SFU;
+
+ if (v3d_qpu_reads_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_READ;
+ if (v3d_qpu_writes_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_WRITE;
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
- qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
- return true;
+ v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
+ result |= V3D_PERIPHERAL_TSY;
}
if (inst->alu.add.op == V3D_QPU_A_TMUWT)
- return true;
-
- if (inst->alu.mul.op != V3D_QPU_M_NOP &&
- inst->alu.mul.magic_write &&
- qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
- return true;
- }
+ result |= V3D_PERIPHERAL_TMU_WAIT;
}
- return (inst->sig.ldvpm ||
- inst->sig.ldtmu ||
- inst->sig.ldtlb ||
- inst->sig.ldtlbu ||
- inst->sig.wrtmuc);
+ return result;
}
static bool
@@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *a,
const struct v3d_qpu_instr *b)
{
- const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
- const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
+ const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
+ const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
/* We can always do one peripheral access per instruction. */
- if (!a_uses_peripheral || !b_uses_peripheral)
+ if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
return true;
- if (devinfo->ver < 41)
+ /* V3D 4.x can't do more than one peripheral access except in a
+ * few cases:
+ */
+ if (devinfo->ver == 42) {
+ /* WRTMUC signal with TMU register write (other than tmuc). */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ }
+
+ /* TMU read with VPM read/write. */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+
return false;
+ }
- /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
- * WRTMUC with a TMU magic register write (other than tmuc).
- */
- if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
- (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
- return true;
+ /* V3D 7.x can't have more than one of these restricted peripherals */
+ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+ V3D_PERIPHERAL_TSY |
+ V3D_PERIPHERAL_TLB_READ |
+ V3D_PERIPHERAL_SFU |
+ V3D_PERIPHERAL_VPM_READ |
+ V3D_PERIPHERAL_VPM_WRITE;
+
+ const uint32_t a_restricted = a_peripherals & restricted;
+ const uint32_t b_restricted = b_peripherals & restricted;
+ if (a_restricted && b_restricted) {
+ /* WRTMUC signal with TMU register write (other than tmuc) is
+ * allowed though.
+ */
+ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+ return false;
+ }
}
- if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
- (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
- return true;
+ /* Only one TMU read per instruction */
+ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+ return false;
}
- return false;
+ /* Only one TLB access per instruction */
+ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ)) &&
+ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ))) {
+ return false;
+ }
+
+ return true;
}
/* Compute a bitmask of which rf registers are used between
@@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
uint64_t raddrs_used = 0;
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
raddrs_used |= (1ll << a->raddr_a);
- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
raddrs_used |= (1ll << a->raddr_b);
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
raddrs_used |= (1ll << b->raddr_a);
- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
raddrs_used |= (1ll << b->raddr_b);
return raddrs_used;
}
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
*/
static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *add_instr,
- const struct v3d_qpu_instr *mul_instr)
+ const struct v3d_qpu_instr *mul_instr,
+ const struct v3d_device_info *devinfo)
{
+ if (devinfo->ver >= 71) {
+ assert(add_instr->sig.small_imm_a +
+ add_instr->sig.small_imm_b <= 1);
+ assert(add_instr->sig.small_imm_c +
+ add_instr->sig.small_imm_d == 0);
+ assert(mul_instr->sig.small_imm_a +
+ mul_instr->sig.small_imm_b == 0);
+ assert(mul_instr->sig.small_imm_c +
+ mul_instr->sig.small_imm_d <= 1);
+
+ result->sig.small_imm_a = add_instr->sig.small_imm_a;
+ result->sig.small_imm_b = add_instr->sig.small_imm_b;
+ result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+ result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+ return (result->sig.small_imm_a +
+ result->sig.small_imm_b +
+ result->sig.small_imm_c +
+ result->sig.small_imm_d) <= 1;
+ }
+
+ assert(devinfo->ver == 42);
+
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
int naddrs = util_bitcount64(raddrs_used);
if (naddrs > 2)
return false;
- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
if (naddrs > 1)
return false;
- if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
if (add_instr->raddr_b != mul_instr->raddr_b)
return false;
- result->sig.small_imm = true;
- result->raddr_b = add_instr->sig.small_imm ?
+ result->sig.small_imm_b = true;
+ result->raddr_b = add_instr->sig.small_imm_b ?
add_instr->raddr_b : mul_instr->raddr_b;
}
@@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
raddrs_used &= ~(1ll << raddr_a);
result->raddr_a = raddr_a;
- if (!result->sig.small_imm) {
+ if (!result->sig.small_imm_b) {
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
raddr_a == add_instr->raddr_b) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_B)
- result->alu.add.a = V3D_QPU_MUX_A;
- if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+ result->alu.add.a.mux = V3D_QPU_MUX_A;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_A;
+ result->alu.add.b.mux = V3D_QPU_MUX_A;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
raddr_a == mul_instr->raddr_b) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
- result->alu.mul.a = V3D_QPU_MUX_A;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+ result->alu.mul.a.mux = V3D_QPU_MUX_A;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_A;
+ result->alu.mul.b.mux = V3D_QPU_MUX_A;
}
}
}
@@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
result->raddr_b = raddr_b;
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
raddr_b == add_instr->raddr_a) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_A)
- result->alu.add.a = V3D_QPU_MUX_B;
- if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+ result->alu.add.a.mux = V3D_QPU_MUX_B;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_B;
+ result->alu.add.b.mux = V3D_QPU_MUX_B;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
raddr_b == mul_instr->raddr_a) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
- result->alu.mul.a = V3D_QPU_MUX_B;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+ result->alu.mul.a.mux = V3D_QPU_MUX_B;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_B;
+ result->alu.mul.b.mux = V3D_QPU_MUX_B;
}
}
@@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
}
static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst)
{
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->flags.ac = V3D_QPU_COND_NONE;
inst->flags.apf = V3D_QPU_PF_NONE;
inst->flags.auf = V3D_QPU_UF_NONE;
+
+ inst->alu.mul.output_pack = inst->alu.add.output_pack;
+
+ inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+ inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
+ inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ if (devinfo->ver >= 71) {
+ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+ if (inst->sig.small_imm_a) {
+ inst->sig.small_imm_c = true;
+ inst->sig.small_imm_a = false;
+ } else if (inst->sig.small_imm_b) {
+ inst->sig.small_imm_d = true;
+ inst->sig.small_imm_b = false;
+ }
+ }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ case V3D_QPU_M_FMOV:
+ return devinfo->ver >= 71;
+ default:
+ return false;
+ }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ return V3D_QPU_A_MOV;
+ case V3D_QPU_M_FMOV:
+ return V3D_QPU_A_FMOV;
+ default:
+ unreachable("unexpected mov opcode");
+ }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+ assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+ inst->alu.mul.op = V3D_QPU_M_NOP;
+
+ inst->flags.ac = inst->flags.mc;
+ inst->flags.apf = inst->flags.mpf;
+ inst->flags.auf = inst->flags.muf;
+ inst->flags.mc = V3D_QPU_COND_NONE;
+ inst->flags.mpf = V3D_QPU_PF_NONE;
+ inst->flags.muf = V3D_QPU_UF_NONE;
+
+ inst->alu.add.output_pack = inst->alu.mul.output_pack;
+ inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+ inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+ if (inst->sig.small_imm_c) {
+ inst->sig.small_imm_a = true;
+ inst->sig.small_imm_c = false;
+ } else if (inst->sig.small_imm_d) {
+ inst->sig.small_imm_b = true;
+ inst->sig.small_imm_d = false;
+ }
}
static bool
@@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(b->alu.add.op)) {
mul_inst = *b;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge.alu.mul = mul_inst.alu.mul;
- merge.flags.mc = b->flags.ac;
- merge.flags.mpf = b->flags.apf;
- merge.flags.muf = b->flags.auf;
+ merge.flags.mc = mul_inst.flags.mc;
+ merge.flags.mpf = mul_inst.flags.mpf;
+ merge.flags.muf = mul_inst.flags.muf;
add_instr = a;
mul_instr = &mul_inst;
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(a->alu.add.op)) {
mul_inst = *a;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge = mul_inst;
merge.alu.add = b->alu.add;
@@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
}
}
+ struct v3d_qpu_instr add_inst;
if (b->alu.mul.op != V3D_QPU_M_NOP) {
- if (a->alu.mul.op != V3D_QPU_M_NOP)
- return false;
- merge.alu.mul = b->alu.mul;
+ if (a->alu.mul.op == V3D_QPU_M_NOP) {
+ merge.alu.mul = b->alu.mul;
- merge.flags.mc = b->flags.mc;
- merge.flags.mpf = b->flags.mpf;
- merge.flags.muf = b->flags.muf;
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
- mul_instr = b;
- add_instr = a;
+ mul_instr = b;
+ add_instr = a;
+ }
+ /* If a's mul op is used but its add op is not, then see if we
+ * can convert either a's mul op or b's mul op to an add op
+ * so we can merge.
+ */
+ else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+ add_inst = *b;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge.alu.add = add_inst.alu.add;
+
+ merge.flags.ac = add_inst.flags.ac;
+ merge.flags.apf = add_inst.flags.apf;
+ merge.flags.auf = add_inst.flags.auf;
+
+ mul_instr = a;
+ add_instr = &add_inst;
+ } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+ add_inst = *a;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge = add_inst;
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = &add_inst;
+ } else {
+ return false;
+ }
}
+ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+ * they have restrictions on the number of raddrs that can be adressed
+ * in a single instruction. In V3D 7.x, we don't have that restriction,
+ * but we are still limited to a single small immediate per instruction.
+ */
if (add_instr && mul_instr &&
- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
- return false;
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+ return false;
}
merge.sig.thrsw |= b->sig.thrsw;
@@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.sig.ldtmu |= b->sig.ldtmu;
merge.sig.ldvary |= b->sig.ldvary;
merge.sig.ldvpm |= b->sig.ldvpm;
- merge.sig.small_imm |= b->sig.small_imm;
merge.sig.ldtlb |= b->sig.ldtlb;
merge.sig.ldtlbu |= b->sig.ldtlbu;
merge.sig.ucb |= b->sig.ucb;
@@ -1047,24 +1403,25 @@ retry:
* regfile A or B that was written to by the previous
* instruction."
*/
- if (reads_too_soon_after_write(scoreboard, n->inst))
+ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
- /* "A scoreboard wait must not occur in the first two
- * instructions of a fragment shader. This is either the
- * explicit Wait for Scoreboard signal or an implicit wait
- * with the first tile-buffer read or write instruction."
+ /* "Before doing a TLB access a scoreboard wait must have been
+ * done. This happens either on the first or last thread
+ * switch, depending on a setting (scb_wait_on_first_thrsw) in
+ * the shader state."
*/
- if (pixel_scoreboard_too_soon(scoreboard, inst))
+ if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* ldunif and ldvary both write r5, but ldunif does so a tick
- * sooner. If the ldvary's r5 wasn't used, then ldunif might
+ /* ldunif and ldvary both write the same register (r5 for v42
+ * and below, rf0 for v71), but ldunif does so a tick sooner.
+ * If the ldvary's register wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
- * r5 in the same tick.
+ * the register in the same tick.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1131,24 +1488,54 @@ retry:
continue;
}
- /* Don't merge in something that will lock the TLB.
- * Hopwefully what we have in inst will release some
- * other instructions, allowing us to delay the
- * TLB-locking instruction until later.
+ /* Don't merge TLB instructions before we have acquired
+ * the scoreboard lock.
*/
- if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+ if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* When we succesfully pair up an ldvary we then try
+ /* When we successfully pair up an ldvary we then try
* to merge it into the previous instruction if
* possible to improve pipelining. Don't pick up the
* ldvary now if the follow-up fixup would place
* it in the delay slots of a thrsw, which is not
* allowed and would prevent the fixup from being
- * successul.
+ * successful. In V3D 7.x we can allow this to happen
+ * as long as it is not the last delay slot.
*/
- if (inst->sig.ldvary &&
- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
+ if (inst->sig.ldvary) {
+ if (c->devinfo->ver == 42 &&
+ scoreboard->last_thrsw_tick + 2 >=
+ scoreboard->tick - 1) {
+ continue;
+ }
+ if (c->devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 ==
+ scoreboard->tick - 1) {
+ continue;
+ }
+ }
+
+ /* We can emit a new tmu lookup with a previous ldtmu
+ * if doing this would free just enough space in the
+ * TMU output fifo so we don't overflow, however, this
+ * is only safe if the ldtmu cannot stall.
+ *
+ * A ldtmu can stall if it is not the first following a
+ * thread switch and corresponds to the first word of a
+ * read request.
+ *
+ * FIXME: For now we forbid pairing up a new lookup
+ * with a previous ldtmu that is not the first after a
+ * thrsw if that could overflow the TMU output fifo
+ * regardless of whether the ldtmu is reading the first
+ * word of a TMU result or not, since we don't track
+ * this aspect in the compiler yet.
+ */
+ if (prev_inst->inst->qpu.sig.ldtmu &&
+ !scoreboard->first_ldtmu_after_thrsw &&
+ (scoreboard->pending_ldtmu_count +
+ n->inst->ldtmu_count > 16 / c->threads)) {
continue;
}
@@ -1161,7 +1548,7 @@ retry:
int prio = get_instruction_priority(c->devinfo, inst);
- if (mux_read_stalls(scoreboard, inst)) {
+ if (read_stalls(c->devinfo, scoreboard, inst)) {
/* Don't merge an instruction that stalls */
if (prev_inst)
continue;
@@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
{
if (v3d_qpu_magic_waddr_is_sfu(waddr))
scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
- else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
+ else if (waddr == V3D_QPU_WADDR_UNIFA)
scoreboard->last_unifa_write_tick = scoreboard->tick;
}
@@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
}
static void
+update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+ const struct qinst *inst)
+{
+ /* Track if the have seen any ldtmu after the last thread switch */
+ if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
+ scoreboard->first_ldtmu_after_thrsw = true;
+
+ /* Track the number of pending ldtmu instructions for outstanding
+ * TMU lookups.
+ */
+ scoreboard->pending_ldtmu_count += inst->ldtmu_count;
+ if (inst->qpu.sig.ldtmu) {
+ assert(scoreboard->pending_ldtmu_count > 0);
+ scoreboard->pending_ldtmu_count--;
+ scoreboard->first_ldtmu_after_thrsw = false;
+ }
+}
+
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+ v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ scoreboard->has_rf0_flops_conflict = true;
+ }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return;
+
+ /* Thread switch restrictions:
+ *
+ * At the point of a thread switch or thread end (when the actual
+ * thread switch or thread end happens, not when the signalling
+ * instruction is processed):
+ *
+ * - If the most recent write to rf0 was from a ldunif, ldunifa, or
+ * ldvary instruction in which another signal also wrote to the
+ * register file, and the final instruction of the thread section
+ * contained a signal which wrote to the register file, then the
+ * value of rf0 is undefined at the start of the new section
+ *
+ * Here we use the scoreboard to track if our last rf0 implicit write
+ * happens at the same time that another signal writes the register
+ * file (has_rf0_flops_conflict). We will use that information when
+ * scheduling thrsw instructions to avoid putting anything in their
+ * last delay slot which has a signal that writes to the register file.
+ */
+
+ /* Reset tracking if we have an explicit rf0 write or we are starting
+ * a new thread section.
+ */
+ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+ scoreboard->last_implicit_rf0_write_tick = -10;
+ scoreboard->has_rf0_flops_conflict = false;
+ }
+
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+ scoreboard->tick + 1 : scoreboard->tick;
+ }
+
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
+static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst,
+ const struct qinst *qinst,
const struct v3d_device_info *devinfo)
{
+ const struct v3d_qpu_instr *inst = &qinst->qpu;
+
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
return;
@@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
}
}
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
+ update_scoreboard_for_magic_waddr(scoreboard,
+ inst->sig_addr,
+ devinfo);
+ }
+
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
- if (qpu_inst_is_tlb(inst))
- scoreboard->tlb_locked = true;
+ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
+ update_scoreboard_tmu_tracking(scoreboard, qinst);
}
static void
@@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo,
after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
return latency;
- if (before_inst->alu.add.magic_write) {
+ if (v3d_qpu_instr_is_sfu(before_inst))
+ return 2;
+
+ if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
+ before_inst->alu.add.magic_write) {
latency = MAX2(latency,
magic_waddr_latency(devinfo,
before_inst->alu.add.waddr,
after_inst));
}
- if (before_inst->alu.mul.magic_write) {
+ if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
+ before_inst->alu.mul.magic_write) {
latency = MAX2(latency,
magic_waddr_latency(devinfo,
before_inst->alu.mul.waddr,
after_inst));
}
- if (v3d_qpu_instr_is_sfu(before_inst))
- return 2;
-
return latency;
}
@@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
{
list_addtail(&inst->link, &block->instructions);
- update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
+ update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
c->qpu_inst_count++;
scoreboard->tick++;
}
@@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
- /* Only TLB Z writes are prohibited in the last slot, but we don't
- * have those flagged so prohibit all TLB ops for now.
- */
- if (slot == 2 && qpu_inst_is_tlb(inst))
+ if (slot == 2 && qinst->is_tlb_z_write)
return false;
if (slot > 0 && qinst->uniform != ~0)
return false;
- if (v3d_qpu_uses_vpm(inst))
+ if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
return false;
if (inst->sig.ldvary)
@@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
/* GFXH-1625: TMUWT not allowed in the final instruction. */
- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+ if (c->devinfo->ver == 42 && slot == 2 &&
+ inst->alu.add.op == V3D_QPU_A_TMUWT) {
return false;
+ }
- /* No writing physical registers at the end. */
- if (!inst->alu.add.magic_write ||
- !inst->alu.mul.magic_write) {
- return false;
+ if (c->devinfo->ver == 42) {
+ /* No writing physical registers at the end. */
+ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+ if ((!add_is_nop && !inst->alu.add.magic_write) ||
+ (!mul_is_nop && !inst->alu.mul.magic_write)) {
+ return false;
+ }
+
+ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ return false;
+ }
}
- if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* The thread end instruction must not write to the
+ * register file via the add/mul ALUs.
+ */
+ if (slot == 0 &&
+ (!inst->alu.add.magic_write ||
+ !inst->alu.mul.magic_write)) {
+ return false;
+ }
+ }
- /* RF0-2 might be overwritten during the delay slots by
- * fragment shader setup.
- */
- if (inst->raddr_a < 3 &&
- (inst->alu.add.a == V3D_QPU_MUX_A ||
- inst->alu.add.b == V3D_QPU_MUX_A ||
- inst->alu.mul.a == V3D_QPU_MUX_A ||
- inst->alu.mul.b == V3D_QPU_MUX_A)) {
- return false;
+ if (c->devinfo->ver == 42) {
+ /* RF0-2 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+ return false;
+
+ if (inst->raddr_b < 3 &&
+ !inst->sig.small_imm_b &&
+ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+ return false;
+ }
}
- if (inst->raddr_b < 3 &&
- !inst->sig.small_imm &&
- (inst->alu.add.a == V3D_QPU_MUX_B ||
- inst->alu.add.b == V3D_QPU_MUX_B ||
- inst->alu.mul.a == V3D_QPU_MUX_B ||
- inst->alu.mul.b == V3D_QPU_MUX_B)) {
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* RF2-3 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (v3d71_qpu_reads_raddr(inst, 2) ||
+ v3d71_qpu_reads_raddr(inst, 3)) {
+ return false;
+ }
+
+ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+ return false;
+ }
}
}
@@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
*/
static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
uint32_t slot)
{
@@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* thread. The simulator complains for safety, though it
* would only occur for dead code in our case.
*/
- if (slot > 0 &&
- qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
- (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
- v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
- return false;
+ if (slot > 0) {
+ if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
+ return false;
+ if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
+ return false;
}
- if (slot > 0 && qinst->qpu.sig.ldvary)
- return false;
+ if (qinst->qpu.sig.ldvary) {
+ if (c->devinfo->ver == 42 && slot > 0)
+ return false;
+ if (c->devinfo->ver >= 71 && slot == 2)
+ return false;
+ }
/* unifa and the following 3 instructions can't overlap a
* thread switch/end. The docs further clarify that this means
@@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
return false;
+ /* See comment when we set has_rf0_flops_conflict for details */
+ if (c->devinfo->ver >= 71 &&
+ slot == 2 &&
+ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+ !qinst->qpu.sig_magic) {
+ if (scoreboard->has_rf0_flops_conflict)
+ return false;
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+ return false;
+ }
+
return true;
}
@@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
assert(slot <= 2);
/* We merge thrsw instructions back into the instruction stream
- * manually, so any instructions scheduled after a thrsw shold be
+ * manually, so any instructions scheduled after a thrsw should be
* in the actual delay slots and not in the same slot as the thrsw.
*/
assert(slot >= 1);
@@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* also apply to instructions scheduled after the thrsw that we want
* to place in its delay slots.
*/
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
return false;
/* TLB access is disallowed until scoreboard wait is executed, which
@@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_flags(&qinst->qpu))
return false;
+ /* TSY sync ops materialize at the point of the next thread switch,
+ * therefore, if we have a TSY sync right after a thread switch, we
+ * cannot place it in its delay slots, or we would be moving the sync
+ * to the thrsw before it instead.
+ */
+ if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
+ return false;
+
return true;
}
@@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
struct qinst *qinst, int instructions_in_sequence,
bool is_thrend)
{
- /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
- if (scoreboard->last_thrsw_tick + 3 >
- scoreboard->tick - instructions_in_sequence) {
- return false;
- }
-
for (int slot = 0; slot < instructions_in_sequence; slot++) {
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+ qinst, slot)) {
return false;
+ }
if (is_thrend &&
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c,
/* Find how far back into previous instructions we can put the THRSW. */
int slots_filled = 0;
+ int invalid_sig_count = 0;
+ int invalid_seq_count = 0;
+ bool last_thrsw_after_invalid_ok = false;
struct qinst *merge_inst = NULL;
vir_for_each_inst_rev(prev_inst, block) {
- struct v3d_qpu_sig sig = prev_inst->qpu.sig;
- sig.thrsw = true;
- uint32_t packed_sig;
-
- if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
+ /* No emitting our thrsw while the previous thrsw hasn't
+ * happened yet.
+ */
+ if (scoreboard->last_thrsw_tick + 3 >
+ scoreboard->tick - (slots_filled + 1)) {
break;
+ }
+
if (!valid_thrsw_sequence(c, scoreboard,
prev_inst, slots_filled + 1,
is_thrend)) {
- break;
+ /* Even if the current sequence isn't valid, we may
+ * be able to get a valid sequence by trying to move the
+ * thrsw earlier, so keep going.
+ */
+ invalid_seq_count++;
+ goto cont_block;
+ }
+
+ struct v3d_qpu_sig sig = prev_inst->qpu.sig;
+ sig.thrsw = true;
+ uint32_t packed_sig;
+ if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
+ /* If we can't merge the thrsw here because of signal
+ * incompatibility, keep going, we might be able to
+ * merge it in an earlier instruction.
+ */
+ invalid_sig_count++;
+ goto cont_block;
}
+ /* For last thrsw we need 2 consecutive slots that are
+ * thrsw compatible, so if we have previously jumped over
+ * an incompatible signal, flag that we have found the first
+ * valid slot here and keep going.
+ */
+ if (inst->is_last_thrsw && invalid_sig_count > 0 &&
+ !last_thrsw_after_invalid_ok) {
+ last_thrsw_after_invalid_ok = true;
+ invalid_sig_count++;
+ goto cont_block;
+ }
+
+ /* We can merge the thrsw in this instruction */
+ last_thrsw_after_invalid_ok = false;
+ invalid_sig_count = 0;
+ invalid_seq_count = 0;
merge_inst = prev_inst;
+
+cont_block:
if (++slots_filled == 3)
break;
}
+ /* If we jumped over a signal incompatibility and did not manage to
+ * merge the thrsw in the end, we need to adjust slots filled to match
+ * the last valid merge point.
+ */
+ assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
+ slots_filled >= invalid_sig_count + invalid_seq_count);
+ if (invalid_sig_count > 0)
+ slots_filled -= invalid_sig_count;
+ if (invalid_seq_count > 0)
+ slots_filled -= invalid_seq_count;
+
bool needs_free = false;
if (merge_inst) {
merge_inst->qpu.sig.thrsw = true;
@@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c,
merge_inst = inst;
}
+ scoreboard->first_thrsw_emitted = true;
+
/* If we're emitting the last THRSW (other than program end), then
* signal that to the HW by emitting two THRSWs in a row.
*/
@@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c,
struct qinst *second_inst =
(struct qinst *)merge_inst->link.next;
second_inst->qpu.sig.thrsw = true;
+ scoreboard->last_thrsw_emitted = true;
}
/* Make sure the thread end executes within the program lifespan */
@@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c,
assert(scoreboard->last_branch_tick + 3 < branch_tick);
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
* setmsf.
*/
bool is_safe_msf_branch =
+ c->devinfo->ver >= 71 ||
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c,
break;
}
+ /* Do not move up a branch if it can disrupt an ldvary sequence
+ * as that can cause stomping of the r5 register.
+ */
+ if (scoreboard->last_ldvary_tick + 2 >=
+ branch_tick - slots_filled) {
+ break;
+ }
+
/* Can't move a conditional branch before the instruction
* that writes the flags for its condition.
*/
@@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c,
}
static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst,
bool add, bool magic, uint32_t index)
{
uint32_t num_src;
- enum v3d_qpu_mux mux_a, mux_b;
-
- if (add) {
+ if (add)
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
- mux_a = inst->alu.add.a;
- mux_b = inst->alu.add.b;
- } else {
+ else
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- mux_a = inst->alu.mul.a;
- mux_b = inst->alu.mul.b;
- }
- for (int i = 0; i < num_src; i++) {
- if (magic) {
- if (i == 0 && mux_a == index)
- return true;
- if (i == 1 && mux_b == index)
- return true;
+ if (devinfo->ver == 42) {
+ enum v3d_qpu_mux mux_a, mux_b;
+ if (add) {
+ mux_a = inst->alu.add.a.mux;
+ mux_b = inst->alu.add.b.mux;
} else {
- if (i == 0 && mux_a == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 0 && mux_a == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
+ mux_a = inst->alu.mul.a.mux;
+ mux_b = inst->alu.mul.b.mux;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (magic) {
+ if (i == 0 && mux_a == index)
+ return true;
+ if (i == 1 && mux_b == index)
+ return true;
+ } else {
+ if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
}
}
+
+ return false;
+ }
+
+ assert(devinfo->ver >= 71);
+ assert(!magic);
+
+ uint32_t raddr_a, raddr_b;
+ if (add) {
+ raddr_a = inst->alu.add.a.raddr;
+ raddr_b = inst->alu.add.b.raddr;
+ } else {
+ raddr_a = inst->alu.mul.a.raddr;
+ raddr_b = inst->alu.mul.b.raddr;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (i == 0 && raddr_a == index)
+ return true;
+ if (i == 1 && raddr_b == index)
+ return true;
}
return false;
@@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
struct qblock *block,
struct v3d_qpu_instr *inst)
{
- /* We only call this if we have successfuly merged an ldvary into a
+ const struct v3d_device_info *devinfo = c->devinfo;
+
+ /* We only call this if we have successfully merged an ldvary into a
* previous instruction.
*/
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
@@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
* the ldvary destination, if it does, then moving the ldvary before
* it would overwrite it.
*/
- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
return false;
- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
+ return false;
+
+ /* The implicit ldvary destination may not be written to by a signal
+ * in the instruction following ldvary. Since we are planning to move
+ * ldvary to the previous instruction, this means we need to check if
+ * the current instruction has any other signal that could create this
+ * conflict. The only other signal that can write to the implicit
+ * ldvary destination that is compatible with ldvary in the same
+ * instruction is ldunif.
+ */
+ if (inst->sig.ldunif)
return false;
/* The previous instruction can't write to the same destination as the
@@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
}
/* The previous instruction cannot have a conflicting signal */
- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
+ return false;
+
+ uint32_t sig;
+ struct v3d_qpu_sig new_sig = prev->qpu.sig;
+ new_sig.ldvary = true;
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
return false;
/* The previous instruction cannot use flags since ldvary uses the
@@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
/* We can't put an ldvary in the delay slots of a thrsw. We should've
* prevented this when pairing up the ldvary with another instruction
- * and flagging it for a fixup.
+ * and flagging it for a fixup. In V3D 7.x this is limited only to the
+ * second delay slot.
*/
- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+ assert((devinfo->ver == 42 &&
+ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+ (devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
/* Move the ldvary to the previous instruction and remove it from the
* current one.
@@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
inst->sig_magic = false;
inst->sig_addr = 0;
- /* By moving ldvary to the previous instruction we make it update
- * r5 in the current one, so nothing else in it should write r5.
+ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+ if (devinfo->ver >= 71) {
+ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+ }
+
+ /* By moving ldvary to the previous instruction we make it update r5
+ * (rf0 for ver >= 71) in the current one, so nothing else in it
+ * should write this register.
+ *
* This should've been prevented by our depedency tracking, which
* would not allow ldvary to be paired up with an instruction that
- * writes r5 (since our dependency tracking doesn't know that the
- * ldvary write r5 happens in the next instruction).
+ * writes r5/rf0 (since our dependency tracking doesn't know that the
+ * ldvary write to r5/rf0 happens in the next instruction).
*/
- assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+ assert(!v3d_qpu_writes_r5(devinfo, inst));
+ assert(devinfo->ver == 42 ||
+ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
return true;
}
@@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c,
merge->inst->uniform;
}
+ chosen->inst->ldtmu_count +=
+ merge->inst->ldtmu_count;
+
if (debug) {
fprintf(stderr, "t=%4d: merging: ",
time);
@@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c,
}
}
}
- if (mux_read_stalls(scoreboard, inst))
+ if (read_stalls(c->devinfo, scoreboard, inst))
c->qpu_inst_stalled_count++;
}
@@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_branch_tick = -10;
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
+ scoreboard.first_ldtmu_after_thrsw = true;
+ scoreboard.last_implicit_rf0_write_tick = - 10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index ec9ed66650c..538b247e3e0 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
int last_sfu_write;
int last_branch_ip;
int last_thrsw_ip;
+ int first_tlb_z_write;
/* Set when we've found the last-THRSW signal, or if we were started
* in single-segment mode.
@@ -110,11 +111,58 @@ static void
qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
{
const struct v3d_device_info *devinfo = state->c->devinfo;
+
+ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+ state->first_tlb_z_write = state->ip;
+
const struct v3d_qpu_instr *inst = &qinst->qpu;
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+ fail_instr(state, "Implicit branch MSF read after TLB Z write");
+ }
+
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return;
+ if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write) {
+ fail_instr(state, "SETMSF after TLB Z write");
+ }
+
+ if (state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->alu.add.op == V3D_QPU_A_MSF) {
+ fail_instr(state, "MSF read after TLB Z write");
+ }
+
+ if (devinfo->ver < 71) {
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+ inst->sig.small_imm_d) {
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
+ }
+ } else {
+ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+ !vir_is_add(qinst)) {
+ fail_instr(state, "small imm a/b used but no ADD inst");
+ }
+ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+ !vir_is_mul(qinst)) {
+ fail_instr(state, "small imm c/d used but no MUL inst");
+ }
+ if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+ fail_instr(state, "only one small immediate can be "
+ "enabled per instruction");
+ }
+ }
+
/* LDVARY writes r5 two instructions later and LDUNIF writes
* r5 one instruction later, which is illegal to have
* together.
@@ -128,7 +176,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
*
* FIXME: This would not check correctly for V3D 4.2 versions lower
* than V3D 4.2.14, but that is not a real issue because the simulator
- * will still catch this, and we are not really targetting any such
+ * will still catch this, and we are not really targeting any such
* versions anyway.
*/
if (state->c->devinfo->ver < 42) {
@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
"SFU write started during THRSW delay slots ");
}
- if (inst->sig.ldvary)
- fail_instr(state, "LDVARY during THRSW delay slots");
+ if (inst->sig.ldvary) {
+ if (devinfo->ver == 42)
+ fail_instr(state, "LDVARY during THRSW delay slots");
+ if (devinfo->ver >= 71 &&
+ state->ip - state->last_thrsw_ip == 2) {
+ fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+ }
+ }
}
(void)qpu_magic_waddr_matches; /* XXX */
@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
vpm_writes +
tlb_writes +
tsy_writes +
- inst->sig.ldtmu +
+ (devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
inst->sig.ldtlb +
inst->sig.ldvpm +
inst->sig.ldtlbu > 1) {
@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if ((inst->alu.add.op != V3D_QPU_A_NOP &&
!inst->alu.add.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver == 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "ADD RF write at THREND");
+ }
+ if (inst->alu.add.waddr == 2 ||
+ inst->alu.add.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
!inst->alu.mul.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver == 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "MUL RF write at THREND");
+ }
+
+ if (inst->alu.mul.waddr == 2 ||
+ inst->alu.mul.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
!inst->sig_magic) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver == 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71 &&
+ (inst->sig_addr == 2 ||
+ inst->sig_addr == 3)) {
+ fail_instr(state, "RF2-3 write after THREND");
+ }
}
/* GFXH-1625: No TMUWT in the last instruction */
@@ -312,7 +397,7 @@ qpu_validate(struct v3d_compile *c)
* keep compiling the validation code to make sure it doesn't get
* broken.
*/
-#ifndef DEBUG
+#if !MESA_DEBUG
return;
#endif
@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
.last_sfu_write = -10,
.last_thrsw_ip = -10,
.last_branch_ip = -10,
+ .first_tlb_z_write = INT_MAX,
.ip = 0,
.last_thrsw_found = !c->last_thrsw,
diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c
deleted file mode 100644
index b933635f6fe..00000000000
--- a/src/broadcom/compiler/v3d33_tex.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
-{
- /* FIXME: We don't bother implementing pipelining for texture reads
- * for any pre 4.x hardware. It should be straight forward to do but
- * we are not really testing or even targetting this hardware at
- * present.
- */
- ntq_flush_tmu(c);
-
- unsigned unit = instr->texture_index;
-
- struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = {
- V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header,
-
- .fetch_sample_mode = instr->op == nir_texop_txf,
- };
-
- struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked = {
- };
-
- switch (instr->sampler_dim) {
- case GLSL_SAMPLER_DIM_1D:
- if (instr->is_array)
- p0_unpacked.lookup_type = TEXTURE_1D_ARRAY;
- else
- p0_unpacked.lookup_type = TEXTURE_1D;
- break;
- case GLSL_SAMPLER_DIM_2D:
- case GLSL_SAMPLER_DIM_RECT:
- if (instr->is_array)
- p0_unpacked.lookup_type = TEXTURE_2D_ARRAY;
- else
- p0_unpacked.lookup_type = TEXTURE_2D;
- break;
- case GLSL_SAMPLER_DIM_3D:
- p0_unpacked.lookup_type = TEXTURE_3D;
- break;
- case GLSL_SAMPLER_DIM_CUBE:
- p0_unpacked.lookup_type = TEXTURE_CUBE_MAP;
- break;
- default:
- unreachable("Bad sampler type");
- }
-
- struct qreg coords[5];
- int next_coord = 0;
- for (unsigned i = 0; i < instr->num_srcs; i++) {
- switch (instr->src[i].src_type) {
- case nir_tex_src_coord:
- for (int j = 0; j < instr->coord_components; j++) {
- coords[next_coord++] =
- ntq_get_src(c, instr->src[i].src, j);
- }
- if (instr->coord_components < 2)
- coords[next_coord++] = vir_uniform_f(c, 0.5);
- break;
- case nir_tex_src_bias:
- coords[next_coord++] =
- ntq_get_src(c, instr->src[i].src, 0);
-
- p0_unpacked.bias_supplied = true;
- break;
- case nir_tex_src_lod:
- coords[next_coord++] =
- vir_FADD(c,
- ntq_get_src(c, instr->src[i].src, 0),
- vir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL,
- unit));
-
- if (instr->op != nir_texop_txf &&
- instr->op != nir_texop_tg4) {
- p0_unpacked.disable_autolod_use_bias_only = true;
- }
- break;
- case nir_tex_src_comparator:
- coords[next_coord++] =
- ntq_get_src(c, instr->src[i].src, 0);
-
- p0_unpacked.shadow = true;
- break;
-
- case nir_tex_src_offset: {
- p0_unpacked.texel_offset_for_s_coordinate =
- nir_src_comp_as_int(instr->src[i].src, 0);
-
- if (instr->coord_components >= 2)
- p0_unpacked.texel_offset_for_t_coordinate =
- nir_src_comp_as_int(instr->src[i].src, 1);
-
- if (instr->coord_components >= 3)
- p0_unpacked.texel_offset_for_r_coordinate =
- nir_src_comp_as_int(instr->src[i].src, 2);
- break;
- }
-
- default:
- unreachable("unknown texture source");
- }
- }
-
- /* Limit the number of channels returned to both how many the NIR
- * instruction writes and how many the instruction could produce.
- */
- p1_unpacked.return_words_of_texture_data =
- instr->dest.is_ssa ?
- nir_ssa_def_components_read(&instr->dest.ssa) :
- (1 << instr->dest.reg.reg->num_components) - 1;
-
- uint32_t p0_packed;
- V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
- (uint8_t *)&p0_packed,
- &p0_unpacked);
-
- uint32_t p1_packed;
- V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL,
- (uint8_t *)&p1_packed,
- &p1_unpacked);
- /* Load unit number into the address field, which will be be used by
- * the driver to decide which texture to put in the actual address
- * field.
- */
- p1_packed |= unit << 5;
-
- /* There is no native support for GL texture rectangle coordinates, so
- * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
- * 1]).
- */
- if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
- coords[0] = vir_FMUL(c, coords[0],
- vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X,
- unit));
- coords[1] = vir_FMUL(c, coords[1],
- vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y,
- unit));
- }
-
- int texture_u[] = {
- vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
- vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
- };
-
- for (int i = 0; i < next_coord; i++) {
- struct qreg dst;
-
- if (i == next_coord - 1)
- dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL);
- else
- dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU);
-
- struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
-
- if (i < 2)
- tmu->uniform = texture_u[i];
- }
-
- vir_emit_thrsw(c);
-
- for (int i = 0; i < 4; i++) {
- if (p1_unpacked.return_words_of_texture_data & (1 << i))
- ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
- }
-}
diff --git a/src/broadcom/compiler/v3d33_vpm_setup.c b/src/broadcom/compiler/v3d33_vpm_setup.c
deleted file mode 100644
index 8bce67dfae9..00000000000
--- a/src/broadcom/compiler/v3d33_vpm_setup.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "broadcom/cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components)
-{
- struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = {
- V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header,
-
- .horiz = true,
- .laned = false,
- /* If the field is 0, that means a read count of 32. */
- .num = num_components & 31,
- .segs = true,
- .stride = 1,
- .size = VPM_SETUP_SIZE_32_BIT,
- .addr = c->num_inputs,
- };
-
- uint32_t packed;
- V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL,
- (uint8_t *)&packed,
- &unpacked);
- vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
-
-void
-v3d33_vir_vpm_write_setup(struct v3d_compile *c)
-{
- uint32_t packed;
- struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = {
- V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header,
-
- .horiz = true,
- .laned = false,
- .segs = true,
- .stride = 1,
- .size = VPM_SETUP_SIZE_32_BIT,
- .addr = 0,
- };
-
- V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL,
- (uint8_t *)&packed,
- &unpacked);
- vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 0c1419661d3..12aaacdc14a 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -31,6 +31,7 @@
#include <stdint.h>
#include <string.h>
+#include "util/blend.h"
#include "util/macros.h"
#include "common/v3d_debug.h"
#include "common/v3d_device_info.h"
@@ -40,7 +41,6 @@
#include "util/u_math.h"
#include "qpu/qpu_instr.h"
-#include "pipe/p_state.h"
/**
* Maximum number of outstanding TMU operations we can queue for execution.
@@ -87,7 +87,7 @@ enum qfile {
/** A physical register, such as the W coordinate payload. */
QFILE_REG,
- /** One of the regsiters for fixed function interactions. */
+ /** One of the registers for fixed function interactions. */
QFILE_MAGIC,
/**
@@ -97,12 +97,6 @@ enum qfile {
QFILE_TEMP,
/**
- * VPM reads use this with an index value to say what part of the VPM
- * is being read.
- */
- QFILE_VPM,
-
- /**
* Stores an immediate value in the index field that will be used
* directly by qpu_load_imm().
*/
@@ -169,6 +163,19 @@ struct qinst {
* otherwise.
*/
int uniform;
+
+ /* If this is a a TLB Z write */
+ bool is_tlb_z_write;
+
+ /* If this is a retiring TMU instruction (the last in a lookup sequence),
+ * how many ldtmu instructions are required to read the results.
+ */
+ uint32_t ldtmu_count;
+
+ /* Position of this instruction in the program. Filled in during
+ * register allocation.
+ */
+ int32_t ip;
};
enum quniform_contents {
@@ -330,6 +337,19 @@ enum quniform_contents {
* Current value of gl_ViewIndex for Multiview rendering.
*/
QUNIFORM_VIEW_INDEX,
+
+ /**
+ * Inline uniform buffers
+ */
+ QUNIFORM_INLINE_UBO_0,
+ QUNIFORM_INLINE_UBO_1,
+ QUNIFORM_INLINE_UBO_2,
+ QUNIFORM_INLINE_UBO_3,
+
+ /**
+ * Current value of DrawIndex for Multidraw
+ */
+ QUNIFORM_DRAW_ID,
};
static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
@@ -369,13 +389,7 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
return slot.slot_and_component & 3;
}
-enum v3d_execution_environment {
- V3D_ENVIRONMENT_OPENGL = 0,
- V3D_ENVIRONMENT_VULKAN,
-};
-
struct v3d_key {
- void *shader_state;
struct {
uint8_t swizzle[4];
} tex[V3D_MAX_TEXTURE_SAMPLERS];
@@ -388,9 +402,9 @@ struct v3d_key {
uint8_t num_samplers_used;
uint8_t ucp_enables;
bool is_last_geometry_stage;
- bool robust_buffer_access;
-
- enum v3d_execution_environment environment;
+ bool robust_uniform_access;
+ bool robust_storage_access;
+ bool robust_image_access;
};
struct v3d_fs_key {
@@ -400,7 +414,6 @@ struct v3d_fs_key {
bool line_smoothing;
bool point_coord_upper_left;
bool msaa;
- bool sample_coverage;
bool sample_alpha_to_coverage;
bool sample_alpha_to_one;
/* Mask of which color render targets are present. */
@@ -419,14 +432,12 @@ struct v3d_fs_key {
*/
struct {
enum pipe_format format;
- const uint8_t *swizzle;
+ uint8_t swizzle[4];
} color_fmt[V3D_MAX_DRAW_BUFFERS];
- uint8_t logicop_func;
+ enum pipe_logicop logicop_func;
uint32_t point_sprite_mask;
- struct pipe_rt_blend_state blend;
-
/* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios:
*
* - If there is a geometry shader, then gl_PrimitiveID must be written
@@ -468,7 +479,7 @@ struct v3d_vs_key {
bool clamp_color;
};
-/** A basic block of VIR intructions. */
+/** A basic block of VIR instructions. */
struct qblock {
struct list_head link;
@@ -566,6 +577,7 @@ enum v3d_compilation_result {
*/
struct v3d_compiler {
const struct v3d_device_info *devinfo;
+ uint32_t max_inline_uniform_buffers;
struct ra_regs *regs;
struct ra_class *reg_class_any[3];
struct ra_class *reg_class_r5[3];
@@ -584,6 +596,19 @@ struct v3d_interp_input {
unsigned mode; /* interpolation mode */
};
+struct v3d_ra_node_info {
+ struct {
+ uint32_t priority;
+ uint8_t class_bits;
+ bool is_program_end;
+ bool unused;
+
+ /* V3D 7.x */
+ bool is_ldunif_dst;
+ } *info;
+ uint32_t alloc_count;
+};
+
struct v3d_compile {
const struct v3d_device_info *devinfo;
nir_shader *s;
@@ -596,7 +621,7 @@ struct v3d_compile {
void *debug_output_data;
/**
- * Mapping from nir_register * or nir_ssa_def * to array of struct
+ * Mapping from nir_register * or nir_def * to array of struct
* qreg for the values.
*/
struct hash_table *def_ht;
@@ -615,11 +640,12 @@ struct v3d_compile {
uint32_t output_fifo_size;
struct {
- nir_dest *dest;
+ nir_def *def;
uint8_t num_components;
uint8_t component_mask;
} flush[MAX_TMU_QUEUE_SIZE];
uint32_t flush_count;
+ uint32_t total_count;
} tmu;
/**
@@ -652,16 +678,13 @@ struct v3d_compile {
bool uses_center_w;
bool writes_z;
+ bool writes_z_from_fep;
+ bool reads_z;
bool uses_implicit_point_line_varyings;
/* True if a fragment shader reads gl_PrimitiveID */
bool fs_uses_primitive_id;
- /* If the fragment shader does anything that requires to force
- * per-sample MSAA, such as reading gl_SampleID.
- */
- bool force_per_sample_msaa;
-
/* Whether we are using the fallback scheduler. This will be set after
* register allocation has failed once.
*/
@@ -681,6 +704,11 @@ struct v3d_compile {
bool disable_constant_ubo_load_sorting;
bool sorted_any_ubo_loads;
+ /* Moves UBO/SSBO loads right before their first user (nir_opt_move).
+ * This can reduce register pressure.
+ */
+ bool move_buffer_loads;
+
/* Emits ldunif for each new uniform, even if the uniform was already
* emitted in the same block. Useful to compile shaders with high
* register pressure or to disable the optimization during uniform
@@ -692,6 +720,19 @@ struct v3d_compile {
bool disable_loop_unrolling;
bool unrolled_any_loops;
+ /* Disables nir_opt_gcm to reduce register pressure. */
+ bool disable_gcm;
+
+ /* If calling nir_opt_gcm made any progress. Used to skip new rebuilds
+ * if possible
+ */
+ bool gcm_progress;
+
+ /* Disables scheduling of general TMU loads (and unfiltered image load).
+ */
+ bool disable_general_tmu_sched;
+ bool has_general_tmu_load;
+
/* Minimum number of threads we are willing to use to register allocate
* a shader with the current compilation strategy. This only prevents
* us from lowering the thread count to register allocate successfully,
@@ -705,7 +746,9 @@ struct v3d_compile {
* strategies that can reduce register pressure and hopefully reduce or
* eliminate TMU spills in the shader.
*/
- bool tmu_spilling_allowed;
+ uint32_t max_tmu_spills;
+
+ uint32_t compile_strategy_idx;
/* The UBO index and block used with the last unifa load, as well as the
* current unifa offset *after* emitting that load. This is used to skip
@@ -715,6 +758,7 @@ struct v3d_compile {
struct qblock *current_unifa_block;
int32_t current_unifa_index;
uint32_t current_unifa_offset;
+ bool current_unifa_is_ubo;
/* State for whether we're executing on each channel currently. 0 if
* yes, otherwise a block number + 1 that the channel jumped to.
@@ -749,6 +793,11 @@ struct v3d_compile {
struct qreg cs_shared_offset;
int local_invocation_index_bits;
+ /* Starting value of the sample mask in a fragment shader. We use
+ * this to identify lanes that have been terminated/discarded.
+ */
+ struct qreg start_msf;
+
/* If the shader uses subgroup functionality */
bool has_subgroups;
@@ -761,14 +810,27 @@ struct v3d_compile {
uint32_t spill_size;
/* Shader-db stats */
uint32_t spills, fills, loops;
+
+ /* Whether we are in the process of spilling registers for
+ * register allocation
+ */
+ bool spilling;
+
/**
* Register spilling's per-thread base address, shared between each
- * spill/fill's addressing calculations.
+ * spill/fill's addressing calculations (also used for scratch
+ * access).
*/
struct qreg spill_base;
+
/* Bit vector of which temps may be spilled */
BITSET_WORD *spillable;
+ /* Used during register allocation */
+ int thread_index;
+ struct v3d_ra_node_info nodes;
+ struct ra_graph *g;
+
/**
* Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
*
@@ -799,11 +861,16 @@ struct v3d_compile {
uint32_t uniform_array_size;
uint32_t num_uniforms;
uint32_t output_position_index;
- nir_variable *output_color_var[4];
+ nir_variable *output_color_var[V3D_MAX_DRAW_BUFFERS];
uint32_t output_sample_mask_index;
struct qreg undef;
uint32_t num_temps;
+ /* Number of temps in the program right before we spill a new temp. We
+ * use this to know which temps existed before a spill and which were
+ * added with the spill itself.
+ */
+ uint32_t spill_start_num_temps;
struct vir_cursor cursor;
struct list_head blocks;
@@ -848,12 +915,16 @@ struct v3d_compile {
bool emitted_tlb_load;
bool lock_scoreboard_on_first_thrsw;
- /* Total number of spilled registers in the program */
- uint32_t spill_count;
-
enum v3d_compilation_result compilation_result;
bool tmu_dirty_rcl;
+ bool has_global_address;
+
+ /* If we have processed a discard/terminate instruction. This may
+ * cause some lanes to be inactive even during uniform control
+ * flow.
+ */
+ bool emitted_discard;
};
struct v3d_uniform_list {
@@ -866,6 +937,13 @@ struct v3d_prog_data {
struct v3d_uniform_list uniforms;
uint32_t spill_size;
+ uint32_t tmu_spills;
+ uint32_t tmu_fills;
+ uint32_t tmu_count;
+
+ uint32_t qpu_read_stalls;
+
+ uint8_t compile_strategy_idx;
uint8_t threads;
@@ -877,6 +955,8 @@ struct v3d_prog_data {
bool tmu_dirty_rcl;
bool has_control_barrier;
+
+ bool has_global_address;
};
struct v3d_vs_prog_data {
@@ -964,10 +1044,15 @@ struct v3d_fs_prog_data {
uint8_t num_inputs;
bool writes_z;
+ bool writes_z_from_fep;
bool disable_ez;
bool uses_center_w;
bool uses_implicit_point_line_varyings;
bool lock_scoreboard_on_first_thrsw;
+
+ /* If the fragment shader does anything that requires to force
+ * per-sample MSAA, such as reading gl_SampleID.
+ */
bool force_per_sample_msaa;
};
@@ -998,6 +1083,10 @@ v3d_compute_vpm_config(struct v3d_device_info *devinfo,
struct v3d_gs_prog_data *gs,
struct vpm_config *vpm_cfg_bin,
struct vpm_config *vpm_cfg);
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+ uint32_t *p1_packed,
+ bool unnormalized_coordinates);
static inline bool
vir_has_uniform(struct qinst *inst)
@@ -1005,7 +1094,8 @@ vir_has_uniform(struct qinst *inst)
return inst->uniform != ~0;
}
-const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo,
+ uint32_t max_inline_uniform_buffers);
void v3d_compiler_free(const struct v3d_compiler *compiler);
void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
@@ -1066,15 +1156,14 @@ bool vir_is_raw_mov(struct qinst *inst);
bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_is_add(struct qinst *inst);
bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
uint8_t vir_channels_written(struct qinst *inst);
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
-void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
- struct qreg result);
+void ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+ struct qreg result);
bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
-void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
+void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_def *def,
uint32_t component_mask);
void ntq_flush_tmu(struct v3d_compile *c);
void vir_emit_thrsw(struct v3d_compile *c);
@@ -1095,32 +1184,27 @@ bool vir_opt_redundant_flags(struct v3d_compile *c);
bool vir_opt_small_immediates(struct v3d_compile *c);
bool vir_opt_vpm(struct v3d_compile *c);
bool vir_opt_constant_alu(struct v3d_compile *c);
-void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_line_smooth(nir_shader *shader);
-void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c);
-void v3d_nir_lower_scratch(nir_shader *s);
-void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_image_load_store(nir_shader *s);
-void vir_lower_uniforms(struct v3d_compile *c);
-
-void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
-void v3d33_vir_vpm_write_setup(struct v3d_compile *c);
-void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_image_load_store(struct v3d_compile *c,
- nir_intrinsic_instr *instr);
+bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_line_smooth(nir_shader *shader);
+bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_scratch(nir_shader *s);
+bool v3d_nir_lower_txf_ms(nir_shader *s);
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
+
+void v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
+void v3d_vir_emit_image_load_store(struct v3d_compile *c,
+ nir_intrinsic_instr *instr);
void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
void qpu_validate(struct v3d_compile *c);
-struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
bool vir_init_reg_sets(struct v3d_compiler *compiler);
int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str);
-bool v3d_gl_format_is_return_32(GLenum format);
+bool v3d_gl_format_is_return_32(enum pipe_format format);
uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
@@ -1220,28 +1304,35 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \
#define VIR_SFU(name) \
static inline struct qreg \
vir_##name(struct v3d_compile *c, struct qreg a) \
-{ \
- if (c->devinfo->ver >= 41) { \
- return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \
- c->undef, \
- a, c->undef)); \
- } else { \
- vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
- return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
- } \
+{ \
+ return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \
+ c->undef, \
+ a, c->undef)); \
} \
static inline struct qinst * \
vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \
struct qreg a) \
{ \
- if (c->devinfo->ver >= 41) { \
- return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
- dest, \
- a, c->undef)); \
- } else { \
- vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
- return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
- } \
+ return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
+ dest, \
+ a, c->undef)); \
+}
+
+#define VIR_SFU2(name) \
+static inline struct qreg \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \
+{ \
+ return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \
+ c->undef, \
+ a, b)); \
+} \
+static inline struct qinst * \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \
+ struct qreg a, struct qreg b) \
+{ \
+ return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
+ dest, \
+ a, b)); \
}
#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
@@ -1343,6 +1434,28 @@ VIR_SFU(LOG)
VIR_SFU(SIN)
VIR_SFU(RSQRT2)
+VIR_SFU(BALLOT)
+VIR_SFU(BCASTF)
+VIR_SFU(ALLEQ)
+VIR_SFU(ALLFEQ)
+VIR_SFU2(ROTQ)
+VIR_SFU2(ROT)
+VIR_SFU2(SHUFFLE)
+
+VIR_A_ALU2(VPACK)
+VIR_A_ALU2(V8PACK)
+VIR_A_ALU2(V10PACK)
+VIR_A_ALU2(V11FPACK)
+
+VIR_M_ALU1(FTOUNORM16)
+VIR_M_ALU1(FTOSNORM16)
+
+VIR_M_ALU1(VFTOUNORM8)
+VIR_M_ALU1(VFTOSNORM8)
+
+VIR_M_ALU1(VFTOUNORM10LO)
+VIR_M_ALU1(VFTOUNORM10HI)
+
static inline struct qinst *
vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
struct qreg dest, struct qreg src)
@@ -1372,16 +1485,11 @@ vir_NOP(struct v3d_compile *c)
static inline struct qreg
vir_LDTMU(struct v3d_compile *c)
{
- if (c->devinfo->ver >= 41) {
- struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
- c->undef, c->undef);
- ldtmu->qpu.sig.ldtmu = true;
-
- return vir_emit_def(c, ldtmu);
- } else {
- vir_NOP(c)->qpu.sig.ldtmu = true;
- return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
- }
+ struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+ c->undef, c->undef);
+ ldtmu->qpu.sig.ldtmu = true;
+
+ return vir_emit_def(c, ldtmu);
}
static inline struct qreg
@@ -1394,7 +1502,6 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
static inline struct qreg
vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
{
- assert(c->devinfo->ver >= 41); /* XXX */
assert((config & 0xffffff00) == 0xffffff00);
struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
@@ -1407,38 +1514,12 @@ vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
static inline struct qreg
vir_TLB_COLOR_READ(struct v3d_compile *c)
{
- assert(c->devinfo->ver >= 41); /* XXX */
-
struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
c->undef, c->undef);
ldtlb->qpu.sig.ldtlb = true;
return vir_emit_def(c, ldtlb);
}
-/*
-static inline struct qreg
-vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
-{
- return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
- vir_reg(QFILE_LOAD_IMM, val), c->undef));
-}
-
-static inline struct qreg
-vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
-{
- return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
- vir_reg(QFILE_LOAD_IMM, val),
- c->undef));
-}
-static inline struct qreg
-vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
-{
- return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
- vir_reg(QFILE_LOAD_IMM, val),
- c->undef));
-}
-*/
-
static inline struct qinst *
vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
{
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2706432d5ef..9a651bfc6a7 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,9 +40,20 @@
* calculations and load/store using the TMU general memory access path.
*/
+static const unsigned bits_8[4] = {8, 8, 8, 8};
+static const unsigned bits_16[4] = {16, 16, 16, 16};
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+
bool
v3d_gl_format_is_return_32(enum pipe_format format)
{
+ /* We can get a NONE format in Vulkan because we support the
+ * shaderStorageImageReadWithoutFormat feature. We consider these to
+ * always use 32-bit precision.
+ */
+ if (format == PIPE_FORMAT_NONE)
+ return true;
+
const struct util_format_description *desc =
util_format_description(format);
const struct util_format_channel_description *chan = &desc->channel[0];
@@ -52,15 +63,17 @@ v3d_gl_format_is_return_32(enum pipe_format format)
/* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
* 32-bit SSA value, with as many channels as necessary to store all the bits
+ *
+ * This is the generic helper, using all common nir operations.
*/
-static nir_ssa_def *
-pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
+static nir_def *
+pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
int num_components, bool mask)
{
- nir_ssa_def *results[4];
+ nir_def *results[4];
int offset = 0;
for (int i = 0; i < num_components; i++) {
- nir_ssa_def *chan = nir_channel(b, color, i);
+ nir_def *chan = nir_channel(b, color, i);
/* Channels being stored shouldn't cross a 32-bit boundary. */
assert((offset & ~31) == ((offset + bits[i] - 1) & ~31));
@@ -84,10 +97,187 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
}
-static void
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static inline nir_def *
+nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2)
+{
+ return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_def *
+pack_11f11f10f(nir_builder *b, nir_def *color)
+{
+ nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ nir_def *undef = nir_undef(b, 1, color->bit_size);
+ nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+ return nir_pack_32_to_r11g11b10_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_def *color)
+{
+ nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+
+ return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color)
+{
+ nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1);
+
+ nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2);
+
+ return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+enum hw_conversion {
+ NONE,
+ TO_SNORM,
+ TO_UNORM
+};
+
+static inline nir_def *
+pack_8bit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ /* Note that usually you should not use this method (that relies on
+ * custom packing) for 1 component if we are not doing any
+ * conversion. But we support also that case, and let the caller
+ * decide which method to use.
+ */
+ nir_def *p1;
+ nir_def *p2;
+
+ if (conversion == NONE) {
+ p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ } else {
+ p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ p1 = (conversion == TO_UNORM) ?
+ nir_pack_2x16_to_unorm_2x8_v3d(b, p1) :
+ nir_pack_2x16_to_snorm_2x8_v3d(b, p1);
+ }
+ if (num_components == 4) {
+ if (conversion == NONE) {
+ p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ } else {
+ p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = (conversion == TO_UNORM) ?
+ nir_pack_2x16_to_unorm_2x8_v3d(b, p2) :
+ nir_pack_2x16_to_snorm_2x8_v3d(b, p2);
+ }
+ } else {
+ /* Using an undef here would be more correct. But for this
+ * case we are getting worse shader-db values with some CTS
+ * tests, so we just reuse the first packing.
+ */
+ p2 = p1;
+ }
+
+ return nir_pack_4x16_to_4x8_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_16bit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ nir_def *results[2] = {0};
+ nir_def *channels[4] = {0};
+
+ for (unsigned i = 0; i < num_components; i++) {
+ channels[i] = nir_channel(b, color, i);
+ switch (conversion) {
+ case TO_SNORM:
+ channels[i] = nir_f2snorm_16_v3d(b, channels[i]);
+ break;
+ case TO_UNORM:
+ channels[i] = nir_f2unorm_16_v3d(b, channels[i]);
+ break;
+ default:
+ /* Note that usually you should not use this method
+ * (that relies on custom packing) if we are not doing
+ * any conversion. But we support also that case, and
+ * let the caller decide which method to use.
+ */
+ break;
+ }
+ }
+
+ switch (num_components) {
+ case 1:
+ results[0] = channels[0];
+ break;
+ case 4:
+ results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]);
+ FALLTHROUGH;
+ case 2:
+ results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]);
+ break;
+ default:
+ unreachable("Invalid number of components");
+ }
+
+ return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_def *
+pack_xbit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ const struct util_format_channel_description *r_chan)
+{
+ bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
+ enum hw_conversion conversion = NONE;
+ if (r_chan->normalized) {
+ conversion =
+ (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
+ }
+
+ switch (r_chan->size) {
+ case 8:
+ if (conversion == NONE && num_components < 2)
+ return pack_bits(b, color, bits_8, num_components, pack_mask);
+ else
+ return pack_8bit(b, color, num_components, conversion);
+ break;
+ case 16:
+ /* pack_mask implies that the generic packing method would
+ * need to include extra operations to handle negative values,
+ * so in that case, even without a conversion, it is better to
+ * use the packing using custom hw operations.
+ */
+ if (conversion == NONE && !pack_mask)
+ return pack_bits(b, color, bits_16, num_components, pack_mask);
+ else
+ return pack_16bit(b, color, num_components, conversion);
+ break;
+ default:
+ unreachable("unrecognized bits");
+ }
+}
+
+static bool
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
{
enum pipe_format format = nir_intrinsic_format(instr);
+ assert(format != PIPE_FORMAT_NONE);
const struct util_format_description *desc =
util_format_description(format);
const struct util_format_channel_description *r_chan = &desc->channel[0];
@@ -95,10 +285,10 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *color = nir_channels(b,
- nir_ssa_for_src(b, instr->src[3], 4),
- (1 << num_components) - 1);
- nir_ssa_def *formatted = NULL;
+ nir_def *color = nir_trim_vector(b,
+ instr->src[3].ssa,
+ num_components);
+ nir_def *formatted = NULL;
if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
formatted = nir_format_pack_11f11f10f(b, color);
@@ -110,9 +300,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
*/
formatted = color;
} else {
- static const unsigned bits_8[4] = {8, 8, 8, 8};
- static const unsigned bits_16[4] = {16, 16, 16, 16};
- static const unsigned bits_1010102[4] = {10, 10, 10, 2};
const unsigned *bits;
switch (r_chan->size) {
@@ -132,11 +319,13 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
bool pack_mask = false;
if (r_chan->pure_integer &&
r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
- formatted = nir_format_clamp_sint(b, color, bits);
+ /* We don't need to do any conversion or clamping in this case */
+ formatted = color;
pack_mask = true;
} else if (r_chan->pure_integer &&
r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
- formatted = nir_format_clamp_uint(b, color, bits);
+ /* We don't need to do any conversion or clamping in this case */
+ formatted = color;
} else if (r_chan->normalized &&
r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
formatted = nir_format_float_to_snorm(b, color, bits);
@@ -154,75 +343,116 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
pack_mask);
}
- nir_instr_rewrite_src(&instr->instr, &instr->src[3],
- nir_src_for_ssa(formatted));
+ nir_src_rewrite(&instr->src[3], formatted);
instr->num_components = formatted->num_components;
+
+ return true;
}
-static void
+
+static bool
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ enum pipe_format format = nir_intrinsic_format(instr);
+ assert(format != PIPE_FORMAT_NONE);
+ const struct util_format_description *desc =
+ util_format_description(format);
+ const struct util_format_channel_description *r_chan = &desc->channel[0];
+ unsigned num_components = util_format_get_nr_components(format);
+ b->cursor = nir_before_instr(&instr->instr);
+
+ nir_def *color =
+ nir_trim_vector(b, instr->src[3].ssa, num_components);
+ nir_def *formatted = NULL;
+ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ formatted = nir_format_pack_r9g9b9e5(b, color);
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ formatted = pack_11f11f10f(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
+ formatted = pack_r10g10b10a2_uint(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+ formatted = pack_r10g10b10a2_unorm(b, color);
+ } else if (r_chan->size == 32) {
+ /* For 32-bit formats, we just have to move the vector
+ * across (possibly reducing the number of channels).
+ */
+ formatted = color;
+ } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+ assert(r_chan->size == 16);
+ formatted = nir_format_float_to_half(b, color);
+ formatted = pack_bits(b, formatted, bits_16, num_components,
+ false);
+ } else {
+ assert(r_chan->size == 8 || r_chan->size == 16);
+ formatted = pack_xbit(b, color, num_components, r_chan);
+ }
+
+ nir_src_rewrite(&instr->src[3], formatted);
+ instr->num_components = formatted->num_components;
+
+ return true;
+}
+
+static bool
v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
{
static const unsigned bits16[] = {16, 16, 16, 16};
enum pipe_format format = nir_intrinsic_format(instr);
if (v3d_gl_format_is_return_32(format))
- return;
+ return false;
b->cursor = nir_after_instr(&instr->instr);
- assert(instr->dest.is_ssa);
- nir_ssa_def *result = &instr->dest.ssa;
+ nir_def *result = &instr->def;
if (util_format_is_pure_uint(format)) {
result = nir_format_unpack_uint(b, result, bits16, 4);
} else if (util_format_is_pure_sint(format)) {
result = nir_format_unpack_sint(b, result, bits16, 4);
} else {
- nir_ssa_def *rg = nir_channel(b, result, 0);
- nir_ssa_def *ba = nir_channel(b, result, 1);
- result = nir_vec4(b,
- nir_unpack_half_2x16_split_x(b, rg),
- nir_unpack_half_2x16_split_y(b, rg),
- nir_unpack_half_2x16_split_x(b, ba),
- nir_unpack_half_2x16_split_y(b, ba));
+ nir_def *rg = nir_channel(b, result, 0);
+ nir_def *ba = nir_channel(b, result, 1);
+ result = nir_vec4(b,
+ nir_unpack_half_2x16_split_x(b, rg),
+ nir_unpack_half_2x16_split_y(b, rg),
+ nir_unpack_half_2x16_split_x(b, ba),
+ nir_unpack_half_2x16_split_y(b, ba));
}
- nir_ssa_def_rewrite_uses_after(&instr->dest.ssa, result,
+ nir_def_rewrite_uses_after(&instr->def, result,
result->parent_instr);
+
+ return true;
}
-void
-v3d_nir_lower_image_load_store(nir_shader *s)
+static bool
+v3d_nir_lower_image_load_store_cb(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ void *_state)
{
- nir_foreach_function(function, s) {
- if (!function->impl)
- continue;
-
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *intr =
- nir_instr_as_intrinsic(instr);
-
- switch (intr->intrinsic) {
- case nir_intrinsic_image_load:
- v3d_nir_lower_image_load(&b, intr);
- break;
- case nir_intrinsic_image_store:
- v3d_nir_lower_image_store(&b, intr);
- break;
- default:
- break;
- }
- }
- }
+ struct v3d_compile *c = (struct v3d_compile *) _state;
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_image_load:
+ return v3d_nir_lower_image_load(b, intr);
+ case nir_intrinsic_image_store:
+ if (c->devinfo->ver >= 71)
+ return v3d_nir_lower_image_store_v71(b, intr);
+ else
+ return v3d_nir_lower_image_store_v42(b, intr);
+ break;
+ default:
+ return false;
}
+
+ return false;
+}
+
+bool
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
+{
+ return nir_shader_intrinsics_pass(s,
+ v3d_nir_lower_image_load_store_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance, c);
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 895b1a39163..55e2e4f2e11 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -24,8 +24,6 @@
#include "compiler/v3d_compiler.h"
#include "compiler/nir/nir_builder.h"
-#include "util/u_helpers.h"
-
/**
* Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
* intrinsics into something amenable to the V3D architecture.
@@ -64,7 +62,7 @@ struct v3d_nir_lower_io_state {
BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
- nir_ssa_def *pos[4];
+ nir_def *pos[4];
};
static void
@@ -72,8 +70,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
struct v3d_nir_lower_io_state *state);
static void
-v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
- nir_ssa_def *chan)
+v3d_nir_store_output(nir_builder *b, int base, nir_def *offset,
+ nir_def *chan)
{
if (offset) {
/* When generating the VIR instruction, the base and the offset
@@ -90,29 +88,6 @@ v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0);
}
-/* Convert the uniform offset to bytes. If it happens to be a constant,
- * constant-folding will clean up the shift for us.
- */
-static void
-v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
- nir_intrinsic_instr *intr)
-{
- /* On SPIR-V/Vulkan we are already getting our offsets in
- * bytes.
- */
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
- return;
-
- b->cursor = nir_before_instr(&intr->instr);
-
- nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
-
- nir_instr_rewrite_src(&intr->instr,
- &intr->src[0],
- nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
- nir_imm_int(b, 4))));
-}
-
static int
v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component)
{
@@ -159,14 +134,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
/* If this is a geometry shader we need to emit our outputs
* to the current vertex offset in the VPM.
*/
- nir_ssa_def *offset_reg =
+ nir_def *offset_reg =
c->s->info.stage == MESA_SHADER_GEOMETRY ?
nir_load_var(b, state->gs.output_offset_var) : NULL;
int start_comp = nir_intrinsic_component(intr);
unsigned location = nir_intrinsic_io_semantics(intr).location;
- nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
- intr->num_components);
+ nir_def *src = intr->src[0].ssa;
/* Save off the components of the position for the setup of VPM inputs
* read by fixed function HW.
*/
@@ -184,8 +158,8 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
if (location == VARYING_SLOT_LAYER) {
assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
- nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
- header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff));
+ nir_def *header = nir_load_var(b, state->gs.header_var);
+ header = nir_iand_imm(b, header, 0xff00ffff);
/* From the GLES 3.2 spec:
*
@@ -205,24 +179,26 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
* to 0 in that case (we always allocate tile state for at
* least one layer).
*/
- nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
- nir_ssa_def *cond = nir_ige(b, src, fb_layers);
- nir_ssa_def *layer_id =
+ nir_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
+ nir_def *cond = nir_ige(b, src, fb_layers);
+ nir_def *layer_id =
nir_bcsel(b, cond,
nir_imm_int(b, 0),
- nir_ishl(b, src, nir_imm_int(b, 16)));
+ nir_ishl_imm(b, src, 16));
header = nir_ior(b, header, layer_id);
nir_store_var(b, state->gs.header_var, header, 0x1);
}
/* Scalarize outputs if it hasn't happened already, since we want to
- * schedule each VPM write individually. We can skip any outut
+ * schedule each VPM write individually. We can skip any output
* components not read by the FS.
*/
for (int i = 0; i < intr->num_components; i++) {
int vpm_offset =
v3d_varying_slot_vpm_offset(c, location, start_comp + i);
+ if (!(nir_intrinsic_write_mask(intr) & (1 << i)))
+ continue;
if (vpm_offset == -1)
continue;
@@ -261,9 +237,9 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
{
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
- nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
- nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
+ nir_def *header = nir_load_var(b, state->gs.header_var);
+ nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
+ nir_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
/* Emit fixed function outputs */
v3d_nir_emit_ff_vpm_outputs(c, b, state);
@@ -273,13 +249,13 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
/* Update VPM offset for next vertex output data and header */
output_offset =
- nir_iadd(b, output_offset,
- nir_imm_int(b, state->gs.output_vertex_data_size));
+ nir_iadd_imm(b, output_offset,
+ state->gs.output_vertex_data_size);
- header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
+ header_offset = nir_iadd_imm(b, header_offset, 1);
/* Reset the New Primitive bit */
- header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
+ header = nir_iand_imm(b, header, 0xfffffffe);
nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
@@ -304,7 +280,7 @@ v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
* doesn't provide means to do that, so we need to apply the swizzle in the
* vertex shader.
*
- * This is required at least in Vulkan to support madatory vertex attribute
+ * This is required at least in Vulkan to support mandatory vertex attribute
* format VK_FORMAT_B8G8R8A8_UNORM.
*/
static void
@@ -327,59 +303,6 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
nir_intrinsic_set_component(instr, (comp + 2) % 4);
}
-/* Sometimes the origin of gl_PointCoord is in the upper left rather than the
- * lower left so we need to flip it.
- *
- * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
- */
-static void
-v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
- nir_intrinsic_instr *intr)
-{
- assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
-
- /* Gallium uses lower_wpos_pntc */
- if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
- return;
-
- b->cursor = nir_after_instr(&intr->instr);
-
- int comp = nir_intrinsic_component(intr);
-
- nir_variable *input_var =
- nir_find_variable_with_driver_location(c->s,
- nir_var_shader_in,
- nir_intrinsic_base(intr));
-
- if (input_var && util_varying_is_point_coord(input_var->data.location,
- c->fs_key->point_sprite_mask)) {
- assert(intr->num_components == 1);
-
- nir_ssa_def *result = &intr->dest.ssa;
-
- switch (comp) {
- case 0:
- case 1:
- if (!c->fs_key->is_points)
- result = nir_imm_float(b, 0.0);
- break;
- case 2:
- result = nir_imm_float(b, 0.0);
- break;
- case 3:
- result = nir_imm_float(b, 1.0);
- break;
- }
- if (c->fs_key->point_coord_upper_left && comp == 1)
- result = nir_fsub(b, nir_imm_float(b, 1.0), result);
- if (result != &intr->dest.ssa) {
- nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
- result,
- result->parent_instr);
- }
- }
-}
-
static void
v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
struct nir_instr *instr,
@@ -393,12 +316,6 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
case nir_intrinsic_load_input:
if (c->s->info.stage == MESA_SHADER_VERTEX)
v3d_nir_lower_vertex_input(c, b, intr);
- else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
- v3d_nir_lower_fragment_input(c, b, intr);
- break;
-
- case nir_intrinsic_load_uniform:
- v3d_nir_lower_uniform(c, b, intr);
break;
case nir_intrinsic_store_output:
@@ -558,16 +475,16 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
/* If this is a geometry shader we need to emit our fixed function
* outputs to the current vertex offset in the VPM.
*/
- nir_ssa_def *offset_reg =
+ nir_def *offset_reg =
c->s->info.stage == MESA_SHADER_GEOMETRY ?
nir_load_var(b, state->gs.output_offset_var) : NULL;
for (int i = 0; i < 4; i++) {
if (!state->pos[i])
- state->pos[i] = nir_ssa_undef(b, 1, 32);
+ state->pos[i] = nir_undef(b, 1, 32);
}
- nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
+ nir_def *rcp_wc = nir_frcp(b, state->pos[3]);
if (state->pos_vpm_offset != -1) {
for (int i = 0; i < 4; i++) {
@@ -578,8 +495,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
if (state->vp_vpm_offset != -1) {
for (int i = 0; i < 2; i++) {
- nir_ssa_def *pos;
- nir_ssa_def *scale;
+ nir_def *pos;
+ nir_def *scale;
pos = state->pos[i];
if (i == 0)
scale = nir_load_viewport_x_scale(b);
@@ -598,14 +515,18 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
* The correct fix for this as recommended by Broadcom
* is to convert to .8 fixed-point with ffloor().
*/
- pos = nir_f2i32(b, nir_ffloor(b, pos));
- v3d_nir_store_output(b, state->vp_vpm_offset + i,
- offset_reg, pos);
+ if (c->devinfo->ver == 42)
+ pos = nir_f2i32(b, nir_ffloor(b, pos));
+ else
+ pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+ v3d_nir_store_output(b, state->vp_vpm_offset + i,
+ offset_reg, pos);
}
}
if (state->zs_vpm_offset != -1) {
- nir_ssa_def *z = state->pos[2];
+ nir_def *z = state->pos[2];
z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
z = nir_fmul(b, z, rcp_wc);
z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
@@ -679,21 +600,22 @@ emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
* have a variable just to keep track of the number of vertices we
* emitted and instead we can just compute it here from the header
* offset variable by removing the one generic header slot that always
- * goes at the begining of out header.
+ * goes at the beginning of out header.
*/
- nir_ssa_def *header_offset =
+ nir_def *header_offset =
nir_load_var(b, state->gs.header_offset_var);
- nir_ssa_def *vertex_count =
- nir_isub(b, header_offset, nir_imm_int(b, 1));
- nir_ssa_def *header =
- nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
- nir_ishl(b, vertex_count,
- nir_imm_int(b, VERTEX_COUNT_OFFSET)));
+ nir_def *vertex_count =
+ nir_iadd_imm(b, header_offset, -1);
+ nir_def *header =
+ nir_ior_imm(b,
+ nir_ishl_imm(b, vertex_count,
+ VERTEX_COUNT_OFFSET),
+ state->gs.output_header_size);
v3d_nir_store_output(b, 0, NULL, header);
}
-void
+bool
v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
{
struct v3d_nir_lower_io_state state = { 0 };
@@ -713,36 +635,39 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
unreachable("Unsupported shader stage");
}
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- if (c->s->info.stage == MESA_SHADER_GEOMETRY)
- emit_gs_prolog(c, &b, function->impl, &state);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block)
- v3d_nir_lower_io_instr(c, &b, instr,
- &state);
- }
-
- nir_block *last = nir_impl_last_block(function->impl);
- b.cursor = nir_after_block(last);
- if (s->info.stage == MESA_SHADER_VERTEX) {
- v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
- } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
- emit_gs_vpm_output_header_prolog(c, &b, &state);
- }
-
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ nir_foreach_function_impl(impl, s) {
+ nir_builder b = nir_builder_create(impl);
+
+ if (c->s->info.stage == MESA_SHADER_GEOMETRY)
+ emit_gs_prolog(c, &b, impl, &state);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block)
+ v3d_nir_lower_io_instr(c, &b, instr,
+ &state);
}
+
+ nir_block *last = nir_impl_last_block(impl);
+ b.cursor = nir_after_block(last);
+ if (s->info.stage == MESA_SHADER_VERTEX) {
+ v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+ } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
+ emit_gs_vpm_output_header_prolog(c, &b, &state);
+ }
+
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
}
if (s->info.stage == MESA_SHADER_VERTEX ||
s->info.stage == MESA_SHADER_GEOMETRY) {
v3d_nir_lower_io_update_output_var_base(c, &state);
}
+
+ /* It is really unlikely that we don't get progress here, and fully
+ * filtering when not would make code more complex, but we are still
+ * interested on getting this lowering going through NIR_PASS
+ */
+ return true;
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
index 8f6e7d4e648..05b5224bc52 100644
--- a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
+++ b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -42,25 +42,23 @@ lower_line_smooth_intrinsic(struct lower_line_smooth_state *state,
{
b->cursor = nir_before_instr(&intr->instr);
- nir_ssa_def *one = nir_imm_float(b, 1.0f);
+ nir_def *one = nir_imm_float(b, 1.0f);
- nir_ssa_def *coverage = nir_load_var(b, state->coverage);
+ nir_def *coverage = nir_load_var(b, state->coverage);
- nir_ssa_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
+ nir_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
intr->src[0].ssa);
- nir_instr_rewrite_src(&intr->instr,
- &intr->src[0],
- nir_src_for_ssa(new_val));
+ nir_src_rewrite(&intr->src[0], new_val);
}
-static void
+static bool
lower_line_smooth_func(struct lower_line_smooth_state *state,
nir_function_impl *impl)
{
- nir_builder b;
+ bool progress = false;
- nir_builder_init(&b, impl);
+ nir_builder b = nir_builder_create(impl);
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
@@ -72,58 +70,66 @@ lower_line_smooth_func(struct lower_line_smooth_state *state,
if (intr->intrinsic != nir_intrinsic_store_output ||
nir_intrinsic_base(intr) != 0 ||
- intr->num_components != 4 ||
- !intr->src[0].is_ssa)
+ intr->num_components != 4)
continue;
lower_line_smooth_intrinsic(state, &b, intr);
+ progress = true;
}
}
+
+ return progress;
}
static void
initialise_coverage_var(struct lower_line_smooth_state *state,
nir_function_impl *impl)
{
- nir_builder b;
-
- nir_builder_init(&b, impl);
+ nir_builder b = nir_builder_at(nir_before_impl(impl));
- b.cursor = nir_before_block(nir_start_block(impl));
+ nir_def *line_width = nir_load_line_width(&b);
- nir_ssa_def *line_width = nir_load_line_width(&b);
+ nir_def *real_line_width = nir_load_aa_line_width(&b);
- nir_ssa_def *real_line_width = nir_load_aa_line_width(&b);
-
- /* The line coord varies from 0.0 to 1.0 across the width of the line */
- nir_ssa_def *line_coord = nir_load_line_coord(&b);
+ /* According to the PRM, the line coord varies from 0.0 to 1.0 across
+ * the width of the line. But actually, when a perspective projection
+ * is used, it is also applied to the line coords, so the values end
+ * up being between [min_coord, 1], based on the Wc coordinate. We
+ * need to re-map the values to be between [0.0, 1.0].
+ */
+ nir_def *line_coord = nir_load_line_coord(&b);
+ nir_def *wc = nir_load_fep_w_v3d(&b, 32);
+ nir_def *min_coord_val = nir_fsub(&b, nir_imm_float(&b, 1.0f), wc);
+ nir_def *normalized_line_coord = nir_fdiv(&b,
+ nir_fsub(&b, line_coord, min_coord_val),
+ nir_fsub_imm(&b, 1.0, min_coord_val));;
/* fabs(line_coord - 0.5) * real_line_width */
- nir_ssa_def *pixels_from_center =
+ nir_def *pixels_from_center =
nir_fmul(&b, real_line_width,
- nir_fabs(&b, nir_fsub(&b, line_coord,
+ nir_fabs(&b, nir_fsub(&b, normalized_line_coord,
nir_imm_float(&b, 0.5f))));
/* 0.5 - 1/√2 * (pixels_from_center - line_width * 0.5) */
- nir_ssa_def *coverage =
+ nir_def *coverage =
nir_fsub(&b,
nir_imm_float(&b, 0.5f),
nir_fmul(&b,
nir_imm_float(&b, 1.0f / M_SQRT2),
nir_fsub(&b, pixels_from_center,
- nir_fmul(&b,
- line_width,
- nir_imm_float(&b, 0.5f)))));
+ nir_fmul_imm(&b,
+ line_width,
+ 0.5f))));
/* Discard fragments that aren’t covered at all by the line */
- nir_ssa_def *outside = nir_fge(&b, nir_imm_float(&b, 0.0f), coverage);
+ nir_def *outside = nir_fle_imm(&b, coverage, 0.0f);
nir_discard_if(&b, outside);
/* Clamp to at most 1.0. If it was less than 0.0 then the fragment will
* be discarded so we don’t need to handle that.
*/
- nir_ssa_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
+ nir_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
nir_store_var(&b, state->coverage, clamped, 0x1 /* writemask */);
}
@@ -140,9 +146,11 @@ make_coverage_var(nir_shader *s)
return var;
}
-void
+bool
v3d_nir_lower_line_smooth(nir_shader *s)
{
+ bool progress = false;
+
assert(s->info.stage == MESA_SHADER_FRAGMENT);
struct lower_line_smooth_state state = {
@@ -150,10 +158,20 @@ v3d_nir_lower_line_smooth(nir_shader *s)
.coverage = make_coverage_var(s),
};
- nir_foreach_function(function, s) {
+ nir_foreach_function_with_impl(function, impl, s) {
if (function->is_entrypoint)
- initialise_coverage_var(&state, function->impl);
+ initialise_coverage_var(&state, impl);
+
+ progress |= lower_line_smooth_func(&state, impl);
- lower_line_smooth_func(&state, function->impl);
+ if (progress) {
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
+ } else {
+ nir_metadata_preserve(impl, nir_metadata_all);
+ }
}
+
+ return progress;
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
new file mode 100644
index 00000000000..0caf5dbc92c
--- /dev/null
+++ b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright © 2021 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * The V3D TMU unit can only do 32-bit general vector access so for anything
+ * else we need to split vector load/store instructions to scalar.
+ *
+ * Note that a vectorization pass after this lowering may be able to
+ * re-vectorize some of these using 32-bit load/store instructions instead,
+ * which we do support.
+ */
+
+static int
+value_src(nir_intrinsic_op intrinsic)
+{
+ switch (intrinsic) {
+ case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
+ return 0;
+ default:
+ unreachable("Unsupported intrinsic");
+ }
+}
+
+static int
+offset_src(nir_intrinsic_op intrinsic)
+{
+ switch (intrinsic) {
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_global_2x32:
+ return 0;
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
+ return 1;
+ case nir_intrinsic_store_ssbo:
+ return 2;
+ default:
+ unreachable("Unsupported intrinsic");
+ }
+}
+
+static nir_intrinsic_instr *
+init_scalar_intrinsic(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ uint32_t component,
+ nir_def *offset,
+ uint32_t bit_size,
+ nir_def **scalar_offset)
+{
+
+ nir_intrinsic_instr *new_intr =
+ nir_intrinsic_instr_create(b->shader, intr->intrinsic);
+
+ nir_intrinsic_copy_const_indices(new_intr, intr);
+
+ const int offset_units = bit_size / 8;
+ assert(offset_units >= 1);
+
+ if (nir_intrinsic_has_align_mul(intr)) {
+ assert(nir_intrinsic_has_align_offset(intr));
+ unsigned align_mul = nir_intrinsic_align_mul(intr);
+ unsigned align_off = nir_intrinsic_align_offset(intr);
+
+ align_off += offset_units * component;
+ align_off = align_off % align_mul;
+
+ nir_intrinsic_set_align(new_intr, align_mul, align_off);
+ }
+
+ *scalar_offset = offset;
+ unsigned offset_adj = offset_units * component;
+ if (nir_intrinsic_has_base(intr)) {
+ nir_intrinsic_set_base(
+ new_intr, nir_intrinsic_base(intr) + offset_adj);
+ } else {
+ *scalar_offset =
+ nir_iadd(b, offset,
+ nir_imm_intN_t(b, offset_adj,
+ offset->bit_size));
+ }
+
+ new_intr->num_components = 1;
+
+ return new_intr;
+}
+
+static bool
+lower_load_bitsize(nir_builder *b,
+ nir_intrinsic_instr *intr)
+{
+ uint32_t bit_size = intr->def.bit_size;
+ if (bit_size == 32)
+ return false;
+
+ /* No need to split if it is already scalar */
+ int num_comp = nir_intrinsic_dest_components(intr);
+ if (num_comp <= 1)
+ return false;
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ /* For global 2x32 we ignore Y component because it must be zero */
+ unsigned offset_idx = offset_src(intr->intrinsic);
+ nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+ /* Split vector store to multiple scalar loads */
+ nir_def *dest_components[4] = { NULL };
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+ for (int component = 0; component < num_comp; component++) {
+ nir_def *scalar_offset;
+ nir_intrinsic_instr *new_intr =
+ init_scalar_intrinsic(b, intr, component, offset,
+ bit_size, &scalar_offset);
+
+ for (unsigned i = 0; i < info->num_srcs; i++) {
+ if (i == offset_idx) {
+ nir_def *final_offset;
+ final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
+ scalar_offset :
+ nir_vec2(b, scalar_offset,
+ nir_imm_int(b, 0));
+ new_intr->src[i] = nir_src_for_ssa(final_offset);
+ } else {
+ new_intr->src[i] = intr->src[i];
+ }
+ }
+
+ nir_def_init(&new_intr->instr, &new_intr->def, 1,
+ bit_size);
+ dest_components[component] = &new_intr->def;
+
+ nir_builder_instr_insert(b, &new_intr->instr);
+ }
+
+ nir_def *new_dst = nir_vec(b, dest_components, num_comp);
+ nir_def_rewrite_uses(&intr->def, new_dst);
+
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_store_bitsize(nir_builder *b,
+ nir_intrinsic_instr *intr)
+{
+ /* No need to split if it is already scalar */
+ int value_idx = value_src(intr->intrinsic);
+ int num_comp = nir_intrinsic_src_components(intr, value_idx);
+ if (num_comp <= 1)
+ return false;
+
+ /* No need to split if it is 32-bit */
+ if (nir_src_bit_size(intr->src[value_idx]) == 32)
+ return false;
+
+ nir_def *value = intr->src[value_idx].ssa;
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ /* For global 2x32 we ignore Y component because it must be zero */
+ unsigned offset_idx = offset_src(intr->intrinsic);
+ nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+ /* Split vector store to multiple scalar stores */
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+ unsigned wrmask = nir_intrinsic_write_mask(intr);
+ while (wrmask) {
+ unsigned component = ffs(wrmask) - 1;
+
+ nir_def *scalar_offset;
+ nir_intrinsic_instr *new_intr =
+ init_scalar_intrinsic(b, intr, component, offset,
+ value->bit_size, &scalar_offset);
+
+ nir_intrinsic_set_write_mask(new_intr, 0x1);
+
+ for (unsigned i = 0; i < info->num_srcs; i++) {
+ if (i == value_idx) {
+ nir_def *scalar_value =
+ nir_channels(b, value, 1 << component);
+ new_intr->src[i] = nir_src_for_ssa(scalar_value);
+ } else if (i == offset_idx) {
+ nir_def *final_offset;
+ final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
+ scalar_offset :
+ nir_vec2(b, scalar_offset,
+ nir_imm_int(b, 0));
+ new_intr->src[i] = nir_src_for_ssa(final_offset);
+ } else {
+ new_intr->src[i] = intr->src[i];
+ }
+ }
+
+ nir_builder_instr_insert(b, &new_intr->instr);
+
+ wrmask &= ~(1 << component);
+ }
+
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
+ void *data)
+{
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_global_2x32:
+ return lower_load_bitsize(b, intr);
+
+ case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
+ return lower_store_bitsize(b, intr);
+
+ default:
+ return false;
+ }
+}
+
+bool
+v3d_nir_lower_load_store_bitsize(nir_shader *s)
+{
+ return nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
+ nir_metadata_block_index |
+ nir_metadata_dominance,
+ NULL);
+}
diff --git a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
index 11782c7348f..4affb79a7e2 100644
--- a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
+++ b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
@@ -36,8 +36,8 @@
#include "v3d_compiler.h"
-typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c);
-typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c);
+typedef nir_def *(*nir_pack_func)(nir_builder *b, nir_def *c);
+typedef nir_def *(*nir_unpack_func)(nir_builder *b, nir_def *c);
static bool
logicop_depends_on_dst_color(int logicop_func)
@@ -53,9 +53,9 @@ logicop_depends_on_dst_color(int logicop_func)
}
}
-static nir_ssa_def *
+static nir_def *
v3d_logicop(nir_builder *b, int logicop_func,
- nir_ssa_def *src, nir_ssa_def *dst)
+ nir_def *src, nir_def *dst)
{
switch (logicop_func) {
case PIPE_LOGICOP_CLEAR:
@@ -96,8 +96,8 @@ v3d_logicop(nir_builder *b, int logicop_func,
}
}
-static nir_ssa_def *
-v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+static nir_def *
+v3d_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz)
{
switch (swiz) {
default:
@@ -116,57 +116,57 @@ v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
}
}
-static nir_ssa_def *
-v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans,
+static nir_def *
+v3d_nir_swizzle_and_pack(nir_builder *b, nir_def **chans,
const uint8_t *swiz, nir_pack_func pack_func)
{
- nir_ssa_def *c[4];
+ nir_def *c[4];
for (int i = 0; i < 4; i++)
c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]);
return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3]));
}
-static nir_ssa_def *
-v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed,
+static nir_def *
+v3d_nir_unpack_and_swizzle(nir_builder *b, nir_def *packed,
const uint8_t *swiz, nir_unpack_func unpack_func)
{
- nir_ssa_def *unpacked = unpack_func(b, packed);
+ nir_def *unpacked = unpack_func(b, packed);
- nir_ssa_def *unpacked_chans[4];
+ nir_def *unpacked_chans[4];
for (int i = 0; i < 4; i++)
unpacked_chans[i] = nir_channel(b, unpacked, i);
- nir_ssa_def *c[4];
+ nir_def *c[4];
for (int i = 0; i < 4; i++)
c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]);
return nir_vec4(b, c[0], c[1], c[2], c[3]);
}
-static nir_ssa_def *
-pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+pack_unorm_rgb10a2(nir_builder *b, nir_def *c)
{
static const unsigned bits[4] = { 10, 10, 10, 2 };
- nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits);
+ nir_def *unorm = nir_format_float_to_unorm(b, c, bits);
- nir_ssa_def *chans[4];
+ nir_def *chans[4];
for (int i = 0; i < 4; i++)
chans[i] = nir_channel(b, unorm, i);
- nir_ssa_def *result = nir_mov(b, chans[0]);
+ nir_def *result = nir_mov(b, chans[0]);
int offset = bits[0];
for (int i = 1; i < 4; i++) {
- nir_ssa_def *shifted_chan =
- nir_ishl(b, chans[i], nir_imm_int(b, offset));
+ nir_def *shifted_chan =
+ nir_ishl_imm(b, chans[i], offset);
result = nir_ior(b, result, shifted_chan);
offset += bits[i];
}
return result;
}
-static nir_ssa_def *
-unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+unpack_unorm_rgb10a2(nir_builder *b, nir_def *c)
{
static const unsigned bits[4] = { 10, 10, 10, 2 };
const unsigned masks[4] = { BITFIELD_MASK(bits[0]),
@@ -174,11 +174,11 @@ unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
BITFIELD_MASK(bits[2]),
BITFIELD_MASK(bits[3]) };
- nir_ssa_def *chans[4];
+ nir_def *chans[4];
for (int i = 0; i < 4; i++) {
- nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i]));
+ nir_def *unorm = nir_iand_imm(b, c, masks[i]);
chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]);
- c = nir_ushr(b, c, nir_imm_int(b, bits[i]));
+ c = nir_ushr_imm(b, c, bits[i]);
}
return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
@@ -201,13 +201,13 @@ v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt)
}
}
-static nir_ssa_def *
+static nir_def *
v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
{
uint32_t num_components =
util_format_get_nr_components(c->fs_key->color_fmt[rt].format);
- nir_ssa_def *color[4];
+ nir_def *color[4];
for (int i = 0; i < 4; i++) {
if (i < num_components) {
color[i] =
@@ -222,71 +222,68 @@ v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
return nir_vec4(b, color[0], color[1], color[2], color[3]);
}
-static nir_ssa_def *
+static nir_def *
v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b,
- nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+ nir_def **src_chans, nir_def **dst_chans,
int rt, int sample)
{
const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
- nir_ssa_def *op_res[4];
+ nir_def *op_res[4];
for (int i = 0; i < 4; i++) {
- nir_ssa_def *src = src_chans[i];
- nir_ssa_def *dst =
+ nir_def *src = src_chans[i];
+ nir_def *dst =
v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]);
op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst);
- /* In Vulkan we configure our integer RTs to clamp, so we need
- * to ignore result bits that don't fit in the destination RT
- * component size.
+ /* We configure our integer RTs to clamp, so we need to ignore
+ * result bits that don't fit in the destination RT component
+ * size.
*/
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
- uint32_t bits =
- util_format_get_component_bits(
- c->fs_key->color_fmt[rt].format,
- UTIL_FORMAT_COLORSPACE_RGB, i);
- if (bits > 0 && bits < 32) {
- nir_ssa_def *mask =
- nir_imm_int(b, (1u << bits) - 1);
- op_res[i] = nir_iand(b, op_res[i], mask);
- }
+ uint32_t bits =
+ util_format_get_component_bits(
+ c->fs_key->color_fmt[rt].format,
+ UTIL_FORMAT_COLORSPACE_RGB, i);
+ if (bits > 0 && bits < 32) {
+ op_res[i] =
+ nir_iand_imm(b, op_res[i], (1u << bits) - 1);
}
}
- nir_ssa_def *r[4];
+ nir_def *r[4];
for (int i = 0; i < 4; i++)
r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]);
return nir_vec4(b, r[0], r[1], r[2], r[3]);
}
-static nir_ssa_def *
+static nir_def *
v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b,
- nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+ nir_def **src_chans, nir_def **dst_chans,
int rt, int sample,
nir_pack_func pack_func, nir_unpack_func unpack_func)
{
static const uint8_t src_swz[4] = { 0, 1, 2, 3 };
- nir_ssa_def *packed_src =
+ nir_def *packed_src =
v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func);
const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
- nir_ssa_def *packed_dst =
+ nir_def *packed_dst =
v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func);
- nir_ssa_def *packed_result =
+ nir_def *packed_result =
v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst);
return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func);
}
-static nir_ssa_def *
+static nir_def *
v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
- nir_ssa_def *src, int rt, int sample)
+ nir_def *src, int rt, int sample)
{
- nir_ssa_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
+ nir_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
- nir_ssa_def *src_chans[4], *dst_chans[4];
+ nir_def *src_chans[4], *dst_chans[4];
for (unsigned i = 0; i < 4; i++) {
src_chans[i] = nir_channel(b, src, i);
dst_chans[i] = nir_channel(b, dst, i);
@@ -309,7 +306,7 @@ v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
static void
v3d_emit_ms_output(nir_builder *b,
- nir_ssa_def *color, nir_src *offset,
+ nir_def *color, nir_src *offset,
nir_alu_type type, int rt, int sample)
{
nir_store_tlb_sample_color_v3d(b, color, nir_imm_int(b, rt), .base = sample, .component = 0, .src_type = type);
@@ -321,7 +318,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
nir_intrinsic_instr *intr,
int rt)
{
- nir_ssa_def *frag_color = intr->src[0].ssa;
+ nir_def *frag_color = intr->src[0].ssa;
const int logic_op = c->fs_key->logicop_func;
@@ -331,7 +328,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
nir_src *offset = &intr->src[1];
nir_alu_type type = nir_intrinsic_src_type(intr);
for (int i = 0; i < V3D_MAX_SAMPLES; i++) {
- nir_ssa_def *sample =
+ nir_def *sample =
v3d_nir_emit_logic_op(c, b, frag_color, rt, i);
v3d_emit_ms_output(b, sample, offset, type, rt, i);
@@ -339,11 +336,10 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
nir_instr_remove(&intr->instr);
} else {
- nir_ssa_def *result =
+ nir_def *result =
v3d_nir_emit_logic_op(c, b, frag_color, rt, 0);
- nir_instr_rewrite_src(&intr->instr, &intr->src[0],
- nir_src_for_ssa(result));
+ nir_src_rewrite(&intr->src[0], result);
intr->num_components = result->num_components;
}
}
@@ -351,6 +347,8 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
static bool
v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
{
+ bool progress = false;
+
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
@@ -384,35 +382,40 @@ v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
continue;
}
- nir_function_impl *impl =
- nir_cf_node_get_function(&block->cf_node);
- nir_builder b;
- nir_builder_init(&b, impl);
- b.cursor = nir_before_instr(&intr->instr);
+ nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
v3d_nir_lower_logic_op_instr(c, &b, intr, rt);
+
+ progress = true;
}
}
- return true;
+ return progress;
}
-void
+bool
v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c)
{
+ bool progress = false;
+
/* Nothing to do if logic op is 'copy src to dst' or if logic ops are
* disabled (we set the logic op to copy in that case).
*/
if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY)
- return;
+ return false;
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_foreach_block(block, function->impl)
- v3d_nir_lower_logic_ops_block(block, c);
+ nir_foreach_function_impl(impl, s) {
+ nir_foreach_block(block, impl)
+ progress |= v3d_nir_lower_logic_ops_block(block, c);
- nir_metadata_preserve(function->impl,
+ if (progress) {
+ nir_metadata_preserve(impl,
nir_metadata_block_index |
nir_metadata_dominance);
+ } else {
+ nir_metadata_preserve(impl,
+ nir_metadata_all);
}
}
+
+ return progress;
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c b/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
deleted file mode 100644
index 40f1cc23b1a..00000000000
--- a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "compiler/v3d_compiler.h"
-#include "compiler/nir/nir_builder.h"
-
-static void
-rewrite_offset(nir_builder *b,
- nir_intrinsic_instr *instr,
- uint32_t buffer_idx,
- uint32_t offset_src,
- nir_intrinsic_op buffer_size_op)
-{
- b->cursor = nir_before_instr(&instr->instr);
-
- /* Get size of the buffer */
- nir_intrinsic_instr *size =
- nir_intrinsic_instr_create(b->shader, buffer_size_op);
- size->src[0] = nir_src_for_ssa(nir_imm_int(b, buffer_idx));
- nir_ssa_dest_init(&size->instr, &size->dest, 1, 32, NULL);
- nir_builder_instr_insert(b, &size->instr);
-
- /* All out TMU accesses are 32-bit aligned */
- nir_ssa_def *aligned_buffer_size =
- nir_iand(b, &size->dest.ssa, nir_imm_int(b, 0xfffffffc));
-
- /* Rewrite offset */
- nir_ssa_def *offset =
- nir_umin(b, instr->src[offset_src].ssa, aligned_buffer_size);
- nir_instr_rewrite_src(&instr->instr, &instr->src[offset_src],
- nir_src_for_ssa(offset));
-}
-
-static void
-lower_load(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
-
- nir_intrinsic_op op;
- if (instr->intrinsic == nir_intrinsic_load_ubo) {
- op = nir_intrinsic_get_ubo_size;
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
- index--;
- } else {
- op = nir_intrinsic_get_ssbo_size;
- }
-
- rewrite_offset(b, instr, index, 1, op);
-}
-
-static void
-lower_store(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- uint32_t index = nir_src_comp_as_uint(instr->src[1], 0);
- rewrite_offset(b, instr, index, 2, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_atomic(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
- rewrite_offset(b, instr, index, 1, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_shared(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *aligned_size =
- nir_imm_int(b, c->s->info.shared_size & 0xfffffffc);
- nir_ssa_def *offset = nir_umin(b, instr->src[0].ssa, aligned_size);
- nir_instr_rewrite_src(&instr->instr, &instr->src[0],
- nir_src_for_ssa(offset));
-}
-
-static void
-lower_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr)
-{
- if (instr->type != nir_instr_type_intrinsic)
- return;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
- switch (intr->intrinsic) {
- case nir_intrinsic_load_ubo:
- case nir_intrinsic_load_ssbo:
- lower_load(c, b, intr);
- break;
- case nir_intrinsic_store_ssbo:
- lower_store(c, b, intr);
- break;
- case nir_intrinsic_ssbo_atomic_add:
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_ssbo_atomic_comp_swap:
- lower_atomic(c, b, intr);
- break;
- case nir_intrinsic_load_shared:
- case nir_intrinsic_shared_atomic_add:
- case nir_intrinsic_shared_atomic_imin:
- case nir_intrinsic_shared_atomic_umin:
- case nir_intrinsic_shared_atomic_imax:
- case nir_intrinsic_shared_atomic_umax:
- case nir_intrinsic_shared_atomic_and:
- case nir_intrinsic_shared_atomic_or:
- case nir_intrinsic_shared_atomic_xor:
- case nir_intrinsic_shared_atomic_exchange:
- case nir_intrinsic_shared_atomic_comp_swap:
- lower_shared(c, b, intr);
- break;
- default:
- break;
- }
-}
-
-void
-v3d_nir_lower_robust_buffer_access(nir_shader *s, struct v3d_compile *c)
-{
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block)
- lower_instr(c, &b, instr);
- }
-
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
- }
- }
-}
diff --git a/src/broadcom/compiler/v3d_nir_lower_scratch.c b/src/broadcom/compiler/v3d_nir_lower_scratch.c
index 893b6f6ae28..93ed1bb6e26 100644
--- a/src/broadcom/compiler/v3d_nir_lower_scratch.c
+++ b/src/broadcom/compiler/v3d_nir_lower_scratch.c
@@ -34,11 +34,11 @@
* writemasks in the process.
*/
-static nir_ssa_def *
+static nir_def *
v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr)
{
bool is_store = instr->intrinsic == nir_intrinsic_store_scratch;
- nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1);
+ nir_def *offset = instr->src[is_store ? 1 : 0].ssa;
assert(nir_intrinsic_align_mul(instr) >= 4);
assert(nir_intrinsic_align_offset(instr) == 0);
@@ -55,18 +55,18 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
{
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr);
+ nir_def *offset = v3d_nir_scratch_offset(b,instr);
- nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS];
+ nir_def *chans[NIR_MAX_VEC_COMPONENTS];
for (int i = 0; i < instr->num_components; i++) {
- nir_ssa_def *chan_offset =
+ nir_def *chan_offset =
nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
nir_intrinsic_instr *chan_instr =
nir_intrinsic_instr_create(b->shader, instr->intrinsic);
chan_instr->num_components = 1;
- nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1,
- instr->dest.ssa.bit_size, NULL);
+ nir_def_init(&chan_instr->instr, &chan_instr->def, 1,
+ instr->def.bit_size);
chan_instr->src[0] = nir_src_for_ssa(chan_offset);
@@ -74,11 +74,11 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
nir_builder_instr_insert(b, &chan_instr->instr);
- chans[i] = &chan_instr->dest.ssa;
+ chans[i] = &chan_instr->def;
}
- nir_ssa_def *result = nir_vec(b, chans, instr->num_components);
- nir_ssa_def_rewrite_uses(&instr->dest.ssa, result);
+ nir_def *result = nir_vec(b, chans, instr->num_components);
+ nir_def_rewrite_uses(&instr->def, result);
nir_instr_remove(&instr->instr);
}
@@ -87,15 +87,14 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
{
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr);
- nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0],
- instr->num_components);
+ nir_def *offset = v3d_nir_scratch_offset(b, instr);
+ nir_def *value = instr->src[0].ssa;
for (int i = 0; i < instr->num_components; i++) {
if (!(nir_intrinsic_write_mask(instr) & (1 << i)))
continue;
- nir_ssa_def *chan_offset =
+ nir_def *chan_offset =
nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
nir_intrinsic_instr *chan_instr =
@@ -115,39 +114,29 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
nir_instr_remove(&instr->instr);
}
-void
-v3d_nir_lower_scratch(nir_shader *s)
+static bool
+v3d_nir_lower_scratch_cb(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ void *_state)
{
- nir_foreach_function(function, s) {
- if (!function->impl)
- continue;
-
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *intr =
- nir_instr_as_intrinsic(instr);
-
- switch (intr->intrinsic) {
- case nir_intrinsic_load_scratch:
- v3d_nir_lower_load_scratch(&b, intr);
- break;
- case nir_intrinsic_store_scratch:
- v3d_nir_lower_store_scratch(&b, intr);
- break;
- default:
- break;
- }
- }
- }
-
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_scratch:
+ v3d_nir_lower_load_scratch(b, intr);
+ return true;
+ case nir_intrinsic_store_scratch:
+ v3d_nir_lower_store_scratch(b, intr);
+ return true;
+ default:
+ return false;
}
+
+ return false;
+}
+
+bool
+v3d_nir_lower_scratch(nir_shader *s)
+{
+ return nir_shader_intrinsics_pass(s, v3d_nir_lower_scratch_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance, NULL);
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
index d79969374d5..e78c3cb9e3e 100644
--- a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
+++ b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
@@ -32,25 +32,21 @@
* 2x2 quad.
*/
-#define V3D_MAX_SAMPLES 4
-
-static nir_ssa_def *
+static nir_def *
v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
{
nir_tex_instr *instr = nir_instr_as_tex(in_instr);
b->cursor = nir_before_instr(&instr->instr);
- int coord_index = nir_tex_instr_src_index(instr, nir_tex_src_coord);
- int sample_index = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
- nir_ssa_def *coord = instr->src[coord_index].src.ssa;
- nir_ssa_def *sample = instr->src[sample_index].src.ssa;
+ nir_def *coord = nir_steal_tex_src(instr, nir_tex_src_coord);
+ nir_def *sample = nir_steal_tex_src(instr, nir_tex_src_ms_index);
- nir_ssa_def *one = nir_imm_int(b, 1);
- nir_ssa_def *x = nir_iadd(b,
+ nir_def *one = nir_imm_int(b, 1);
+ nir_def *x = nir_iadd(b,
nir_ishl(b, nir_channel(b, coord, 0), one),
nir_iand(b, sample, one));
- nir_ssa_def *y = nir_iadd(b,
+ nir_def *y = nir_iadd(b,
nir_ishl(b, nir_channel(b, coord, 1), one),
nir_iand(b, nir_ushr(b, sample, one), one));
if (instr->is_array)
@@ -58,10 +54,7 @@ v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
else
coord = nir_vec2(b, x, y);
- nir_instr_rewrite_src(&instr->instr,
- &instr->src[nir_tex_src_coord].src,
- nir_src_for_ssa(coord));
- nir_tex_instr_remove_src(instr, sample_index);
+ nir_tex_instr_add_src(instr, nir_tex_src_coord, coord);
instr->op = nir_texop_txf;
instr->sampler_dim = GLSL_SAMPLER_DIM_2D;
@@ -75,11 +68,11 @@ v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data)
nir_instr_as_tex(instr)->op == nir_texop_txf_ms);
}
-void
-v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c)
+bool
+v3d_nir_lower_txf_ms(nir_shader *s)
{
- nir_shader_lower_instructions(s,
- v3d_nir_lower_txf_ms_filter,
- v3d_nir_lower_txf_ms_instr,
- NULL);
+ return nir_shader_lower_instructions(s,
+ v3d_nir_lower_txf_ms_filter,
+ v3d_nir_lower_txf_ms_instr,
+ NULL);
}
diff --git a/src/broadcom/compiler/v3d_packing.c b/src/broadcom/compiler/v3d_packing.c
new file mode 100644
index 00000000000..46643edd5e6
--- /dev/null
+++ b/src/broadcom/compiler/v3d_packing.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#define __gen_unpack_address(cl, s, e) (__gen_unpack_uint(cl, s, e) << (31 - (e - s)))
+#include "cle/v3d_packet_v42_pack.h"
+
+
+/* Typically, this method would wrap calling version-specific variant of this
+ * method, but as TMU_CONFIG_PARAMETER_1 doesn't change between v42 and v71,
+ * we can assume that p1_packed is the same struct, and use the same method.
+ */
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+ uint32_t *p1_packed,
+ bool unnormalized_coordinates)
+{
+ assert(devinfo->ver == 71 || devinfo->ver == 42);
+
+ struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked;
+ V3D42_TMU_CONFIG_PARAMETER_1_unpack((uint8_t *)p1_packed, &p1_unpacked);
+ p1_unpacked.unnormalized_coordinates = unnormalized_coordinates;
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)p1_packed,
+ &p1_unpacked);
+}
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d_tex.c
index 7bebfe95552..643c73c4e58 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d_tex.c
@@ -28,27 +28,29 @@
#define __gen_address_type uint32_t
#define __gen_address_offset(reloc) (*reloc)
#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
-static inline void
+static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
{
/* XXX perf: We should figure out how to merge ALU operations
* producing the val with this MOV, when possible.
*/
- vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+ return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
}
-static inline void
+static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile *c,
enum v3d_qpu_waddr waddr,
struct qreg val,
uint32_t *tmu_writes)
{
- if (tmu_writes)
+ if (tmu_writes) {
(*tmu_writes)++;
- else
- vir_TMU_WRITE(c, waddr, val);
+ return NULL;
+ } else {
+ return vir_TMU_WRITE(c, waddr, val);
+ }
}
static void
@@ -59,11 +61,11 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data
inst->uniform = vir_get_uniform_index(c, contents, data);
}
-static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
.per_pixel_mask_enable = true,
};
-static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
.op = V3D_TMU_OP_REGULAR,
};
@@ -84,7 +86,7 @@ handle_tex_src(struct v3d_compile *c,
nir_tex_instr *instr,
unsigned src_idx,
unsigned non_array_components,
- struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+ struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
struct qreg *s_out,
unsigned *tmu_writes)
{
@@ -199,7 +201,7 @@ handle_tex_src(struct v3d_compile *c,
static void
vir_tex_handle_srcs(struct v3d_compile *c,
nir_tex_instr *instr,
- struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+ struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
struct qreg *s,
unsigned *tmu_writes)
{
@@ -222,31 +224,62 @@ get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
}
void
-v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
{
- assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);
-
unsigned texture_idx = instr->texture_index;
- unsigned sampler_idx = instr->sampler_index;
- struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+ /* For instructions that don't have a sampler (i.e. txf) we bind
+ * default sampler state via the backend_flags to handle precision.
+ */
+ unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ?
+ instr->sampler_index : instr->backend_flags;
+
+ /* Even if the texture operation doesn't need a sampler by
+ * itself, we still need to add the sampler configuration
+ * parameter if the output is 32 bit
+ */
+ assert(sampler_idx < c->key->num_samplers_used);
+ bool output_type_32_bit =
+ c->key->sampler[sampler_idx].return_size == 32;
+
+ struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
};
/* Limit the number of channels returned to both how many the NIR
* instruction writes and how many the instruction could produce.
*/
- p0_unpacked.return_words_of_texture_data =
- instr->dest.is_ssa ?
- nir_ssa_def_components_read(&instr->dest.ssa) :
- (1 << instr->dest.reg.reg->num_components) - 1;
+ nir_intrinsic_instr *store = nir_store_reg_for_def(&instr->def);
+ if (store == NULL) {
+ p0_unpacked.return_words_of_texture_data =
+ nir_def_components_read(&instr->def);
+ } else {
+ nir_def *reg = store->src[1].ssa;
+ nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+ unsigned reg_num_components =
+ nir_intrinsic_num_components(decl);
+
+ /* For the non-ssa case we don't have a full equivalent to
+ * nir_def_components_read. This is a problem for the 16
+ * bit case. nir_lower_tex will not change the destination as
+ * nir_tex_instr_dest_size will still return 4. The driver is
+ * just expected to not store on other channels, so we
+ * manually ensure that here.
+ */
+ uint32_t num_components = output_type_32_bit ?
+ MIN2(reg_num_components, 4) :
+ MIN2(reg_num_components, 2);
+
+ p0_unpacked.return_words_of_texture_data = (1 << num_components) - 1;
+ }
assert(p0_unpacked.return_words_of_texture_data != 0);
- struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
.op = V3D_TMU_OP_REGULAR,
.gather_mode = instr->op == nir_texop_tg4,
.gather_component = instr->component,
.coefficient_mode = instr->op == nir_texop_txd,
- .disable_autolod = instr->op == nir_texop_tg4
+ .disable_autolod = instr->op == nir_texop_tg4,
+ .lod_query = instr->op == nir_texop_lod,
};
const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr);
@@ -270,22 +303,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
uint32_t p0_packed;
- V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
(uint8_t *)&p0_packed,
&p0_unpacked);
uint32_t p2_packed;
- V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
(uint8_t *)&p2_packed,
&p2_unpacked);
- /* We manually set the LOD Query bit (see
- * V3D42_TMU_CONFIG_PARAMETER_2) as right now is the only V42 specific
- * feature over V41 we are using
- */
- if (instr->op == nir_texop_lod)
- p2_packed |= 1UL << 24;
-
/* Load texture_idx number into the high bits of the texture address field,
* which will be be used by the driver to decide which texture to put
* in the actual address field.
@@ -294,14 +320,6 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
- /* Even if the texture operation doesn't need a sampler by
- * itself, we still need to add the sampler configuration
- * parameter if the output is 32 bit
- */
- bool output_type_32_bit =
- c->key->sampler[sampler_idx].return_size == 32 &&
- !instr->is_shadow;
-
/* p1 is optional, but we can skip it only if p2 can be skipped too */
bool needs_p2_config =
(instr->op == nir_texop_lod ||
@@ -313,7 +331,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
output_type_32_bit;
if (non_default_p1_config) {
- struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
.output_type_32_bit = output_type_32_bit,
.unnormalized_coordinates = (instr->sampler_dim ==
@@ -330,7 +348,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
p0_unpacked.return_words_of_texture_data < (1 << 2));
uint32_t p1_packed;
- V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
(uint8_t *)&p1_packed,
&p1_unpacked);
@@ -358,7 +376,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
* address
*/
uint32_t p1_packed_default;
- V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
(uint8_t *)&p1_packed_default,
&p1_unpacked_default);
vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default);
@@ -368,48 +386,54 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
/* Emit retiring TMU write */
+ struct qinst *retiring;
if (instr->op == nir_texop_txf) {
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
} else if (instr->op == nir_texop_txl) {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
} else {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
}
- ntq_add_pending_tmu_flush(c, &instr->dest,
+ retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+ ntq_add_pending_tmu_flush(c, &instr->def,
p0_unpacked.return_words_of_texture_data);
}
static uint32_t
-v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
+v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr)
+{
+ nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+ switch (atomic_op) {
+ case nir_atomic_op_iadd: return v3d_get_op_for_atomic_add(instr, 3);
+ case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN;
+ case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+ case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX;
+ case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX;
+ case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC;
+ case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC;
+ case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+ case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+ case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+ default: unreachable("unknown atomic op");
+ }
+}
+
+static uint32_t
+v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
return V3D_TMU_OP_REGULAR;
- case nir_intrinsic_image_atomic_add:
- return v3d_get_op_for_atomic_add(instr, 3);
- case nir_intrinsic_image_atomic_imin:
- return V3D_TMU_OP_WRITE_SMIN;
- case nir_intrinsic_image_atomic_umin:
- return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
- case nir_intrinsic_image_atomic_imax:
- return V3D_TMU_OP_WRITE_SMAX;
- case nir_intrinsic_image_atomic_umax:
- return V3D_TMU_OP_WRITE_UMAX;
- case nir_intrinsic_image_atomic_and:
- return V3D_TMU_OP_WRITE_AND_READ_INC;
- case nir_intrinsic_image_atomic_or:
- return V3D_TMU_OP_WRITE_OR_READ_DEC;
- case nir_intrinsic_image_atomic_xor:
- return V3D_TMU_OP_WRITE_XOR_READ_NOT;
- case nir_intrinsic_image_atomic_exchange:
- return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
- case nir_intrinsic_image_atomic_comp_swap:
- return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+ case nir_intrinsic_image_atomic:
+ case nir_intrinsic_image_atomic_swap:
+ return v3d_image_atomic_tmu_op(instr);
+
default:
unreachable("unknown image intrinsic");
};
@@ -427,7 +451,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
* which is why we always call ntq_get_src() even if we are only interested in
* register write counts.
*/
-static void
+static struct qinst *
vir_image_emit_register_writes(struct v3d_compile *c,
nir_intrinsic_instr *instr,
bool atomic_add_replaced,
@@ -480,7 +504,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
}
/* Second atomic argument */
- if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) {
+ if (instr->intrinsic == nir_intrinsic_image_atomic_swap &&
+ nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) {
struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0);
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0,
tmu_writes);
@@ -494,7 +519,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
V3D_QPU_PF_PUSHZ);
}
- vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
+ struct qinst *retiring =
+ vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
instr->intrinsic != nir_intrinsic_image_load) {
@@ -502,6 +528,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
(struct qinst *)c->cur_block->instructions.prev;
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
}
+
+ return retiring;
}
static unsigned
@@ -516,21 +544,21 @@ get_required_image_tmu_writes(struct v3d_compile *c,
}
void
-v3d40_vir_emit_image_load_store(struct v3d_compile *c,
- nir_intrinsic_instr *instr)
+v3d_vir_emit_image_load_store(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
{
unsigned format = nir_intrinsic_format(instr);
unsigned unit = nir_src_as_uint(instr->src[0]);
- struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
};
- struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
.per_pixel_mask_enable = true,
.output_type_32_bit = v3d_gl_format_is_return_32(format),
};
- struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
+ struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
/* Limit the number of channels returned to both how many the NIR
* instruction writes and how many the instruction could produce.
@@ -542,19 +570,20 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
p0_unpacked.return_words_of_texture_data =
(1 << instr_return_channels) - 1;
- p2_unpacked.op = v3d40_image_load_store_tmu_op(instr);
+ p2_unpacked.op = v3d_image_load_store_tmu_op(instr);
/* If we were able to replace atomic_add for an inc/dec, then we
* need/can to do things slightly different, like not loading the
* amount to add/sub, as that is implicit.
*/
bool atomic_add_replaced =
- (instr->intrinsic == nir_intrinsic_image_atomic_add &&
- (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
- p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+ instr->intrinsic == nir_intrinsic_image_atomic &&
+ nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
+ (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+ p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC);
uint32_t p0_packed;
- V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
(uint8_t *)&p0_packed,
&p0_unpacked);
@@ -565,12 +594,12 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
p0_packed |= unit << 24;
uint32_t p1_packed;
- V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
(uint8_t *)&p1_packed,
&p1_unpacked);
uint32_t p2_packed;
- V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
(uint8_t *)&p2_packed,
&p2_unpacked);
@@ -599,8 +628,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
- vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
-
- ntq_add_pending_tmu_flush(c, &instr->dest,
+ struct qinst *retiring =
+ vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
+ retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+ ntq_add_pending_tmu_flush(c, &instr->def,
p0_unpacked.return_words_of_texture_data);
}
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 27869a35a3b..c59a8aac434 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -23,7 +23,6 @@
#include "broadcom/common/v3d_device_info.h"
#include "v3d_compiler.h"
-#include "util/u_prim.h"
#include "compiler/nir/nir_schedule.h"
#include "compiler/nir/nir_builder.h"
@@ -89,7 +88,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
* pointer, so each read has a side effect (we don't care for ldunif
* because we reconstruct the uniform stream buffer after compiling
* with the surviving uniforms), so allowing DCE to remove
- * one would break follow-up loads. We could fix this by emiting a
+ * one would break follow-up loads. We could fix this by emitting a
* unifa for each ldunifa, but each unifa requires 3 delay slots
* before a ldunifa, so that would be quite expensive.
*/
@@ -113,10 +112,10 @@ vir_is_raw_mov(struct qinst *inst)
return false;
}
- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
@@ -156,30 +155,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
- for (int i = 0; i < vir_get_nsrc(inst); i++) {
- switch (inst->src[i].file) {
- case QFILE_VPM:
- return true;
- default:
- break;
- }
- }
-
- if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
- inst->qpu.sig.ldtlb ||
- inst->qpu.sig.ldtlbu ||
- inst->qpu.sig.ldvpm)) {
- return true;
- }
-
- return false;
-}
+ if (!devinfo->has_accumulators)
+ return false;
-bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
-{
switch (inst->dst.file) {
case QFILE_MAGIC:
switch (inst->dst.index) {
@@ -195,9 +176,6 @@ vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
break;
}
- if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
- return true;
-
return false;
}
@@ -209,15 +187,15 @@ vir_set_unpack(struct qinst *inst, int src,
if (vir_is_add(inst)) {
if (src == 0)
- inst->qpu.alu.add.a_unpack = unpack;
+ inst->qpu.alu.add.a.unpack = unpack;
else
- inst->qpu.alu.add.b_unpack = unpack;
+ inst->qpu.alu.add.b.unpack = unpack;
} else {
assert(vir_is_mul(inst));
if (src == 0)
- inst->qpu.alu.mul.a_unpack = unpack;
+ inst->qpu.alu.mul.a.unpack = unpack;
else
- inst->qpu.alu.mul.b_unpack = unpack;
+ inst->qpu.alu.mul.b.unpack = unpack;
}
}
@@ -369,6 +347,8 @@ vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct q
inst->src[1] = src1;
inst->uniform = ~0;
+ inst->ip = -1;
+
return inst;
}
@@ -385,6 +365,8 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q
inst->src[1] = src1;
inst->uniform = ~0;
+ inst->ip = -1;
+
return inst;
}
@@ -404,12 +386,16 @@ vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
inst->dst = vir_nop_reg();
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
+ inst->ip = -1;
+
return inst;
}
static void
vir_emit(struct v3d_compile *c, struct qinst *inst)
{
+ inst->ip = -1;
+
switch (c->cursor.mode) {
case vir_cursor_add:
list_add(&inst->link, c->cursor.link);
@@ -509,13 +495,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
}
const struct v3d_compiler *
-v3d_compiler_init(const struct v3d_device_info *devinfo)
+v3d_compiler_init(const struct v3d_device_info *devinfo,
+ uint32_t max_inline_uniform_buffers)
{
struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
if (!compiler)
return NULL;
compiler->devinfo = devinfo;
+ compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
if (!vir_init_reg_sets(compiler)) {
ralloc_free(compiler);
@@ -531,6 +519,19 @@ v3d_compiler_free(const struct v3d_compiler *compiler)
ralloc_free((void *)compiler);
}
+struct v3d_compiler_strategy {
+ const char *name;
+ uint32_t max_threads;
+ uint32_t min_threads;
+ bool disable_general_tmu_sched;
+ bool disable_gcm;
+ bool disable_loop_unrolling;
+ bool disable_ubo_load_sorting;
+ bool move_buffer_loads;
+ bool disable_tmu_pipelining;
+ uint32_t max_tmu_spills;
+};
+
static struct v3d_compile *
vir_compile_init(const struct v3d_compiler *compiler,
struct v3d_key *key,
@@ -539,12 +540,8 @@ vir_compile_init(const struct v3d_compiler *compiler,
void *debug_output_data),
void *debug_output_data,
int program_id, int variant_id,
- uint32_t max_threads,
- uint32_t min_threads_for_reg_alloc,
- bool tmu_spilling_allowed,
- bool disable_loop_unrolling,
- bool disable_constant_ubo_load_sorting,
- bool disable_tmu_pipelining,
+ uint32_t compile_strategy_idx,
+ const struct v3d_compiler_strategy *strategy,
bool fallback_scheduler)
{
struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
@@ -554,17 +551,22 @@ vir_compile_init(const struct v3d_compiler *compiler,
c->key = key;
c->program_id = program_id;
c->variant_id = variant_id;
- c->threads = max_threads;
+ c->compile_strategy_idx = compile_strategy_idx;
+ c->threads = strategy->max_threads;
c->debug_output = debug_output;
c->debug_output_data = debug_output_data;
c->compilation_result = V3D_COMPILATION_SUCCEEDED;
- c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
- c->tmu_spilling_allowed = tmu_spilling_allowed;
+ c->min_threads_for_reg_alloc = strategy->min_threads;
+ c->max_tmu_spills = strategy->max_tmu_spills;
c->fallback_scheduler = fallback_scheduler;
- c->disable_tmu_pipelining = disable_tmu_pipelining;
- c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
- c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
- ? true : disable_loop_unrolling;
+ c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
+ c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
+ c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
+ c->move_buffer_loads = strategy->move_buffer_loads;
+ c->disable_gcm = strategy->disable_gcm;
+ c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
+ ? true : strategy->disable_loop_unrolling;
+
s = nir_shader_clone(c, s);
c->s = s;
@@ -590,17 +592,107 @@ type_size_vec4(const struct glsl_type *type, bool bindless)
return glsl_count_attribute_slots(type, false);
}
+static enum nir_lower_tex_packing
+lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
+{
+ struct v3d_compile *c = (struct v3d_compile *) data;
+
+ int sampler_index = nir_tex_instr_need_sampler(tex) ?
+ tex->sampler_index : tex->backend_flags;
+
+ assert(sampler_index < c->key->num_samplers_used);
+ return c->key->sampler[sampler_index].return_size == 16 ?
+ nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
+}
+
+static bool
+v3d_nir_lower_null_pointers_cb(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ void *_state)
+{
+ uint32_t buffer_src_idx;
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ssbo:
+ buffer_src_idx = 0;
+ break;
+ case nir_intrinsic_store_ssbo:
+ buffer_src_idx = 1;
+ break;
+ default:
+ return false;
+ }
+
+ /* If index if constant we are good */
+ nir_src *src = &intr->src[buffer_src_idx];
+ if (nir_src_is_const(*src))
+ return false;
+
+ /* Otherwise, see if it comes from a bcsel including a null pointer */
+ if (src->ssa->parent_instr->type != nir_instr_type_alu)
+ return false;
+
+ nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
+ if (alu->op != nir_op_bcsel)
+ return false;
+
+ /* A null pointer is specified using block index 0xffffffff */
+ int32_t null_src_idx = -1;
+ for (int i = 1; i < 3; i++) {
+ /* FIXME: since we are running this before optimization maybe
+ * we need to also handle the case where we may have bcsel
+ * chain that we need to recurse?
+ */
+ if (!nir_src_is_const(alu->src[i].src))
+ continue;
+ if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
+ continue;
+
+ /* One of the bcsel srcs is a null pointer reference */
+ null_src_idx = i;
+ break;
+ }
+
+ if (null_src_idx < 0)
+ return false;
+
+ assert(null_src_idx == 1 || null_src_idx == 2);
+ int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
+
+ /* Rewrite the null pointer reference so we use the same buffer index
+ * as the other bcsel branch. This will allow optimization to remove
+ * the bcsel and we should then end up with a constant buffer index
+ * like we need.
+ */
+ b->cursor = nir_before_instr(&alu->instr);
+ nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
+ nir_src_rewrite(&alu->src[null_src_idx].src, copy);
+
+ return true;
+}
+
+static bool
+v3d_nir_lower_null_pointers(nir_shader *s)
+{
+ return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance, NULL);
+}
+
static void
v3d_lower_nir(struct v3d_compile *c)
{
struct nir_lower_tex_options tex_options = {
.lower_txd = true,
+ .lower_tg4_offsets = true,
.lower_tg4_broadcom_swizzle = true,
.lower_rect = false, /* XXX: Use this on V3D 3.x */
.lower_txp = ~0,
/* Apply swizzles to all samplers. */
.swizzle_result = ~0,
+ .lower_invalid_implicit_lod = true,
};
/* Lower the format swizzle and (for 32-bit returns)
@@ -612,38 +704,35 @@ v3d_lower_nir(struct v3d_compile *c)
tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
}
- assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
- for (int i = 0; i < c->key->num_samplers_used; i++) {
- if (c->key->sampler[i].return_size == 16) {
- tex_options.lower_tex_packing[i] =
- nir_lower_tex_packing_16;
- }
- }
-
- /* CS textures may not have return_size reflecting the shadow state. */
- nir_foreach_uniform_variable(var, c->s) {
- const struct glsl_type *type = glsl_without_array(var->type);
- unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+ tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
+ tex_options.lower_tex_packing_data = c;
- if (!glsl_type_is_sampler(type) ||
- !glsl_sampler_type_is_shadow(type))
- continue;
+ NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
+ NIR_PASS(_, c->s, nir_lower_system_values);
- for (int i = 0; i < array_len; i++) {
- tex_options.lower_tex_packing[var->data.binding + i] =
- nir_lower_tex_packing_16;
- }
+ if (c->s->info.zero_initialize_shared_memory &&
+ c->s->info.shared_size > 0) {
+ /* All our BOs allocate full pages, so the underlying allocation
+ * for shared memory will always be a multiple of 4KB. This
+ * ensures that we can do an exact number of full chunk_size
+ * writes to initialize the memory independently of the actual
+ * shared_size used by the shader, which is a requirement of
+ * the initialization pass.
+ */
+ const unsigned chunk_size = 16; /* max single store size */
+ NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
+ align(c->s->info.shared_size, chunk_size), chunk_size);
}
- NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
- NIR_PASS_V(c->s, nir_lower_system_values);
- NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
+ NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
- NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
- nir_var_function_temp,
- 0,
- glsl_get_natural_size_align_bytes);
- NIR_PASS_V(c->s, v3d_nir_lower_scratch);
+ NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
+ nir_var_function_temp,
+ 0,
+ glsl_get_natural_size_align_bytes);
+ NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
+ NIR_PASS(_, c->s, v3d_nir_lower_scratch);
+ NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
}
static void
@@ -711,6 +800,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
/* Set us up for shared input/output segments. This is apparently
* necessary for our VCM setup to avoid varying corruption.
+ *
+ * FIXME: initial testing on V3D 7.1 seems to work fine when using
+ * separate segments. So we could try to reevaluate in the future, if
+ * there is any advantage of using separate segments.
*/
prog_data->separate_segments = false;
prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
@@ -807,13 +900,14 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
{
v3d_set_fs_prog_data_inputs(c, prog_data);
prog_data->writes_z = c->writes_z;
+ prog_data->writes_z_from_fep = c->writes_z_from_fep;
prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
prog_data->uses_center_w = c->uses_center_w;
prog_data->uses_implicit_point_line_varyings =
c->uses_implicit_point_line_varyings;
prog_data->lock_scoreboard_on_first_thrsw =
c->lock_scoreboard_on_first_thrsw;
- prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
+ prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
prog_data->uses_pid = c->fs_uses_primitive_id;
}
@@ -837,8 +931,14 @@ v3d_set_prog_data(struct v3d_compile *c,
prog_data->threads = c->threads;
prog_data->single_seg = !c->last_thrsw;
prog_data->spill_size = c->spill_size;
+ prog_data->tmu_spills = c->spills;
+ prog_data->tmu_fills = c->fills;
+ prog_data->tmu_count = c->tmu.total_count;
+ prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
+ prog_data->compile_strategy_idx = c->compile_strategy_idx;
prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
prog_data->has_control_barrier = c->s->info.uses_control_barrier;
+ prog_data->has_global_address = c->has_global_address;
v3d_set_prog_data_uniforms(c, prog_data);
@@ -882,32 +982,32 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
/* Split our I/O vars and dead code eliminate the unused
* components.
*/
- NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
- nir_var_shader_in | nir_var_shader_out);
+ NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+ nir_var_shader_in | nir_var_shader_out);
uint64_t used_outputs[4] = {0};
for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
used_outputs[comp] |= 1ull << slot;
}
- NIR_PASS_V(c->s, nir_remove_unused_io_vars,
- nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
- NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+ NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+ nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+ NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
v3d_optimize_nir(c, c->s);
- NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+ NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
/* This must go before nir_lower_io */
if (c->vs_key->per_vertex_point_size)
- NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+ NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
- NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
- type_size_vec4,
- (nir_lower_io_options)0);
+ NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+ type_size_vec4,
+ (nir_lower_io_options)0);
/* clean up nir_lower_io's deref_var remains and do a constant folding pass
* on the code it generated.
*/
- NIR_PASS_V(c->s, nir_opt_dce);
- NIR_PASS_V(c->s, nir_opt_constant_folding);
+ NIR_PASS(_, c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
}
static void
@@ -916,29 +1016,32 @@ v3d_nir_lower_gs_early(struct v3d_compile *c)
/* Split our I/O vars and dead code eliminate the unused
* components.
*/
- NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
- nir_var_shader_in | nir_var_shader_out);
+ NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+ nir_var_shader_in | nir_var_shader_out);
uint64_t used_outputs[4] = {0};
for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
used_outputs[comp] |= 1ull << slot;
}
- NIR_PASS_V(c->s, nir_remove_unused_io_vars,
- nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
- NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+ NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+ nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+ NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
v3d_optimize_nir(c, c->s);
- NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+ NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
/* This must go before nir_lower_io */
if (c->gs_key->per_vertex_point_size)
- NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+ NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
- NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
- type_size_vec4,
- (nir_lower_io_options)0);
- /* clean up nir_lower_io's deref_var remains */
- NIR_PASS_V(c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+ type_size_vec4,
+ (nir_lower_io_options)0);
+ /* clean up nir_lower_io's deref_var remains and do a constant folding pass
+ * on the code it generated.
+ */
+ NIR_PASS(_, c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
}
static void
@@ -977,11 +1080,11 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
v3d_fixup_fs_output_types(c);
- NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
+ NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
if (c->fs_key->line_smoothing) {
- v3d_nir_lower_line_smooth(c->s);
- NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+ NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
+ NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
/* The lowering pass can introduce new sysval reads */
nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
}
@@ -991,26 +1094,26 @@ static void
v3d_nir_lower_gs_late(struct v3d_compile *c)
{
if (c->key->ucp_enables) {
- NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
- false, NULL);
+ NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
+ true, NULL);
}
/* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
- NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
}
static void
v3d_nir_lower_vs_late(struct v3d_compile *c)
{
if (c->key->ucp_enables) {
- NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
- false, false, NULL);
+ NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
+ false, true, NULL);
NIR_PASS_V(c->s, nir_lower_io_to_scalar,
- nir_var_shader_out);
+ nir_var_shader_out, NULL, NULL);
}
/* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
- NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
}
static void
@@ -1024,9 +1127,9 @@ v3d_nir_lower_fs_late(struct v3d_compile *c)
* are using.
*/
if (c->key->ucp_enables)
- NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
+ NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
- NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
}
static uint32_t
@@ -1107,6 +1210,69 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
return false;
}
+static unsigned
+v3d_instr_delay_cb(nir_instr *instr, void *data)
+{
+ struct v3d_compile *c = (struct v3d_compile *) data;
+
+ switch (instr->type) {
+ case nir_instr_type_undef:
+ case nir_instr_type_load_const:
+ case nir_instr_type_alu:
+ case nir_instr_type_deref:
+ case nir_instr_type_jump:
+ case nir_instr_type_parallel_copy:
+ case nir_instr_type_call:
+ case nir_instr_type_phi:
+ return 1;
+
+ /* We should not use very large delays for TMU instructions. Typically,
+ * thread switches will be sufficient to hide all or most of the latency,
+ * so we typically only need a little bit of extra room. If we over-estimate
+ * the latency here we may end up unnecessarily delaying the critical path in
+ * the shader, which would have a negative effect in performance, so here
+ * we are trying to strike a balance based on empirical testing.
+ */
+ case nir_instr_type_intrinsic: {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ if (!c->disable_general_tmu_sched) {
+ switch (intr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ return 0;
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_image_load:
+ return 3;
+ case nir_intrinsic_load_ubo:
+ if (nir_src_is_divergent(intr->src[1]))
+ return 3;
+ FALLTHROUGH;
+ default:
+ return 1;
+ }
+ } else {
+ switch (intr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ return 0;
+ default:
+ return 1;
+ }
+ }
+ break;
+ }
+
+ case nir_instr_type_tex:
+ return 5;
+ }
+
+ return 0;
+}
+
static bool
should_split_wrmask(const nir_instr *instr, const void *data)
{
@@ -1197,7 +1363,7 @@ v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
* reference offset, since otherwise we would not be able to
* skip the unifa write for them. See ntq_emit_load_ubo_unifa.
*/
- if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
+ if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
continue;
/* We will move this load if its offset is smaller than ref's
@@ -1349,16 +1515,14 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
static bool
v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
{
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_foreach_block(block, function->impl) {
- c->sorted_any_ubo_loads |=
- v3d_nir_sort_constant_ubo_loads_block(c, block);
- }
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ nir_foreach_function_impl(impl, s) {
+ nir_foreach_block(block, impl) {
+ c->sorted_any_ubo_loads |=
+ v3d_nir_sort_constant_ubo_loads_block(c, block);
}
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
}
return c->sorted_any_ubo_loads;
}
@@ -1376,8 +1540,8 @@ lower_load_num_subgroups(struct v3d_compile *c,
DIV_ROUND_UP(c->s->info.workgroup_size[0] *
c->s->info.workgroup_size[1] *
c->s->info.workgroup_size[2], V3D_CHANNELS);
- nir_ssa_def *result = nir_imm_int(b, num_subgroups);
- nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
+ nir_def *result = nir_imm_int(b, num_subgroups);
+ nir_def_rewrite_uses(&intr->def, result);
nir_instr_remove(&intr->instr);
}
@@ -1404,6 +1568,36 @@ lower_subgroup_intrinsics(struct v3d_compile *c,
case nir_intrinsic_load_subgroup_size:
case nir_intrinsic_load_subgroup_invocation:
case nir_intrinsic_elect:
+ case nir_intrinsic_ballot:
+ case nir_intrinsic_inverse_ballot:
+ case nir_intrinsic_ballot_bitfield_extract:
+ case nir_intrinsic_ballot_bit_count_reduce:
+ case nir_intrinsic_ballot_find_lsb:
+ case nir_intrinsic_ballot_find_msb:
+ case nir_intrinsic_ballot_bit_count_exclusive:
+ case nir_intrinsic_ballot_bit_count_inclusive:
+ case nir_intrinsic_reduce:
+ case nir_intrinsic_inclusive_scan:
+ case nir_intrinsic_exclusive_scan:
+ case nir_intrinsic_read_invocation:
+ case nir_intrinsic_read_first_invocation:
+ case nir_intrinsic_load_subgroup_eq_mask:
+ case nir_intrinsic_load_subgroup_ge_mask:
+ case nir_intrinsic_load_subgroup_gt_mask:
+ case nir_intrinsic_load_subgroup_le_mask:
+ case nir_intrinsic_load_subgroup_lt_mask:
+ case nir_intrinsic_shuffle:
+ case nir_intrinsic_shuffle_xor:
+ case nir_intrinsic_shuffle_up:
+ case nir_intrinsic_shuffle_down:
+ case nir_intrinsic_vote_all:
+ case nir_intrinsic_vote_any:
+ case nir_intrinsic_vote_feq:
+ case nir_intrinsic_vote_ieq:
+ case nir_intrinsic_quad_broadcast:
+ case nir_intrinsic_quad_swap_horizontal:
+ case nir_intrinsic_quad_swap_vertical:
+ case nir_intrinsic_quad_swap_diagonal:
c->has_subgroups = true;
break;
default:
@@ -1418,18 +1612,15 @@ static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
{
bool progress = false;
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_builder b;
- nir_builder_init(&b, function->impl);
+ nir_foreach_function_impl(impl, s) {
+ nir_builder b = nir_builder_create(impl);
- nir_foreach_block(block, function->impl)
- progress |= lower_subgroup_intrinsics(c, block, &b);
+ nir_foreach_block(block, impl)
+ progress |= lower_subgroup_intrinsics(c, block, &b);
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
- }
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
}
return progress;
}
@@ -1483,30 +1674,54 @@ v3d_attempt_compile(struct v3d_compile *c)
break;
}
- NIR_PASS_V(c->s, v3d_nir_lower_io, c);
- NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
- NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
+ NIR_PASS(_, c->s, v3d_nir_lower_io, c);
+ NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
+ NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
+
+ NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
nir_lower_idiv_options idiv_options = {
- .imprecise_32bit_lowering = true,
.allow_fp16 = true,
};
- NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
-
- if (c->key->robust_buffer_access) {
- /* v3d_nir_lower_robust_buffer_access assumes constant buffer
- * indices on ubo/ssbo intrinsics so run copy propagation and
- * constant folding passes before we run the lowering to warrant
- * this. We also want to run the lowering before v3d_optimize to
- * clean-up redundant get_buffer_size calls produced in the pass.
- */
- NIR_PASS_V(c->s, nir_copy_prop);
- NIR_PASS_V(c->s, nir_opt_constant_folding);
- NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
+ NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
+ NIR_PASS(_, c->s, nir_lower_alu);
+
+ if (c->key->robust_uniform_access || c->key->robust_storage_access ||
+ c->key->robust_image_access) {
+ /* nir_lower_robust_access assumes constant buffer
+ * indices on ubo/ssbo intrinsics so run copy propagation and
+ * constant folding passes before we run the lowering to warrant
+ * this. We also want to run the lowering before v3d_optimize to
+ * clean-up redundant get_buffer_size calls produced in the pass.
+ */
+ NIR_PASS(_, c->s, nir_copy_prop);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
+
+ nir_lower_robust_access_options opts = {
+ .lower_image = c->key->robust_image_access,
+ .lower_ssbo = c->key->robust_storage_access,
+ .lower_ubo = c->key->robust_uniform_access,
+ };
+
+ NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
}
- NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
+ NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
- NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
+ NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
+
+ NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
+
+ const nir_lower_subgroups_options subgroup_opts = {
+ .subgroup_size = V3D_CHANNELS,
+ .ballot_components = 1,
+ .ballot_bit_size = 32,
+ .lower_to_scalar = true,
+ .lower_inverse_ballot = true,
+ .lower_subgroup_masks = true,
+ .lower_relative_shuffle = true,
+ .lower_quad = true,
+ };
+ NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
v3d_optimize_nir(c, c->s);
@@ -1519,25 +1734,25 @@ v3d_attempt_compile(struct v3d_compile *c)
while (more_late_algebraic) {
more_late_algebraic = false;
NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
- NIR_PASS_V(c->s, nir_opt_constant_folding);
- NIR_PASS_V(c->s, nir_copy_prop);
- NIR_PASS_V(c->s, nir_opt_dce);
- NIR_PASS_V(c->s, nir_opt_cse);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
+ NIR_PASS(_, c->s, nir_copy_prop);
+ NIR_PASS(_, c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_opt_cse);
}
- NIR_PASS_V(c->s, nir_lower_bool_to_int32);
- nir_convert_to_lcssa(c->s, true, true);
+ NIR_PASS(_, c->s, nir_lower_bool_to_int32);
+ NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
NIR_PASS_V(c->s, nir_divergence_analysis);
- NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+ NIR_PASS(_, c->s, nir_convert_from_ssa, true);
struct nir_schedule_options schedule_options = {
/* Schedule for about half our register space, to enable more
* shaders to hit 4 threads.
*/
- .threshold = 24,
+ .threshold = c->threads == 4 ? 24 : 48,
/* Vertex shaders share the same memory for inputs and outputs,
- * fragement and geometry shaders do not.
+ * fragment and geometry shaders do not.
*/
.stages_with_shared_io_memory =
(((1 << MESA_ALL_SHADER_STAGES) - 1) &
@@ -1548,11 +1763,22 @@ v3d_attempt_compile(struct v3d_compile *c)
.intrinsic_cb = v3d_intrinsic_dependency_cb,
.intrinsic_cb_data = c,
+
+ .instr_delay_cb = v3d_instr_delay_cb,
+ .instr_delay_cb_data = c,
};
NIR_PASS_V(c->s, nir_schedule, &schedule_options);
if (!c->disable_constant_ubo_load_sorting)
- NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
+ NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
+
+ const nir_move_options buffer_opts = c->move_buffer_loads ?
+ (nir_move_load_ubo | nir_move_load_ssbo) : 0;
+ NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
+ nir_move_const_undef |
+ buffer_opts);
+
+ NIR_PASS_V(c->s, nir_trivialize_registers);
v3d_nir_to_vir(c);
}
@@ -1611,32 +1837,28 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
* register allocation to any particular thread count). This is fine
* because v3d_nir_to_vir will cap this to the actual minimum.
*/
-struct v3d_compiler_strategy {
- const char *name;
- uint32_t max_threads;
- uint32_t min_threads;
- bool disable_loop_unrolling;
- bool disable_ubo_load_sorting;
- bool disable_tmu_pipelining;
- bool tmu_spilling_allowed;
-} static const strategies[] = {
- /*0*/ { "default", 4, 4, false, false, false, false },
- /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false },
- /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false },
- /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false },
- /*4*/ { "lower thread count", 2, 1, false, false, false, false },
- /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false },
- /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false },
- /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true },
- /*8*/ { "fallback scheduler", 2, 1, true, true, true, true }
+static const struct v3d_compiler_strategy strategies[] = {
+ /*0*/ { "default", 4, 4, false, false, false, false, false, false, 0 },
+ /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, false, false, 0 },
+ /*2*/ { "disable gcm", 4, 4, true, true, false, false, false, false, 0 },
+ /*3*/ { "disable loop unrolling", 4, 4, true, true, true, false, false, false, 0 },
+ /*4*/ { "disable UBO load sorting", 4, 4, true, true, true, true, false, false, 0 },
+ /*5*/ { "disable TMU pipelining", 4, 4, true, true, true, true, false, true, 0 },
+ /*6*/ { "lower thread count", 2, 1, false, false, false, false, false, false, -1 },
+ /*7*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, false, false, -1 },
+ /*8*/ { "disable gcm (2t)", 2, 1, true, true, false, false, false, false, -1 },
+ /*9*/ { "disable loop unrolling (2t)", 2, 1, true, true, true, false, false, false, -1 },
+ /*10*/ { "Move buffer loads (2t)", 2, 1, true, true, true, true, true, false, -1 },
+ /*11*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, true, true, -1 },
+ /*12*/ { "fallback scheduler", 2, 1, true, true, true, true, true, true, -1 }
};
/**
* If a particular optimization didn't make any progress during a compile
- * attempt disabling it alone won't allow us to compile the shader successfuly,
+ * attempt disabling it alone won't allow us to compile the shader successfully,
* since we'll end up with the same code. Detect these scenarios so we can
* avoid wasting time with useless compiles. We should also consider if the
- * strategy changes other aspects of the compilation process though, like
+ * gy changes other aspects of the compilation process though, like
* spilling, and not skip it in that case.
*/
static bool
@@ -1649,31 +1871,55 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
assert(idx > 0);
/* Don't skip a strategy that changes spilling behavior */
- if (strategies[idx].tmu_spilling_allowed !=
- strategies[idx - 1].tmu_spilling_allowed) {
+ if (strategies[idx].max_tmu_spills !=
+ strategies[idx - 1].max_tmu_spills) {
return false;
}
switch (idx) {
- /* Loop unrolling: skip if we didn't unroll any loops */
+ /* General TMU sched.: skip if we didn't emit any TMU loads */
case 1:
- case 5:
+ case 7:
+ return !c->has_general_tmu_load;
+ /* Global code motion: skip if nir_opt_gcm didn't make any progress */
+ case 2:
+ case 8:
+ return !c->gcm_progress;
+ /* Loop unrolling: skip if we didn't unroll any loops */
+ case 3:
+ case 9:
return !c->unrolled_any_loops;
/* UBO load sorting: skip if we didn't sort any loads */
- case 2:
- case 6:
+ case 4:
return !c->sorted_any_ubo_loads;
+ /* Move buffer loads: we assume any shader with difficult RA
+ * most likely has UBO / SSBO loads so we never try to skip.
+ * For now, we only try this for 2-thread compiles since it
+ * is expected to impact instruction counts and latency.
+ */
+ case 10:
+ assert(c->threads < 4);
+ return false;
/* TMU pipelining: skip if we didn't pipeline any TMU ops */
- case 3:
- case 7:
+ case 5:
+ case 11:
return !c->pipelined_any_tmu;
/* Lower thread count: skip if we already tried less that 4 threads */
- case 4:
+ case 6:
return c->threads < 4;
default:
return false;
};
}
+
+static inline void
+set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
+{
+ if (*best)
+ vir_compile_destroy(*best);
+ *best = c;
+}
+
uint64_t *v3d_compile(const struct v3d_compiler *compiler,
struct v3d_key *key,
struct v3d_prog_data **out_prog_data,
@@ -1685,58 +1931,106 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
uint32_t *final_assembly_size)
{
struct v3d_compile *c = NULL;
- for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
+
+ uint32_t best_spill_fill_count = UINT32_MAX;
+ struct v3d_compile *best_c = NULL;
+ for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
/* Fallback strategy */
- if (i > 0) {
+ if (strat > 0) {
assert(c);
- if (skip_compile_strategy(c, i))
+ if (skip_compile_strategy(c, strat))
continue;
char *debug_msg;
int ret = asprintf(&debug_msg,
- "Falling back to strategy '%s' for %s",
- strategies[i].name,
- vir_get_stage_name(c));
+ "Falling back to strategy '%s' "
+ "for %s prog %d/%d",
+ strategies[strat].name,
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
if (ret >= 0) {
- if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
+ if (V3D_DBG(PERF))
fprintf(stderr, "%s\n", debug_msg);
c->debug_output(debug_msg, c->debug_output_data);
free(debug_msg);
}
- vir_compile_destroy(c);
+ if (c != best_c)
+ vir_compile_destroy(c);
}
c = vir_compile_init(compiler, key, s,
debug_output, debug_output_data,
program_id, variant_id,
- strategies[i].max_threads,
- strategies[i].min_threads,
- strategies[i].tmu_spilling_allowed,
- strategies[i].disable_loop_unrolling,
- strategies[i].disable_ubo_load_sorting,
- strategies[i].disable_tmu_pipelining,
- i == ARRAY_SIZE(strategies) - 1);
+ strat, &strategies[strat],
+ strat == ARRAY_SIZE(strategies) - 1);
v3d_attempt_compile(c);
- if (i >= ARRAY_SIZE(strategies) - 1 ||
- c->compilation_result !=
- V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
+ /* Broken shader or driver bug */
+ if (c->compilation_result == V3D_COMPILATION_FAILED)
break;
+
+ /* If we compiled without spills, choose this.
+ * Otherwise if this is a 4-thread compile, choose this (these
+ * have a very low cap on the allowed TMU spills so we assume
+ * it will be better than a 2-thread compile without spills).
+ * Otherwise, keep going while tracking the strategy with the
+ * lowest spill count.
+ */
+ if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
+ if (c->spills == 0 ||
+ strategies[strat].min_threads == 4 ||
+ V3D_DBG(OPT_COMPILE_TIME)) {
+ set_best_compile(&best_c, c);
+ break;
+ } else if (c->spills + c->fills <
+ best_spill_fill_count) {
+ set_best_compile(&best_c, c);
+ best_spill_fill_count = c->spills + c->fills;
+ }
+
+ if (V3D_DBG(PERF)) {
+ char *debug_msg;
+ int ret = asprintf(&debug_msg,
+ "Compiled %s prog %d/%d with %d "
+ "spills and %d fills. Will try "
+ "more strategies.",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id,
+ c->spills, c->fills);
+ if (ret >= 0) {
+ fprintf(stderr, "%s\n", debug_msg);
+ c->debug_output(debug_msg, c->debug_output_data);
+ free(debug_msg);
+ }
+ }
}
+
+ /* Only try next streategy if we failed to register allocate
+ * or we had to spill.
+ */
+ assert(c->compilation_result ==
+ V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
+ c->spills > 0);
}
- if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
+ /* If the best strategy was not the last, choose that */
+ if (best_c && c != best_c)
+ set_best_compile(&c, best_c);
+
+ if (V3D_DBG(PERF) &&
c->compilation_result !=
V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
c->spills > 0) {
char *debug_msg;
int ret = asprintf(&debug_msg,
- "Compiled %s with %d spills and %d fills",
+ "Compiled %s prog %d/%d with %d "
+ "spills and %d fills",
vir_get_stage_name(c),
+ c->program_id, c->variant_id,
c->spills, c->fills);
fprintf(stderr, "%s\n", debug_msg);
@@ -1747,8 +2041,12 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
}
if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
- fprintf(stderr, "Failed to compile %s with any strategy.\n",
- vir_get_stage_name(c));
+ fprintf(stderr, "Failed to compile %s prog %d/%d "
+ "with any strategy.\n",
+ vir_get_stage_name(c), c->program_id, c->variant_id);
+
+ vir_compile_destroy(c);
+ return NULL;
}
struct v3d_prog_data *prog_data;
@@ -1762,8 +2060,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
char *shaderdb;
int ret = v3d_shaderdb_dump(c, &shaderdb);
if (ret >= 0) {
- if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
- fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
+ if (V3D_DBG(SHADERDB))
+ fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
c->debug_output(shaderdb, c->debug_output_data);
free(shaderdb);
@@ -1872,8 +2170,11 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
struct qinst *prev_inst = NULL;
assert(c->cur_block);
-#ifdef DEBUG
- /* Check if the current instruction is part of the current block */
+#if MESA_DEBUG
+ /* We can only reuse a uniform if it was emitted in the same block,
+ * so callers must make sure the current instruction is being emitted
+ * in the current block.
+ */
bool found = false;
vir_for_each_inst(inst, c->cur_block) {
if (&inst->link == c->cursor.link) {
@@ -1882,7 +2183,7 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
}
}
- assert(found || list_is_empty(&c->cur_block->instructions));
+ assert(found || &c->cur_block->instructions == c->cursor.link);
#endif
list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
@@ -1900,6 +2201,12 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
if (!prev_inst)
return false;
+ /* Only reuse the ldunif result if it was written to a temp register,
+ * otherwise there may be special restrictions (for example, ldunif
+ * may write directly to unifa, which is a write-only register).
+ */
+ if (prev_inst->dst.file != QFILE_TEMP)
+ return false;
list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
&c->cur_block->instructions, link) {
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..631eeee52ab 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -182,11 +182,6 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
break;
}
- case QFILE_VPM:
- fprintf(stderr, "vpm%d.%d",
- reg.index / 4, reg.index % 4);
- break;
-
case QFILE_TEMP:
fprintf(stderr, "t%d", reg.index);
break;
@@ -197,9 +192,6 @@ static void
vir_dump_sig_addr(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr)
{
- if (devinfo->ver < 41)
- return;
-
if (!instr->sig_magic)
fprintf(stderr, ".rf%d", instr->sig_addr);
else {
@@ -270,8 +262,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
- unpack[0] = instr->alu.add.a_unpack;
- unpack[1] = instr->alu.add.b_unpack;
+ unpack[0] = instr->alu.add.a.unpack;
+ unpack[1] = instr->alu.add.b.unpack;
} else {
fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +274,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
- unpack[0] = instr->alu.mul.a_unpack;
- unpack[1] = instr->alu.mul.b_unpack;
+ unpack[0] = instr->alu.mul.a.unpack;
+ unpack[1] = instr->alu.mul.b.unpack;
}
for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 2fd6430a0f4..d1f44aa9cf7 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
flags_inst = NULL;
}
- /* Payload registers: r0/1/2 contain W, centroid W,
- * and Z at program start. Register allocation will
- * force their nodes to R0/1/2.
+ /* Payload registers: for fragment shaders, W,
+ * centroid W, and Z will be initialized in r0/1/2
+ * until v42, or r1/r2/r3 since v71.
+ *
+ * For compute shaders, payload is in r0/r2 up to v42,
+ * r2/r3 since v71.
+ *
+ * Register allocation will force their nodes to those
+ * registers.
*/
if (inst->src[0].file == QFILE_REG) {
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
+ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+ if (inst->src[0].index >= min_payload_r ||
+ inst->src[0].index <= max_payload_r) {
c->temp_start[inst->dst.index] = 0;
- break;
}
}
@@ -306,6 +311,8 @@ vir_calculate_live_intervals(struct v3d_compile *c)
vir_for_each_block(block, c) {
ralloc_free(block->def);
+ ralloc_free(block->defin);
+ ralloc_free(block->defout);
ralloc_free(block->use);
ralloc_free(block->live_in);
ralloc_free(block->live_out);
diff --git a/src/broadcom/compiler/vir_opt_constant_alu.c b/src/broadcom/compiler/vir_opt_constant_alu.c
index 483646f882e..dc4c8a65026 100644
--- a/src/broadcom/compiler/vir_opt_constant_alu.c
+++ b/src/broadcom/compiler/vir_opt_constant_alu.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -155,6 +155,7 @@ vir_opt_constant_alu(struct v3d_compile *c)
{
bool progress = false;
vir_for_each_block(block, c) {
+ c->cur_block = block;
vir_for_each_inst_safe(inst, block) {
progress = try_opt_constant_alu(c, inst) || progress;
}
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index c5bb6112173..611c4693ed3 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
#include "v3d_compiler.h"
static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
{
if (!inst)
return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
return false;
}
- switch (inst->src[0].file) {
- case QFILE_MAGIC:
- /* No copy propagating from R3/R4/R5 -- the MOVs from those
- * are there to register allocate values produced into R3/4/5
- * to other regs (though hopefully r3/4/5).
- */
- switch (inst->src[0].index) {
- case V3D_QPU_WADDR_R3:
- case V3D_QPU_WADDR_R4:
- case V3D_QPU_WADDR_R5:
- return false;
+ if (devinfo->ver == 42) {
+ switch (inst->src[0].file) {
+ case QFILE_MAGIC:
+ /* No copy propagating from R3/R4/R5 -- the MOVs from
+ * those are there to register allocate values produced
+ * into R3/4/5 to other regs (though hopefully r3/4/5).
+ */
+ switch (inst->src[0].index) {
+ case V3D_QPU_WADDR_R3:
+ case V3D_QPU_WADDR_R4:
+ case V3D_QPU_WADDR_R5:
+ return false;
+ default:
+ break;
+ }
+ break;
+
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ /* MOVs from rf0/1/2 are only to track the live
+ * intervals for W/centroid W/Z.
+ */
+ return false;
+ }
+ break;
+
default:
break;
}
- break;
-
- case QFILE_REG:
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
- /* MOVs from rf0/1/2 are only to track the live
+ } else {
+ assert(devinfo->ver >= 71);
+ switch (inst->src[0].file) {
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ /* MOVs from rf1/2/3 are only to track the live
* intervals for W/centroid W/Z.
+ *
+ * Note: rf0 can be implicitly written by ldvary
+ * (no temp involved), so it is not an SSA value and
+ * could clash with writes to other temps that are
+ * also allocated to rf0. In theory, that would mean
+ * that we can't copy propagate from it, but we handle
+ * this at register allocation time, preventing temps
+ * from being allocated to rf0 while the rf0 value from
+ * ldvary is still live.
*/
- return false;
- }
- break;
+ case 1:
+ case 2:
+ case 3:
+ return false;
+ }
+ break;
- default:
- break;
+ default:
+ break;
+ }
}
return true;
@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)
if (vir_is_add(inst)) {
if (chan == 0)
- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
} else {
if (chan == 0)
- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
}
}
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
*/
struct qinst *mov = movs[inst->src[i].index];
if (!mov) {
- if (!is_copy_mov(c->defs[inst->src[i].index]))
+ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
continue;
mov = c->defs[inst->src[i].index];
@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
continue;
/* these ops can't represent abs. */
- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_VFPACK:
case V3D_QPU_A_FROUND:
@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
inst->src[i] = mov->src[0];
if (vir_has_unpack(mov, 0)) {
- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
vir_set_unpack(inst, i, unpack);
}
@@ -238,12 +267,14 @@ vir_opt_copy_propagate(struct v3d_compile *c)
*/
memset(movs, 0, sizeof(struct qinst *) * c->num_temps);
+ c->cur_block = block;
vir_for_each_inst(inst, block) {
+
progress = try_copy_prop(c, inst, movs) || progress;
apply_kills(c, movs, inst);
- if (is_copy_mov(inst))
+ if (is_copy_mov(c->devinfo, inst))
movs[inst->dst.index] = inst;
}
}
diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c
index 64c762c88db..fd1af944427 100644
--- a/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/src/broadcom/compiler/vir_opt_dead_code.c
@@ -52,21 +52,10 @@ dce(struct v3d_compile *c, struct qinst *inst)
}
static bool
-has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
-{
- for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_VPM)
- return true;
- }
-
- return false;
-}
-
-static bool
can_write_to_null(struct v3d_compile *c, struct qinst *inst)
{
/* The SFU instructions must write to a physical register. */
- if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu))
+ if (v3d_qpu_uses_sfu(&inst->qpu))
return false;
return true;
@@ -149,30 +138,25 @@ check_first_ldunifa(struct v3d_compile *c,
}
static bool
-increment_unifa_address(struct v3d_compile *c, struct qblock *block, struct qinst *unifa)
+increment_unifa_address(struct v3d_compile *c, struct qinst *unifa)
{
- struct qblock *current_block = c->cur_block;
if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
unifa->qpu.alu.mul.op == V3D_QPU_M_MOV) {
c->cursor = vir_after_inst(unifa);
- c->cur_block = block;
struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
vir_ADD_dest(c, unifa_reg, unifa->src[0], vir_uniform_ui(c, 4u));
vir_remove_instruction(c, unifa);
- c->cur_block = current_block;
return true;
}
if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
unifa->qpu.alu.add.op == V3D_QPU_A_ADD) {
c->cursor = vir_after_inst(unifa);
- c->cur_block = block;
struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
struct qreg tmp =
vir_ADD(c, unifa->src[1], vir_uniform_ui(c, 4u));
vir_ADD_dest(c, unifa_reg, unifa->src[0], tmp);
vir_remove_instruction(c, unifa);
- c->cur_block = current_block;
return true;
}
@@ -200,7 +184,7 @@ vir_opt_dead_code(struct v3d_compile *c)
vir_for_each_block(block, c) {
struct qinst *last_flags_write = NULL;
-
+ c->cur_block = block;
vir_for_each_inst_safe(inst, block) {
/* If this instruction reads the flags, we can't
* remove the flags generation for it.
@@ -246,7 +230,6 @@ vir_opt_dead_code(struct v3d_compile *c)
}
if (v3d_qpu_writes_flags(&inst->qpu) ||
- has_nonremovable_reads(c, inst) ||
(is_ldunifa && !is_first_ldunifa && !is_last_ldunifa)) {
/* If we can't remove the instruction, but we
* don't need its destination value, just
@@ -276,7 +259,7 @@ vir_opt_dead_code(struct v3d_compile *c)
*/
if (is_first_ldunifa) {
assert(unifa);
- if (!increment_unifa_address(c, block, unifa))
+ if (!increment_unifa_address(c, unifa))
continue;
}
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index 4609ef9c361..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
a->qpu.flags.mpf != b->qpu.flags.mpf ||
a->qpu.alu.add.op != b->qpu.alu.add.op ||
a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
return false;
}
@@ -99,6 +99,7 @@ vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block)
struct qinst *last_flags = NULL;
bool progress = false;
+ c->cur_block = block;
vir_for_each_inst(inst, block) {
if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..56f0bf20706 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
/* The small immediate value sits in the raddr B field, so we
* can't have 2 small immediates in one instruction (unless
* they're the same value, but that should be optimized away
- * elsewhere).
+ * elsewhere). Since 7.x we can encode small immediates in
+ * any raddr field, but each instruction can still only use
+ * one.
*/
bool uses_small_imm = false;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
*/
struct v3d_qpu_sig new_sig = inst->qpu.sig;
uint32_t sig_packed;
- new_sig.small_imm = true;
+ if (c->devinfo->ver == 42) {
+ new_sig.small_imm_b = true;
+ } else {
+ if (vir_is_add(inst)) {
+ if (i == 0)
+ new_sig.small_imm_a = true;
+ else
+ new_sig.small_imm_b = true;
+ } else {
+ if (i == 0)
+ new_sig.small_imm_c = true;
+ else
+ new_sig.small_imm_d = true;
+ }
+ }
+
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
continue;
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
vir_dump_inst(c, inst);
fprintf(stderr, "\n");
}
- inst->qpu.sig.small_imm = true;
+ inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+ inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+ inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+ inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
inst->qpu.raddr_b = packed;
inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 08698b4ece1..53e84840899 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -26,12 +26,100 @@
#include "common/v3d_device_info.h"
#include "v3d_compiler.h"
-#define QPU_R(i) { .magic = false, .index = i }
-
#define ACC_INDEX 0
#define ACC_COUNT 6
-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT 64
+
+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return ACC_INDEX + ACC_COUNT;
+ else
+ return 0;
+}
+
+/* ACC as accumulator */
+#define CLASS_BITS_PHYS (1 << 0)
+#define CLASS_BITS_ACC (1 << 1)
+#define CLASS_BITS_R5 (1 << 4)
+
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+ else
+ return CLASS_BITS_PHYS;
+}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+ if (!devinfo->has_accumulators) {
+ assert(class_bits & CLASS_BITS_PHYS);
+ class_bits = CLASS_BITS_PHYS;
+ }
+ return class_bits;
+}
+
+static inline uint32_t
+temp_to_node(struct v3d_compile *c, uint32_t temp)
+{
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
+}
+
+static inline uint32_t
+node_to_temp(struct v3d_compile *c, uint32_t node)
+{
+ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
+}
+
+static inline uint8_t
+get_temp_class_bits(struct v3d_compile *c,
+ uint32_t temp)
+{
+ return c->nodes.info[temp_to_node(c, temp)].class_bits;
+}
+
+static inline void
+set_temp_class_bits(struct v3d_compile *c,
+ uint32_t temp, uint8_t class_bits)
+{
+ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
+}
+
+static struct ra_class *
+choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
+{
+ if (class_bits == CLASS_BITS_PHYS) {
+ return c->compiler->reg_class_phys[c->thread_index];
+ } else if (class_bits == (CLASS_BITS_R5)) {
+ assert(c->devinfo->has_accumulators);
+ return c->compiler->reg_class_r5[c->thread_index];
+ } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+ assert(c->devinfo->has_accumulators);
+ return c->compiler->reg_class_phys_or_acc[c->thread_index];
+ } else {
+ assert(class_bits == get_class_bit_any(c->devinfo));
+ return c->compiler->reg_class_any[c->thread_index];
+ }
+}
+
+static inline struct ra_class *
+choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
+{
+ assert(temp < c->num_temps && temp < c->nodes.alloc_count);
+ return choose_reg_class(c, get_temp_class_bits(c, temp));
+}
static inline bool
qinst_writes_tmu(const struct v3d_device_info *devinfo,
@@ -46,23 +134,22 @@ static bool
is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
struct qinst *inst, struct qblock *block)
{
- if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
- inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
- return true;
- }
-
- if (!inst->qpu.sig.ldtmu)
+ /* Only tmuwt and ldtmu can finish TMU sequences */
+ bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+ bool is_ldtmu = inst->qpu.sig.ldtmu;
+ if (!is_tmuwt && !is_ldtmu)
return false;
+ /* Check if this is the last tmuwt or ldtmu in the sequence */
list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
&block->instructions, link) {
- if (scan_inst->qpu.sig.ldtmu)
- return false;
+ is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+ is_ldtmu = scan_inst->qpu.sig.ldtmu;
- if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
- inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
- return true;
- }
+ if (is_tmuwt || is_ldtmu)
+ return false;
if (qinst_writes_tmu(devinfo, scan_inst))
return true;
@@ -79,11 +166,101 @@ vir_is_mov_uniform(struct v3d_compile *c, int temp)
return def && def->qpu.sig.ldunif;
}
+static bool
+can_reconstruct_inst(struct qinst *inst)
+{
+ assert(inst);
+
+ if (vir_is_add(inst)) {
+ switch (inst->qpu.alu.add.op) {
+ case V3D_QPU_A_FXCD:
+ case V3D_QPU_A_FYCD:
+ case V3D_QPU_A_XCD:
+ case V3D_QPU_A_YCD:
+ case V3D_QPU_A_IID:
+ case V3D_QPU_A_EIDX:
+ case V3D_QPU_A_TIDX:
+ case V3D_QPU_A_SAMPID:
+ /* No need to check input unpacks because none of these
+ * opcodes read sources. FXCD,FYCD have pack variants.
+ */
+ return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+ inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
+ inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
+ inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
+ default:
+ return false;
+ }
+ }
+
+ return false;
+}
+
+static bool
+can_reconstruct_temp(struct v3d_compile *c, int temp)
+{
+ struct qinst *def = c->defs[temp];
+ return def && can_reconstruct_inst(def);
+}
+
+static struct qreg
+reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
+{
+ struct qreg dest;
+ switch (op) {
+ case V3D_QPU_A_FXCD:
+ dest = vir_FXCD(c);
+ break;
+ case V3D_QPU_A_FYCD:
+ dest = vir_FYCD(c);
+ break;
+ case V3D_QPU_A_XCD:
+ dest = vir_XCD(c);
+ break;
+ case V3D_QPU_A_YCD:
+ dest = vir_YCD(c);
+ break;
+ case V3D_QPU_A_IID:
+ dest = vir_IID(c);
+ break;
+ case V3D_QPU_A_EIDX:
+ dest = vir_EIDX(c);
+ break;
+ case V3D_QPU_A_TIDX:
+ dest = vir_TIDX(c);
+ break;
+ case V3D_QPU_A_SAMPID:
+ dest = vir_SAMPID(c);
+ break;
+ default:
+ unreachable("Unexpected opcode for reconstruction");
+ }
+
+ return dest;
+}
+
+enum temp_spill_type {
+ SPILL_TYPE_UNIFORM,
+ SPILL_TYPE_RECONSTRUCT,
+ SPILL_TYPE_TMU
+};
+
+static enum temp_spill_type
+get_spill_type_for_temp(struct v3d_compile *c, int temp)
+{
+ if (vir_is_mov_uniform(c, temp))
+ return SPILL_TYPE_UNIFORM;
+
+ if (can_reconstruct_temp(c, temp))
+ return SPILL_TYPE_RECONSTRUCT;
+
+ return SPILL_TYPE_TMU;
+}
+
static int
-v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
- uint32_t *temp_to_node)
+v3d_choose_spill_node(struct v3d_compile *c)
{
- const float tmu_scale = 5;
+ const float tmu_scale = 10;
float block_scale = 1.0;
float spill_costs[c->num_temps];
bool in_tmu_operation = false;
@@ -99,7 +276,8 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
* starting output writes.
*/
bool no_spilling =
- c->threads > 1 && started_last_seg;
+ (c->threads > 1 && started_last_seg) ||
+ (c->max_tmu_spills == 0);
/* Discourage spilling of TMU operations */
for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -107,7 +285,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
continue;
int temp = inst->src[i].index;
- if (vir_is_mov_uniform(c, temp)) {
+ enum temp_spill_type spill_type =
+ get_spill_type_for_temp(c, temp);
+
+ if (spill_type != SPILL_TYPE_TMU) {
spill_costs[temp] += block_scale;
} else if (!no_spilling) {
float tmu_op_scale = in_tmu_operation ?
@@ -122,11 +303,11 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
if (inst->dst.file == QFILE_TEMP) {
int temp = inst->dst.index;
+ enum temp_spill_type spill_type =
+ get_spill_type_for_temp(c, temp);
- if (vir_is_mov_uniform(c, temp)) {
- /* We just rematerialize the unform
- * later.
- */
+ if (spill_type != SPILL_TYPE_TMU) {
+ /* We just rematerialize it later */
} else if (!no_spilling) {
spill_costs[temp] += (block_scale *
tmu_scale);
@@ -147,10 +328,6 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
if (inst->is_last_thrsw)
started_last_seg = true;
- if (v3d_qpu_writes_vpm(&inst->qpu) ||
- v3d_qpu_uses_tlb(&inst->qpu))
- started_last_seg = true;
-
/* Track when we're in between a TMU setup and the
* final LDTMU or TMUWT from that TMU setup. We
* penalize spills during that time.
@@ -163,12 +340,53 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
}
}
+ /* We always emit a "last thrsw" to ensure all our spilling occurs
+ * before the last thread section. See vir_emit_last_thrsw.
+ */
+ assert(started_last_seg);
+
for (unsigned i = 0; i < c->num_temps; i++) {
- if (BITSET_TEST(c->spillable, i))
- ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]);
+ if (BITSET_TEST(c->spillable, i)) {
+ ra_set_node_spill_cost(c->g, temp_to_node(c, i),
+ spill_costs[i]);
+ }
}
- return ra_get_best_spill_node(g);
+ return ra_get_best_spill_node(c->g);
+}
+
+static void
+ensure_nodes(struct v3d_compile *c)
+{
+ if (c->num_temps < c->nodes.alloc_count)
+ return;
+
+ c->nodes.alloc_count *= 2;
+ c->nodes.info = reralloc_array_size(c,
+ c->nodes.info,
+ sizeof(c->nodes.info[0]),
+ c->nodes.alloc_count +
+ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
+}
+
+/* Creates the interference node for a new temp. We use this to keep the node
+ * list updated during the spilling process, which generates new temps/nodes.
+ */
+static void
+add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+{
+ ensure_nodes(c);
+
+ int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
+ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+ node == temp + IMPLICIT_RF_COUNT);
+
+ /* We fill the node priority after we are done inserting spills */
+ c->nodes.info[node].class_bits = class_bits;
+ c->nodes.info[node].priority = 0;
+ c->nodes.info[node].is_ldunif_dst = false;
+ c->nodes.info[node].is_program_end = false;
+ c->nodes.info[node].unused = false;
}
/* The spill offset for this thread takes a bit of setup, so do it once at
@@ -206,79 +424,224 @@ v3d_setup_spill_base(struct v3d_compile *c)
vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
/* Make sure that we don't spill the spilling setup instructions. */
- for (int i = start_num_temps; i < c->num_temps; i++)
+ for (int i = start_num_temps; i < c->num_temps; i++) {
BITSET_CLEAR(c->spillable, i);
+ /* If we are spilling, update the RA map with the temps added
+ * by the spill setup. Our spill_base register can never be an
+ * accumulator because it is used for TMU spill/fill and thus
+ * needs to persist across thread switches.
+ */
+ if (c->spilling) {
+ int temp_class = CLASS_BITS_PHYS;
+ if (c->devinfo->has_accumulators &&
+ i != c->spill_base.index) {
+ temp_class |= CLASS_BITS_ACC;
+ }
+ add_node(c, i, temp_class);
+ }
+ }
+
/* Restore the current block. */
c->cur_block = current_block;
c->cursor = vir_after_block(c->cur_block);
}
-static struct qinst *
-v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
+/**
+ * Computes the address for a spill/fill sequence and completes the spill/fill
+ * sequence by emitting the following code:
+ *
+ * ldunif.spill_offset
+ * add tmua spill_base spill_offset
+ * thrsw
+ *
+ * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
+ * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
+ *
+ * The parameter 'ip' represents the ip at which the spill/fill is happening.
+ * This is used to disallow accumulators on temps that cross this ip boundary
+ * due to the new thrsw itroduced in the sequence above.
+ */
+static void
+v3d_emit_spill_tmua(struct v3d_compile *c,
+ uint32_t spill_offset,
+ enum v3d_qpu_cond cond,
+ int32_t ip,
+ struct qreg *fill_dst)
{
- return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
- c->spill_base, vir_uniform_ui(c, spill_offset));
-}
+ assert(ip >= 0);
+
+ /* Load a uniform with the spill offset and add it to the spill base
+ * to obtain the TMUA address. It can be of class ANY because we know
+ * we are consuming it immediately without thrsw in between.
+ */
+ assert(c->disable_ldunif_opt);
+ struct qreg offset = vir_uniform_ui(c, spill_offset);
+ add_node(c, offset.index, get_class_bit_any(c->devinfo));
+ /* We always enable per-quad on spills/fills to ensure we spill
+ * any channels involved with helper invocations.
+ */
+ struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+ struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
+ inst->qpu.flags.ac = cond;
+ inst->ldtmu_count = 1;
+ inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+ 0xffffff7f); /* per-quad */
+
+ vir_emit_thrsw(c);
+
+ /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
+ * result of the fill. The TMUWT temp is not really read, the ldtmu
+ * temp will be used immediately so just like the uniform above we
+ * can allow accumulators.
+ */
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ if (!fill_dst) {
+ struct qreg dst = vir_TMUWT(c);
+ assert(dst.file == QFILE_TEMP);
+ add_node(c, dst.index, temp_class);
+ } else {
+ *fill_dst = vir_LDTMU(c);
+ assert(fill_dst->file == QFILE_TEMP);
+ add_node(c, fill_dst->index, temp_class);
+ }
+
+ /* Temps across the thread switch we injected can't be assigned to
+ * accumulators.
+ *
+ * Fills inject code before ip, so anything that starts at ip or later
+ * is not affected by the thrsw. Something that ends at ip will be
+ * affected though.
+ *
+ * Spills inject code after ip, so anything that starts strictly later
+ * than ip is not affected (the temp starting at ip is usually the
+ * spilled temp except for postponed spills). Something that ends at ip
+ * won't be affected either.
+ */
+ for (int i = 0; i < c->spill_start_num_temps; i++) {
+ bool thrsw_cross = fill_dst ?
+ c->temp_start[i] < ip && c->temp_end[i] >= ip :
+ c->temp_start[i] <= ip && c->temp_end[i] > ip;
+ if (thrsw_cross) {
+ ra_set_node_class(c->g, temp_to_node(c, i),
+ choose_reg_class(c, CLASS_BITS_PHYS));
+ }
+ }
+}
static void
-v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst,
- struct qinst *position, uint32_t spill_offset)
+v3d_emit_tmu_spill(struct v3d_compile *c,
+ struct qinst *inst,
+ struct qreg spill_temp,
+ struct qinst *position,
+ uint32_t ip,
+ uint32_t spill_offset)
{
assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+ assert(inst->dst.file == QFILE_TEMP);
c->cursor = vir_after_inst(position);
- inst->dst = vir_get_temp(c);
+
enum v3d_qpu_cond cond = vir_get_cond(inst);
+
+ /* If inst and position don't match, this is a postponed spill,
+ * in which case we have already allocated the temp for the spill
+ * and we should use that, otherwise create a new temp with the
+ * same register class bits as the original.
+ */
+ if (inst == position) {
+ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
+ inst->dst = vir_get_temp(c);
+ add_node(c, inst->dst.index, class_bits);
+ } else {
+ inst->dst = spill_temp;
+
+ /* If this is a postponed spill the register being spilled may
+ * have been written more than once including conditional
+ * writes, so ignore predication on the spill instruction and
+ * always spill the full register.
+ */
+ cond = V3D_QPU_COND_NONE;
+ }
+
struct qinst *tmp =
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
inst->dst);
tmp->qpu.flags.mc = cond;
- tmp = v3d_emit_spill_tmua(c, spill_offset);
- tmp->qpu.flags.ac = cond;
- vir_emit_thrsw(c);
- vir_TMUWT(c);
+
+ v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
+
c->spills++;
c->tmu_dirty_rcl = true;
}
+static inline bool
+interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
+{
+ return !(t0_start >= t1_end || t1_start >= t0_end);
+}
+
static void
-v3d_spill_reg(struct v3d_compile *c, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+ int spill_temp)
{
- c->spill_count++;
+ c->spill_start_num_temps = c->num_temps;
+ c->spilling = true;
- bool is_uniform = vir_is_mov_uniform(c, spill_temp);
+ enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
uint32_t spill_offset = 0;
-
- if (!is_uniform) {
+ if (spill_type == SPILL_TYPE_TMU) {
spill_offset = c->spill_size;
c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
- if (spill_offset == 0)
+ if (spill_offset == 0) {
v3d_setup_spill_base(c);
+
+ /* Don't allocate our spill base to rf0 to avoid
+ * conflicts with instructions doing implicit writes
+ * to that register.
+ */
+ if (!c->devinfo->has_accumulators) {
+ ra_add_node_interference(
+ c->g,
+ temp_to_node(c, c->spill_base.index),
+ implicit_rf_nodes[0]);
+ }
+ }
}
struct qinst *last_thrsw = c->last_thrsw;
assert(last_thrsw && last_thrsw->is_last_thrsw);
- int start_num_temps = c->num_temps;
-
int uniform_index = ~0;
- if (is_uniform) {
+ if (spill_type == SPILL_TYPE_UNIFORM) {
struct qinst *orig_unif = c->defs[spill_temp];
uniform_index = orig_unif->uniform;
}
+ enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
+ if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+ struct qinst *orig_def = c->defs[spill_temp];
+ assert(vir_is_add(orig_def));
+ reconstruct_op = orig_def->qpu.alu.add.op;
+ }
+
+ uint32_t spill_node = temp_to_node(c, spill_temp);
+
/* We must disable the ldunif optimization if we are spilling uniforms */
bool had_disable_ldunif_opt = c->disable_ldunif_opt;
c->disable_ldunif_opt = true;
struct qinst *start_of_tmu_sequence = NULL;
struct qinst *postponed_spill = NULL;
+ struct qreg postponed_spill_temp = { 0 };
vir_for_each_block(block, c) {
vir_for_each_inst_safe(inst, block) {
+ int32_t ip = inst->ip;
+
/* Track when we're in between a TMU setup and the final
* LDTMU or TMUWT from that TMU setup. We can't spill/fill any
* temps during that time, because that involves inserting a
@@ -289,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
if (postponed_spill) {
v3d_emit_tmu_spill(c, postponed_spill,
- inst, spill_offset);
+ postponed_spill_temp,
+ inst, ip, spill_offset);
}
start_of_tmu_sequence = NULL;
@@ -302,49 +666,103 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
}
/* fills */
+ int filled_src = -1;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
if (inst->src[i].file != QFILE_TEMP ||
inst->src[i].index != spill_temp) {
continue;
}
+ if (filled_src >= 0) {
+ inst->src[i] = inst->src[filled_src];
+ continue;
+ }
+
c->cursor = vir_before_inst(inst);
- if (is_uniform) {
+ if (spill_type == SPILL_TYPE_UNIFORM) {
struct qreg unif =
vir_uniform(c,
c->uniform_contents[uniform_index],
c->uniform_data[uniform_index]);
inst->src[i] = unif;
+ /* We are using the uniform in the
+ * instruction immediately after, so
+ * we can use any register class for it.
+ */
+ add_node(c, unif.index,
+ get_class_bit_any(c->devinfo));
+ } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+ struct qreg temp =
+ reconstruct_temp(c, reconstruct_op);
+ inst->src[i] = temp;
+ /* We are using the temp in the
+ * instruction immediately after so we
+ * can use ACC.
+ */
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+ CLASS_BITS_ACC);
+ add_node(c, temp.index, temp_class);
} else {
- /* If we have a postponed spill, we don't need
- * a fill as the temp would not have been
- * spilled yet.
+ /* If we have a postponed spill, we
+ * don't need a fill as the temp would
+ * not have been spilled yet, however,
+ * we need to update the temp index.
*/
- if (postponed_spill)
- continue;
- if (start_of_tmu_sequence)
- c->cursor = vir_before_inst(start_of_tmu_sequence);
-
- v3d_emit_spill_tmua(c, spill_offset);
- vir_emit_thrsw(c);
- inst->src[i] = vir_LDTMU(c);
- c->fills++;
+ if (postponed_spill) {
+ inst->src[i] =
+ postponed_spill_temp;
+ } else {
+ int32_t fill_ip = ip;
+ if (start_of_tmu_sequence) {
+ c->cursor = vir_before_inst(start_of_tmu_sequence);
+ fill_ip = start_of_tmu_sequence->ip;
+ }
+
+ v3d_emit_spill_tmua(c, spill_offset,
+ V3D_QPU_COND_NONE,
+ fill_ip, &inst->src[i]);
+ c->fills++;
+ }
}
+
+ filled_src = i;
}
/* spills */
if (inst->dst.file == QFILE_TEMP &&
inst->dst.index == spill_temp) {
- if (is_uniform) {
+ if (spill_type != SPILL_TYPE_TMU) {
c->cursor.link = NULL;
vir_remove_instruction(c, inst);
} else {
- if (start_of_tmu_sequence)
+ /* If we are in the middle of a TMU
+ * sequence, we postpone the actual
+ * spill until we have finished it. We,
+ * still need to replace the spill temp
+ * with a new temp though.
+ */
+ if (start_of_tmu_sequence) {
+ if (postponed_spill) {
+ postponed_spill->dst =
+ postponed_spill_temp;
+ }
+ if (!postponed_spill ||
+ vir_get_cond(inst) == V3D_QPU_COND_NONE) {
+ postponed_spill_temp =
+ vir_get_temp(c);
+ add_node(c,
+ postponed_spill_temp.index,
+ c->nodes.info[spill_node].class_bits);
+ }
postponed_spill = inst;
- else
- v3d_emit_tmu_spill(c, inst, inst,
+ } else {
+ v3d_emit_tmu_spill(c, inst,
+ postponed_spill_temp,
+ inst, ip,
spill_offset);
+ }
}
}
}
@@ -358,21 +776,64 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
/* Don't allow spilling of our spilling instructions. There's no way
* they can help get things colored.
*/
- for (int i = start_num_temps; i < c->num_temps; i++)
+ for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
BITSET_CLEAR(c->spillable, i);
+ /* Reset interference for spilled node */
+ ra_set_node_spill_cost(c->g, spill_node, 0);
+ ra_reset_node_interference(c->g, spill_node);
+ BITSET_CLEAR(c->spillable, spill_temp);
+
+ /* Rebuild program ips */
+ int32_t ip = 0;
+ vir_for_each_inst_inorder(inst, c)
+ inst->ip = ip++;
+
+ /* Rebuild liveness */
+ vir_calculate_live_intervals(c);
+
+ /* Add interferences for the new spilled temps and update interferences
+ * for c->spill_base (since we may have modified its liveness). Also,
+ * update node priorities based one new liveness data.
+ */
+ uint32_t sb_temp =c->spill_base.index;
+ uint32_t sb_node = temp_to_node(c, sb_temp);
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ if (c->temp_end[i] == -1)
+ continue;
+
+ uint32_t node_i = temp_to_node(c, i);
+ c->nodes.info[node_i].priority =
+ c->temp_end[i] - c->temp_start[i];
+
+ for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
+ j < c->num_temps; j++) {
+ if (interferes(c->temp_start[i], c->temp_end[i],
+ c->temp_start[j], c->temp_end[j])) {
+ uint32_t node_j = temp_to_node(c, j);
+ ra_add_node_interference(c->g, node_i, node_j);
+ }
+ }
+
+ if (spill_type == SPILL_TYPE_TMU) {
+ if (i != sb_temp &&
+ interferes(c->temp_start[i], c->temp_end[i],
+ c->temp_start[sb_temp], c->temp_end[sb_temp])) {
+ ra_add_node_interference(c->g, node_i, sb_node);
+ }
+ }
+ }
+
c->disable_ldunif_opt = had_disable_ldunif_opt;
+ c->spilling = false;
}
-struct node_to_temp_map {
- uint32_t temp;
- uint32_t priority;
-};
-
struct v3d_ra_select_callback_data {
+ uint32_t phys_index;
uint32_t next_acc;
uint32_t next_phys;
- struct node_to_temp_map *map;
+ struct v3d_ra_node_info *nodes;
+ const struct v3d_device_info *devinfo;
};
/* Choosing accumulators improves chances of merging QPU instructions
@@ -384,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
int priority)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Favor accumulators if we have less that this number of physical
* registers. Accumulators have more restrictions (like being
* invalidated through thrsw), so running out of physical registers
@@ -393,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
static const int available_rf_threshold = 5;
int available_rf = 0 ;
for (int i = 0; i < PHYS_COUNT; i++) {
- if (BITSET_TEST(regs, PHYS_INDEX + i))
+ if (BITSET_TEST(regs, v3d_ra->phys_index + i))
available_rf++;
if (available_rf >= available_rf_threshold)
break;
@@ -419,6 +883,19 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
+ /* Choose r5 for our ldunifs if possible (nobody else can load to that
+ * reg, and it keeps the QPU cond field free from being occupied by
+ * ldunifrf).
+ */
+ int r5 = ACC_INDEX + 5;
+ if (BITSET_TEST(regs, r5)) {
+ *out = r5;
+ return true;
+ }
+
/* Round-robin through our accumulators to give post-RA instruction
* selection more options.
*/
@@ -438,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+ unsigned int node,
BITSET_WORD *regs,
unsigned int *out)
{
+ /* If this node is for an unused temp, ignore. */
+ if (v3d_ra->nodes->info[node].unused) {
+ *out = 0;
+ return true;
+ }
+
+ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+ * so we can avoid turning them into ldunifrf (which uses the
+ * cond field to encode the dst and would prevent merge with
+ * instructions that use cond flags).
+ */
+ if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ assert(v3d_ra->devinfo->ver >= 71);
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
+ /* The last 3 instructions in a shader can't use some specific registers
+ * (usually early rf registers, depends on v3d version) so try to
+ * avoid allocating these to registers used by the last instructions
+ * in the shader.
+ */
+ const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
+ if (v3d_ra->nodes->info[node].is_program_end &&
+ v3d_ra->next_phys < safe_rf_start) {
+ v3d_ra->next_phys = safe_rf_start;
+ }
+
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
- int phys = PHYS_INDEX + phys_off;
+
+ /* Try to keep rf0 available for ldunif in 7.x (see above). */
+ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+ continue;
+
+ int phys = v3d_ra->phys_index + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
@@ -452,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
}
}
+ /* If we couldn't allocate, do try to assign rf0 if it is available. */
+ if (v3d_ra->devinfo->ver >= 71 &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ v3d_ra->next_phys = 1;
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
return false;
}
@@ -459,22 +979,14 @@ static unsigned int
v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
{
struct v3d_ra_select_callback_data *v3d_ra = data;
- int r5 = ACC_INDEX + 5;
-
- /* Choose r5 for our ldunifs if possible (nobody else can load to that
- * reg, and it keeps the QPU cond field free from being occupied by
- * ldunifrf).
- */
- if (BITSET_TEST(regs, r5))
- return r5;
unsigned int reg;
- if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&
+ if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
v3d_ra_select_accum(v3d_ra, regs, &reg)) {
return reg;
}
- if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+ if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
return reg;
/* If we ran out of physical registers try to assign an accumulator
@@ -492,9 +1004,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
/* Allocate up to 3 regfile classes, for the ways the physical
* register file can be divided up for fragment shader threading.
*/
- int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+ int max_thread_index = 2;
+ uint8_t phys_index = get_phys_index(compiler->devinfo);
- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
false);
if (!compiler->regs)
return false;
@@ -502,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
for (int threads = 0; threads < max_thread_index; threads++) {
compiler->reg_class_any[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_r5[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_phys_or_acc[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
+ if (compiler->devinfo->has_accumulators) {
+ compiler->reg_class_r5[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ compiler->reg_class_phys_or_acc[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ }
compiler->reg_class_phys[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- for (int i = PHYS_INDEX;
- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ /* Init physical regs */
+ for (int i = phys_index;
+ i < phys_index + (PHYS_COUNT >> threads); i++) {
+ if (compiler->devinfo->has_accumulators)
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->reg_class_phys[threads], i);
ra_class_add_reg(compiler->reg_class_any[threads], i);
}
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
- ra_class_add_reg(compiler->reg_class_any[threads], i);
+ /* Init accumulator regs */
+ if (compiler->devinfo->has_accumulators) {
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ ra_class_add_reg(compiler->reg_class_any[threads], i);
+ }
+ /* r5 can only store a single 32-bit value, so not much can
+ * use it.
+ */
+ ra_class_add_reg(compiler->reg_class_r5[threads],
+ ACC_INDEX + 5);
+ ra_class_add_reg(compiler->reg_class_any[threads],
+ ACC_INDEX + 5);
}
- /* r5 can only store a single 32-bit value, so not much can
- * use it.
- */
- ra_class_add_reg(compiler->reg_class_r5[threads],
- ACC_INDEX + 5);
- ra_class_add_reg(compiler->reg_class_any[threads],
- ACC_INDEX + 5);
}
ra_set_finalize(compiler->regs, NULL);
@@ -534,52 +1054,220 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
return true;
}
-static int
-node_to_temp_priority(const void *in_a, const void *in_b)
+static inline bool
+tmu_spilling_allowed(struct v3d_compile *c)
{
- const struct node_to_temp_map *a = in_a;
- const struct node_to_temp_map *b = in_b;
-
- return a->priority - b->priority;
+ return c->spills + c->fills < c->max_tmu_spills;
}
-/**
- * Computes the number of registers to spill in a batch after a register
- * allocation failure.
- */
-static uint32_t
-get_spill_batch_size(struct v3d_compile *c)
-{
- /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of
- * over-spilling if the program requires few spills to compile.
- */
- if (c->spill_count < 10)
- return 1;
-
- /* If we have to spill more than that we assume performance is not going to
- * be great and we shift focus to batching spills to cut down compile
- * time at the expense of over-spilling.
- */
- return 20;
-}
-
-/* Don't emit spills using the TMU until we've dropped thread count first. We,
- * may also disable spilling when certain optimizations that are known to
- * increase register pressure are active so we favor recompiling with
- * optimizations disabled instead of spilling.
- */
-static inline bool
-tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
+static void
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+ int *acc_nodes,
+ int *implicit_rf_nodes,
+ int last_ldvary_ip,
+ struct qinst *inst)
{
- return thread_index == 0 && c->tmu_spilling_allowed;
+ int32_t ip = inst->ip;
+ assert(ip >= 0);
+
+ /* If the instruction writes r4 (and optionally moves its
+ * result to a temp), nothing else can be stored in r4 across
+ * it.
+ */
+ if (vir_writes_r4_implicitly(c->devinfo, inst)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ acc_nodes[4]);
+ }
+ }
+ }
+
+ /* If any instruction writes to a physical register implicitly
+ * nothing else can write the same register across it.
+ */
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
+ if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+ switch (inst->qpu.alu.add.op) {
+ case V3D_QPU_A_LDVPMV_IN:
+ case V3D_QPU_A_LDVPMV_OUT:
+ case V3D_QPU_A_LDVPMD_IN:
+ case V3D_QPU_A_LDVPMD_OUT:
+ case V3D_QPU_A_LDVPMP:
+ case V3D_QPU_A_LDVPMG_IN:
+ case V3D_QPU_A_LDVPMG_OUT: {
+ /* LDVPMs only store to temps (the MA flag
+ * decides whether the LDVPM is in or out)
+ */
+ assert(inst->dst.file == QFILE_TEMP);
+ set_temp_class_bits(c, inst->dst.index,
+ CLASS_BITS_PHYS);
+ break;
+ }
+
+ case V3D_QPU_A_RECIP:
+ case V3D_QPU_A_RSQRT:
+ case V3D_QPU_A_EXP:
+ case V3D_QPU_A_LOG:
+ case V3D_QPU_A_SIN:
+ case V3D_QPU_A_RSQRT2: {
+ /* The SFU instructions write directly to the
+ * phys regfile.
+ */
+ assert(inst->dst.file == QFILE_TEMP);
+ set_temp_class_bits(c, inst->dst.index,
+ CLASS_BITS_PHYS);
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+
+ if (inst->src[0].file == QFILE_REG) {
+ switch (inst->src[0].index) {
+ case 0:
+ /* V3D 7.x doesn't use rf0 for thread payload */
+ if (c->devinfo->ver >= 71)
+ break;
+ else
+ FALLTHROUGH;
+ case 1:
+ case 2:
+ case 3: {
+ /* Payload setup instructions: Force allocate
+ * the dst to the given register (so the MOV
+ * will disappear).
+ */
+ assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+ assert(inst->dst.file == QFILE_TEMP);
+ uint32_t node = temp_to_node(c, inst->dst.index);
+ ra_set_node_reg(c->g, node,
+ get_phys_index(c->devinfo) +
+ inst->src[0].index);
+ break;
+ }
+ }
+ }
+
+ /* Don't allocate rf0 to temps that cross ranges where we have
+ * live implicit rf0 writes from ldvary. We can identify these
+ * by tracking the last ldvary instruction and explicit reads
+ * of rf0.
+ */
+ if (c->devinfo->ver >= 71 &&
+ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+ (vir_get_nsrc(inst) > 1 &&
+ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip &&
+ c->temp_end[i] > last_ldvary_ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ /* Only a ldunif gets to write to R5, which only has a
+ * single 32-bit channel of storage.
+ *
+ * NOTE: ldunifa is subject to the same, however, going by
+ * shader-db it is best to keep r5 exclusive to ldunif, probably
+ * because ldunif has usually a shorter lifespan, allowing for
+ * more accumulator reuse and QPU merges.
+ */
+ if (c->devinfo->has_accumulators) {
+ if (!inst->qpu.sig.ldunif) {
+ uint8_t class_bits =
+ get_temp_class_bits(c, inst->dst.index) &
+ ~CLASS_BITS_R5;
+ set_temp_class_bits(c, inst->dst.index,
+ class_bits);
+
+ }
+ } else {
+ /* Make sure we don't allocate the ldvary's
+ * destination to rf0, since it would clash
+ * with its implicit write to that register.
+ */
+ if (inst->qpu.sig.ldvary) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, inst->dst.index),
+ implicit_rf_nodes[0]);
+ }
+ /* Flag dst temps from ldunif(a) instructions
+ * so we can try to assign rf0 to them and avoid
+ * converting these to ldunif(a)rf.
+ */
+ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+ const uint32_t dst_n =
+ temp_to_node(c, inst->dst.index);
+ c->nodes.info[dst_n].is_ldunif_dst = true;
+ }
+ }
+ }
+
+ /* All accumulators are invalidated across a thread switch. */
+ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ set_temp_class_bits(c, i,
+ CLASS_BITS_PHYS);
+ }
+ }
+ }
}
-#define CLASS_BIT_PHYS (1 << 0)
-#define CLASS_BIT_ACC (1 << 1)
-#define CLASS_BIT_R5 (1 << 4)
-#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \
- CLASS_BIT_ACC | \
- CLASS_BIT_R5)
+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+ /* Only look for registers used in this many instructions */
+ uint32_t last_set_count = 6;
+
+ struct qblock *last_block = vir_exit_block(c);
+ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+ if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+ continue;
+
+ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+ }
+ }
+
+ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->dst.index);
+ c->nodes.info[node].is_program_end = true;
+ }
+
+ if (--last_set_count == 0)
+ break;
+ }
+}
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
@@ -587,24 +1275,37 @@ tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
* The return value should be freed by the caller.
*/
struct qpu_reg *
-v3d_register_allocate(struct v3d_compile *c, bool *spilled)
+v3d_register_allocate(struct v3d_compile *c)
{
- uint32_t UNUSED start_num_temps = c->num_temps;
- struct node_to_temp_map map[c->num_temps];
- uint32_t temp_to_node[c->num_temps];
- uint8_t class_bits[c->num_temps];
int acc_nodes[ACC_COUNT];
+ int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+ unsigned num_ra_nodes = c->num_temps;
+ if (c->devinfo->has_accumulators)
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
+ else
+ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
+ c->nodes = (struct v3d_ra_node_info) {
+ .alloc_count = c->num_temps,
+ .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
+ num_ra_nodes),
+ };
+
+ uint32_t phys_index = get_phys_index(c->devinfo);
+
struct v3d_ra_select_callback_data callback_data = {
+ .phys_index = phys_index,
.next_acc = 0,
/* Start at RF3, to try to keep the TLB writes from using
- * RF0-2.
+ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+ * using RF2-3.
*/
- .next_phys = 3,
- .map = map,
+ .next_phys = c->devinfo->ver == 42 ? 3 : 4,
+ .nodes = &c->nodes,
+ .devinfo = c->devinfo,
};
- *spilled = false;
-
vir_calculate_live_intervals(c);
/* Convert 1, 2, 4 threads to 0, 1, 2 index.
@@ -612,257 +1313,163 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
* V3D 4.x has double the physical register space, so 64 physical regs
* are available at both 1x and 2x threading, and 4x has 32.
*/
- int thread_index = ffs(c->threads) - 1;
- if (c->devinfo->ver >= 40) {
- if (thread_index >= 1)
- thread_index--;
- }
+ c->thread_index = ffs(c->threads) - 1;
+ if (c->thread_index >= 1)
+ c->thread_index--;
- struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
- c->num_temps +
- ARRAY_SIZE(acc_nodes));
- ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
+ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
+ ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread
* switches. We could represent these as classes for the nodes to
* live in, but the classes take up a lot of memory to set up, so we
- * don't want to make too many.
+ * don't want to make too many. We use the same mechanism on platforms
+ * without accumulators that can have implicit writes to phys regs.
*/
- for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
- acc_nodes[i] = c->num_temps + i;
- ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
- }
-
- for (uint32_t i = 0; i < c->num_temps; i++) {
- map[i].temp = i;
- map[i].priority = c->temp_end[i] - c->temp_start[i];
- }
- qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
- for (uint32_t i = 0; i < c->num_temps; i++) {
- temp_to_node[map[i].temp] = i;
+ for (uint32_t i = 0; i < num_ra_nodes; i++) {
+ c->nodes.info[i].is_ldunif_dst = false;
+ c->nodes.info[i].is_program_end = false;
+ c->nodes.info[i].unused = false;
+ c->nodes.info[i].priority = 0;
+ c->nodes.info[i].class_bits = 0;
+ if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+ acc_nodes[i] = i;
+ ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+ } else if (!c->devinfo->has_accumulators &&
+ i < ARRAY_SIZE(implicit_rf_nodes)) {
+ implicit_rf_nodes[i] = i;
+ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
+ } else {
+ uint32_t t = node_to_temp(c, i);
+ c->nodes.info[i].priority =
+ c->temp_end[t] - c->temp_start[t];
+ c->nodes.info[i].class_bits =
+ get_class_bit_any(c->devinfo);
+ }
}
- /* Figure out our register classes and preallocated registers. We
- * start with any temp being able to be in any file, then instructions
- * incrementally remove bits that the temp definitely can't be in.
+ /* Walk the instructions adding register class restrictions and
+ * interferences.
*/
- memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
-
int ip = 0;
+ int last_ldvary_ip = -1;
vir_for_each_inst_inorder(inst, c) {
- /* If the instruction writes r3/r4 (and optionally moves its
- * result to a temp), nothing else can be stored in r3/r4 across
- * it.
+ inst->ip = ip++;
+
+ /* ldunif(a) always write to a temporary, so we have
+ * liveness info available to decide if rf0 is
+ * available for them, however, ldvary is different:
+ * it always writes to rf0 directly so we don't have
+ * liveness information for its implicit rf0 write.
+ *
+ * That means the allocator may assign rf0 to a temp
+ * that is defined while an implicit rf0 write from
+ * ldvary is still live. We fix that by manually
+ * tracking rf0 live ranges from ldvary instructions.
*/
- if (vir_writes_r3(c->devinfo, inst)) {
- for (int i = 0; i < c->num_temps; i++) {
- if (c->temp_start[i] < ip &&
- c->temp_end[i] > ip) {
- ra_add_node_interference(g,
- temp_to_node[i],
- acc_nodes[3]);
- }
- }
- }
- if (vir_writes_r4(c->devinfo, inst)) {
- for (int i = 0; i < c->num_temps; i++) {
- if (c->temp_start[i] < ip &&
- c->temp_end[i] > ip) {
- ra_add_node_interference(g,
- temp_to_node[i],
- acc_nodes[4]);
- }
- }
- }
-
- if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
- switch (inst->qpu.alu.add.op) {
- case V3D_QPU_A_LDVPMV_IN:
- case V3D_QPU_A_LDVPMV_OUT:
- case V3D_QPU_A_LDVPMD_IN:
- case V3D_QPU_A_LDVPMD_OUT:
- case V3D_QPU_A_LDVPMP:
- case V3D_QPU_A_LDVPMG_IN:
- case V3D_QPU_A_LDVPMG_OUT:
- /* LDVPMs only store to temps (the MA flag
- * decides whether the LDVPM is in or out)
- */
- assert(inst->dst.file == QFILE_TEMP);
- class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
- break;
-
- case V3D_QPU_A_RECIP:
- case V3D_QPU_A_RSQRT:
- case V3D_QPU_A_EXP:
- case V3D_QPU_A_LOG:
- case V3D_QPU_A_SIN:
- case V3D_QPU_A_RSQRT2:
- /* The SFU instructions write directly to the
- * phys regfile.
- */
- assert(inst->dst.file == QFILE_TEMP);
- class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
- break;
-
- default:
- break;
- }
- }
+ if (inst->qpu.sig.ldvary)
+ last_ldvary_ip = ip;
- if (inst->src[0].file == QFILE_REG) {
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
- case 3:
- /* Payload setup instructions: Force allocate
- * the dst to the given register (so the MOV
- * will disappear).
- */
- assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
- assert(inst->dst.file == QFILE_TEMP);
- ra_set_node_reg(g,
- temp_to_node[inst->dst.index],
- PHYS_INDEX +
- inst->src[0].index);
- break;
- }
- }
-
- if (inst->dst.file == QFILE_TEMP) {
- /* Only a ldunif gets to write to R5, which only has a
- * single 32-bit channel of storage.
- */
- if (!inst->qpu.sig.ldunif) {
- class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
- } else {
- /* Until V3D 4.x, we could only load a uniform
- * to r5, so we'll need to spill if uniform
- * loads interfere with each other.
- */
- if (c->devinfo->ver < 40) {
- class_bits[inst->dst.index] &=
- CLASS_BIT_R5;
- }
- }
- }
-
- if (inst->qpu.sig.thrsw) {
- /* All accumulators are invalidated across a thread
- * switch.
- */
- for (int i = 0; i < c->num_temps; i++) {
- if (c->temp_start[i] < ip && c->temp_end[i] > ip)
- class_bits[i] &= CLASS_BIT_PHYS;
- }
- }
-
- ip++;
+ update_graph_and_reg_classes_for_inst(c, acc_nodes,
+ implicit_rf_nodes,
+ last_ldvary_ip, inst);
}
+ /* Flag the nodes that are used in the last instructions of the program
+ * (there are some registers that cannot be used in the last 3
+ * instructions). We only do this for fragment shaders, because the idea
+ * is that by avoiding this conflict we may be able to emit the last
+ * thread switch earlier in some cases, however, in non-fragment shaders
+ * this won't happen because the last instructions are always VPM stores
+ * with a small immediate, which conflicts with other signals,
+ * preventing us from ever moving the thrsw earlier.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+ flag_program_end_nodes(c);
+
+ /* Set the register classes for all our temporaries in the graph */
for (uint32_t i = 0; i < c->num_temps; i++) {
- if (class_bits[i] == CLASS_BIT_PHYS) {
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_phys[thread_index]);
- } else if (class_bits[i] == (CLASS_BIT_R5)) {
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_r5[thread_index]);
- } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_phys_or_acc[thread_index]);
- } else {
- assert(class_bits[i] == CLASS_BITS_ANY);
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_any[thread_index]);
- }
+ ra_set_node_class(c->g, temp_to_node(c, i),
+ choose_reg_class_for_temp(c, i));
}
+ /* Add register interferences based on liveness data */
for (uint32_t i = 0; i < c->num_temps; i++) {
+ /* And while we are here, let's also flag nodes for
+ * unused temps.
+ */
+ if (c->temp_start[i] > c->temp_end[i])
+ c->nodes.info[temp_to_node(c, i)].unused = true;
+
for (uint32_t j = i + 1; j < c->num_temps; j++) {
- if (!(c->temp_start[i] >= c->temp_end[j] ||
- c->temp_start[j] >= c->temp_end[i])) {
- ra_add_node_interference(g,
- temp_to_node[i],
- temp_to_node[j]);
+ if (interferes(c->temp_start[i], c->temp_end[i],
+ c->temp_start[j], c->temp_end[j])) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ temp_to_node(c, j));
}
}
}
- /* Debug code to force a bit of register spilling, for running across
- * conformance tests to make sure that spilling works.
+ /* Debug option to force a bit of TMU spilling, for running
+ * across conformance tests to make sure that spilling works.
*/
- int force_register_spills = 0;
- if (c->spill_size <
- V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
- int node = v3d_choose_spill_node(c, g, temp_to_node);
- if (node != -1) {
- v3d_spill_reg(c, map[node].temp);
- ralloc_free(g);
- *spilled = true;
- return NULL;
+ const int force_register_spills = 0;
+ if (force_register_spills > 0)
+ c->max_tmu_spills = UINT32_MAX;
+
+ struct qpu_reg *temp_registers = NULL;
+ while (true) {
+ if (c->spill_size <
+ V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
+ int node = v3d_choose_spill_node(c);
+ uint32_t temp = node_to_temp(c, node);
+ if (node != -1) {
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+ continue;
+ }
}
- }
-
- bool ok = ra_allocate(g);
- if (!ok) {
- const uint32_t spill_batch_size = get_spill_batch_size(c);
-
- for (uint32_t i = 0; i < spill_batch_size; i++) {
- int node = v3d_choose_spill_node(c, g, temp_to_node);
- if (node == -1)
- break;
-
- /* TMU spills inject thrsw signals that invalidate
- * accumulators, so we can't batch them.
- */
- bool is_uniform = vir_is_mov_uniform(c, map[node].temp);
- if (i > 0 && !is_uniform)
- break;
- if (is_uniform || tmu_spilling_allowed(c, thread_index)) {
- v3d_spill_reg(c, map[node].temp);
-
- /* Ask the outer loop to call back in. */
- *spilled = true;
+ if (ra_allocate(c->g))
+ break;
- /* See comment above about batching TMU spills.
- */
- if (!is_uniform) {
- assert(i == 0);
- break;
- }
- } else {
- break;
- }
+ /* Failed allocation, try to spill */
+ int node = v3d_choose_spill_node(c);
+ if (node == -1)
+ goto spill_fail;
+
+ uint32_t temp = node_to_temp(c, node);
+ enum temp_spill_type spill_type =
+ get_spill_type_for_temp(c, temp);
+ if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+ if (c->spills + c->fills > c->max_tmu_spills)
+ goto spill_fail;
+ } else {
+ goto spill_fail;
}
-
- ralloc_free(g);
- return NULL;
}
- /* Ensure that we are not accessing temp_to_node out of bounds. We
- * should never trigger this assertion because `c->num_temps` only
- * grows when we spill, in which case we return early and don't get
- * here.
- */
- assert(start_num_temps == c->num_temps);
- struct qpu_reg *temp_registers = calloc(c->num_temps,
- sizeof(*temp_registers));
-
+ /* Allocation was successful, build the 'temp -> reg' map */
+ temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
for (uint32_t i = 0; i < c->num_temps; i++) {
- int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
- if (ra_reg < PHYS_INDEX) {
+ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+ if (ra_reg < phys_index) {
temp_registers[i].magic = true;
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
ra_reg - ACC_INDEX);
} else {
temp_registers[i].magic = false;
- temp_registers[i].index = ra_reg - PHYS_INDEX;
+ temp_registers[i].index = ra_reg - phys_index;
}
}
- ralloc_free(g);
-
+spill_fail:
+ ralloc_free(c->nodes.info);
+ c->nodes.info = NULL;
+ c->nodes.alloc_count = 0;
+ ralloc_free(c->g);
+ c->g = NULL;
return temp_registers;
}
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index aa33545420e..605c3e4c7d5 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr)
return reg;
}
-static inline struct qpu_reg
-qpu_acc(int acc)
-{
- return qpu_magic(V3D_QPU_WADDR_R0 + acc);
-}
-
struct v3d_qpu_instr
v3d_qpu_nop(void)
{
@@ -92,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
return q;
}
+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+ /* If we have a small immediate move it from inst->raddr_b to the
+ * corresponding raddr.
+ */
+ if (src.smimm) {
+ assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+ instr->sig.small_imm_c || instr->sig.small_imm_d);
+ *raddr = instr->raddr_b;
+ return;
+ }
+
+ assert(!src.magic);
+ *raddr = src.index;
+}
+
/**
* Allocates the src register (accumulator or register file) into the RADDR
* fields of the instruction.
*/
static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d42_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
{
if (src.smimm) {
- assert(instr->sig.small_imm);
+ assert(instr->sig.small_imm_b);
*mux = V3D_QPU_MUX_B;
return;
}
@@ -112,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
return;
}
- if (instr->alu.add.a != V3D_QPU_MUX_A &&
- instr->alu.add.b != V3D_QPU_MUX_A &&
- instr->alu.mul.a != V3D_QPU_MUX_A &&
- instr->alu.mul.b != V3D_QPU_MUX_A) {
+ if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
instr->raddr_a = src.index;
*mux = V3D_QPU_MUX_A;
} else {
if (instr->raddr_a == src.index) {
*mux = V3D_QPU_MUX_A;
} else {
- assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
- instr->alu.add.b == V3D_QPU_MUX_B &&
- instr->alu.mul.a == V3D_QPU_MUX_B &&
- instr->alu.mul.b == V3D_QPU_MUX_B) ||
+ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
src.index == instr->raddr_b);
instr->raddr_b = src.index;
@@ -134,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
}
}
-static bool
-is_no_op_mov(struct qinst *qinst)
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux *mux,
+ uint8_t *raddr,
+ struct qpu_reg src,
+ const struct v3d_device_info *devinfo)
{
- static const struct v3d_qpu_sig no_sig = {0};
-
- /* Make sure it's just a lone MOV. */
- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
- qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
- return false;
- }
+ if (devinfo->ver < 71)
+ return v3d42_set_src(instr, mux, src);
+ else
+ return v3d71_set_src(instr, raddr, src);
+}
- /* Check if it's a MOV from a register to itself. */
+static bool
+v3d42_mov_src_and_dst_equal(struct qinst *qinst)
+{
enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
if (qinst->qpu.alu.mul.magic_write) {
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
return false;
- if (qinst->qpu.alu.mul.a !=
+ if (qinst->qpu.alu.mul.a.mux !=
V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
return false;
}
} else {
int raddr;
- switch (qinst->qpu.alu.mul.a) {
+ switch (qinst->qpu.alu.mul.a.mux) {
case V3D_QPU_MUX_A:
raddr = qinst->qpu.raddr_a;
break;
@@ -174,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
return false;
}
+ return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+ if (qinst->qpu.alu.mul.magic_write)
+ return false;
+
+ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+ int raddr;
+
+ raddr = qinst->qpu.alu.mul.a.raddr;
+ if (raddr != waddr)
+ return false;
+
+ return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return v3d42_mov_src_and_dst_equal(qinst);
+ else
+ return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ static const struct v3d_qpu_sig no_sig = {0};
+
+ /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+ * for V3D 7.x there is also A_MOV, we don't need to check for it as
+ * we always emit using M_MOV. We could use A_MOV later on the
+ * squedule to improve performance
+ */
+ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+ qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+ return false;
+ }
+
+ if (!mov_src_and_dst_equal(qinst, devinfo))
+ return false;
+
/* No packing or flags updates, or we need to execute the
* instruction.
*/
- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -193,8 +262,6 @@ v3d_generate_code_block(struct v3d_compile *c,
struct qblock *block,
struct qpu_reg *temp_registers)
{
- int last_vpm_read_index = -1;
-
vir_for_each_inst_safe(qinst, block) {
#if 0
fprintf(stderr, "translating qinst to qpu: ");
@@ -202,8 +269,6 @@ v3d_generate_code_block(struct v3d_compile *c,
fprintf(stderr, "\n");
#endif
- struct qinst *temp;
-
if (vir_has_uniform(qinst))
c->num_uniforms++;
@@ -219,8 +284,14 @@ v3d_generate_code_block(struct v3d_compile *c,
src[i] = qpu_magic(qinst->src[i].index);
break;
case QFILE_NULL:
+ /* QFILE_NULL is an undef, so we can load
+ * anything. Using a reg that doesn't have
+ * sched. restrictions.
+ */
+ src[i] = qpu_reg(5);
+ break;
case QFILE_LOAD_IMM:
- src[i] = qpu_acc(0);
+ assert(!"not reached");
break;
case QFILE_TEMP:
src[i] = temp_registers[index];
@@ -228,18 +299,6 @@ v3d_generate_code_block(struct v3d_compile *c,
case QFILE_SMALL_IMM:
src[i].smimm = true;
break;
-
- case QFILE_VPM:
- assert((int)qinst->src[i].index >=
- last_vpm_read_index);
- (void)last_vpm_read_index;
- last_vpm_read_index = qinst->src[i].index;
-
- temp = new_qpu_nop_before(qinst);
- temp->qpu.sig.ldvpm = true;
-
- src[i] = qpu_acc(3);
- break;
}
}
@@ -261,10 +320,6 @@ v3d_generate_code_block(struct v3d_compile *c,
dst = temp_registers[qinst->dst.index];
break;
- case QFILE_VPM:
- dst = qpu_magic(V3D_QPU_WADDR_VPM);
- break;
-
case QFILE_SMALL_IMM:
case QFILE_LOAD_IMM:
assert(!"not reached");
@@ -276,10 +331,15 @@ v3d_generate_code_block(struct v3d_compile *c,
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
- if (!dst.magic ||
- dst.index != V3D_QPU_WADDR_R5) {
- assert(c->devinfo->ver >= 40);
+ bool use_rf;
+ if (c->devinfo->has_accumulators) {
+ use_rf = !dst.magic ||
+ dst.index != V3D_QPU_WADDR_R5;
+ } else {
+ use_rf = dst.magic || dst.index != 0;
+ }
+ if (use_rf) {
if (qinst->qpu.sig.ldunif) {
qinst->qpu.sig.ldunif = false;
qinst->qpu.sig.ldunifrf = true;
@@ -299,13 +359,18 @@ v3d_generate_code_block(struct v3d_compile *c,
qinst->qpu.sig_magic = dst.magic;
} else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.a, src[0]);
+ &qinst->qpu.alu.add.a.mux,
+ &qinst->qpu.alu.add.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.b, src[1]);
+ &qinst->qpu.alu.add.b.mux,
+ &qinst->qpu.alu.add.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.add.waddr = dst.index;
@@ -313,17 +378,21 @@ v3d_generate_code_block(struct v3d_compile *c,
} else {
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.a, src[0]);
+ &qinst->qpu.alu.mul.a.mux,
+ &qinst->qpu.alu.mul.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.b, src[1]);
+ &qinst->qpu.alu.mul.b.mux,
+ &qinst->qpu.alu.mul.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.mul.waddr = dst.index;
qinst->qpu.alu.mul.magic_write = dst.magic;
- if (is_no_op_mov(qinst)) {
+ if (is_no_op_mov(qinst, c->devinfo)) {
vir_remove_instruction(c, qinst);
continue;
}
@@ -378,11 +447,7 @@ v3d_dump_qpu(struct v3d_compile *c)
const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str);
- /* We can only do this on 4.x, because we're not tracking TMU
- * implicit uniforms here on 3.x.
- */
- if (c->devinfo->ver >= 40 &&
- reads_uniform(c->devinfo, c->qpu_insts[i])) {
+ if (reads_uniform(c->devinfo, c->qpu_insts[i])) {
fprintf(stderr, " (");
vir_dump_uniform(c->uniform_contents[next_uniform],
c->uniform_data[next_uniform]);
@@ -394,8 +459,7 @@ v3d_dump_qpu(struct v3d_compile *c)
}
/* Make sure our dumping lined up. */
- if (c->devinfo->ver >= 40)
- assert(next_uniform == c->num_uniforms);
+ assert(next_uniform == c->num_uniforms);
fprintf(stderr, "\n");
}
@@ -431,8 +495,8 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
}
assert(i == c->qpu_inst_count);
- if (V3D_DEBUG & (V3D_DEBUG_QPU |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(QPU) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
v3d_dump_qpu(c);
}
diff --git a/src/broadcom/drm-shim/README.md b/src/broadcom/drm-shim/README.md
index 16cbff75825..614cc8304bf 100644
--- a/src/broadcom/drm-shim/README.md
+++ b/src/broadcom/drm-shim/README.md
@@ -1,12 +1,3 @@
-### v3d backend
-
-This implements some of v3d using the closed source v3dv3 tree's
-C/C++-based simulator. All execution is synchronous.
-
-Export: `MESA_LOADER_DRIVER_OVERRIDE=v3d
-LD_PRELOAD=$prefix/lib/libv3d_drm_shim.so`. The v3dv3 version exposed
-will depend on the v3dv3 build -- 3.3, 4.1, and 4.2 are supported.
-
### v3d_noop backend
This implements the minimum of v3d in order to make shader-db work.
diff --git a/src/broadcom/drm-shim/meson.build b/src/broadcom/drm-shim/meson.build
index b44b6c15d18..212c0287aa8 100644
--- a/src/broadcom/drm-shim/meson.build
+++ b/src/broadcom/drm-shim/meson.build
@@ -19,55 +19,19 @@
# SOFTWARE.
libvc4_noop_drm_shim = shared_library(
- ['vc4_noop_drm_shim'],
+ 'vc4_noop_drm_shim',
'vc4_noop.c',
- include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
+ include_directories: [inc_include, inc_src],
dependencies: dep_drm_shim,
gnu_symbol_visibility : 'hidden',
install : true,
)
libv3d_noop_drm_shim = shared_library(
- ['v3d_noop_drm_shim'],
+ 'v3d_noop_drm_shim',
'v3d_noop.c',
- include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
+ include_directories: [inc_include, inc_src],
dependencies: dep_drm_shim,
gnu_symbol_visibility : 'hidden',
install : true,
)
-
-dep_v3dv3 = dependency('v3dv3', required: false)
-if dep_v3dv3.found()
- v3dv3_c_args = '-DUSE_V3D_SIMULATOR'
-
- inc_gallium_v3d = include_directories('../../gallium/drivers/v3d')
-
- per_version_libs = []
- foreach ver : v3d_versions
- per_version_libs += static_library(
- 'libv3d_drm_shim-v' + ver,
- [
- 'v3dx.c',
- v3d_xml_pack
- ],
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_gallium_v3d, inc_simulator],
- c_args : [no_override_init_args, '-DV3D_VERSION=' + ver, v3dv3_c_args],
- gnu_symbol_visibility : 'hidden',
- dependencies: [dep_valgrind, dep_thread, dep_v3dv3],
- )
- endforeach
-
- libv3d_drm_shim = shared_library(
- ['v3d_drm_shim'],
- [
- 'v3d.c',
- '../simulator/v3d_simulator_wrapper.cpp',
- ],
- dependencies: [idep_mesautil, dep_dl, dep_drm_shim, dep_v3dv3],
- link_with: per_version_libs,
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_gallium_v3d, inc_simulator],
- c_args : [no_override_init_args, '-std=gnu99', v3dv3_c_args],
- gnu_symbol_visibility : 'hidden',
- cpp_args : [v3dv3_c_args]
- )
-endif
diff --git a/src/broadcom/drm-shim/v3d.c b/src/broadcom/drm-shim/v3d.c
deleted file mode 100644
index f4d5bd31323..00000000000
--- a/src/broadcom/drm-shim/v3d.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright © 2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include "drm-uapi/v3d_drm.h"
-#include "drm-shim/drm_shim.h"
-#include "v3d.h"
-#include "v3d_simulator_wrapper.h"
-
-bool drm_shim_driver_prefers_first_render_node = false;
-
-static struct v3d_device_info devinfo;
-struct v3d_shim_device v3d = {
- .devinfo = &devinfo
-};
-
-struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle)
-{
- return v3d_bo(drm_shim_bo_lookup(shim_fd, handle));
-}
-
-int
-v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg)
-{
- /* No need to wait on anything yet, given that we submit
- * synchronously.
- */
- return 0;
-}
-
-int
-v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
-{
- struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
- struct drm_v3d_mmap_bo *map = arg;
- struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
-
- map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
-
- drm_shim_bo_put(bo);
-
- return 0;
-}
-
-int
-v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg)
-{
- struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
- struct drm_v3d_get_bo_offset *get = arg;
- struct v3d_bo *bo = v3d_bo_lookup(shim_fd, get->handle);
-
- get->offset = bo->offset;
-
- drm_shim_bo_put(&bo->base);
-
- return 0;
-}
-
-void
-drm_shim_driver_init(void)
-{
- shim_device.bus_type = DRM_BUS_PLATFORM;
- shim_device.driver_name = "v3d";
-
- drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n"
- "OF_COMPATIBLE_N=1\n"
- "OF_COMPATIBLE_0=brcm,7278-v3d\n",
- "/sys/dev/char/%d:%d/device/uevent",
- DRM_MAJOR, render_node_minor);
-
- v3d.hw = v3d_hw_auto_new(NULL);
- v3d.devinfo->ver = v3d_hw_get_version(v3d.hw);
-
- if (v3d.devinfo->ver >= 42)
- v3d42_drm_shim_driver_init();
- else if (v3d.devinfo->ver >= 41)
- v3d41_drm_shim_driver_init();
- else
- v3d33_drm_shim_driver_init();
-}
diff --git a/src/broadcom/drm-shim/v3d.h b/src/broadcom/drm-shim/v3d.h
deleted file mode 100644
index 0712b8b3f24..00000000000
--- a/src/broadcom/drm-shim/v3d.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright © 2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef DRM_SHIM_V3D_H
-#define DRM_SHIM_V3D_H
-
-#include "broadcom/common/v3d_device_info.h"
-#include "util/vma.h"
-
-struct drm_shim_fd;
-
-struct v3d_shim_device {
- struct v3d_hw *hw;
- struct v3d_device_info *devinfo;
-
- /* Base virtual address of the heap. */
- void *mem;
- /* Base hardware address of the heap. */
- uint32_t mem_base;
- /* Size of the heap. */
- size_t mem_size;
-
- /* Allocator for the GPU virtual addresses. */
- struct util_vma_heap heap;
-};
-extern struct v3d_shim_device v3d;
-
-struct v3d_bo {
- struct shim_bo base;
- uint64_t offset;
- void *sim_vaddr;
- void *gem_vaddr;
-};
-
-static inline struct v3d_bo *
-v3d_bo(struct shim_bo *bo)
-{
- return (struct v3d_bo *)bo;
-}
-
-struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle);
-int v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg);
-int v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg);
-int v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg);
-
-void v3d33_drm_shim_driver_init(void);
-void v3d41_drm_shim_driver_init(void);
-void v3d42_drm_shim_driver_init(void);
-
-#endif /* DRM_SHIM_V3D_H */
diff --git a/src/broadcom/drm-shim/v3d_noop.c b/src/broadcom/drm-shim/v3d_noop.c
index fd92e8859c5..8a27052441b 100644
--- a/src/broadcom/drm-shim/v3d_noop.c
+++ b/src/broadcom/drm-shim/v3d_noop.c
@@ -122,6 +122,15 @@ v3d_ioctl_get_param(int fd, unsigned long request, void *arg)
case DRM_V3D_PARAM_SUPPORTS_TFU:
gp->value = 1;
return 0;
+ case DRM_V3D_PARAM_SUPPORTS_CSD:
+ gp->value = 1;
+ return 0;
+ case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
+ gp->value = 1;
+ return 0;
+ case DRM_V3D_PARAM_SUPPORTS_PERFMON:
+ gp->value = 1;
+ return 0;
default:
break;
}
diff --git a/src/broadcom/drm-shim/v3dx.c b/src/broadcom/drm-shim/v3dx.c
deleted file mode 100644
index a22550a03a5..00000000000
--- a/src/broadcom/drm-shim/v3dx.c
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Copyright © 2014-2017 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/* @file
- *
- * v3d driver code interacting v3dv3 simulator/fpga library.
- *
- * This is compiled per V3D version we support, since the register definitions
- * conflict.
- */
-
-#include <errno.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/mman.h>
-#include "util/macros.h"
-#include "util/u_mm.h"
-#include "broadcom/common/v3d_macros.h"
-#include "v3d_simulator_wrapper.h"
-#include "drm-shim/drm_shim.h"
-#include "drm-uapi/v3d_drm.h"
-#include "v3d.h"
-
-#define HW_REGISTER_RO(x) (x)
-#define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
-#include "libs/core/v3d/registers/4.1.34.0/v3d.h"
-#else
-#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
-#endif
-
-#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d.hw, reg, val)
-#define V3D_READ(reg) v3d_hw_read_reg(v3d.hw, reg)
-
-static void
-v3d_flush_l3()
-{
- if (!v3d_hw_has_gca(v3d.hw))
- return;
-
-#if V3D_VERSION < 40
- uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
-
- V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
- V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
-#endif
-}
-
-/* Invalidates the L2 cache. This is a read-only cache. */
-static void
-v3d_flush_l2(void)
-{
- V3D_WRITE(V3D_CTL_0_L2CACTL,
- V3D_CTL_0_L2CACTL_L2CCLR_SET |
- V3D_CTL_0_L2CACTL_L2CENA_SET);
-}
-
-/* Invalidates texture L2 cachelines */
-static void
-v3d_flush_l2t(void)
-{
- V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
- V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
- V3D_WRITE(V3D_CTL_0_L2TCACTL,
- V3D_CTL_0_L2TCACTL_L2TFLS_SET |
- (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
-}
-
-/* Invalidates the slice caches. These are read-only caches. */
-static void
-v3d_flush_slices(void)
-{
- V3D_WRITE(V3D_CTL_0_SLCACTL, ~0);
-}
-
-static void
-v3d_flush_caches(void)
-{
- v3d_flush_l3();
- v3d_flush_l2();
- v3d_flush_l2t();
- v3d_flush_slices();
-}
-
-static void
-v3d_simulator_copy_in_handle(struct shim_fd *shim_fd, int handle)
-{
- if (!handle)
- return;
-
- struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
-
- memcpy(bo->sim_vaddr, bo->gem_vaddr, bo->base.size);
-}
-
-static void
-v3d_simulator_copy_out_handle(struct shim_fd *shim_fd, int handle)
-{
- if (!handle)
- return;
-
- struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
-
- memcpy(bo->gem_vaddr, bo->sim_vaddr, bo->base.size);
-}
-
-static int
-v3dX(v3d_ioctl_submit_cl)(int fd, unsigned long request, void *arg)
-{
- struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
- struct drm_v3d_submit_cl *submit = arg;
- uint32_t *bo_handles = (uint32_t *)(uintptr_t)submit->bo_handles;
-
- for (int i = 0; i < submit->bo_handle_count; i++)
- v3d_simulator_copy_in_handle(shim_fd, bo_handles[i]);
-
- v3d_flush_caches();
-
- if (submit->qma) {
- V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
- V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
- }
-#if V3D_VERSION >= 41
- if (submit->qts) {
- V3D_WRITE(V3D_CLE_0_CT0QTS,
- V3D_CLE_0_CT0QTS_CTQTSEN_SET |
- submit->qts);
- }
-#endif
-
- fprintf(stderr, "submit %x..%x!\n", submit->bcl_start, submit->bcl_end);
-
- V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
- V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
-
- /* Wait for bin to complete before firing render, as it seems the
- * simulator doesn't implement the semaphores.
- */
- while (V3D_READ(V3D_CLE_0_CT0CA) !=
- V3D_READ(V3D_CLE_0_CT0EA)) {
- v3d_hw_tick(v3d.hw);
- }
-
- fprintf(stderr, "submit %x..%x!\n", submit->rcl_start, submit->rcl_end);
-
- v3d_flush_caches();
-
- V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start);
- V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end);
-
- while (V3D_READ(V3D_CLE_0_CT1CA) !=
- V3D_READ(V3D_CLE_0_CT1EA)) {
- v3d_hw_tick(v3d.hw);
- }
-
- for (int i = 0; i < submit->bo_handle_count; i++)
- v3d_simulator_copy_out_handle(shim_fd, bo_handles[i]);
-
- return 0;
-}
-
-static int
-v3dX(v3d_ioctl_submit_tfu)(int fd, unsigned long request, void *arg)
-{
- struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
- struct drm_v3d_submit_tfu *submit = arg;
-
- v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[0]);
- v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[1]);
- v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[2]);
- v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[3]);
-
- int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
-
- V3D_WRITE(V3D_TFU_IIA, submit->iia);
- V3D_WRITE(V3D_TFU_IIS, submit->iis);
- V3D_WRITE(V3D_TFU_ICA, submit->ica);
- V3D_WRITE(V3D_TFU_IUA, submit->iua);
- V3D_WRITE(V3D_TFU_IOA, submit->ioa);
- V3D_WRITE(V3D_TFU_IOS, submit->ios);
- V3D_WRITE(V3D_TFU_COEF0, submit->coef[0]);
- V3D_WRITE(V3D_TFU_COEF1, submit->coef[1]);
- V3D_WRITE(V3D_TFU_COEF2, submit->coef[2]);
- V3D_WRITE(V3D_TFU_COEF3, submit->coef[3]);
-
- V3D_WRITE(V3D_TFU_ICFG, submit->icfg);
-
- while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
- v3d_hw_tick(v3d.hw);
- }
-
- v3d_simulator_copy_out_handle(shim_fd, submit->bo_handles[0]);
-
- return 0;
-}
-
-static int
-v3dX(v3d_ioctl_create_bo)(int fd, unsigned long request, void *arg)
-{
- struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
- struct drm_v3d_create_bo *create = arg;
- struct v3d_bo *bo = calloc(1, sizeof(*bo));
-
- drm_shim_bo_init(&bo->base, create->size);
- bo->offset = util_vma_heap_alloc(&v3d.heap, create->size, 4096);
- if (bo->offset == 0)
- return -ENOMEM;
-
- bo->sim_vaddr = v3d.mem + bo->offset - v3d.mem_base;
-#if 0
- /* Place a mapping of the BO inside of the simulator's address space
- * for V3D memory. This lets us avoid copy in/out for simpenrose, but
- * I'm betting we'll need something else for FPGA.
- */
- void *sim_addr = v3d.mem + bo->block->ofs;
- void *mmap_ret = mmap(sim_addr, create->size, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FIXED, bo->base.fd, 0);
- assert(mmap_ret == sim_addr);
-#else
- /* Make a simulator-private mapping of the shim GEM object. */
- bo->gem_vaddr = mmap(NULL, bo->base.size,
- PROT_READ | PROT_WRITE,
- MAP_SHARED,
- bo->base.fd, 0);
- if (bo->gem_vaddr == MAP_FAILED) {
- fprintf(stderr, "v3d: mmap of shim bo failed\n");
- abort();
- }
-#endif
-
- create->offset = bo->offset;
- create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
-
- drm_shim_bo_put(&bo->base);
-
- return 0;
-}
-
-static int
-v3dX(v3d_ioctl_get_param)(int fd, unsigned long request, void *arg)
-{
- struct drm_v3d_get_param *gp = arg;
- static const uint32_t reg_map[] = {
- [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
- [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
- [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
- [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
- [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
- [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
- [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
- };
-
- switch (gp->param) {
- case DRM_V3D_PARAM_SUPPORTS_TFU:
- gp->value = 1;
- return 0;
- }
-
- if (gp->param < ARRAY_SIZE(reg_map) && reg_map[gp->param]) {
- gp->value = V3D_READ(reg_map[gp->param]);
- return 0;
- }
-
- fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param);
- return -1;
-}
-
-static ioctl_fn_t driver_ioctls[] = {
- [DRM_V3D_SUBMIT_CL] = v3dX(v3d_ioctl_submit_cl),
- [DRM_V3D_SUBMIT_TFU] = v3dX(v3d_ioctl_submit_tfu),
- [DRM_V3D_WAIT_BO] = v3d_ioctl_wait_bo,
- [DRM_V3D_CREATE_BO] = v3dX(v3d_ioctl_create_bo),
- [DRM_V3D_GET_PARAM] = v3dX(v3d_ioctl_get_param),
- [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo,
- [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset,
-};
-
-static void
-v3d_isr(uint32_t hub_status)
-{
- /* Check the per-core bits */
- if (hub_status & (1 << 0)) {
- uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS);
-
- if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
- fprintf(stderr, "GMP violation at 0x%08x\n",
- V3D_READ(V3D_GMP_0_VIO_ADDR));
- abort();
- } else {
- fprintf(stderr,
- "Unexpected ISR with core status 0x%08x\n",
- core_status);
- }
- abort();
- }
-
- return;
-}
-
-static void
-v3dX(simulator_init_regs)(void)
-{
-#if V3D_VERSION == 33
- /* Set OVRTMUOUT to match kernel behavior.
- *
- * This means that the texture sampler uniform configuration's tmu
- * output type field is used, instead of using the hardware default
- * behavior based on the texture type. If you want the default
- * behavior, you can still put "2" in the indirect texture state's
- * output_type field.
- */
- V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
-#endif
-
- uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_GMPV_SET;
- V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
- V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
-
- v3d_hw_set_isr(v3d.hw, v3d_isr);
-}
-
-static void
-v3d_bo_free(struct shim_bo *shim_bo)
-{
- struct v3d_bo *bo = v3d_bo(shim_bo);
-
- if (bo->gem_vaddr)
- munmap(bo->gem_vaddr, shim_bo->size);
-
- util_vma_heap_free(&v3d.heap, bo->offset, bo->base.size);
-}
-
-void
-v3dX(drm_shim_driver_init)(void)
-{
- shim_device.driver_ioctls = driver_ioctls;
- shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls);
-
- shim_device.driver_bo_free = v3d_bo_free;
-
- /* Allocate a gig of memory to play in. */
- v3d_hw_alloc_mem(v3d.hw, 1024 * 1024 * 1024);
- v3d.mem_base =
- v3d_hw_get_mem(v3d.hw, &v3d.mem_size,
- &v3d.mem);
- util_vma_heap_init(&v3d.heap, 4096, v3d.mem_size - 4096);
-
- v3dX(simulator_init_regs)();
-}
diff --git a/src/broadcom/drm-shim/vc4_noop.c b/src/broadcom/drm-shim/vc4_noop.c
index 3f85158e6df..b9c83db8313 100644
--- a/src/broadcom/drm-shim/vc4_noop.c
+++ b/src/broadcom/drm-shim/vc4_noop.c
@@ -51,6 +51,20 @@ vc4_ioctl_create_bo(int fd, unsigned long request, void *arg)
}
static int
+vc4_ioctl_create_shader_bo(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_vc4_create_shader_bo *create = arg;
+ struct shim_bo *bo = calloc(1, sizeof(*bo));
+
+ drm_shim_bo_init(bo, create->size);
+ create->handle = drm_shim_bo_get_handle(shim_fd, bo);
+ drm_shim_bo_put(bo);
+
+ return 0;
+}
+
+static int
vc4_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
{
struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
@@ -101,6 +115,7 @@ vc4_ioctl_get_param(int fd, unsigned long request, void *arg)
static ioctl_fn_t driver_ioctls[] = {
[DRM_VC4_CREATE_BO] = vc4_ioctl_create_bo,
+ [DRM_VC4_CREATE_SHADER_BO] = vc4_ioctl_create_shader_bo,
[DRM_VC4_MMAP_BO] = vc4_ioctl_mmap_bo,
[DRM_VC4_GET_PARAM] = vc4_ioctl_get_param,
[DRM_VC4_GET_TILING] = vc4_ioctl_noop,
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
index 2e1145dd0c0..f8e93526300 100644
--- a/src/broadcom/meson.build
+++ b/src/broadcom/meson.build
@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
subdir('cle')
-v3d_versions = ['33', '41', '42']
+v3d_versions = ['42', '71']
v3d_libs = []
if with_gallium_v3d or with_broadcom_vk
@@ -38,12 +38,12 @@ endif
per_version_libs = []
foreach ver : v3d_versions
per_version_libs += static_library(
- 'libbroadcom-v' + ver,
+ 'broadcom-v' + ver,
[
files('clif/v3dx_dump.c'),
v3d_xml_pack
],
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+ include_directories : [inc_include, inc_src, inc_broadcom],
c_args : [no_override_init_args, '-DV3D_VERSION=' + ver],
gnu_symbol_visibility : 'hidden',
dependencies: [dep_valgrind, dep_thread],
@@ -61,7 +61,7 @@ libv3d_neon = static_library(
'v3d_neon',
'common/v3d_tiling.c',
include_directories : [
- inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
+ inc_src, inc_include, inc_broadcom,
],
c_args : [v3d_args, v3d_neon_c_args],
gnu_symbol_visibility : 'hidden',
@@ -69,12 +69,12 @@ libv3d_neon = static_library(
)
libbroadcom_v3d = static_library(
- 'libbroadcom_v3d',
+ 'broadcom_v3d',
[
files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c', 'common/v3d_util.c'),
v3d_xml_pack,
],
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+ include_directories : [inc_include, inc_src, inc_broadcom],
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',
link_whole : v3d_libs + per_version_libs,
diff --git a/src/broadcom/qpu/meson.build b/src/broadcom/qpu/meson.build
index eea1f9bb058..fefc6a5cc56 100644
--- a/src/broadcom/qpu/meson.build
+++ b/src/broadcom/qpu/meson.build
@@ -25,9 +25,9 @@ libbroadcom_qpu_files = files(
)
libbroadcom_qpu = static_library(
- ['broadcom_qpu', v3d_xml_pack],
- libbroadcom_qpu_files,
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+ 'broadcom_qpu',
+ [libbroadcom_qpu_files, v3d_xml_pack],
+ include_directories : [inc_include, inc_src, inc_broadcom],
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',
dependencies : [dep_libdrm, dep_valgrind],
@@ -42,7 +42,7 @@ test(
'qpu_disasm', 'tests/qpu_disasm.c',
link_with: libbroadcom_qpu,
dependencies : idep_mesautil,
- include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux]
+ include_directories: [inc_include, inc_src]
),
suite : ['broadcom'],
)
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index b5648bd76e2..c1590a760de 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n)
static void
-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
- const struct v3d_qpu_instr *instr, uint8_t mux)
+v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux mux)
{
if (mux == V3D_QPU_MUX_A) {
append(disasm, "rf%d", instr->raddr_a);
} else if (mux == V3D_QPU_MUX_B) {
- if (instr->sig.small_imm) {
+ if (instr->sig.small_imm_b) {
uint32_t val;
ASSERTED bool ok =
v3d_qpu_small_imm_unpack(disasm->devinfo,
@@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
}
}
+enum v3d_qpu_input_class {
+ V3D_QPU_ADD_A,
+ V3D_QPU_ADD_B,
+ V3D_QPU_MUL_A,
+ V3D_QPU_MUL_B
+};
+
+static void
+v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ uint8_t raddr,
+ enum v3d_qpu_input_class input_class)
+{
+ bool is_small_imm = false;
+ switch(input_class) {
+ case V3D_QPU_ADD_A:
+ is_small_imm = instr->sig.small_imm_a;
+ break;
+ case V3D_QPU_ADD_B:
+ is_small_imm = instr->sig.small_imm_b;
+ break;
+ case V3D_QPU_MUL_A:
+ is_small_imm = instr->sig.small_imm_c;
+ break;
+ case V3D_QPU_MUL_B:
+ is_small_imm = instr->sig.small_imm_d;
+ break;
+ }
+
+ if (is_small_imm) {
+ uint32_t val;
+ ASSERTED bool ok =
+ v3d_qpu_small_imm_unpack(disasm->devinfo,
+ raddr,
+ &val);
+
+ if ((int)val >= -16 && (int)val <= 15)
+ append(disasm, "%d", val);
+ else
+ append(disasm, "0x%08x", val);
+ assert(ok);
+ } else {
+ append(disasm, "rf%d", raddr);
+ }
+}
+
+static void
+v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ const struct v3d_qpu_input *input,
+ enum v3d_qpu_input_class input_class)
+{
+ if (disasm->devinfo->ver < 71)
+ v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
+ else
+ v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
+}
+
static void
v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
{
@@ -110,7 +169,7 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
append(disasm, "%s", v3d_qpu_pf_name(instr->flags.apf));
append(disasm, "%s", v3d_qpu_uf_name(instr->flags.auf));
- append(disasm, " ");
+ append(disasm, " ");
if (has_dst) {
v3d_qpu_disasm_waddr(disasm, instr->alu.add.waddr,
@@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.b.unpack));
}
}
@@ -141,7 +200,7 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
bool has_dst = v3d_qpu_mul_op_has_dst(instr->alu.mul.op);
int num_src = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
- pad_to(disasm, 21);
+ pad_to(disasm, 30);
append(disasm, "; ");
append(disasm, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
@@ -153,7 +212,7 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
if (instr->alu.mul.op == V3D_QPU_M_NOP)
return;
- append(disasm, " ");
+ append(disasm, " ");
if (has_dst) {
v3d_qpu_disasm_waddr(disasm, instr->alu.mul.waddr,
@@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
}
}
@@ -217,7 +276,7 @@ v3d_qpu_disasm_sig(struct disasm_state *disasm,
return;
}
- pad_to(disasm, 41);
+ pad_to(disasm, 60);
if (sig->thrsw)
append(disasm, "; thrsw");
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 569c5fc4074..9a6434d94dd 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
return "tmu";
+ /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
+ */
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
+ return "quad";
+
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
+ return "rep";
+
static const char *waddr_magic[] = {
[V3D_QPU_WADDR_R0] = "r0",
[V3D_QPU_WADDR_R1] = "r1",
@@ -169,6 +177,19 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
[V3D_QPU_A_ITOF] = "itof",
[V3D_QPU_A_CLZ] = "clz",
[V3D_QPU_A_UTOF] = "utof",
+ [V3D_QPU_A_MOV] = "mov",
+ [V3D_QPU_A_FMOV] = "fmov",
+ [V3D_QPU_A_VPACK] = "vpack",
+ [V3D_QPU_A_V8PACK] = "v8pack",
+ [V3D_QPU_A_V10PACK] = "v10pack",
+ [V3D_QPU_A_V11FPACK] = "v11fpack",
+ [V3D_QPU_A_BALLOT] = "ballot",
+ [V3D_QPU_A_BCASTF] = "bcastf",
+ [V3D_QPU_A_ALLEQ] = "alleq",
+ [V3D_QPU_A_ALLFEQ] = "allfeq",
+ [V3D_QPU_A_ROTQ] = "rotq",
+ [V3D_QPU_A_ROT] = "rot",
+ [V3D_QPU_A_SHUFFLE] = "shuffle",
};
if (op >= ARRAY_SIZE(op_names))
@@ -191,6 +212,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
[V3D_QPU_M_MOV] = "mov",
[V3D_QPU_M_NOP] = "nop",
[V3D_QPU_M_FMUL] = "fmul",
+ [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
+ [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
+ [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
+ [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
+ [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
+ [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
};
if (op >= ARRAY_SIZE(op_names))
@@ -450,6 +477,21 @@ static const uint8_t add_op_args[] = {
[V3D_QPU_A_ITOF] = D | A,
[V3D_QPU_A_CLZ] = D | A,
[V3D_QPU_A_UTOF] = D | A,
+
+ [V3D_QPU_A_MOV] = D | A,
+ [V3D_QPU_A_FMOV] = D | A,
+ [V3D_QPU_A_VPACK] = D | A | B,
+ [V3D_QPU_A_V8PACK] = D | A | B,
+ [V3D_QPU_A_V10PACK] = D | A | B,
+ [V3D_QPU_A_V11FPACK] = D | A | B,
+
+ [V3D_QPU_A_BALLOT] = D | A,
+ [V3D_QPU_A_BCASTF] = D | A,
+ [V3D_QPU_A_ALLEQ] = D | A,
+ [V3D_QPU_A_ALLFEQ] = D | A,
+ [V3D_QPU_A_ROTQ] = D | A | B,
+ [V3D_QPU_A_ROT] = D | A | B,
+ [V3D_QPU_A_SHUFFLE] = D | A | B,
};
static const uint8_t mul_op_args[] = {
@@ -463,6 +505,12 @@ static const uint8_t mul_op_args[] = {
[V3D_QPU_M_NOP] = 0,
[V3D_QPU_M_MOV] = D | A,
[V3D_QPU_M_FMUL] = D | A | B,
+ [V3D_QPU_M_FTOUNORM16] = D | A,
+ [V3D_QPU_M_FTOSNORM16] = D | A,
+ [V3D_QPU_M_VFTOUNORM8] = D | A,
+ [V3D_QPU_M_VFTOSNORM8] = D | A,
+ [V3D_QPU_M_VFTOUNORM10LO] = D | A,
+ [V3D_QPU_M_VFTOUNORM10HI] = D | A,
};
bool
@@ -636,19 +684,23 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op)
}
bool
-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
{
- if (inst->sig.ldtlb ||
- inst->sig.ldtlbu)
- return true;
+ return inst->sig.ldtlb || inst->sig.ldtlbu;
+}
+bool
+v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
+{
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
- if (inst->alu.add.magic_write &&
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr)) {
return true;
}
- if (inst->alu.mul.magic_write &&
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr)) {
return true;
}
@@ -658,18 +710,32 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
}
bool
+v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+{
+ return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
+}
+
+bool
v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
{
- if (v3d_qpu_instr_is_sfu(inst))
- return true;
+ return v3d_qpu_instr_is_sfu(inst) || v3d_qpu_instr_is_legacy_sfu(inst);
+}
+/* Checks whether the instruction implements a SFU operation by the writing
+ * to specific magic register addresses instead of using SFU ALU opcodes.
+ */
+bool
+v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst)
+{
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
- if (inst->alu.add.magic_write &&
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) {
return true;
}
- if (inst->alu.mul.magic_write &&
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) {
return true;
}
@@ -689,6 +755,13 @@ v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst)
case V3D_QPU_A_LOG:
case V3D_QPU_A_SIN:
case V3D_QPU_A_RSQRT2:
+ case V3D_QPU_A_BALLOT:
+ case V3D_QPU_A_BCASTF:
+ case V3D_QPU_A_ALLEQ:
+ case V3D_QPU_A_ALLFEQ:
+ case V3D_QPU_A_ROTQ:
+ case V3D_QPU_A_ROT:
+ case V3D_QPU_A_SHUFFLE:
return true;
default:
return false;
@@ -702,9 +775,11 @@ v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
return (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
- ((inst->alu.add.magic_write &&
+ ((inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.add.waddr)) ||
- (inst->alu.mul.magic_write &&
+ (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.mul.waddr))));
}
@@ -740,12 +815,14 @@ v3d_qpu_writes_vpm(const struct v3d_qpu_instr *inst)
if (v3d_qpu_add_op_writes_vpm(inst->alu.add.op))
return true;
- if (inst->alu.add.magic_write &&
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr)) {
return true;
}
- if (inst->alu.mul.magic_write &&
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr)) {
return true;
}
@@ -773,12 +850,18 @@ v3d_qpu_writes_unifa(const struct v3d_device_info *devinfo,
inst->alu.mul.waddr == V3D_QPU_WADDR_UNIFA) {
return true;
}
+
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ inst->sig_magic &&
+ inst->sig_addr == V3D_QPU_WADDR_UNIFA) {
+ return true;
+ }
}
return false;
}
-static bool
+bool
v3d_qpu_waits_vpm(const struct v3d_qpu_instr *inst)
{
return inst->type == V3D_QPU_INSTR_TYPE_ALU &&
@@ -805,10 +888,12 @@ qpu_writes_magic_waddr_explicitly(const struct v3d_device_info *devinfo,
uint32_t waddr)
{
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
- if (inst->alu.add.magic_write && inst->alu.add.waddr == waddr)
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write && inst->alu.add.waddr == waddr)
return true;
- if (inst->alu.mul.magic_write && inst->alu.mul.waddr == waddr)
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write && inst->alu.mul.waddr == waddr)
return true;
}
@@ -824,6 +909,9 @@ bool
v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if(!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
return true;
@@ -834,14 +922,19 @@ bool
v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
- if (inst->alu.add.magic_write &&
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
(inst->alu.add.waddr == V3D_QPU_WADDR_R4 ||
v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))) {
return true;
}
- if (inst->alu.mul.magic_write &&
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
(inst->alu.mul.waddr == V3D_QPU_WADDR_R4 ||
v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))) {
return true;
@@ -862,6 +955,9 @@ bool
v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
return true;
@@ -872,6 +968,9 @@ bool
v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (v3d_qpu_writes_r5(devinfo, inst))
return true;
if (v3d_qpu_writes_r4(devinfo, inst))
@@ -889,15 +988,67 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
}
bool
+v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst)
+{
+ if (devinfo->ver >= 71 &&
+ (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
{
int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
- (add_nsrc > 1 && inst->alu.add.b == mux) ||
- (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
- (mul_nsrc > 1 && inst->alu.mul.b == mux));
+ return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
+ (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
+ (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
+ (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
+}
+
+bool
+v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+ int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+
+ return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
+ (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
+ (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
+ (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
+}
+
+bool
+v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr)
+{
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
+ !inst->alu.add.magic_write &&
+ inst->alu.add.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
+ !inst->alu.mul.magic_write &&
+ inst->alu.mul.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic && inst->sig_addr == waddr) {
+ return true;
+ }
+
+ return false;
}
bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 4f165e93914..fe9b5d3a00f 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
bool ldvpm:1;
bool ldtlb:1;
bool ldtlbu:1;
- bool small_imm:1;
bool ucb:1;
bool rotate:1;
bool wrtmuc:1;
+ bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
+ bool small_imm_b:1; /* raddr_b (add b) */
+ bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
+ bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
};
enum v3d_qpu_cond {
@@ -88,12 +91,13 @@ enum v3d_qpu_uf {
};
enum v3d_qpu_waddr {
- V3D_QPU_WADDR_R0 = 0,
- V3D_QPU_WADDR_R1 = 1,
- V3D_QPU_WADDR_R2 = 2,
- V3D_QPU_WADDR_R3 = 3,
- V3D_QPU_WADDR_R4 = 4,
- V3D_QPU_WADDR_R5 = 5,
+ V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */
+ V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */
V3D_QPU_WADDR_NOP = 6,
V3D_QPU_WADDR_TLB = 7,
V3D_QPU_WADDR_TLBU = 8,
@@ -108,12 +112,12 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_SYNC = 16,
V3D_QPU_WADDR_SYNCU = 17,
V3D_QPU_WADDR_SYNCB = 18,
- V3D_QPU_WADDR_RECIP = 19,
- V3D_QPU_WADDR_RSQRT = 20,
- V3D_QPU_WADDR_EXP = 21,
- V3D_QPU_WADDR_LOG = 22,
- V3D_QPU_WADDR_SIN = 23,
- V3D_QPU_WADDR_RSQRT2 = 24,
+ V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
V3D_QPU_WADDR_TMUC = 32,
V3D_QPU_WADDR_TMUS = 33,
V3D_QPU_WADDR_TMUT = 34,
@@ -129,7 +133,8 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_TMUHSCM = 44,
V3D_QPU_WADDR_TMUHSF = 45,
V3D_QPU_WADDR_TMUHSLOD = 46,
- V3D_QPU_WADDR_R5REP = 55,
+ V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
+ V3D_QPU_WADDR_REP = 55, /* V3D 7.x */
};
struct v3d_qpu_flags {
@@ -222,6 +227,21 @@ enum v3d_qpu_add_op {
V3D_QPU_A_ITOF,
V3D_QPU_A_CLZ,
V3D_QPU_A_UTOF,
+
+ /* V3D 7.x */
+ V3D_QPU_A_FMOV,
+ V3D_QPU_A_MOV,
+ V3D_QPU_A_VPACK,
+ V3D_QPU_A_V8PACK,
+ V3D_QPU_A_V10PACK,
+ V3D_QPU_A_V11FPACK,
+ V3D_QPU_A_BALLOT,
+ V3D_QPU_A_BCASTF,
+ V3D_QPU_A_ALLEQ,
+ V3D_QPU_A_ALLFEQ,
+ V3D_QPU_A_ROTQ,
+ V3D_QPU_A_ROT,
+ V3D_QPU_A_SHUFFLE,
};
enum v3d_qpu_mul_op {
@@ -235,6 +255,14 @@ enum v3d_qpu_mul_op {
V3D_QPU_M_MOV,
V3D_QPU_M_NOP,
V3D_QPU_M_FMUL,
+
+ /* V3D 7.x */
+ V3D_QPU_M_FTOUNORM16,
+ V3D_QPU_M_FTOSNORM16,
+ V3D_QPU_M_VFTOUNORM8,
+ V3D_QPU_M_VFTOSNORM8,
+ V3D_QPU_M_VFTOUNORM10LO,
+ V3D_QPU_M_VFTOUNORM10HI,
};
enum v3d_qpu_output_pack {
@@ -276,6 +304,15 @@ enum v3d_qpu_input_unpack {
/** Swap high and low 16 bits */
V3D_QPU_UNPACK_SWAP_16,
+
+ /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UL,
+ /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UH,
+ /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IL,
+ /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IH,
};
enum v3d_qpu_mux {
@@ -289,25 +326,29 @@ enum v3d_qpu_mux {
V3D_QPU_MUX_B,
};
+struct v3d_qpu_input {
+ union {
+ enum v3d_qpu_mux mux; /* V3D 4.x */
+ uint8_t raddr; /* V3D 7.x */
+ };
+ enum v3d_qpu_input_unpack unpack;
+};
+
struct v3d_qpu_alu_instr {
struct {
enum v3d_qpu_add_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} add;
struct {
enum v3d_qpu_mul_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} mul;
};
@@ -379,8 +420,8 @@ struct v3d_qpu_instr {
struct v3d_qpu_sig sig;
uint8_t sig_addr;
bool sig_magic; /* If the signal writes to a magic address */
- uint8_t raddr_a;
- uint8_t raddr_b;
+ uint8_t raddr_a; /* V3D 4.x */
+ uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
struct v3d_qpu_flags flags;
union {
@@ -450,8 +491,11 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -463,11 +507,14 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux);
bool v3d_qpu_uses_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_waits_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_reads_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_writes_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_reads_or_writes_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -481,4 +528,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+
+bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
+bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr);
#endif
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index eee1e9f95a5..c4added7344 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -84,6 +84,9 @@
#define V3D_QPU_MUL_A_SHIFT 18
#define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18)
+#define V3D_QPU_RADDR_C_SHIFT 18
+#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18)
+
#define V3D_QPU_ADD_B_SHIFT 15
#define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15)
@@ -98,6 +101,9 @@
#define V3D_QPU_BRANCH_BDI_SHIFT 12
#define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12)
+#define V3D_QPU_RADDR_D_SHIFT 12
+#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12)
+
#define V3D_QPU_RADDR_A_SHIFT 6
#define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6)
@@ -112,12 +118,15 @@
#define LDTMU .ldtmu = true
#define LDVARY .ldvary = true
#define LDVPM .ldvpm = true
-#define SMIMM .small_imm = true
#define LDTLB .ldtlb = true
#define LDTLBU .ldtlbu = true
#define UCB .ucb = true
#define ROT .rotate = true
#define WRTMUC .wrtmuc = true
+#define SMIMM_A .small_imm_a = true
+#define SMIMM_B .small_imm_b = true
+#define SMIMM_C .small_imm_c = true
+#define SMIMM_D .small_imm_d = true
static const struct v3d_qpu_sig v33_sig_map[] = {
/* MISC R3 R4 R5 */
@@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDVARY, LDTMU, },
[13] = { THRSW, LDVARY, LDTMU, },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
/* 18-21 reserved */
@@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[27] = { THRSW, LDVPM, LDUNIF },
[28] = { LDVPM, LDTMU, },
[29] = { THRSW, LDVPM, LDTMU, },
- [30] = { SMIMM, LDVPM, },
- [31] = { SMIMM, },
+ [30] = { SMIMM_B, LDVPM, },
+ [31] = { SMIMM_B, },
};
static const struct v3d_qpu_sig v40_sig_map[] = {
@@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[10] = { LDVARY, LDUNIF },
[11] = { THRSW, LDVARY, LDUNIF },
/* 12-13 reserved */
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[22] = { UCB, },
[23] = { ROT, },
/* 24-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
};
static const struct v3d_qpu_sig v41_sig_map[] = {
@@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDUNIFRF },
[13] = { THRSW, LDUNIFRF },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[24] = { LDUNIFA},
[25] = { LDUNIFARF },
/* 26-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
+};
+
+
+static const struct v3d_qpu_sig v71_sig_map[] = {
+ /* MISC phys RF0 */
+ [0] = { },
+ [1] = { THRSW, },
+ [2] = { LDUNIF },
+ [3] = { THRSW, LDUNIF },
+ [4] = { LDTMU, },
+ [5] = { THRSW, LDTMU, },
+ [6] = { LDTMU, LDUNIF },
+ [7] = { THRSW, LDTMU, LDUNIF },
+ [8] = { LDVARY, },
+ [9] = { THRSW, LDVARY, },
+ [10] = { LDVARY, LDUNIF },
+ [11] = { THRSW, LDVARY, LDUNIF },
+ [12] = { LDUNIFRF },
+ [13] = { THRSW, LDUNIFRF },
+ [14] = { SMIMM_A, },
+ [15] = { SMIMM_B, },
+ [16] = { LDTLB, },
+ [17] = { LDTLBU, },
+ [18] = { WRTMUC },
+ [19] = { THRSW, WRTMUC },
+ [20] = { LDVARY, WRTMUC },
+ [21] = { THRSW, LDVARY, WRTMUC },
+ [22] = { UCB, },
+ /* 23 reserved */
+ [24] = { LDUNIFA},
+ [25] = { LDUNIFARF },
+ /* 26-29 reserved */
+ [30] = { SMIMM_C, },
+ [31] = { SMIMM_D, },
};
bool
@@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
if (packed_sig >= ARRAY_SIZE(v33_sig_map))
return false;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ *sig = v71_sig_map[packed_sig];
+ else if (devinfo->ver >= 41)
*sig = v41_sig_map[packed_sig];
else if (devinfo->ver == 40)
*sig = v40_sig_map[packed_sig];
@@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
{
static const struct v3d_qpu_sig *map;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ map = v71_sig_map;
+ else if (devinfo->ver >= 41)
map = v41_sig_map;
else if (devinfo->ver == 40)
map = v40_sig_map;
@@ -256,13 +303,6 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
return false;
}
-static inline unsigned
-fui( float f )
-{
- union {float f; unsigned ui;} fi;
- fi.f = f;
- return fi.ui;
-}
static const uint32_t small_immediates[] = {
0, 1, 2, 3,
@@ -425,8 +465,13 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
if (flags_present & MUF)
*packed_cond |= cond->muf - V3D_QPU_UF_ANDZ + 4;
- if (flags_present & AC)
- *packed_cond |= (cond->ac - V3D_QPU_COND_IFA) << 2;
+ if (flags_present & AC) {
+ if (*packed_cond & (1 << 6))
+ *packed_cond |= cond->ac - V3D_QPU_COND_IFA;
+ else
+ *packed_cond |= (cond->ac -
+ V3D_QPU_COND_IFA) << 2;
+ }
if (flags_present & MC) {
if (*packed_cond & (1 << 6))
@@ -445,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
/* Make a mapping of the table of opcodes in the spec. The opcode is
* determined by a combination of the opcode field, and in the case of 0 or
- * 1-arg opcodes, the mux_b field as well.
+ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as
+ * well.
*/
-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1))
-#define ANYMUX MUX_MASK(0, 7)
+#define OP_MASK(val) BITFIELD64_BIT(val)
+#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1)
+#define ANYMUX OP_RANGE(0, 7)
+#define ANYOPMASK OP_RANGE(0, 63)
struct opcode_desc {
uint8_t opcode_first;
uint8_t opcode_last;
- uint8_t mux_b_mask;
- uint8_t mux_a_mask;
+
+ union {
+ struct {
+ uint8_t b_mask;
+ uint8_t a_mask;
+ } mux;
+ uint64_t raddr_mask;
+ };
+
uint8_t op;
/* first_ver == 0 if it's the same across all V3D versions.
@@ -467,122 +522,329 @@ struct opcode_desc {
uint8_t last_ver;
};
-static const struct opcode_desc add_ops[] = {
+static const struct opcode_desc add_ops_v33[] = {
/* FADD is FADDNF depending on the order of the mux_a/mux_b. */
- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADD },
- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADDNF },
- { 53, 55, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 56, 56, ANYMUX, ANYMUX, V3D_QPU_A_ADD },
- { 57, 59, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 60, 60, ANYMUX, ANYMUX, V3D_QPU_A_SUB },
- { 61, 63, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 64, 111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB },
- { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN },
- { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX },
- { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN },
- { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX },
- { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL },
- { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR },
- { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR },
- { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR },
+ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD },
+ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF },
+ { 53, 55, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 56, 56, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD },
+ { 57, 59, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 60, 60, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB },
+ { 61, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 64, 111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB },
+ { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN },
+ { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX },
+ { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN },
+ { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX },
+ { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL },
+ { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR },
+ { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR },
+ { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR },
/* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */
- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN },
- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX },
- { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN },
-
- { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND },
- { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR },
- { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR },
-
- { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD },
- { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB },
- { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT },
- { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
- { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
- { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
- { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
- { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
- { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
- { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
- { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
- { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX },
- { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX },
- { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR },
- { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA },
- { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA },
- { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB },
- { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB },
-
- { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD },
- { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD },
- { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD },
- { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD },
-
- { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF },
- { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF },
- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 },
- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 },
- { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 },
- { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
- { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
- { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
- { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
- { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
- { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
-
- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
- { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
- { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
- { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
- { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
- { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
- { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
+ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN },
+ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX },
+ { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN },
+
+ { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND },
+ { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR },
+ { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR },
+
+ { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD },
+ { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB },
+ { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT },
+ { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG },
+ { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH },
+ { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH },
+ { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP },
+ { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP },
+ { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF },
+ { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD },
+
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 },
+ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 },
+ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+
+ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 },
+ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
+ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
/* FIXME: MORE COMPLICATED */
- /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
+ /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
- { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP },
- { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX },
+ { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP },
+ { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX },
- { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND },
- { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN },
- { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC },
- { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ },
- { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR },
- { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ },
- { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL },
- { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC },
+ { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND },
+ { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN },
+ { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC },
+ { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ },
+ { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR },
+ { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ },
+ { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL },
+ { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC },
- { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX },
- { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY },
+ { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX },
+ { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY },
/* The stvpms are distinguished by the waddr field. */
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV },
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD },
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP },
+
+ { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF },
+ { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ },
+ { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF },
+};
+
+static const struct opcode_desc mul_ops_v33[] = {
+ { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD },
+ { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB },
+ { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 },
+ { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL },
+ { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 },
+ { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP },
+ { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 },
+ { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42},
+ { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 },
+ { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 },
+
+ { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL },
+};
- { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF },
- { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ },
- { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF },
+/* Note that it would have been possible to define all the add/mul opcodes in
+ * just one table, using the first_ver/last_ver. But taking into account that
+ * for v71 there were a lot of changes, it was more tidy this way. Also right
+ * now we are doing a linear search on those tables, so this maintains the
+ * tables smaller.
+ *
+ * Just in case we merge the tables, we define the first_ver as 71 for those
+ * opcodes that changed on v71
+ */
+static const struct opcode_desc add_ops_v71[] = {
+ /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
+ { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
+ { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 60, 60, .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB },
+ { 61, 63, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 64, 111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB },
+ { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN },
+ { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX },
+ { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN },
+ { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX },
+ { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL },
+ { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
+ { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
+ { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
+ /* FMIN is instead FMAX depending on the raddr_a/b order. */
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
+ { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
+
+ { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
+ { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
+ { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR },
+ { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD },
+ { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB },
+
+ { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT },
+ { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG },
+ { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH },
+ { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH },
+ { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP },
+ { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ },
+ { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF },
+ { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF },
+
+ { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+ { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+ { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+ { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR },
+ { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+ { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+ { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+ { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+ { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD },
+ { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD },
+ { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF },
+ { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF },
+ { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID },
+ { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID },
+ { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID },
+ { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT },
+ { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT },
+ { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST },
+ { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST },
+
+ { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD },
+ { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD },
+
+ { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 },
+
+ { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(38), V3D_QPU_A_BALLOT, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(39), V3D_QPU_A_BCASTF, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(40), V3D_QPU_A_ALLEQ, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(41), V3D_QPU_A_ALLFEQ, 71 },
+
+ { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 },
+
+ /* The stvpms are distinguished by the waddr field. */
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71},
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71},
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71},
+
+ { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(3), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(7), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC },
+
+ { 246, 246, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 },
+
+ { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+
+ { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
+ { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
+
+ { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
+
+ { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+
+ { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
+ { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
+
+ { 252, 252, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROTQ, 71 },
+ { 253, 253, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROT, 71 },
+ { 254, 254, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHUFFLE, 71 },
};
-static const struct opcode_desc mul_ops[] = {
- { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD },
- { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB },
- { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 },
- { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL },
- { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 },
- { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP },
- { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV },
- { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV },
- { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 },
- { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV },
- { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
+static const struct opcode_desc mul_ops_v71[] = {
+ /* For V3D 7.1, second mask field would be ignored */
+ { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 },
+ { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 },
+ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+ { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 },
+ { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 },
+ { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 },
+
+ { 14, 14, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(3), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(7), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
+
+ { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
};
/* Returns true if op_desc should be filtered out based on devinfo->ver
@@ -591,17 +853,23 @@ static const struct opcode_desc mul_ops[] = {
*/
static bool
opcode_invalid_in_version(const struct v3d_device_info *devinfo,
- const struct opcode_desc *op_desc)
+ const uint8_t first_ver,
+ const uint8_t last_ver)
{
- return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
- (op_desc->last_ver != 0 && devinfo->ver > op_desc->last_ver);
+ return (first_ver != 0 && devinfo->ver < first_ver) ||
+ (last_ver != 0 && devinfo->ver > last_ver);
}
+/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending
+ * on the devinfo->ver some would be ignored. We do this way just to avoid
+ * having two really similar lookup_opcode methods
+ */
static const struct opcode_desc *
lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
const struct opcode_desc *opcodes,
size_t num_opcodes, uint32_t opcode,
- uint32_t mux_a, uint32_t mux_b)
+ uint32_t mux_a, uint32_t mux_b,
+ uint32_t raddr)
{
for (int i = 0; i < num_opcodes; i++) {
const struct opcode_desc *op_desc = &opcodes[i];
@@ -610,14 +878,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
opcode > op_desc->opcode_last)
continue;
- if (opcode_invalid_in_version(devinfo, op_desc))
+ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
continue;
- if (!(op_desc->mux_b_mask & (1 << mux_b)))
- continue;
+ if (devinfo->ver < 71) {
+ if (!(op_desc->mux.b_mask & (1 << mux_b)))
+ continue;
- if (!(op_desc->mux_a_mask & (1 << mux_a)))
- continue;
+ if (!(op_desc->mux.a_mask & (1 << mux_a)))
+ continue;
+ } else {
+ if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr)))
+ continue;
+ }
return op_desc;
}
@@ -670,6 +943,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
}
static bool
+v3d_qpu_int32_unpack_unpack(uint32_t packed,
+ enum v3d_qpu_input_unpack *unpacked)
+{
+ switch (packed) {
+ case 0:
+ *unpacked = V3D_QPU_UNPACK_NONE;
+ return true;
+ case 1:
+ *unpacked = V3D_QPU_UNPACK_UL;
+ return true;
+ case 2:
+ *unpacked = V3D_QPU_UNPACK_UH;
+ return true;
+ case 3:
+ *unpacked = V3D_QPU_UNPACK_IL;
+ return true;
+ case 4:
+ *unpacked = V3D_QPU_UNPACK_IH;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+ uint32_t *packed)
+{
+ switch (unpacked) {
+ case V3D_QPU_UNPACK_NONE:
+ *packed = 0;
+ return true;
+ case V3D_QPU_UNPACK_UL:
+ *packed = 1;
+ return true;
+ case V3D_QPU_UNPACK_UH:
+ *packed = 2;
+ return true;
+ case V3D_QPU_UNPACK_IL:
+ *packed = 3;
+ return true;
+ case V3D_QPU_UNPACK_IH:
+ *packed = 4;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
v3d_qpu_float16_unpack_unpack(uint32_t packed,
enum v3d_qpu_input_unpack *unpacked)
{
@@ -720,10 +1043,10 @@ v3d_qpu_float16_unpack_pack(enum v3d_qpu_input_unpack unpacked,
}
static bool
-v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked,
+v3d_qpu_float32_pack_pack(enum v3d_qpu_output_pack pack,
uint32_t *packed)
{
- switch (unpacked) {
+ switch (pack) {
case V3D_QPU_PACK_NONE:
*packed = 0;
return true;
@@ -739,8 +1062,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked,
}
static bool
-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
- struct v3d_qpu_instr *instr)
+v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
{
uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
@@ -757,8 +1080,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
map_op = (map_op - 253 + 245);
const struct opcode_desc *desc =
- lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
- map_op, mux_a, mux_b);
+ lookup_opcode_from_packed(devinfo, add_ops_v33,
+ ARRAY_SIZE(add_ops_v33),
+ map_op, mux_a, mux_b, 0);
if (!desc)
return false;
@@ -814,12 +1138,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.add.b_unpack)) {
+ &instr->alu.add.b.unpack)) {
return false;
}
break;
@@ -833,7 +1157,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = mux_b & 0x3;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -845,7 +1169,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -853,23 +1177,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.add.a = mux_a;
- instr->alu.add.b = mux_b;
+ instr->alu.add.a.mux = mux_a;
+ instr->alu.add.b.mux = mux_b;
instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
instr->alu.add.magic_write = false;
@@ -894,18 +1218,205 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
}
static bool
-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
+ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+ uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A);
+ uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B);
+ uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+ uint32_t map_op = op;
+
+ const struct opcode_desc *desc =
+ lookup_opcode_from_packed(devinfo,
+ add_ops_v71,
+ ARRAY_SIZE(add_ops_v71),
+ map_op, 0, 0,
+ raddr_b);
+ if (!desc)
+ return false;
+
+ instr->alu.add.op = desc->op;
+
+ /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
+ * operands.
+ */
+ if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
+ if (instr->alu.add.op == V3D_QPU_A_FMIN)
+ instr->alu.add.op = V3D_QPU_A_FMAX;
+ if (instr->alu.add.op == V3D_QPU_A_FADD)
+ instr->alu.add.op = V3D_QPU_A_FADDNF;
+ }
+
+ /* Some QPU ops require a bit more than just basic opcode and mux a/b
+ * comparisons to distinguish them.
+ */
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_STVPMV:
+ case V3D_QPU_A_STVPMD:
+ case V3D_QPU_A_STVPMP:
+ switch (waddr) {
+ case 0:
+ instr->alu.add.op = V3D_QPU_A_STVPMV;
+ break;
+ case 1:
+ instr->alu.add.op = V3D_QPU_A_STVPMD;
+ break;
+ case 2:
+ instr->alu.add.op = V3D_QPU_A_STVPMP;
+ break;
+ default:
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FADDNF:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FCMP:
+ case V3D_QPU_A_VFPACK:
+ if (instr->alu.add.op != V3D_QPU_A_VFPACK &&
+ instr->alu.add.op != V3D_QPU_A_FCMP) {
+ instr->alu.add.output_pack = (op >> 4) & 0x3;
+ } else {
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+ &instr->alu.add.b.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY:
+ instr->alu.add.output_pack = raddr_b & 0x3;
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_VFMIN:
+ case V3D_QPU_A_VFMAX:
+ unreachable("pending v71 update");
+ if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+
+ case V3D_QPU_A_MOV:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FMOV:
+ instr->alu.add.output_pack = raddr_b & 0x3;
+
+ /* Mul alu FMOV has one additional variant */
+ int32_t unpack = (raddr_b >> 2) & 0x7;
+ if (unpack == 7)
+ return false;
+
+ if (!v3d_qpu_float32_unpack_unpack(unpack,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ default:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+ }
+
+ instr->alu.add.a.raddr = raddr_a;
+ instr->alu.add.b.raddr = raddr_b;
+ instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+
+ instr->alu.add.magic_write = false;
+ if (packed_inst & V3D_QPU_MA) {
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_LDVPMV_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
+ break;
+ case V3D_QPU_A_LDVPMD_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT;
+ break;
+ case V3D_QPU_A_LDVPMG_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT;
+ break;
+ default:
+ instr->alu.add.magic_write = true;
+ break;
+ }
+ }
+
+ return true;
+}
+
+static bool
+v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
struct v3d_qpu_instr *instr)
{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_add_unpack(devinfo, packed_inst, instr);
+ else
+ return v3d71_qpu_add_unpack(devinfo, packed_inst, instr);
+}
+
+static bool
+v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
uint32_t mux_b = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_B);
{
const struct opcode_desc *desc =
- lookup_opcode_from_packed(devinfo, mul_ops,
- ARRAY_SIZE(mul_ops),
- op, mux_a, mux_b);
+ lookup_opcode_from_packed(devinfo,
+ mul_ops_v33,
+ ARRAY_SIZE(mul_ops_v33),
+ op, mux_a, mux_b, 0);
if (!desc)
return false;
@@ -917,12 +1428,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.mul.b_unpack)) {
+ &instr->alu.mul.b.unpack)) {
return false;
}
@@ -933,7 +1444,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
((mux_b >> 2) & 1));
if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
@@ -943,29 +1454,123 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.mul.a = mux_a;
- instr->alu.mul.b = mux_b;
+ instr->alu.mul.a.mux = mux_a;
+ instr->alu.mul.b.mux = mux_b;
instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
return true;
}
+static bool
+v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
+ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+ uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C);
+ uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D);
+
+ {
+ const struct opcode_desc *desc =
+ lookup_opcode_from_packed(devinfo,
+ mul_ops_v71,
+ ARRAY_SIZE(mul_ops_v71),
+ op, 0, 0,
+ raddr_d);
+ if (!desc)
+ return false;
+
+ instr->alu.mul.op = desc->op;
+ }
+
+ switch (instr->alu.mul.op) {
+ case V3D_QPU_M_FMUL:
+ instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+ &instr->alu.mul.b.unpack)) {
+ return false;
+ }
+
+ break;
+
+ case V3D_QPU_M_FMOV:
+ instr->alu.mul.output_pack = raddr_d & 0x3;
+
+ if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ break;
+
+ case V3D_QPU_M_VFMUL:
+ unreachable("pending v71 update");
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ break;
+
+ case V3D_QPU_M_MOV:
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+ break;
+
+ default:
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+ }
+
+ instr->alu.mul.a.raddr = raddr_c;
+ instr->alu.mul.b.raddr = raddr_d;
+ instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
+ instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
+
+ return true;
+}
+
+static bool
+v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr);
+ else
+ return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr);
+}
+
static const struct opcode_desc *
lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
const struct opcode_desc *opcodes, size_t num_opcodes,
@@ -977,7 +1582,7 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
if (op_desc->op != op)
continue;
- if (opcode_invalid_in_version(devinfo, op_desc))
+ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
continue;
return op_desc;
@@ -987,15 +1592,16 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
}
static bool
-v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d33_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
uint32_t waddr = instr->alu.add.waddr;
- uint32_t mux_a = instr->alu.add.a;
- uint32_t mux_b = instr->alu.add.b;
+ uint32_t mux_a = instr->alu.add.a.mux;
+ uint32_t mux_b = instr->alu.add.b.mux;
int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
const struct opcode_desc *desc =
- lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
+ lookup_opcode_from_instr(devinfo, add_ops_v33,
+ ARRAY_SIZE(add_ops_v33),
instr->alu.add.op);
if (!desc)
@@ -1007,10 +1613,10 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
* identify the operation type.
*/
if (nsrc < 2)
- mux_b = ffs(desc->mux_b_mask) - 1;
+ mux_b = ffs(desc->mux.b_mask) - 1;
if (nsrc < 1)
- mux_a = ffs(desc->mux_a_mask) - 1;
+ mux_a = ffs(desc->mux.a_mask) - 1;
bool no_magic_write = false;
@@ -1063,12 +1669,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
opcode |= output_pack << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
@@ -1102,23 +1708,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
uint32_t a_unpack;
uint32_t b_unpack;
- if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
- instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
- opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
- opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
+ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
break;
}
@@ -1137,13 +1743,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
mux_b |= packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
if (packed == 0)
return false;
- opcode = (opcode & ~(1 << 2)) | packed << 2;
+ opcode = (opcode & ~(0x3 << 2)) | packed << 2;
break;
}
@@ -1155,7 +1761,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
return false;
uint32_t packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1168,11 +1774,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
- if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1182,8 +1788,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
default:
if (instr->alu.add.op != V3D_QPU_A_NOP &&
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
return false;
}
break;
@@ -1200,15 +1806,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
static bool
-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
- uint32_t mux_a = instr->alu.mul.a;
- uint32_t mux_b = instr->alu.mul.b;
+ uint32_t waddr = instr->alu.add.waddr;
+ uint32_t raddr_a = instr->alu.add.a.raddr;
+ uint32_t raddr_b = instr->alu.add.b.raddr;
+
+ int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+ const struct opcode_desc *desc =
+ lookup_opcode_from_instr(devinfo, add_ops_v71,
+ ARRAY_SIZE(add_ops_v71),
+ instr->alu.add.op);
+ if (!desc)
+ return false;
+
+ uint32_t opcode = desc->opcode_first;
+
+ /* If an operation doesn't use an arg, its raddr values may be used to
+ * identify the operation type.
+ */
+ if (nsrc < 2)
+ raddr_b = ffsll(desc->raddr_mask) - 1;
+
+ bool no_magic_write = false;
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_STVPMV:
+ waddr = 0;
+ no_magic_write = true;
+ break;
+ case V3D_QPU_A_STVPMD:
+ waddr = 1;
+ no_magic_write = true;
+ break;
+ case V3D_QPU_A_STVPMP:
+ waddr = 2;
+ no_magic_write = true;
+ break;
+
+ case V3D_QPU_A_LDVPMV_IN:
+ case V3D_QPU_A_LDVPMD_IN:
+ case V3D_QPU_A_LDVPMP:
+ case V3D_QPU_A_LDVPMG_IN:
+ assert(!instr->alu.add.magic_write);
+ break;
+
+ case V3D_QPU_A_LDVPMV_OUT:
+ case V3D_QPU_A_LDVPMD_OUT:
+ case V3D_QPU_A_LDVPMG_OUT:
+ assert(!instr->alu.add.magic_write);
+ *packed_instr |= V3D_QPU_MA;
+ break;
+
+ default:
+ break;
+ }
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FADDNF:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FCMP: {
+ uint32_t output_pack;
+ uint32_t a_unpack;
+ uint32_t b_unpack;
+
+ if (instr->alu.add.op != V3D_QPU_A_FCMP) {
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &output_pack)) {
+ return false;
+ }
+ opcode |= output_pack << 4;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &a_unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+ &b_unpack)) {
+ return false;
+ }
+
+ /* These operations with commutative operands are
+ * distinguished by the order of the operands come in.
+ */
+ bool ordering =
+ instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
+ if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
+ instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
+ ((instr->alu.add.op == V3D_QPU_A_FMAX ||
+ instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
+ uint32_t temp;
+
+ temp = a_unpack;
+ a_unpack = b_unpack;
+ b_unpack = temp;
+
+ temp = raddr_a;
+ raddr_a = raddr_b;
+ raddr_b = temp;
+
+ /* If we are swapping raddr_a/b we also need to swap
+ * small_imm_a/b.
+ */
+ if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
+ assert(instr->sig.small_imm_a !=
+ instr->sig.small_imm_b);
+ struct v3d_qpu_sig new_sig = instr->sig;
+ new_sig.small_imm_a = !instr->sig.small_imm_a;
+ new_sig.small_imm_b = !instr->sig.small_imm_b;
+ uint32_t sig;
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+ return false;
+ *packed_instr &= ~V3D_QPU_SIG_MASK;
+ *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+ }
+ }
+
+ opcode |= a_unpack << 2;
+ opcode |= b_unpack << 0;
+
+ break;
+ }
+
+ case V3D_QPU_A_VFPACK: {
+ uint32_t a_unpack;
+ uint32_t b_unpack;
+
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &a_unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+ &b_unpack)) {
+ return false;
+ }
+
+ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
+
+ break;
+ }
+
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_b |= packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (packed == 0)
+ return false;
+ raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2;
+ break;
+ }
+
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ uint32_t packed;
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (packed == 0)
+ return false;
+
+ raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2;
+
+ break;
+
+ case V3D_QPU_A_VFMIN:
+ case V3D_QPU_A_VFMAX:
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
+ return false;
+ }
+
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed;
+ break;
+
+ case V3D_QPU_A_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_b |= packed << 2;
+ break;
+ }
+
+ case V3D_QPU_A_FMOV: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_b = packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ raddr_b |= packed << 2;
+ break;
+ }
+
+ default:
+ if (instr->alu.add.op != V3D_QPU_A_NOP &&
+ (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
+ break;
+ }
+
+ *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A);
+ *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B);
+ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
+ *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
+ if (instr->alu.add.magic_write && !no_magic_write)
+ *packed_instr |= V3D_QPU_MA;
+
+ return true;
+}
+
+static bool
+v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ uint32_t mux_a = instr->alu.mul.a.mux;
+ uint32_t mux_b = instr->alu.mul.b.mux;
int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
const struct opcode_desc *desc =
- lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
+ lookup_opcode_from_instr(devinfo, mul_ops_v33,
+ ARRAY_SIZE(mul_ops_v33),
instr->alu.mul.op);
if (!desc)
@@ -1220,10 +2091,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
* that here. If mux a/b determine packing, it will be set below.
*/
if (nsrc < 2)
- mux_b = ffs(desc->mux_b_mask) - 1;
+ mux_b = ffs(desc->mux.b_mask) - 1;
if (nsrc < 1)
- mux_a = ffs(desc->mux_a_mask) - 1;
+ mux_a = ffs(desc->mux.a_mask) - 1;
switch (instr->alu.mul.op) {
case V3D_QPU_M_FMUL: {
@@ -1238,13 +2109,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
*/
opcode += packed << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
opcode |= packed << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
&packed)) {
return false;
}
@@ -1262,7 +2133,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
opcode |= (packed >> 1) & 1;
mux_b = (packed & 1) << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
@@ -1276,22 +2147,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
return false;
- if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
- if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
opcode = 8;
else
opcode |= (packed + 4) & 7;
- if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
return false;
break;
}
default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
break;
}
@@ -1307,6 +2184,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
}
static bool
+v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ uint32_t raddr_c = instr->alu.mul.a.raddr;
+ uint32_t raddr_d = instr->alu.mul.b.raddr;
+ int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+
+ const struct opcode_desc *desc =
+ lookup_opcode_from_instr(devinfo, mul_ops_v71,
+ ARRAY_SIZE(mul_ops_v71),
+ instr->alu.mul.op);
+ if (!desc)
+ return false;
+
+ uint32_t opcode = desc->opcode_first;
+
+ /* Some opcodes have a single valid value for their raddr_d, so set
+ * that here. If raddr_b determine packing, it will be set below.
+ */
+ if (nsrc < 2)
+ raddr_d = ffsll(desc->raddr_mask) - 1;
+
+ switch (instr->alu.mul.op) {
+ case V3D_QPU_M_FMUL: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+ &packed)) {
+ return false;
+ }
+ /* No need for a +1 because desc->opcode_first has a 1 in this
+ * field.
+ */
+ opcode += packed << 4;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed << 2;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed << 0;
+ break;
+ }
+
+ case V3D_QPU_M_FMOV: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_d |= packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ raddr_d |= packed << 2;
+ break;
+ }
+
+ case V3D_QPU_M_VFMUL: {
+ unreachable("pending v71 update");
+ uint32_t packed;
+
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
+ opcode = 8;
+ else
+ opcode |= (packed + 4) & 7;
+
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
+ return false;
+
+ break;
+ }
+
+ case V3D_QPU_M_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_d |= packed << 2;
+ break;
+ }
+
+ default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
+ break;
+ }
+
+ *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C);
+ *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D);
+ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
+ *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
+ if (instr->alu.mul.magic_write)
+ *packed_instr |= V3D_QPU_MM;
+
+ return true;
+}
+
+static bool
+v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_add_pack(devinfo, instr, packed_instr);
+ else
+ return v3d71_qpu_add_pack(devinfo, instr, packed_instr);
+}
+
+static bool
+v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_mul_pack(devinfo, instr, packed_instr);
+ else
+ return v3d71_qpu_mul_pack(devinfo, instr, packed_instr);
+}
+
+static bool
v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
uint64_t packed_instr,
struct v3d_qpu_instr *instr)
@@ -1334,8 +2355,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
return false;
}
- instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
- instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+ if (devinfo->ver <= 71) {
+ /*
+ * For v71 this will be set on add/mul unpack, as raddr are now
+ * part of v3d_qpu_input
+ */
+ instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+ instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+ }
if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
return false;
@@ -1421,8 +2448,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
*packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
- *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
- *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+ if (devinfo->ver < 71) {
+ /*
+ * For v71 this will be set on add/mul unpack, as raddr are now
+ * part of v3d_qpu_input
+ */
+ *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+ *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+ }
if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
return false;
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
index e6b1918b8f0..be7b78d5ef0 100644
--- a/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
@@ -34,29 +34,29 @@ static const struct {
uint64_t inst;
const char *expected;
} tests[] = {
- { 33, 0x3d003186bb800000ull, "nop ; nop ; ldvary" },
- { 33, 0x3c20318105829000ull, "fadd r1, r1, r5 ; nop ; thrsw" },
- { 33, 0x3c403186bb81d000ull, "vpmsetup -, r5 ; nop ; ldunif" },
- { 33, 0x3f003186bb800000ull, "nop ; nop ; ldvpm" },
- { 33, 0x3c002380b6edb000ull, "or rf0, r3, r3 ; mov vpm, r3" },
- { 33, 0x57403006bbb80000ull, "nop ; fmul r0, rf0, r5 ; ldvpm; ldunif" },
- { 33, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3; fmul.pushz rf43.l, r5, r1.h" },
- { 33, 0xb0044c56ba326840ull, "flpop rf22, rf33 ; fmul.pushz rf49.l, r4.h, r1.abs" },
+ { 33, 0x3d003186bb800000ull, "nop ; nop ; ldvary" },
+ { 33, 0x3c20318105829000ull, "fadd r1, r1, r5 ; nop ; thrsw" },
+ { 33, 0x3c403186bb81d000ull, "vpmsetup -, r5 ; nop ; ldunif" },
+ { 33, 0x3f003186bb800000ull, "nop ; nop ; ldvpm" },
+ { 33, 0x3c002380b6edb000ull, "or rf0, r3, r3 ; mov vpm, r3" },
+ { 33, 0x57403006bbb80000ull, "nop ; fmul r0, rf0, r5 ; ldvpm; ldunif" },
+ { 33, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3 ; fmul.pushz rf43.l, r5, r1.h" },
+ { 33, 0xb0044c56ba326840ull, "flpop rf22, rf33 ; fmul.pushz rf49.l, r4.h, r1.abs" },
/* vfmul input packing */
- { 33, 0x101e8b6e8aad4000ull, "fmax.nornn rf46, r4.l, r2.l; vfmul.ifnb rf45, r3, r5" },
- { 33, 0x1857d3c219825000ull, "faddnf.norc r2.l, r5.l, r4; vfmul.ifb rf15, r0.ll, r4; ldunif" },
- { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna rf61.h, r4.abs, r2.l; vfmul rf55, r2.hh, r1" },
- { 33, 0x2011c89b402cc000ull, "fsub.norz rf27, r4.abs, r1.abs; vfmul.ifa rf34, r3.swp, r1" },
+ { 33, 0x101e8b6e8aad4000ull, "fmax.nornn rf46, r4.l, r2.l ; vfmul.ifnb rf45, r3, r5" },
+ { 33, 0x1857d3c219825000ull, "faddnf.norc r2.l, r5.l, r4 ; vfmul.ifb rf15, r0.ll, r4 ; ldunif" },
+ { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna rf61.h, r4.abs, r2.l; vfmul rf55, r2.hh, r1" },
+ { 33, 0x2011c89b402cc000ull, "fsub.norz rf27, r4.abs, r1.abs; vfmul.ifa rf34, r3.swp, r1" },
- { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc rf43, rf15.l, r0.h; fmul.ifna rf10.h, r4.l, r5.abs" },
- { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb rf7.h, r1.l; fmul.pushn rf46, r3.l, r2.abs" },
+ { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc rf43, rf15.l, r0.h; fmul.ifna rf10.h, r4.l, r5.abs" },
+ { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb rf7.h, r1.l ; fmul.pushn rf46, r3.l, r2.abs" },
/* small immediates */
- { 33, 0x5de24398bbdc6218ull, "vflb.andnn rf24 ; fmul rf14, -8, rf8.h" },
- { 33, 0x25ef83d8b166f00full, "vfmin.pushn rf24, 15.ff, r5; smul24.ifnb rf15, r1, r3" },
- { 33, 0xadedcdf70839f990ull, "faddnf.pushc rf55, -16.l, r3.abs; fmul.ifb rf55.l, rf38.l, r1.h" },
- { 33, 0x7dff89fa6a01f020ull, "fsub.nornc rf58.h, 0x3b800000.l, r3.l; fmul.ifnb rf39, r0.h, r0.h" },
+ { 33, 0x5de24398bbdc6218ull, "vflb.andnn rf24 ; fmul rf14, -8, rf8.h" },
+ { 33, 0x25ef83d8b166f00full, "vfmin.pushn rf24, 15.ff, r5 ; smul24.ifnb rf15, r1, r3" },
+ { 33, 0xadedcdf70839f990ull, "faddnf.pushc rf55, -16.l, r3.abs; fmul.ifb rf55.l, rf38.l, r1.h" },
+ { 33, 0x7dff89fa6a01f020ull, "fsub.nornc rf58.h, 0x3b800000.l, r3.l; fmul.ifnb rf39, r0.h, r0.h" },
/* branch conditions */
{ 33, 0x02000006002034c0ull, "b.anyap rf19" },
@@ -68,36 +68,36 @@ static const struct {
{ 33, 0x0200000300006000ull, "bu.na0 lri, a:unif" },
/* Special waddr names */
- { 33, 0x3c00318735808000ull, "vfpack tlb, r0, r1 ; nop" },
- { 33, 0xe0571c938e8d5000ull, "fmax.andc recip, r5.h, r2.l; fmul.ifb rf50.h, r3.l, r4.abs; ldunif" },
- { 33, 0xc04098d4382c9000ull, "add.pushn rsqrt, r1, r1; fmul rf35.h, r3.abs, r1.abs; ldunif" },
- { 33, 0x481edcd6b3184500ull, "vfmin.norn log, r4.hh, r0; fmul.ifnb rf51, rf20.abs, r0.l" },
- { 33, 0x041618d57c453000ull, "shl.andn exp, r3, r2; add.ifb rf35, r1, r2" },
- { 33, 0x7048e5da49272800ull, "fsub.ifa rf26, r2.l, rf32; fmul.pushc sin, r1.h, r1.abs; ldunif" },
+ { 33, 0x3c00318735808000ull, "vfpack tlb, r0, r1 ; nop" },
+ { 33, 0xe0571c938e8d5000ull, "fmax.andc recip, r5.h, r2.l ; fmul.ifb rf50.h, r3.l, r4.abs; ldunif" },
+ { 33, 0xc04098d4382c9000ull, "add.pushn rsqrt, r1, r1 ; fmul rf35.h, r3.abs, r1.abs ; ldunif" },
+ { 33, 0x481edcd6b3184500ull, "vfmin.norn log, r4.hh, r0 ; fmul.ifnb rf51, rf20.abs, r0.l" },
+ { 33, 0x041618d57c453000ull, "shl.andn exp, r3, r2 ; add.ifb rf35, r1, r2" },
+ { 33, 0x7048e5da49272800ull, "fsub.ifa rf26, r2.l, rf32 ; fmul.pushc sin, r1.h, r1.abs; ldunif" },
/* v4.1 signals */
- { 41, 0x1f010520cf60a000ull, "fcmp.andz rf32, r2.h, r1.h; vfmul rf20, r0.hh, r3; ldunifa" },
- { 41, 0x932045e6c16ea000ull, "fcmp rf38, r2.abs, r5; fmul rf23.l, r3, r3.abs; ldunifarf.rf1" },
- { 41, 0xd72f0434e43ae5c0ull, "fcmp rf52.h, rf23, r5.abs; fmul rf16.h, rf23, r1; ldunifarf.rf60" },
- { 41, 0xdb3048eb9d533780ull, "fmax rf43.l, r3.h, rf30; fmul rf35.h, r4, r2.l; ldunifarf.r1" },
- { 41, 0x733620471e6ce700ull, "faddnf rf7.l, rf28.h, r1.l; fmul r1, r3.h, r3.abs; ldunifarf.rsqrt2" },
- { 41, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3; fmul.pushz rf43.l, r5, r1.h" },
+ { 41, 0x1f010520cf60a000ull, "fcmp.andz rf32, r2.h, r1.h ; vfmul rf20, r0.hh, r3 ; ldunifa" },
+ { 41, 0x932045e6c16ea000ull, "fcmp rf38, r2.abs, r5 ; fmul rf23.l, r3, r3.abs ; ldunifarf.rf1" },
+ { 41, 0xd72f0434e43ae5c0ull, "fcmp rf52.h, rf23, r5.abs ; fmul rf16.h, rf23, r1 ; ldunifarf.rf60" },
+ { 41, 0xdb3048eb9d533780ull, "fmax rf43.l, r3.h, rf30 ; fmul rf35.h, r4, r2.l ; ldunifarf.r1" },
+ { 41, 0x733620471e6ce700ull, "faddnf rf7.l, rf28.h, r1.l ; fmul r1, r3.h, r3.abs ; ldunifarf.rsqrt2" },
+ { 41, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3 ; fmul.pushz rf43.l, r5, r1.h" },
/* v4.1 opcodes */
- { 41, 0x3de020c7bdfd200dull, "ldvpmg_in rf7, r2, r2; mov r3, 13" },
- { 41, 0x3de02040f8ff7201ull, "stvpmv 1, rf8 ; mov r1, 1" },
- { 41, 0xd8000e50bb2d3000ull, "sampid rf16 ; fmul rf57.h, r3, r1.l" },
+ { 41, 0x3de020c7bdfd200dull, "ldvpmg_in rf7, r2, r2 ; mov r3, 13" },
+ { 41, 0x3de02040f8ff7201ull, "stvpmv 1, rf8 ; mov r1, 1" },
+ { 41, 0xd8000e50bb2d3000ull, "sampid rf16 ; fmul rf57.h, r3, r1.l" },
/* v4.1 SFU instructions. */
- { 41, 0xe98d60c1ba2aef80ull, "recip rf1, rf62 ; fmul r3.h, r2.l, r1.l; ldunifrf.rf53" },
- { 41, 0x7d87c2debc51c000ull, "rsqrt rf30, r4 ; fmul rf11, r4.h, r2.h; ldunifrf.rf31" },
- { 41, 0xb182475abc2bb000ull, "rsqrt2 rf26, r3 ; fmul rf29.l, r2.h, r1.abs; ldunifrf.rf9" },
- { 41, 0x79880808bc0b6900ull, "sin rf8, rf36 ; fmul rf32, r2.h, r0.l; ldunifrf.rf32" },
- { 41, 0x04092094bc5a28c0ull, "exp.ifb rf20, r2 ; add r2, rf35, r2" },
- { 41, 0xe00648bfbc32a000ull, "log rf63, r2 ; fmul.andnn rf34.h, r4.l, r1.abs" },
+ { 41, 0xe98d60c1ba2aef80ull, "recip rf1, rf62 ; fmul r3.h, r2.l, r1.l ; ldunifrf.rf53" },
+ { 41, 0x7d87c2debc51c000ull, "rsqrt rf30, r4 ; fmul rf11, r4.h, r2.h ; ldunifrf.rf31" },
+ { 41, 0xb182475abc2bb000ull, "rsqrt2 rf26, r3 ; fmul rf29.l, r2.h, r1.abs ; ldunifrf.rf9" },
+ { 41, 0x79880808bc0b6900ull, "sin rf8, rf36 ; fmul rf32, r2.h, r0.l ; ldunifrf.rf32" },
+ { 41, 0x04092094bc5a28c0ull, "exp.ifb rf20, r2 ; add r2, rf35, r2" },
+ { 41, 0xe00648bfbc32a000ull, "log rf63, r2 ; fmul.andnn rf34.h, r4.l, r1.abs" },
/* v4.2 changes */
- { 42, 0x3c203192bb814000ull, "barrierid syncb ; nop ; thrsw" },
+ { 42, 0x3c203192bb814000ull, "barrierid syncb ; nop ; thrsw" },
};
static void
@@ -133,6 +133,8 @@ main(int argc, char **argv)
const char *disasm_output = v3d_qpu_disasm(&devinfo,
tests[i].inst);
+ printf("%s\n", disasm_output);
+
if (strcmp(disasm_output, tests[i].expected) != 0) {
printf("FAIL\n");
printf(" Expected: \"%s\"\n", tests[i].expected);
@@ -158,10 +160,10 @@ main(int argc, char **argv)
/* Swap the operands to be sure that we test
* how the QPUs distinguish between these ops.
*/
- swap_mux(&instr.alu.add.a,
- &instr.alu.add.b);
- swap_pack(&instr.alu.add.a_unpack,
- &instr.alu.add.b_unpack);
+ swap_mux(&instr.alu.add.a.mux,
+ &instr.alu.add.b.mux);
+ swap_pack(&instr.alu.add.a.unpack,
+ &instr.alu.add.b.unpack);
break;
default:
break;
diff --git a/src/broadcom/simulator/meson.build b/src/broadcom/simulator/meson.build
index 51f311bb094..0432fa0e52c 100644
--- a/src/broadcom/simulator/meson.build
+++ b/src/broadcom/simulator/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2019 Raspberry Pi
+# Copyright © 2019 Raspberry Pi Ltd
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -29,8 +29,8 @@ files_per_version = files(
)
v3d_args = []
-dep_v3dv3 = dependency('v3dv3', required: false)
-if dep_v3dv3.found()
+dep_v3d_hw = dependency('v3d_hw', required: false)
+if dep_v3d_hw.found()
v3d_args += '-DUSE_V3D_SIMULATOR'
endif
@@ -40,22 +40,22 @@ foreach ver : v3d_versions
'v3d-simulator-v' + ver,
[files_per_version, v3d_xml_pack],
include_directories : [
- inc_src, inc_include, inc_gallium_aux, inc_broadcom,
+ inc_src, inc_include, inc_broadcom,
],
c_args : [v3d_args, '-DV3D_VERSION=' + ver],
gnu_symbol_visibility: 'hidden',
- dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind],
+ dependencies : [dep_v3d_hw, dep_libdrm, dep_valgrind],
)
endforeach
libbroadcom_simulator = static_library(
'broadcom_simulator',
[libbroadcom_simulator_files],
- include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux],
+ include_directories : [inc_src, inc_include],
c_args : [v3d_args, no_override_init_args],
cpp_args : [v3d_args],
gnu_symbol_visibility : 'hidden',
- dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind],
+ dependencies : [dep_v3d_hw, dep_libdrm, dep_valgrind],
link_with : [per_version_libs],
build_by_default : false,
)
diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
index 494e5bb4475..1d78d7205f1 100644
--- a/src/broadcom/simulator/v3d_simulator.c
+++ b/src/broadcom/simulator/v3d_simulator.c
@@ -54,32 +54,32 @@
#include "util/hash_table.h"
#include "util/ralloc.h"
#include "util/set.h"
+#include "util/simple_mtx.h"
#include "util/u_dynarray.h"
#include "util/u_memory.h"
#include "util/u_mm.h"
#include "util/u_math.h"
#include <xf86drm.h>
+#include "drm-uapi/amdgpu_drm.h"
#include "drm-uapi/i915_drm.h"
#include "drm-uapi/v3d_drm.h"
#include "v3d_simulator.h"
#include "v3d_simulator_wrapper.h"
+#include "broadcom/common/v3d_csd.h"
+
/** Global (across GEM fds) state for the simulator */
static struct v3d_simulator_state {
- mtx_t mutex;
+ simple_mtx_t mutex;
mtx_t submit_lock;
struct v3d_hw *v3d;
int ver;
- /* Base virtual address of the heap. */
- void *mem;
- /* Base hardware address of the heap. */
- uint32_t mem_base;
/* Size of the heap. */
- uint32_t mem_size;
+ uint64_t mem_size;
struct mem_block *heap;
struct mem_block *overflow;
@@ -90,10 +90,19 @@ static struct v3d_simulator_state {
/** Last performance monitor ID. */
uint32_t last_perfid;
+ /** Total performance counters */
+ uint32_t perfcnt_total;
+
struct util_dynarray bin_oom;
int refcount;
} sim_state = {
- .mutex = _MTX_INITIALIZER_NP,
+ .mutex = SIMPLE_MTX_INITIALIZER,
+};
+
+enum gem_type {
+ GEM_I915,
+ GEM_AMDGPU,
+ GEM_DUMB
};
/** Per-GEM-fd state for the simulator. */
@@ -109,10 +118,10 @@ struct v3d_simulator_file {
uint32_t active_perfid;
struct mem_block *gmp;
- void *gmp_vaddr;
+ uint64_t gmp_addr;
- /** Actual GEM fd is i915, so we should use their create ioctl. */
- bool is_i915;
+ /** For specific gpus, use their create ioctl. Otherwise use dumb bo. */
+ enum gem_type gem_type;
};
/** Wrapper for drm_v3d_bo tracking the simulator-specific state. */
@@ -123,7 +132,7 @@ struct v3d_simulator_bo {
struct mem_block *block;
uint32_t size;
uint64_t mmap_offset;
- void *sim_vaddr;
+ uint64_t sim_addr;
void *gem_vaddr;
int handle;
@@ -184,7 +193,8 @@ set_gmp_flags(struct v3d_simulator_file *file,
assert((offset & ((1 << GMP_ALIGN2) - 1)) == 0);
int gmp_offset = offset >> GMP_ALIGN2;
int gmp_count = align(size, 1 << GMP_ALIGN2) >> GMP_ALIGN2;
- uint32_t *gmp = file->gmp_vaddr;
+ uint32_t *gmp = malloc((gmp_count + gmp_offset)*sizeof(uint32_t));
+ v3d_hw_read_mem(sim_state.v3d, gmp, file->gmp_addr, (gmp_offset + gmp_count)*sizeof(uint32_t));
assert(flag <= 0x3);
@@ -193,6 +203,9 @@ set_gmp_flags(struct v3d_simulator_file *file,
gmp[i / 16] &= ~(0x3 << bitshift);
gmp[i / 16] |= flag << bitshift;
}
+
+ v3d_hw_write_mem(sim_state.v3d, file->gmp_addr, gmp, (gmp_offset + gmp_count)*sizeof(uint32_t));
+ free(gmp);
}
/**
@@ -203,26 +216,25 @@ static struct v3d_simulator_bo *
v3d_create_simulator_bo(int fd, unsigned size)
{
struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+
+ simple_mtx_lock(&sim_state.mutex);
struct v3d_simulator_bo *sim_bo = rzalloc(file,
struct v3d_simulator_bo);
- size = align(size, 4096);
-
- sim_bo->file = file;
-
- mtx_lock(&sim_state.mutex);
sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, GMP_ALIGN2, 0);
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
assert(sim_bo->block);
-
+ size = align(size, 4096);
+ sim_bo->file = file;
set_gmp_flags(file, sim_bo->block->ofs, size, 0x3);
sim_bo->size = size;
/* Allocate space for the buffer in simulator memory. */
- sim_bo->sim_vaddr = sim_state.mem + sim_bo->block->ofs - sim_state.mem_base;
- memset(sim_bo->sim_vaddr, 0xd0, size);
+ sim_bo->sim_addr = sim_bo->block->ofs;
+ v3d_hw_set_mem(sim_state.v3d, sim_bo->sim_addr, 0xd0, size);
- *(uint32_t *)(sim_bo->sim_vaddr + sim_bo->size) = BO_SENTINEL;
+ uint32_t sentinel = BO_SENTINEL;
+ v3d_hw_write_mem(sim_state.v3d, sim_bo->sim_addr + sim_bo->size, &sentinel, sizeof(sentinel));
return sim_bo;
}
@@ -241,7 +253,9 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size)
* one.
*/
int ret;
- if (file->is_i915) {
+ switch (file->gem_type) {
+ case GEM_I915:
+ {
struct drm_i915_gem_mmap_gtt map = {
.handle = handle,
};
@@ -252,14 +266,26 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size)
*/
ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &map);
sim_bo->mmap_offset = map.offset;
- } else {
+ break;
+ }
+ case GEM_AMDGPU:
+ {
+ union drm_amdgpu_gem_mmap map = { 0 };
+ map.in.handle = handle;
+
+ ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &map);
+ sim_bo->mmap_offset = map.out.addr_ptr;
+ break;
+ }
+ default:
+ {
struct drm_mode_map_dumb map = {
.handle = handle,
};
-
ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
sim_bo->mmap_offset = map.offset;
}
+ }
if (ret) {
fprintf(stderr, "Failed to get MMAP offset: %d\n", ret);
abort();
@@ -278,10 +304,10 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size)
* don't need to go in the lookup table.
*/
if (handle != 0) {
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
_mesa_hash_table_insert(file->bo_map, int_to_key(handle),
sim_bo);
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
}
return sim_bo;
@@ -311,14 +337,14 @@ v3d_free_simulator_bo(struct v3d_simulator_bo *sim_bo)
if (sim_bo->gem_vaddr)
munmap(sim_bo->gem_vaddr, sim_bo->size);
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
u_mmFreeMem(sim_bo->block);
if (sim_bo->handle) {
_mesa_hash_table_remove_key(sim_file->bo_map,
int_to_key(sim_bo->handle));
}
- mtx_unlock(&sim_state.mutex);
ralloc_free(sim_bo);
+ simple_mtx_unlock(&sim_state.mutex);
}
static struct v3d_simulator_bo *
@@ -327,10 +353,10 @@ v3d_get_simulator_bo(struct v3d_simulator_file *file, int gem_handle)
if (gem_handle == 0)
return NULL;
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
struct hash_entry *entry =
_mesa_hash_table_search(file->bo_map, int_to_key(gem_handle));
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
return entry ? entry->data : NULL;
}
@@ -343,7 +369,7 @@ v3d_simulator_copy_in_handle(struct v3d_simulator_file *file, int handle)
if (!sim_bo)
return;
- memcpy(sim_bo->sim_vaddr, sim_bo->gem_vaddr, sim_bo->size);
+ v3d_hw_write_mem(sim_state.v3d, sim_bo->sim_addr, sim_bo->gem_vaddr, sim_bo->size);
}
static void
@@ -354,10 +380,11 @@ v3d_simulator_copy_out_handle(struct v3d_simulator_file *file, int handle)
if (!sim_bo)
return;
- memcpy(sim_bo->gem_vaddr, sim_bo->sim_vaddr, sim_bo->size);
+ v3d_hw_read_mem(sim_state.v3d, sim_bo->gem_vaddr, sim_bo->sim_addr, sim_bo->size);
- if (*(uint32_t *)(sim_bo->sim_vaddr +
- sim_bo->size) != BO_SENTINEL) {
+ uint32_t sentinel;
+ v3d_hw_read_mem(sim_state.v3d, &sentinel, sim_bo->sim_addr + sim_bo->size, sizeof(sentinel));
+ if (sentinel != BO_SENTINEL) {
fprintf(stderr, "Buffer overflow in handle %d\n",
handle);
}
@@ -395,10 +422,10 @@ v3d_get_simulator_perfmon(int fd, uint32_t perfid)
struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
assert(perfid <= file->perfmons_size);
struct v3d_simulator_perfmon *perfmon = file->perfmons[perfid - 1];
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
return perfmon;
}
@@ -414,20 +441,46 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid)
perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
if (perfmon)
- v3d41_simulator_perfmon_stop(sim_state.v3d,
- perfmon->ncounters,
- perfmon->values);
+ v3d_X_simulator(perfmon_stop)(sim_state.v3d,
+ perfmon->ncounters,
+ perfmon->values);
perfmon = v3d_get_simulator_perfmon(fd, perfid);
if (perfmon)
- v3d41_simulator_perfmon_start(sim_state.v3d,
- perfmon->ncounters,
- perfmon->counters);
+ v3d_X_simulator(perfmon_start)(sim_state.v3d,
+ perfmon->ncounters,
+ perfmon->counters);
file->active_perfid = perfid;
}
static int
+v3d_simulator_signal_syncobjs(int fd, struct drm_v3d_multi_sync *ms)
+{
+ struct drm_v3d_sem *out_syncs = (void *)(uintptr_t)ms->out_syncs;
+ int n_syncobjs = ms->out_sync_count;
+ uint32_t syncobjs[n_syncobjs];
+
+ for (int i = 0; i < n_syncobjs; i++)
+ syncobjs[i] = out_syncs[i].handle;
+ return drmSyncobjSignal(fd, (uint32_t *) &syncobjs, n_syncobjs);
+}
+
+static int
+v3d_simulator_process_post_deps(int fd, struct drm_v3d_extension *ext)
+{
+ int ret = 0;
+ while (ext && ext->id != DRM_V3D_EXT_ID_MULTI_SYNC)
+ ext = (void *)(uintptr_t) ext->next;
+
+ if (ext) {
+ struct drm_v3d_multi_sync *ms = (struct drm_v3d_multi_sync *) ext;
+ ret = v3d_simulator_signal_syncobjs(fd, ms);
+ }
+ return ret;
+}
+
+static int
v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
{
struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
@@ -441,11 +494,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
bin_fd = fd;
v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
-
- if (sim_state.ver >= 41)
- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
- else
- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+ v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs);
util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
sim_bo) {
@@ -459,7 +508,12 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
if (ret)
return ret;
- return 0;
+ if (submit->flags & DRM_V3D_SUBMIT_EXTENSION) {
+ struct drm_v3d_extension *ext = (void *)(uintptr_t)submit->extensions;
+ ret = v3d_simulator_process_post_deps(fd, ext);
+ }
+
+ return ret;
}
/**
@@ -488,14 +542,30 @@ v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
* native ioctl in case we're on a render node.
*/
int ret;
- if (file->is_i915) {
+ switch (file->gem_type) {
+ case GEM_I915:
+ {
struct drm_i915_gem_create create = {
.size = args->size,
};
+
ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE, &create);
args->handle = create.handle;
- } else {
+ break;
+ }
+ case GEM_AMDGPU:
+ {
+ union drm_amdgpu_gem_create create = { 0 };
+ create.in.bo_size = args->size;
+
+ ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create);
+
+ args->handle = create.out.handle;
+ break;
+ }
+ default:
+ {
struct drm_mode_create_dumb create = {
.width = 128,
.bpp = 8,
@@ -507,7 +577,7 @@ v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
args->handle = create.handle;
}
-
+ }
if (ret == 0) {
struct v3d_simulator_bo *sim_bo =
v3d_create_simulator_bo_for_gem(fd, args->handle,
@@ -564,15 +634,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
}
static int
-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
-{
- if (sim_state.ver >= 41)
- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
- else
- return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
-}
-
-static int
v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
{
struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
@@ -583,13 +644,18 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
- else
- ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+ ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args);
v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
+ if (ret)
+ return ret;
+
+ if (args->flags & DRM_V3D_SUBMIT_EXTENSION) {
+ struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions;
+ ret = v3d_simulator_process_post_deps(fd, ext);
+ }
+
return ret;
}
@@ -605,15 +671,311 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
v3d_simulator_perfmon_switch(fd, args->perfmon_id);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
- file->gmp->ofs);
- else
- ret = -1;
+ ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args,
+ file->gmp->ofs);
for (int i = 0; i < args->bo_handle_count; i++)
v3d_simulator_copy_out_handle(file, bo_handles[i]);
+ if (ret < 0)
+ return ret;
+
+ if (args->flags & DRM_V3D_SUBMIT_EXTENSION) {
+ struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions;
+ ret = v3d_simulator_process_post_deps(fd, ext);
+ }
+
+ return ret;
+}
+
+static void
+v3d_rewrite_csd_job_wg_counts_from_indirect(int fd,
+ struct drm_v3d_extension *ext,
+ struct drm_v3d_submit_cpu *args)
+{
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ struct drm_v3d_indirect_csd *indirect_csd = (struct drm_v3d_indirect_csd *) ext;
+ uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+
+ assert(args->bo_handle_count == 1);
+ struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+ struct v3d_simulator_bo *indirect = v3d_get_simulator_bo(file, indirect_csd->indirect);
+ struct drm_v3d_submit_csd *submit = &indirect_csd->submit;
+
+ uint32_t *wg_counts = (uint32_t *) (bo->gem_vaddr + indirect_csd->offset);
+
+ if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0)
+ return;
+
+ submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+ submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+ submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+ submit->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
+ (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+
+ for (int i = 0; i < 3; i++) {
+ /* 0xffffffff indicates that the uniform rewrite is not needed */
+ if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) {
+ uint32_t uniform_idx = indirect_csd->wg_uniform_offsets[i];
+ ((uint32_t *) indirect->gem_vaddr)[uniform_idx] = wg_counts[i];
+ }
+ }
+
+ v3d_simulator_submit_csd_ioctl(fd, submit);
+}
+
+static void
+v3d_timestamp_query(int fd,
+ struct drm_v3d_extension *ext,
+ struct drm_v3d_submit_cpu *args)
+{
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ struct drm_v3d_timestamp_query *timestamp_query = (struct drm_v3d_timestamp_query *) ext;
+ uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+ struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+ uint32_t *offsets = (void *)(uintptr_t) timestamp_query->offsets;
+ uint32_t *syncs = (void *)(uintptr_t) timestamp_query->syncs;
+
+ struct timespec t;
+ clock_gettime(CLOCK_MONOTONIC, &t);
+
+ for (uint32_t i = 0; i < timestamp_query->count; i++) {
+ uint64_t value = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
+ v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + offsets[i], &value, sizeof(value));
+ }
+
+ drmSyncobjSignal(fd, syncs, timestamp_query->count);
+}
+
+static void
+v3d_reset_timestamp_queries(int fd,
+ struct drm_v3d_extension *ext,
+ struct drm_v3d_submit_cpu *args)
+{
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ struct drm_v3d_reset_timestamp_query *reset = (struct drm_v3d_reset_timestamp_query *) ext;
+ uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+ struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+ uint32_t *syncs = (void *)(uintptr_t) reset->syncs;
+
+ v3d_hw_set_mem(sim_state.v3d, bo->sim_addr + reset->offset, 0, reset->count);
+
+ drmSyncobjReset(fd, syncs, reset->count);
+}
+
+static void
+write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
+{
+ if (do_64bit) {
+ uint64_t *dst64 = (uint64_t *) dst;
+ dst64[idx] = value;
+ } else {
+ uint32_t *dst32 = (uint32_t *) dst;
+ dst32[idx] = (uint32_t) value;
+ }
+}
+
+static void
+v3d_copy_query_results(int fd,
+ struct drm_v3d_extension *ext,
+ struct drm_v3d_submit_cpu *args)
+{
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ struct drm_v3d_copy_timestamp_query *copy = (struct drm_v3d_copy_timestamp_query *) ext;
+ uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+ struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+ struct v3d_simulator_bo *timestamp = v3d_get_simulator_bo(file, bo_handles[1]);
+ uint32_t *offsets = (void *)(uintptr_t) copy->offsets;
+ uint32_t *syncs = (void *)(uintptr_t) copy->syncs;
+ bool available, write_result;
+ uint8_t *data = malloc(copy->count * copy->stride);
+ uint64_t query_val;
+
+ uint8_t *p = data;
+ for (uint32_t i = 0; i < copy->count; i++) {
+ available = (drmSyncobjWait(fd, &syncs[i], 1, 0, 0, NULL) == 0);
+
+ write_result = available || copy->do_partial;
+ if (write_result) {
+ v3d_hw_read_mem(sim_state.v3d, &query_val, timestamp->sim_addr + offsets[i], sizeof(uint64_t));
+ write_to_buffer(p, 0, copy->do_64bit, query_val);
+ }
+
+ if (copy->availability_bit)
+ write_to_buffer(p, 1, copy->do_64bit, available ? 1u : 0u);
+
+ p += copy->stride;
+ }
+
+ v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + copy->offset, data, copy->count * copy->stride);
+ free(data);
+}
+
+static void
+v3d_reset_performance_queries(int fd,
+ struct drm_v3d_extension *ext,
+ struct drm_v3d_submit_cpu *args)
+{
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ struct drm_v3d_reset_performance_query *reset = (struct drm_v3d_reset_performance_query *) ext;
+ uint64_t *kperfmon_ids = (void *)(uintptr_t) reset->kperfmon_ids;
+ uint32_t *syncs = (void *)(uintptr_t) reset->syncs;
+ struct v3d_simulator_perfmon *perfmon;
+
+ for (uint32_t i = 0; i < reset->count; i++) {
+ uint32_t *ids = (void *)(uintptr_t) kperfmon_ids[i];
+
+ for (uint32_t j = 0; j < reset->nperfmons; j++) {
+ mtx_lock(&sim_state.submit_lock);
+
+ /* Stop the perfmon if it is still active */
+ if (ids[j] == file->active_perfid)
+ v3d_simulator_perfmon_switch(fd, 0);
+
+ mtx_unlock(&sim_state.submit_lock);
+
+ perfmon = v3d_get_simulator_perfmon(fd, ids[j]);
+
+ if (!perfmon)
+ return;
+
+ memset(perfmon->values, 0, perfmon->ncounters * sizeof(uint64_t));
+ }
+ }
+
+ drmSyncobjReset(fd, syncs, reset->count);
+}
+
+static void
+v3d_write_performance_query_result(int fd,
+ struct drm_v3d_copy_performance_query *copy,
+ uint32_t *kperfmon_ids,
+ void *data)
+{
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ struct v3d_simulator_perfmon *perfmon;
+ uint64_t counter_values[sim_state.perfcnt_total];
+
+ for (uint32_t i = 0; i < copy->nperfmons; i++) {
+ mtx_lock(&sim_state.submit_lock);
+
+ /* Stop the perfmon if it is still active */
+ if (kperfmon_ids[i] == file->active_perfid)
+ v3d_simulator_perfmon_switch(fd, 0);
+
+ mtx_unlock(&sim_state.submit_lock);
+
+ perfmon = v3d_get_simulator_perfmon(fd, kperfmon_ids[i]);
+
+ if (!perfmon)
+ return;
+
+ memcpy(&counter_values[i * DRM_V3D_MAX_PERF_COUNTERS], perfmon->values,
+ perfmon->ncounters * sizeof(uint64_t));
+ }
+
+ for (uint32_t i = 0; i < copy->ncounters; i++)
+ write_to_buffer(data, i, copy->do_64bit, counter_values[i]);
+}
+
+static void
+v3d_copy_performance_query(int fd,
+ struct drm_v3d_extension *ext,
+ struct drm_v3d_submit_cpu *args)
+{
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ struct drm_v3d_copy_performance_query *copy = (struct drm_v3d_copy_performance_query *) ext;
+ uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+ struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]);
+ uint64_t *kperfmon_ids = (void *)(uintptr_t) copy->kperfmon_ids;
+ uint32_t *syncs = (void *)(uintptr_t) copy->syncs;
+ bool available, write_result;
+ uint8_t *data = malloc(copy->count * copy->stride);
+
+ uint8_t *p = data;
+ for (uint32_t i = 0; i < copy->count; i++) {
+ /* Although we don't have in_syncs implemented in the simulator,
+ * we don't need to wait for the availability of the syncobjs,
+ * as they are signaled by CL and CSD jobs, which are serialized
+ * by the simulator.
+ */
+ available = (drmSyncobjWait(fd, &syncs[i], 1, 0, 0, NULL) == 0);
+
+ write_result = available || copy->do_partial;
+ if (write_result) {
+ v3d_write_performance_query_result(fd, copy,
+ (void *)(uintptr_t) kperfmon_ids[i],
+ p);
+ }
+
+ if (copy->availability_bit) {
+ write_to_buffer(p, copy->ncounters, copy->do_64bit,
+ available ? 1u : 0u);
+ }
+
+ p += copy->stride;
+ }
+
+ v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + copy->offset, data, copy->count + copy->stride);
+ free(data);
+}
+
+static int
+v3d_simulator_submit_cpu_ioctl(int fd, struct drm_v3d_submit_cpu *args)
+{
+ struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions;
+ struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+ uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
+ int ret = 0;
+
+ for (int i = 0; i < args->bo_handle_count; i++)
+ v3d_simulator_copy_in_handle(file, bo_handles[i]);
+
+ while (ext) {
+ switch (ext->id) {
+ case DRM_V3D_EXT_ID_MULTI_SYNC:
+ /* As the simulator serializes the jobs, we don't need
+ * to handle the in_syncs here. The out_syncs are handled
+ * by the end of the ioctl in v3d_simulator_process_post_deps().
+ */
+ break;
+ case DRM_V3D_EXT_ID_CPU_INDIRECT_CSD:
+ v3d_rewrite_csd_job_wg_counts_from_indirect(fd, ext, args);
+ break;
+ case DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY:
+ v3d_timestamp_query(fd, ext, args);
+ break;
+ case DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY:
+ v3d_reset_timestamp_queries(fd, ext, args);
+ break;
+ case DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY:
+ v3d_copy_query_results(fd, ext, args);
+ break;
+ case DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY:
+ v3d_reset_performance_queries(fd, ext, args);
+ break;
+ case DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY:
+ v3d_copy_performance_query(fd, ext, args);
+ break;
+ default:
+ fprintf(stderr, "Unknown CPU job 0x%08x\n", (int)ext->id);
+ break;
+ }
+
+ ext = (void *)(uintptr_t) ext->next;
+ }
+
+ for (int i = 0; i < args->bo_handle_count; i++)
+ v3d_simulator_copy_out_handle(file, bo_handles[i]);
+
+ if (ret < 0)
+ return ret;
+
+ if (args->flags & DRM_V3D_SUBMIT_EXTENSION) {
+ ext = (void *)(uintptr_t)args->extensions;
+ ret = v3d_simulator_process_post_deps(fd, ext);
+ }
+
return ret;
}
@@ -631,7 +993,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
perfmon->ncounters = args->ncounters;
for (int i = 0; i < args->ncounters; i++) {
- if (args->counters[i] >= V3D_PERFCNT_NUM) {
+ if (args->counters[i] >= sim_state.perfcnt_total) {
ralloc_free(perfmon);
return -EINVAL;
} else {
@@ -639,10 +1001,10 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
}
}
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
args->id = perfmons_next_id(file);
file->perfmons[args->id - 1] = perfmon;
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
return 0;
}
@@ -657,9 +1019,9 @@ v3d_simulator_perfmon_destroy_ioctl(int fd, struct drm_v3d_perfmon_destroy *args
if (!perfmon)
return -EINVAL;
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
file->perfmons[args->id - 1] = NULL;
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
ralloc_free(perfmon);
@@ -712,7 +1074,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
return 0;
case DRM_IOCTL_V3D_GET_PARAM:
- return v3d_simulator_get_param_ioctl(fd, args);
+ return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args);
case DRM_IOCTL_GEM_CLOSE:
return v3d_simulator_gem_close_ioctl(fd, args);
@@ -723,6 +1085,9 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
case DRM_IOCTL_V3D_SUBMIT_CSD:
return v3d_simulator_submit_csd_ioctl(fd, args);
+ case DRM_IOCTL_V3D_SUBMIT_CPU:
+ return v3d_simulator_submit_cpu_ioctl(fd, args);
+
case DRM_IOCTL_V3D_PERFMON_CREATE:
return v3d_simulator_perfmon_create_ioctl(fd, args);
@@ -747,20 +1112,28 @@ v3d_simulator_get_mem_size(void)
return sim_state.mem_size;
}
+uint32_t
+v3d_simulator_get_mem_free(void)
+{
+ uint32_t total_free = 0;
+ struct mem_block *p;
+ for (p = sim_state.heap->next_free; p != sim_state.heap; p = p->next_free)
+ total_free += p->size;
+ return total_free;
+}
+
static void
v3d_simulator_init_global()
{
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
if (sim_state.refcount++) {
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
return;
}
sim_state.v3d = v3d_hw_auto_new(NULL);
v3d_hw_alloc_mem(sim_state.v3d, 1024 * 1024 * 1024);
- sim_state.mem_base =
- v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size,
- &sim_state.mem);
+ v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size);
/* Allocate from anywhere from 4096 up. We don't allocate at 0,
* because for OQs and some other addresses in the HW, 0 means
@@ -772,11 +1145,11 @@ v3d_simulator_init_global()
* and land there.
*/
struct mem_block *b = u_mmAllocMem(sim_state.heap, 4096, GMP_ALIGN2, 0);
- memset(sim_state.mem + b->ofs - sim_state.mem_base, 0xd0, 4096);
+ v3d_hw_set_mem(sim_state.v3d, b->ofs, 0xd0, 4096);
sim_state.ver = v3d_hw_get_version(sim_state.v3d);
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
sim_state.fd_map =
_mesa_hash_table_create(NULL,
@@ -785,10 +1158,8 @@ v3d_simulator_init_global()
util_dynarray_init(&sim_state.bin_oom, NULL);
- if (sim_state.ver >= 41)
- v3d41_simulator_init_regs(sim_state.v3d);
- else
- v3d33_simulator_init_regs(sim_state.v3d);
+ v3d_X_simulator(init_regs)(sim_state.v3d);
+ v3d_X_simulator(get_perfcnt_total)(&sim_state.perfcnt_total);
}
struct v3d_simulator_file *
@@ -800,7 +1171,11 @@ v3d_simulator_init(int fd)
drmVersionPtr version = drmGetVersion(fd);
if (version && strncmp(version->name, "i915", version->name_len) == 0)
- sim_file->is_i915 = true;
+ sim_file->gem_type = GEM_I915;
+ else if (version && strncmp(version->name, "amdgpu", version->name_len) == 0)
+ sim_file->gem_type = GEM_AMDGPU;
+ else
+ sim_file->gem_type = GEM_DUMB;
drmFreeVersion(version);
sim_file->bo_map =
@@ -808,15 +1183,14 @@ v3d_simulator_init(int fd)
_mesa_hash_pointer,
_mesa_key_pointer_equal);
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
_mesa_hash_table_insert(sim_state.fd_map, int_to_key(fd + 1),
sim_file);
- mtx_unlock(&sim_state.mutex);
+ simple_mtx_unlock(&sim_state.mutex);
sim_file->gmp = u_mmAllocMem(sim_state.heap, 8096, GMP_ALIGN2, 0);
- sim_file->gmp_vaddr = (sim_state.mem + sim_file->gmp->ofs -
- sim_state.mem_base);
- memset(sim_file->gmp_vaddr, 0, 8096);
+ sim_file->gmp_addr = sim_file->gmp->ofs;
+ v3d_hw_set_mem(sim_state.v3d, sim_file->gmp_addr, 0, 8096);
return sim_file;
}
@@ -824,16 +1198,16 @@ v3d_simulator_init(int fd)
void
v3d_simulator_destroy(struct v3d_simulator_file *sim_file)
{
- mtx_lock(&sim_state.mutex);
+ simple_mtx_lock(&sim_state.mutex);
if (!--sim_state.refcount) {
_mesa_hash_table_destroy(sim_state.fd_map, NULL);
util_dynarray_fini(&sim_state.bin_oom);
u_mmDestroy(sim_state.heap);
- /* No memsetting the struct, because it contains the mutex. */
- sim_state.mem = NULL;
+ /* No memsetting the sim_state struct, because it contains the
+ * mutex. */
}
- mtx_unlock(&sim_state.mutex);
ralloc_free(sim_file);
+ simple_mtx_unlock(&sim_state.mutex);
}
#endif /* USE_V3D_SIMULATOR */
diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
index ef6bf44f19f..03575ae8951 100644
--- a/src/broadcom/simulator/v3d_simulator.h
+++ b/src/broadcom/simulator/v3d_simulator.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
* Copyright © 2014-2017 Broadcom
* Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
*
@@ -40,17 +40,35 @@ uint32_t v3d_simulator_get_spill(uint32_t spill_size);
int v3d_simulator_ioctl(int fd, unsigned long request, void *arg);
void v3d_simulator_open_from_handle(int fd, int handle, uint32_t size);
uint32_t v3d_simulator_get_mem_size(void);
+uint32_t v3d_simulator_get_mem_free(void);
#ifdef v3dX
# include "v3dx_simulator.h"
#else
-# define v3dX(x) v3d33_##x
+# define v3dX(x) v3d42_##x
# include "v3dx_simulator.h"
# undef v3dX
-# define v3dX(x) v3d41_##x
+# define v3dX(x) v3d71_##x
# include "v3dx_simulator.h"
# undef v3dX
+
#endif
+/* Helper to call simulator ver specific functions */
+#define v3d_X_simulator(thing) ({ \
+ __typeof(&v3d42_simulator_##thing) v3d_X_sim_thing;\
+ switch (sim_state.ver) { \
+ case 42: \
+ v3d_X_sim_thing = &v3d42_simulator_##thing; \
+ break; \
+ case 71: \
+ v3d_X_sim_thing = &v3d71_simulator_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ v3d_X_sim_thing; \
+})
+
#endif
diff --git a/src/broadcom/simulator/v3d_simulator_wrapper.cpp b/src/broadcom/simulator/v3d_simulator_wrapper.cpp
index 88e439255d3..ef9bec492ee 100644
--- a/src/broadcom/simulator/v3d_simulator_wrapper.cpp
+++ b/src/broadcom/simulator/v3d_simulator_wrapper.cpp
@@ -30,12 +30,6 @@
#ifdef USE_V3D_SIMULATOR
#include "v3d_simulator_wrapper.h"
-
-#define V3D_TECH_VERSION 3
-#define V3D_REVISION 3
-#define V3D_SUB_REV 0
-#define V3D_HIDDEN_REV 0
-#define V3D_COMPAT_REV 0
#include "v3d_hw_auto.h"
extern "C" {
@@ -45,13 +39,29 @@ struct v3d_hw *v3d_hw_auto_new(void *in_params)
return v3d_hw_auto_make_unique().release();
}
+uint64_t v3d_hw_get_mem(const struct v3d_hw *hw, uint64_t *size)
+{
+ uint64_t addr;
+ assert(hw->get_mem(&addr, size));
+ return addr;
+}
+
+void v3d_hw_set_mem(struct v3d_hw *hw, uint64_t addr, uint8_t value, uint64_t size)
+{
+ hw->set_mem(addr, value, size);
+}
+
+void v3d_hw_write_mem(struct v3d_hw *hw, uint64_t addr, const void *p, uint64_t size)
+{
+ hw->write_mem(addr, p, size);
+}
-uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p)
+void v3d_hw_read_mem(struct v3d_hw *hw, void *p, uint64_t addr, uint64_t size)
{
- return hw->get_mem(size, p);
+ hw->read_mem(p, addr, size);
}
-bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size)
+bool v3d_hw_alloc_mem(struct v3d_hw *hw, uint64_t min_size)
{
return hw->alloc_mem(min_size) == V3D_HW_ALLOC_SUCCESS;
}
diff --git a/src/broadcom/simulator/v3d_simulator_wrapper.h b/src/broadcom/simulator/v3d_simulator_wrapper.h
index 05b2a3361ac..7f2be57a3be 100644
--- a/src/broadcom/simulator/v3d_simulator_wrapper.h
+++ b/src/broadcom/simulator/v3d_simulator_wrapper.h
@@ -31,8 +31,11 @@ extern "C" {
#endif
struct v3d_hw *v3d_hw_auto_new(void *params);
-uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p);
-bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size);
+uint64_t v3d_hw_get_mem(const struct v3d_hw *hw, uint64_t *size);
+void v3d_hw_set_mem(struct v3d_hw *hw, uint64_t addr, uint8_t value, uint64_t size);
+void v3d_hw_write_mem(struct v3d_hw *hw, uint64_t add, const void *p, uint64_t size);
+void v3d_hw_read_mem(struct v3d_hw *hw, void *p, uint64_t addr, uint64_t size);
+bool v3d_hw_alloc_mem(struct v3d_hw *hw, uint64_t min_size);
uint32_t v3d_hw_read_reg(struct v3d_hw *hw, uint32_t reg);
void v3d_hw_write_reg(struct v3d_hw *hw, uint32_t reg, uint32_t val);
void v3d_hw_tick(struct v3d_hw *hw);
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
index 07bbbe2f8c9..ea682955dca 100644
--- a/src/broadcom/simulator/v3dx_simulator.c
+++ b/src/broadcom/simulator/v3dx_simulator.c
@@ -40,32 +40,25 @@
#include "v3d_simulator.h"
#include "v3d_simulator_wrapper.h"
+#include "common/v3d_performance_counters.h"
+
#include "util/macros.h"
#include "util/bitscan.h"
#include "drm-uapi/v3d_drm.h"
#define HW_REGISTER_RO(x) (x)
#define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
-#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
+#if V3D_VERSION == 71
+#include "libs/core/v3d/registers/7.1.7.0/v3d.h"
#else
-#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+#if V3D_VERSION == 42
+#include "libs/core/v3d/registers/4.2.14.0/v3d.h"
+#endif
#endif
#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
-static void
-v3d_invalidate_l3(struct v3d_hw *v3d)
-{
-#if V3D_VERSION < 40
- uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
-
- V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
- V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
-#endif
-}
-
/* Invalidates the L2C cache. This is a read-only cache for uniforms and instructions. */
static void
v3d_invalidate_l2c(struct v3d_hw *v3d)
@@ -150,7 +143,6 @@ v3d_invalidate_slices(struct v3d_hw *v3d)
static void
v3d_invalidate_caches(struct v3d_hw *v3d)
{
- v3d_invalidate_l3(v3d);
v3d_invalidate_l2c(v3d);
v3d_invalidate_l2t(v3d);
v3d_invalidate_slices(v3d);
@@ -178,38 +170,48 @@ v3d_flush_caches(struct v3d_hw *v3d)
v3d_flush_l2t(v3d);
}
+#if V3D_VERSION < 71
+#define TFU_REG(NAME) V3D_TFU_ ## NAME
+#else
+#define TFU_REG(NAME) V3D_IFC_ ## NAME
+#endif
+
+
int
v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
struct drm_v3d_submit_tfu *args)
{
- int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
-
- V3D_WRITE(V3D_TFU_IIA, args->iia);
- V3D_WRITE(V3D_TFU_IIS, args->iis);
- V3D_WRITE(V3D_TFU_ICA, args->ica);
- V3D_WRITE(V3D_TFU_IUA, args->iua);
- V3D_WRITE(V3D_TFU_IOA, args->ioa);
- V3D_WRITE(V3D_TFU_IOS, args->ios);
- V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
- V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
- V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
- V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
-
- V3D_WRITE(V3D_TFU_ICFG, args->icfg);
-
- while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+ int last_vtct = V3D_READ(TFU_REG(CS)) & TFU_REG(CS_CVTCT_SET);
+
+ V3D_WRITE(TFU_REG(IIA), args->iia);
+ V3D_WRITE(TFU_REG(IIS), args->iis);
+ V3D_WRITE(TFU_REG(ICA), args->ica);
+ V3D_WRITE(TFU_REG(IUA), args->iua);
+ V3D_WRITE(TFU_REG(IOA), args->ioa);
+#if V3D_VERSION >= 71
+ V3D_WRITE(TFU_REG(IOC), args->v71.ioc);
+#endif
+ V3D_WRITE(TFU_REG(IOS), args->ios);
+ V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
+ V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
+ V3D_WRITE(TFU_REG(COEF2), args->coef[2]);
+ V3D_WRITE(TFU_REG(COEF3), args->coef[3]);
+
+ V3D_WRITE(TFU_REG(ICFG), args->icfg);
+
+ while ((V3D_READ(TFU_REG(CS)) & TFU_REG(CS_CVTCT_SET)) == last_vtct) {
v3d_hw_tick(v3d);
}
return 0;
}
-#if V3D_VERSION >= 41
int
v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
struct drm_v3d_submit_csd *args,
uint32_t gmp_ofs)
{
+#if V3D_VERSION >= 42
int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
g_gmp_ofs = gmp_ofs;
@@ -223,6 +225,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
+#if V3D_VERSION >= 71
+ V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
+#endif
/* CFG0 kicks off the job */
V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
@@ -239,15 +244,21 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
v3d_flush_caches(v3d);
return 0;
-}
+#else
+ return -1;
#endif
+}
int
v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
struct drm_v3d_get_param *args)
{
static const uint32_t reg_map[] = {
+#if V3D_VERSION >= 71
+ [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_IDENT0,
+#else
[DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
+#endif
[DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
[DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
[DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
@@ -261,14 +272,20 @@ v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
args->value = 1;
return 0;
case DRM_V3D_PARAM_SUPPORTS_CSD:
- args->value = V3D_VERSION >= 41;
+ args->value = V3D_VERSION >= 42;
return 0;
case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
args->value = 1;
return 0;
case DRM_V3D_PARAM_SUPPORTS_PERFMON:
- args->value = V3D_VERSION >= 41;
+ args->value = V3D_VERSION >= 42;
return 0;
+ case DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT:
+ args->value = 1;
+ return 0;
+ case DRM_V3D_PARAM_SUPPORTS_CPU_QUEUE:
+ args->value = 1;
+ return 0;
}
if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) {
@@ -307,16 +324,17 @@ v3d_isr_core(struct v3d_hw *v3d,
return;
}
+#if V3D_VERSION <= 42
if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
fprintf(stderr, "GMP violation at 0x%08x\n",
V3D_READ(V3D_GMP_VIO_ADDR));
- abort();
} else {
fprintf(stderr,
"Unexpected ISR with core status 0x%08x\n",
core_status);
}
abort();
+#endif
}
static void
@@ -331,11 +349,10 @@ handle_mmu_interruptions(struct v3d_hw *v3d,
return;
const char *client = "?";
- uint32_t axi_id = V3D_READ(V3D_MMU_VIO_ID);
+ uint32_t axi_id = V3D_READ(V3D_MMU0_VIO_ID);
uint32_t va_width = 30;
-#if V3D_VERSION >= 41
- static const char *const v3d41_axi_ids[] = {
+ static const char *const v3d42_axi_ids[] = {
"L2T",
"PTB",
"PSE",
@@ -347,21 +364,21 @@ handle_mmu_interruptions(struct v3d_hw *v3d,
};
axi_id = axi_id >> 5;
- if (axi_id < ARRAY_SIZE(v3d41_axi_ids))
- client = v3d41_axi_ids[axi_id];
+ if (axi_id < ARRAY_SIZE(v3d42_axi_ids))
+ client = v3d42_axi_ids[axi_id];
- uint32_t mmu_debug = V3D_READ(V3D_MMU_DEBUG_INFO);
+ uint32_t mmu_debug = V3D_READ(V3D_MMU0_DEBUG_INFO);
+
+ va_width += ((mmu_debug & V3D_MMU0_DEBUG_INFO_VA_WIDTH_SET)
+ >> V3D_MMU0_DEBUG_INFO_VA_WIDTH_LSB);
- va_width += ((mmu_debug & V3D_MMU_DEBUG_INFO_VA_WIDTH_SET)
- >> V3D_MMU_DEBUG_INFO_VA_WIDTH_LSB);
-#endif
/* Only the top bits (final number depends on the gen) of the virtual
* address are reported in the MMU VIO_ADDR register.
*/
- uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU_VIO_ADDR) <<
+ uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU0_VIO_ADDR) <<
(va_width - 32));
- /* Difference with the kernal: here were are going to abort after
+ /* Difference with the kernel: here were are going to abort after
* logging, so we don't bother with some stuff that the kernel does,
* like restoring the MMU ctrl bits
*/
@@ -393,6 +410,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
}
handle_mmu_interruptions(v3d, hub_status);
+
+#if V3D_VERSION == 71
+ if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
+ fprintf(stderr, "GMP violation at 0x%08x\n",
+ V3D_READ(V3D_GMP_VIO_ADDR));
+ } else {
+ fprintf(stderr,
+ "Unexpected ISR with status 0x%08x\n",
+ hub_status);
+ }
+ abort();
+#endif
}
static void
@@ -417,24 +446,15 @@ v3d_isr(uint32_t hub_status)
void
v3dX(simulator_init_regs)(struct v3d_hw *v3d)
{
-#if V3D_VERSION == 33
- /* Set OVRTMUOUT to match kernel behavior.
- *
- * This means that the texture sampler uniform configuration's tmu
- * output type field is used, instead of using the hardware default
- * behavior based on the texture type. If you want the default
- * behavior, you can still put "2" in the indirect texture state's
- * output_type field.
- */
- V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
-#endif
-
/* FIXME: the kernel captures some additional core interrupts here,
* for tracing. Perhaps we should evaluate to do the same here and add
* some debug options.
*/
- uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
- V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
+ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
+#if V3D_VERSION <= 42
+ core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
+#endif
+
V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
@@ -444,6 +464,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */
V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
+#if V3D_VERSION == 71
+ hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
+#endif
V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
@@ -471,13 +494,11 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
}
-#if V3D_VERSION >= 41
if (submit->qts) {
V3D_WRITE(V3D_CLE_0_CT0QTS,
V3D_CLE_0_CT0QTS_CTQTSEN_SET |
submit->qts);
}
-#endif
V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
@@ -501,20 +522,18 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
}
}
-#if V3D_VERSION >= 41
#define V3D_PCTR_0_PCTR_N(x) (V3D_PCTR_0_PCTR0 + 4 * (x))
#define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
#define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
#define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
- V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
-#endif
+ V3D_PCTR_0_SRC_N_SHIFT(x) + \
+ V3D_PCTR_0_SRC_0_3_PCTRS0_MSB))
void
v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
uint32_t ncounters,
uint8_t *events)
{
-#if V3D_VERSION >= 41
int i, j;
uint32_t source;
uint32_t mask = BITFIELD_RANGE(0, ncounters);
@@ -529,21 +548,23 @@ v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
V3D_WRITE(V3D_PCTR_0_CLR, mask);
V3D_WRITE(V3D_PCTR_0_OVERFLOW, mask);
V3D_WRITE(V3D_PCTR_0_EN, mask);
-#endif
}
void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
uint32_t ncounters,
uint64_t *values)
{
-#if V3D_VERSION >= 41
int i;
for (i = 0; i < ncounters; i++)
values[i] += V3D_READ(V3D_PCTR_0_PCTR_N(i));
V3D_WRITE(V3D_PCTR_0_EN, 0);
-#endif
+}
+
+void v3dX(simulator_get_perfcnt_total)(uint32_t *count)
+{
+ *count = ARRAY_SIZE(v3d_performance_counters);
}
#endif /* USE_V3D_SIMULATOR */
diff --git a/src/broadcom/simulator/v3dx_simulator.h b/src/broadcom/simulator/v3dx_simulator.h
index 145ae59c21e..51fc2409d3e 100644
--- a/src/broadcom/simulator/v3dx_simulator.h
+++ b/src/broadcom/simulator/v3dx_simulator.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
* Copyright © 2014-2017 Broadcom
* Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
*
@@ -50,3 +50,4 @@ void v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
uint32_t ncounters,
uint64_t *values);
+void v3dX(simulator_get_perfcnt_total)(uint32_t *count);
diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
index 9d2593cf6d2..3f04a4162dc 100644
--- a/src/broadcom/vulkan/meson.build
+++ b/src/broadcom/vulkan/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2019 Raspberry Pi
+# Copyright © 2019 Raspberry Pi Ltd
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,9 @@ v3dv_entrypoints = custom_target(
command : [
prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
+ '--beta', with_vulkan_beta.to_string(),
'--device-prefix', 'ver42',
+ '--device-prefix', 'ver71',
],
depend_files : vk_entrypoints_gen_depend_files,
)
@@ -38,6 +40,7 @@ libv3dv_files = files(
'v3dv_debug.h',
'v3dv_descriptor_set.c',
'v3dv_device.c',
+ 'v3dv_event.c',
'v3dv_formats.c',
'v3dv_image.c',
'v3dv_limits.h',
@@ -50,9 +53,8 @@ libv3dv_files = files(
'v3dv_query.c',
'v3dv_queue.c',
'v3dv_uniforms.c',
- 'v3dv_util.c',
'v3dv_wsi.c',
-)
+) + [v3d_xml_pack]
files_per_version = files(
'v3dvx_cmd_buffer.c',
@@ -63,18 +65,16 @@ files_per_version = files(
'v3dvx_pipeline.c',
'v3dvx_meta_common.c',
'v3dvx_pipeline.c',
+ 'v3dvx_query.c',
'v3dvx_queue.c',
)
-# The vulkan driver only supports version >= 42, which is the version present in
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
-# driver.
-v3d_versions = ['42']
+v3d_versions = ['42', '71']
v3dv_flags = []
-dep_v3dv3 = dependency('v3dv3', required : false)
-if dep_v3dv3.found()
+dep_v3d_hw = dependency('v3d_hw', required : false)
+if dep_v3d_hw.found()
v3dv_flags += '-DUSE_V3D_SIMULATOR'
endif
@@ -82,31 +82,27 @@ v3dv_deps = [
dep_dl,
dep_libdrm,
dep_valgrind,
- dep_v3dv3,
+ dep_v3d_hw,
idep_nir,
idep_nir_headers,
idep_vulkan_util,
+ idep_vulkan_runtime,
+ idep_vulkan_wsi,
]
if with_platform_x11
v3dv_deps += dep_xcb_dri3
- v3dv_flags += [
- '-DVK_USE_PLATFORM_XCB_KHR',
- '-DVK_USE_PLATFORM_XLIB_KHR',
- ]
- libv3dv_files += files('v3dv_wsi_x11.c')
endif
if with_platform_wayland
- v3dv_deps += [dep_wayland_client, dep_wl_protocols]
- v3dv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
- libv3dv_files += files('v3dv_wsi_wayland.c')
+ v3dv_deps += dep_wayland_client
libv3dv_files += [wayland_drm_client_protocol_h, wayland_drm_protocol_c]
endif
-if system_has_kms_drm and not with_platform_android
- v3dv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
- libv3dv_files += files('v3dv_wsi_display.c')
+if with_platform_android
+ v3dv_deps += [dep_android, idep_u_gralloc]
+ v3dv_flags += '-DVK_USE_PLATFORM_ANDROID_KHR'
+ libv3dv_files += files('v3dv_android.c')
endif
per_version_libs = []
@@ -115,8 +111,8 @@ foreach ver : v3d_versions
'v3dv-v' + ver,
[files_per_version, v3d_xml_pack, v3dv_entrypoints[0]],
include_directories : [
- inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
- inc_compiler, inc_util, inc_vulkan_wsi,
+ inc_src, inc_include, inc_broadcom,
+ inc_util,
],
c_args : [v3dv_flags, '-DV3D_VERSION=' + ver],
gnu_symbol_visibility : 'hidden',
@@ -128,17 +124,17 @@ libvulkan_broadcom = shared_library(
'vulkan_broadcom',
[libv3dv_files, v3dv_entrypoints, sha1_h],
include_directories : [
- inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util, inc_vulkan_wsi,
+ inc_include, inc_src, inc_broadcom, inc_util,
],
link_with : [
libbroadcom_cle,
libbroadcom_v3d,
- libvulkan_wsi,
per_version_libs,
],
dependencies : v3dv_deps,
c_args : v3dv_flags,
- link_args : ['-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections],
+ link_args : [vulkan_icd_link_args, '-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections],
+ link_depends : vulkan_icd_link_depends,
gnu_symbol_visibility : 'hidden',
install : true,
)
@@ -162,12 +158,31 @@ broadcom_icd = custom_target(
output : 'broadcom_icd.@0@.json'.format(host_machine.cpu()),
command : [
prog_python, '@INPUT0@',
- '--api-version', '1.0', '--xml', '@INPUT1@',
+ '--api-version', '1.2', '--xml', '@INPUT1@',
'--lib-path', join_paths(get_option('prefix'), get_option('libdir'),
'libvulkan_broadcom.so'),
'--out', '@OUTPUT@',
],
build_by_default : true,
install_dir : with_vulkan_icd_dir,
+ install_tag : 'runtime',
install : true,
)
+
+_dev_icdname = 'broadcom_devenv_icd.@0@.json'.format(host_machine.cpu())
+_dev_icd = custom_target(
+ 'broadcom_devenv_icd',
+ input : [vk_icd_gen, vk_api_xml],
+ output : _dev_icdname,
+ command : [
+ prog_python, '@INPUT0@',
+ '--api-version', '1.3', '--xml', '@INPUT1@',
+ '--lib-path', meson.current_build_dir() / 'libvulkan_broadcom.so',
+ '--out', '@OUTPUT@',
+ ],
+ build_by_default : true,
+)
+
+devenv.append('VK_DRIVER_FILES', _dev_icd.full_path())
+# Deprecated: replaced by VK_DRIVER_FILES above
+devenv.append('VK_ICD_FILENAMES', _dev_icd.full_path())
diff --git a/src/broadcom/vulkan/v3dv_android.c b/src/broadcom/vulkan/v3dv_android.c
new file mode 100644
index 00000000000..afb691e55d0
--- /dev/null
+++ b/src/broadcom/vulkan/v3dv_android.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright © 2017, Google Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include <hardware/gralloc.h>
+
+#if ANDROID_API_LEVEL >= 26
+#include <hardware/gralloc1.h>
+#endif
+
+#include "drm-uapi/drm_fourcc.h"
+#include <hardware/hardware.h>
+#include <hardware/hwvulkan.h>
+
+#include <vulkan/vk_android_native_buffer.h>
+#include <vulkan/vk_icd.h>
+
+#include "vk_android.h"
+#include "vk_enum_defines.h"
+
+#include "util/libsync.h"
+#include "util/log.h"
+#include "util/os_file.h"
+
+static int
+v3dv_hal_open(const struct hw_module_t *mod,
+ const char *id,
+ struct hw_device_t **dev);
+static int
+v3dv_hal_close(struct hw_device_t *dev);
+
+static_assert(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC, "");
+
+PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
+ .common =
+ {
+ .tag = HARDWARE_MODULE_TAG,
+ .module_api_version = HWVULKAN_MODULE_API_VERSION_0_1,
+ .hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0),
+ .id = HWVULKAN_HARDWARE_MODULE_ID,
+ .name = "Broadcom Vulkan HAL",
+ .author = "Mesa3D",
+ .methods =
+ &(hw_module_methods_t) {
+ .open = v3dv_hal_open,
+ },
+ },
+};
+
+/* If any bits in test_mask are set, then unset them and return true. */
+static inline bool
+unmask32(uint32_t *inout_mask, uint32_t test_mask)
+{
+ uint32_t orig_mask = *inout_mask;
+ *inout_mask &= ~test_mask;
+ return *inout_mask != orig_mask;
+}
+
+static int
+v3dv_hal_open(const struct hw_module_t *mod,
+ const char *id,
+ struct hw_device_t **dev)
+{
+ assert(mod == &HAL_MODULE_INFO_SYM.common);
+ assert(strcmp(id, HWVULKAN_DEVICE_0) == 0);
+
+ hwvulkan_device_t *hal_dev = malloc(sizeof(*hal_dev));
+ if (!hal_dev)
+ return -1;
+
+ *hal_dev = (hwvulkan_device_t){
+ .common =
+ {
+ .tag = HARDWARE_DEVICE_TAG,
+ .version = HWVULKAN_DEVICE_API_VERSION_0_1,
+ .module = &HAL_MODULE_INFO_SYM.common,
+ .close = v3dv_hal_close,
+ },
+ .EnumerateInstanceExtensionProperties =
+ v3dv_EnumerateInstanceExtensionProperties,
+ .CreateInstance = v3dv_CreateInstance,
+ .GetInstanceProcAddr = v3dv_GetInstanceProcAddr,
+ };
+
+ mesa_logi("v3dv: Warning: Android Vulkan implementation is experimental");
+
+ *dev = &hal_dev->common;
+ return 0;
+}
+
+static int
+v3dv_hal_close(struct hw_device_t *dev)
+{
+ /* hwvulkan.h claims that hw_device_t::close() is never called. */
+ return -1;
+}
+
+VkResult
+v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc,
+ struct u_gralloc_buffer_handle *in_hnd,
+ VkImageDrmFormatModifierExplicitCreateInfoEXT *out,
+ VkSubresourceLayout *out_layouts,
+ int max_planes)
+{
+ struct u_gralloc_buffer_basic_info info;
+
+ if (u_gralloc_get_buffer_basic_info(gralloc, in_hnd, &info) != 0)
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+ if (info.num_planes > max_planes)
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+ bool is_disjoint = false;
+ for (int i = 1; i < info.num_planes; i++) {
+ if (info.offsets[i] == 0) {
+ is_disjoint = true;
+ break;
+ }
+ }
+
+ if (is_disjoint) {
+ /* We don't support disjoint planes yet */
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+ }
+
+ memset(out_layouts, 0, sizeof(*out_layouts) * info.num_planes);
+ memset(out, 0, sizeof(*out));
+
+ out->sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT;
+ out->pPlaneLayouts = out_layouts;
+
+ out->drmFormatModifier = info.modifier;
+ out->drmFormatModifierPlaneCount = info.num_planes;
+ for (int i = 0; i < info.num_planes; i++) {
+ out_layouts[i].offset = info.offsets[i];
+ out_layouts[i].rowPitch = info.strides[i];
+ }
+
+ if (info.drm_fourcc == DRM_FORMAT_YVU420) {
+ /* Swap the U and V planes to match the VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM */
+ VkSubresourceLayout tmp = out_layouts[1];
+ out_layouts[1] = out_layouts[2];
+ out_layouts[2] = tmp;
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+v3dv_import_native_buffer_fd(VkDevice device_h,
+ int native_buffer_fd,
+ const VkAllocationCallbacks *alloc,
+ VkImage image_h)
+{
+ VkResult result;
+
+ VkDeviceMemory memory_h;
+
+ const VkMemoryDedicatedAllocateInfo ded_alloc = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+ .pNext = NULL,
+ .buffer = VK_NULL_HANDLE,
+ .image = image_h
+ };
+
+ const VkImportMemoryFdInfoKHR import_info = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+ .pNext = &ded_alloc,
+ .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+ .fd = os_dupfd_cloexec(native_buffer_fd),
+ };
+
+ result =
+ v3dv_AllocateMemory(device_h,
+ &(VkMemoryAllocateInfo) {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .pNext = &import_info,
+ .allocationSize = lseek(native_buffer_fd, 0, SEEK_END),
+ .memoryTypeIndex = 0,
+ },
+ alloc, &memory_h);
+
+ if (result != VK_SUCCESS)
+ goto fail_create_image;
+
+ VkBindImageMemoryInfo bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
+ .image = image_h,
+ .memory = memory_h,
+ .memoryOffset = 0,
+ };
+ v3dv_BindImageMemory2(device_h, 1, &bind_info);
+
+ return VK_SUCCESS;
+
+fail_create_image:
+ close(import_info.fd);
+
+ return result;
+}
+
+static VkResult
+format_supported_with_usage(VkDevice device_h,
+ VkFormat format,
+ VkImageUsageFlags imageUsage)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+ struct v3dv_physical_device *phys_dev = device->pdevice;
+ VkPhysicalDevice phys_dev_h = v3dv_physical_device_to_handle(phys_dev);
+ VkResult result;
+
+ const VkPhysicalDeviceImageFormatInfo2 image_format_info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+ .format = format,
+ .type = VK_IMAGE_TYPE_2D,
+ .tiling = VK_IMAGE_TILING_OPTIMAL,
+ .usage = imageUsage,
+ };
+
+ VkImageFormatProperties2 image_format_props = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
+ };
+
+ /* Check that requested format and usage are supported. */
+ result = v3dv_GetPhysicalDeviceImageFormatProperties2(
+ phys_dev_h, &image_format_info, &image_format_props);
+ if (result != VK_SUCCESS) {
+ return vk_errorf(device, result,
+ "v3dv_GetPhysicalDeviceImageFormatProperties2 failed "
+ "inside %s",
+ __func__);
+ }
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+setup_gralloc0_usage(struct v3dv_device *device,
+ VkFormat format,
+ VkImageUsageFlags imageUsage,
+ int *grallocUsage)
+{
+ if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT))
+ *grallocUsage |= GRALLOC_USAGE_HW_RENDER;
+
+ if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+ VK_IMAGE_USAGE_SAMPLED_BIT |
+ VK_IMAGE_USAGE_STORAGE_BIT |
+ VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
+ *grallocUsage |= GRALLOC_USAGE_HW_TEXTURE;
+
+ /* All VkImageUsageFlags not explicitly checked here are unsupported for
+ * gralloc swapchains.
+ */
+ if (imageUsage != 0) {
+ return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ "unsupported VkImageUsageFlags(0x%x) for gralloc "
+ "swapchain",
+ imageUsage);
+ }
+
+ /* Swapchain assumes direct displaying, therefore enable COMPOSER flag,
+ * In case format is not supported by display controller, gralloc will
+ * drop this flag and still allocate the buffer in VRAM
+ */
+ *grallocUsage |= GRALLOC_USAGE_HW_COMPOSER;
+
+ if (*grallocUsage == 0)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+ return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetSwapchainGrallocUsageANDROID(VkDevice device_h,
+ VkFormat format,
+ VkImageUsageFlags imageUsage,
+ int *grallocUsage)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+ VkResult result;
+
+ result = format_supported_with_usage(device_h, format, imageUsage);
+ if (result != VK_SUCCESS)
+ return result;
+
+ *grallocUsage = 0;
+ return setup_gralloc0_usage(device, format, imageUsage, grallocUsage);
+}
+
+#if ANDROID_API_LEVEL >= 26
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetSwapchainGrallocUsage2ANDROID(
+ VkDevice device_h,
+ VkFormat format,
+ VkImageUsageFlags imageUsage,
+ VkSwapchainImageUsageFlagsANDROID swapchainImageUsage,
+ uint64_t *grallocConsumerUsage,
+ uint64_t *grallocProducerUsage)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+ VkResult result;
+
+ *grallocConsumerUsage = 0;
+ *grallocProducerUsage = 0;
+ mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
+
+ result = format_supported_with_usage(device_h, format, imageUsage);
+ if (result != VK_SUCCESS)
+ return result;
+
+ int32_t grallocUsage = 0;
+ result = setup_gralloc0_usage(device, format, imageUsage, &grallocUsage);
+ if (result != VK_SUCCESS)
+ return result;
+
+ /* Setup gralloc1 usage flags from gralloc0 flags. */
+
+ if (grallocUsage & GRALLOC_USAGE_HW_RENDER) {
+ *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET;
+ }
+
+ if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) {
+ *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE;
+ }
+
+ if (grallocUsage & GRALLOC_USAGE_HW_COMPOSER) {
+ /* GPU composing case */
+ *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE;
+ /* Hardware composing case */
+ *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER;
+ }
+
+ if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID) {
+ uint64_t front_rendering_usage = 0;
+ u_gralloc_get_front_rendering_usage(device->gralloc, &front_rendering_usage);
+ *grallocProducerUsage |= front_rendering_usage;
+ }
+
+ return VK_SUCCESS;
+}
+#endif
+
+/* ----------------------------- AHardwareBuffer --------------------------- */
+
+static VkResult
+get_ahb_buffer_format_properties2(VkDevice device_h, const struct AHardwareBuffer *buffer,
+ VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+
+ /* Get a description of buffer contents . */
+ AHardwareBuffer_Desc desc;
+ AHardwareBuffer_describe(buffer, &desc);
+
+ /* Verify description. */
+ const uint64_t gpu_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE |
+ AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT |
+ AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
+
+ /* "Buffer must be a valid Android hardware buffer object with at least
+ * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags."
+ */
+ if (!(desc.usage & (gpu_usage)))
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+ /* Fill properties fields based on description. */
+ VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
+
+ p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+ p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+ p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+ p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+
+ p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+ p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+
+ p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+ p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+
+ VkFormatProperties2 format_properties = {.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2};
+
+ p->format = vk_ahb_format_to_image_format(desc.format);
+
+ VkFormat external_format = p->format;
+
+ if (p->format != VK_FORMAT_UNDEFINED)
+ goto finish;
+
+ /* External format only case
+ *
+ * From vkGetAndroidHardwareBufferPropertiesANDROID spec:
+ * "If the Android hardware buffer has one of the formats listed in the Format
+ * Equivalence table (see spec.), then format must have the equivalent Vulkan
+ * format listed in the table. Otherwise, format may be VK_FORMAT_UNDEFINED,
+ * indicating the Android hardware buffer can only be used with an external format."
+ *
+ * From SKIA source code analysis: p->format MUST be VK_FORMAT_UNDEFINED, if the
+ * format is not in the Equivalence table.
+ */
+
+ struct u_gralloc_buffer_handle gr_handle = {
+ .handle = AHardwareBuffer_getNativeHandle(buffer),
+ .pixel_stride = desc.stride,
+ .hal_format = desc.format,
+ };
+
+ struct u_gralloc_buffer_basic_info info;
+
+ if (u_gralloc_get_buffer_basic_info(device->gralloc, &gr_handle, &info) != 0)
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+ switch (info.drm_fourcc) {
+ case DRM_FORMAT_YVU420:
+ /* Assuming that U and V planes are swapped earlier */
+ external_format = VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM;
+ break;
+ case DRM_FORMAT_NV12:
+ external_format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+ break;
+ default:;
+ mesa_loge("Unsupported external DRM format: %d", info.drm_fourcc);
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+ }
+
+ struct u_gralloc_buffer_color_info color_info;
+ if (u_gralloc_get_buffer_color_info(device->gralloc, &gr_handle, &color_info) == 0) {
+ switch (color_info.yuv_color_space) {
+ case __DRI_YUV_COLOR_SPACE_ITU_REC601:
+ p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+ break;
+ case __DRI_YUV_COLOR_SPACE_ITU_REC709:
+ p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709;
+ break;
+ case __DRI_YUV_COLOR_SPACE_ITU_REC2020:
+ p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020;
+ break;
+ default:
+ break;
+ }
+
+ p->suggestedYcbcrRange = (color_info.sample_range == __DRI_YUV_NARROW_RANGE) ?
+ VK_SAMPLER_YCBCR_RANGE_ITU_NARROW : VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+ p->suggestedXChromaOffset = (color_info.horizontal_siting == __DRI_YUV_CHROMA_SITING_0_5) ?
+ VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN;
+ p->suggestedYChromaOffset = (color_info.vertical_siting == __DRI_YUV_CHROMA_SITING_0_5) ?
+ VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN;
+ }
+
+finish:
+
+ v3dv_GetPhysicalDeviceFormatProperties2(v3dv_physical_device_to_handle(device->pdevice),
+ external_format, &format_properties);
+
+ /* v3dv doesn't support direct sampling from linear images but has a logic to copy
+ * from linear to tiled images implicitly before sampling. Therefore expose optimal
+ * features for both linear and optimal tiling.
+ */
+ p->formatFeatures = format_properties.formatProperties.optimalTilingFeatures;
+ p->externalFormat = external_format;
+
+ /* From vkGetAndroidHardwareBufferPropertiesANDROID spec:
+ * "The formatFeatures member *must* include
+ * VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT and at least one of
+ * VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT or
+ * VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT"
+ */
+ p->formatFeatures |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+
+ return VK_SUCCESS;
+}
+
+VkResult
+v3dv_GetAndroidHardwareBufferPropertiesANDROID(VkDevice device_h,
+ const struct AHardwareBuffer *buffer,
+ VkAndroidHardwareBufferPropertiesANDROID *pProperties)
+{
+ V3DV_FROM_HANDLE(v3dv_device, dev, device_h);
+ struct v3dv_physical_device *pdevice = dev->pdevice;
+
+ VkResult result;
+
+ VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop =
+ vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID);
+
+ /* Fill format properties of an Android hardware buffer. */
+ if (format_prop) {
+ VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = {
+ .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID,
+ };
+ result = get_ahb_buffer_format_properties2(device_h, buffer, &format_prop2);
+ if (result != VK_SUCCESS)
+ return result;
+
+ format_prop->format = format_prop2.format;
+ format_prop->externalFormat = format_prop2.externalFormat;
+ format_prop->formatFeatures =
+ vk_format_features2_to_features(format_prop2.formatFeatures);
+ format_prop->samplerYcbcrConversionComponents =
+ format_prop2.samplerYcbcrConversionComponents;
+ format_prop->suggestedYcbcrModel = format_prop2.suggestedYcbcrModel;
+ format_prop->suggestedYcbcrRange = format_prop2.suggestedYcbcrRange;
+ format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset;
+ format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset;
+ }
+
+ VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 =
+ vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID);
+ if (format_prop2) {
+ result = get_ahb_buffer_format_properties2(device_h, buffer, format_prop2);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ const native_handle_t *handle = AHardwareBuffer_getNativeHandle(buffer);
+ assert(handle && handle->numFds > 0);
+ pProperties->allocationSize = lseek(handle->data[0], 0, SEEK_END);
+
+ /* All memory types. */
+ pProperties->memoryTypeBits = (1u << pdevice->memory.memoryTypeCount) - 1;
+
+ return VK_SUCCESS;
+}
diff --git a/src/broadcom/vulkan/v3dv_bo.c b/src/broadcom/vulkan/v3dv_bo.c
index 71679ceec27..1b26abec325 100644
--- a/src/broadcom/vulkan/v3dv_bo.c
+++ b/src/broadcom/vulkan/v3dv_bo.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -31,11 +31,12 @@
/* Default max size of the bo cache, in MB.
*
- * FIXME: we got this value when testing some apps using the rpi4 with 4GB,
- * but it should depend on the total amount of RAM. But for that we would need
- * to test on real hw with different amount of RAM. Using this value for now.
+ * This value comes from testing different Vulkan application. Greater values
+ * didn't get any further performance benefit. This looks somewhat small, but
+ * from testing those applications, the main consumer of the bo cache are
+ * the bos used for the CLs, that are usually small.
*/
-#define DEFAULT_MAX_BO_CACHE_SIZE 512
+#define DEFAULT_MAX_BO_CACHE_SIZE 64
/* Discarded to use a V3D_DEBUG for this, as it would mean adding a run-time
* check for most of the calls
@@ -67,8 +68,8 @@ bo_dump_stats(struct v3dv_device *device)
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
- fprintf(stderr, " now: %ld\n",
- time.tv_sec);
+ fprintf(stderr, " now: %lld\n",
+ (long long)time.tv_sec);
}
if (cache->size_list_size) {
@@ -117,8 +118,8 @@ bo_from_cache(struct v3dv_device *device, uint32_t size, const char *name)
}
bo_remove_from_cache(cache, bo);
-
bo->name = name;
+ p_atomic_set(&bo->refcnt, 1);
}
mtx_unlock(&cache->lock);
return bo;
@@ -131,28 +132,39 @@ bo_free(struct v3dv_device *device,
if (!bo)
return true;
- if (bo->map)
- v3dv_bo_unmap(device, bo);
+ assert(p_atomic_read(&bo->refcnt) == 0);
+ assert(bo->map == NULL);
+
+ if (!bo->is_import) {
+ device->bo_count--;
+ device->bo_size -= bo->size;
+
+ if (dump_stats) {
+ fprintf(stderr, "Freed %s%s%dkb:\n",
+ bo->name ? bo->name : "",
+ bo->name ? " " : "",
+ bo->size / 1024);
+ bo_dump_stats(device);
+ }
+ }
+
+ uint32_t handle = bo->handle;
+ /* Our BO structs are stored in a sparse array in the physical device,
+ * so we don't want to free the BO pointer, instead we want to reset it
+ * to 0, to signal that array entry as being free.
+ *
+ * We must do the reset before we actually free the BO in the kernel, since
+ * otherwise there is a chance the application creates another BO in a
+ * different thread and gets the same array entry, causing a race.
+ */
+ memset(bo, 0, sizeof(*bo));
struct drm_gem_close c;
memset(&c, 0, sizeof(c));
- c.handle = bo->handle;
+ c.handle = handle;
int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_GEM_CLOSE, &c);
if (ret != 0)
- fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
-
- device->bo_count--;
- device->bo_size -= bo->size;
-
- if (dump_stats) {
- fprintf(stderr, "Freed %s%s%dkb:\n",
- bo->name ? bo->name : "",
- bo->name ? " " : "",
- bo->size / 1024);
- bo_dump_stats(device);
- }
-
- vk_free(&device->vk.alloc, bo);
+ fprintf(stderr, "close object %d: %s\n", handle, strerror(errno));
return ret == 0;
}
@@ -183,6 +195,7 @@ v3dv_bo_init(struct v3dv_bo *bo,
const char *name,
bool private)
{
+ p_atomic_set(&bo->refcnt, 1);
bo->handle = handle;
bo->handle_bit = 1ull << (handle % 64);
bo->size = size;
@@ -192,9 +205,22 @@ v3dv_bo_init(struct v3dv_bo *bo,
bo->name = name;
bo->private = private;
bo->dumb_handle = -1;
+ bo->is_import = false;
+ bo->cl_branch_offset = 0xffffffff;
list_inithead(&bo->list_link);
}
+void
+v3dv_bo_init_import(struct v3dv_bo *bo,
+ uint32_t handle,
+ uint32_t size,
+ uint32_t offset,
+ bool private)
+{
+ v3dv_bo_init(bo, handle, size, offset, "import", private);
+ bo->is_import = true;
+}
+
struct v3dv_bo *
v3dv_bo_alloc(struct v3dv_device *device,
uint32_t size,
@@ -218,14 +244,6 @@ v3dv_bo_alloc(struct v3dv_device *device,
}
}
- bo = vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (!bo) {
- fprintf(stderr, "Failed to allocate host memory for BO\n");
- return NULL;
- }
-
retry:
;
@@ -244,7 +262,6 @@ v3dv_bo_alloc(struct v3dv_device *device,
goto retry;
}
- vk_free(&device->vk.alloc, bo);
fprintf(stderr, "Failed to allocate device memory for BO\n");
return NULL;
}
@@ -252,6 +269,9 @@ v3dv_bo_alloc(struct v3dv_device *device,
assert(create.offset % page_align == 0);
assert((create.offset & 0xffffffff) == create.offset);
+ bo = v3dv_device_lookup_bo(device->pdevice, create.handle);
+ assert(bo && bo->handle == 0);
+
v3dv_bo_init(bo, create.handle, size, create.offset, name, private);
device->bo_count++;
@@ -320,7 +340,7 @@ v3dv_bo_map(struct v3dv_device *device, struct v3dv_bo *bo, uint32_t size)
if (!ok)
return false;
- ok = v3dv_bo_wait(device, bo, PIPE_TIMEOUT_INFINITE);
+ ok = v3dv_bo_wait(device, bo, OS_TIMEOUT_INFINITE);
if (!ok) {
fprintf(stderr, "memory wait for map failed\n");
return false;
@@ -340,7 +360,7 @@ v3dv_bo_unmap(struct v3dv_device *device, struct v3dv_bo *bo)
bo->map_size = 0;
}
-static boolean
+static bool
reallocate_size_list(struct v3dv_bo_cache *cache,
struct v3dv_device *device,
uint32_t size)
@@ -400,9 +420,11 @@ v3dv_bo_cache_init(struct v3dv_device *device)
fprintf(stderr, "MAX BO CACHE SIZE: %iMB\n", device->bo_cache.max_cache_size);
}
+ mtx_lock(&device->bo_cache.lock);
device->bo_cache.max_cache_size *= 1024 * 1024;
device->bo_cache.cache_count = 0;
device->bo_cache.cache_size = 0;
+ mtx_unlock(&device->bo_cache.lock);
}
void
@@ -455,6 +477,12 @@ v3dv_bo_free(struct v3dv_device *device,
if (!bo)
return true;
+ if (!p_atomic_dec_zero(&bo->refcnt))
+ return true;
+
+ if (bo->map)
+ v3dv_bo_unmap(device, bo);
+
struct timespec time;
struct v3dv_bo_cache *cache = &device->bo_cache;
uint32_t page_index = bo->size / 4096 - 1;
diff --git a/src/broadcom/vulkan/v3dv_bo.h b/src/broadcom/vulkan/v3dv_bo.h
index ab2b8c7356d..5e382817b37 100644
--- a/src/broadcom/vulkan/v3dv_bo.h
+++ b/src/broadcom/vulkan/v3dv_bo.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -39,6 +39,11 @@ struct v3dv_bo {
const char *name;
+ /* In a CL where a BRANCH has been emitted, the offset of the BRANCH
+ * instruction in the BO.
+ */
+ uint32_t cl_branch_offset;
+
/** Entry in the linked list of buffers freed, by age. */
struct list_head time_list;
/** Entry in the per-page-count linked list of buffers freed (by age). */
@@ -52,14 +57,20 @@ struct v3dv_bo {
*/
bool private;
+ /** If this BO has been imported */
+ bool is_import;
+
/**
* If this BO was allocated for a swapchain on the display device, the
* handle of the dumb BO on that device.
*/
int32_t dumb_handle;
+
+ int32_t refcnt;
};
void v3dv_bo_init(struct v3dv_bo *bo, uint32_t handle, uint32_t size, uint32_t offset, const char *name, bool private);
+void v3dv_bo_init_import(struct v3dv_bo *bo, uint32_t handle, uint32_t size, uint32_t offset, bool private);
struct v3dv_bo *v3dv_bo_alloc(struct v3dv_device *device, uint32_t size, const char *name, bool private);
diff --git a/src/broadcom/vulkan/v3dv_cl.c b/src/broadcom/vulkan/v3dv_cl.c
index ed11f53c4bb..7d414999e9b 100644
--- a/src/broadcom/vulkan/v3dv_cl.c
+++ b/src/broadcom/vulkan/v3dv_cl.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -27,7 +27,7 @@
* versions, so we just explicitly set the V3D_VERSION and include v3dx_pack
* here
*/
-#define V3D_VERSION 33
+#define V3D_VERSION 42
#include "broadcom/common/v3d_macros.h"
#include "broadcom/cle/v3dx_pack.h"
@@ -58,6 +58,14 @@ v3dv_cl_destroy(struct v3dv_cl *cl)
static bool
cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)
{
+ /* If we are growing, double the BO allocation size to reduce the number
+ * of allocations with large command buffers. This has a very significant
+ * impact on the number of draw calls per second reported by vkoverhead.
+ */
+ space = align(space, 4096);
+ if (cl->bo)
+ space = MAX2(cl->bo->size * 2, space);
+
struct v3dv_bo *bo = v3dv_bo_alloc(cl->job->device, space, "CL", true);
if (!bo) {
fprintf(stderr, "failed to allocate memory for command list\n");
@@ -76,6 +84,7 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)
/* Chain to the new BO from the old one if requested */
if (use_branch && cl->bo) {
+ cl->bo->cl_branch_offset = v3dv_cl_offset(cl);
cl_emit(cl, BRANCH, branch) {
branch.address = v3dv_cl_address(bo, 0);
}
@@ -114,14 +123,18 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
* end with a 'return from sub list' command.
*/
bool needs_return_from_sub_list = false;
- if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
- if (cl->size > 0) {
+ if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE && cl->size > 0)
needs_return_from_sub_list = true;
- space += cl_packet_length(RETURN_FROM_SUB_LIST);
- }
- } else {
- space += cl_packet_length(BRANCH);
- }
+
+ /*
+ * The CLE processor in the simulator tries to read V3D_CL_MAX_INSTR_SIZE
+ * bytes form the CL for each new instruction. If the last instruction in our
+ * CL is smaller than that, and there are not at least V3D_CL_MAX_INSTR_SIZE
+ * bytes until the end of the BO, it will read out of bounds and possibly
+ * cause a GMP violation interrupt to trigger. Ensure we always have at
+ * least that many bytes available to read with the last instruction.
+ */
+ space += V3D_CL_MAX_INSTR_SIZE;
if (v3dv_cl_offset(cl) + space <= cl->size)
return;
diff --git a/src/broadcom/vulkan/v3dv_cl.h b/src/broadcom/vulkan/v3dv_cl.h
index 68d5acd455b..7e17ac395c4 100644
--- a/src/broadcom/vulkan/v3dv_cl.h
+++ b/src/broadcom/vulkan/v3dv_cl.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,7 +26,8 @@
#include "broadcom/cle/v3d_packet_helpers.h"
-#include "list.h"
+#include "util/list.h"
+#include "util/macros.h"
struct v3dv_bo;
struct v3dv_job;
@@ -118,6 +119,13 @@ cl_advance(struct v3dv_cl_out **cl, uint32_t n)
}
static inline void
+cl_advance_and_end(struct v3dv_cl *cl, uint32_t n)
+{
+ cl->next = (struct v3dv_cl_out *)((char *)(cl->next) + n);
+ assert(v3dv_cl_offset(cl) <= cl->size);
+}
+
+static inline void
cl_aligned_u32(struct v3dv_cl_out **cl, uint32_t n)
{
*(uint32_t *)(*cl) = n;
@@ -143,15 +151,9 @@ cl_aligned_reloc(struct v3dv_cl *cl,
uint32_t v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment);
void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
-/* We redefine ALIGN as a macro as we want to use cl_aligned_packet_length for
- * struct fields
- */
-#define ALIGN(value, alignment) \
- (((value) + (alignment) - 1) & ~((alignment) - 1))
-
#define cl_packet_header(packet) V3DX(packet ## _header)
#define cl_packet_length(packet) V3DX(packet ## _length)
-#define cl_aligned_packet_length(packet, alignment) ALIGN(cl_packet_length(packet), alignment)
+#define cl_aligned_packet_length(packet, alignment) ALIGN_POT(cl_packet_length(packet), alignment)
#define cl_packet_pack(packet) V3DX(packet ## _pack)
#define cl_packet_struct(packet) V3DX(packet)
@@ -178,8 +180,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
({ \
struct v3dv_cl_out *cl_out = cl_start(cl); \
cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \
- cl_advance(&cl_out, cl_packet_length(packet)); \
- cl_end(cl, cl_out); \
+ cl_advance_and_end(cl, cl_packet_length(packet)); \
_loop_terminate = NULL; \
})) \
@@ -195,8 +196,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
cl_packet_pack(packet)(cl, packed, &name); \
for (int _i = 0; _i < cl_packet_length(packet); _i++) \
((uint8_t *)cl_out)[_i] = packed[_i] | (prepacked)[_i]; \
- cl_advance(&cl_out, cl_packet_length(packet)); \
- cl_end(cl, cl_out); \
+ cl_advance_and_end(cl, cl_packet_length(packet)); \
_loop_terminate = NULL; \
})) \
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 0d6c393ee6e..96e83c657e6 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -21,42 +21,26 @@
* IN THE SOFTWARE.
*/
+#include "broadcom/common/v3d_csd.h"
#include "v3dv_private.h"
#include "util/u_pack_color.h"
-#include "vk_format_info.h"
+#include "vk_common_entrypoints.h"
#include "vk_util.h"
-const struct v3dv_dynamic_state default_dynamic_state = {
- .viewport = {
- .count = 0,
- },
- .scissor = {
- .count = 0,
- },
- .stencil_compare_mask =
- {
- .front = ~0u,
- .back = ~0u,
- },
- .stencil_write_mask =
- {
- .front = ~0u,
- .back = ~0u,
- },
- .stencil_reference =
- {
- .front = 0u,
- .back = 0u,
- },
- .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
- .depth_bias = {
- .constant_factor = 0.0f,
- .depth_bias_clamp = 0.0f,
- .slope_factor = 0.0f,
- },
- .line_width = 1.0f,
- .color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1,
-};
+float
+v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline,
+ struct v3dv_cmd_buffer *buffer)
+{
+ float width = buffer->vk.dynamic_graphics_state.rs.line.width;
+
+ /* If line smoothing is enabled then we want to add some extra pixels to
+ * the width in order to have some semi-transparent edges.
+ */
+ if (pipeline->line_smooth)
+ width = floorf(M_SQRT2 * width) + 3;
+
+ return width;
+}
void
v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
@@ -83,59 +67,22 @@ v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
job->bo_handle_mask |= bo->handle_bit;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateCommandPool(VkDevice _device,
- const VkCommandPoolCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkCommandPool *pCmdPool)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- struct v3dv_cmd_pool *pool;
-
- /* We only support one queue */
- assert(pCreateInfo->queueFamilyIndex == 0);
-
- pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
- VK_OBJECT_TYPE_COMMAND_POOL);
- if (pool == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (pAllocator)
- pool->alloc = *pAllocator;
- else
- pool->alloc = device->vk.alloc;
-
- list_inithead(&pool->cmd_buffers);
-
- *pCmdPool = v3dv_cmd_pool_to_handle(pool);
-
- return VK_SUCCESS;
-}
-
static void
cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_device *device,
- struct v3dv_cmd_pool *pool,
- VkCommandBufferLevel level)
+ struct v3dv_device *device)
{
/* Do not reset the base object! If we are calling this from a command
* buffer reset that would reset the loader's dispatch table for the
* command buffer, and any other relevant info from vk_object_base
*/
- const uint32_t base_size = sizeof(struct vk_object_base);
+ const uint32_t base_size = sizeof(struct vk_command_buffer);
uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
cmd_buffer->device = device;
- cmd_buffer->pool = pool;
- cmd_buffer->level = level;
list_inithead(&cmd_buffer->private_objs);
list_inithead(&cmd_buffer->jobs);
- list_inithead(&cmd_buffer->list_link);
-
- assert(pool);
- list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
cmd_buffer->state.subpass_idx = -1;
cmd_buffer->state.meta.subpass_idx = -1;
@@ -144,22 +91,35 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
}
static VkResult
-cmd_buffer_create(struct v3dv_device *device,
- struct v3dv_cmd_pool *pool,
- VkCommandBufferLevel level,
- VkCommandBuffer *pCommandBuffer)
+cmd_buffer_create(struct vk_command_pool *pool, VkCommandBufferLevel level,
+ struct vk_command_buffer **cmd_buffer_out)
{
+ struct v3dv_device *device =
+ container_of(pool->base.device, struct v3dv_device, vk);
+
struct v3dv_cmd_buffer *cmd_buffer;
- cmd_buffer = vk_object_zalloc(&device->vk,
- &pool->alloc,
- sizeof(*cmd_buffer),
- VK_OBJECT_TYPE_COMMAND_BUFFER);
+ cmd_buffer = vk_zalloc(&pool->alloc,
+ sizeof(*cmd_buffer),
+ 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (cmd_buffer == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ /* Here we pass 0 as level because this callback hook doesn't have the level
+ * info, but that's fine, vk_common_AllocateCommandBuffers will fix it up
+ * after creation.
+ */
+ VkResult result;
+ result = vk_command_buffer_init(pool, &cmd_buffer->vk,
+ &v3dv_cmd_buffer_ops, level);
+ if (result != VK_SUCCESS) {
+ vk_free(&pool->alloc, cmd_buffer);
+ return result;
+ }
- cmd_buffer_init(cmd_buffer, device, pool, level);
+ cmd_buffer_init(cmd_buffer, device);
- *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+ *cmd_buffer_out = &cmd_buffer->vk;
return VK_SUCCESS;
}
@@ -168,7 +128,7 @@ static void
job_destroy_gpu_cl_resources(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
- job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+ job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
v3dv_cl_destroy(&job->bcl);
v3dv_cl_destroy(&job->rcl);
@@ -189,9 +149,21 @@ job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_GPU_CL);
- list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
- list_del(&bo->list_link);
- vk_free(&job->device->vk.alloc, bo);
+ struct v3dv_cmd_buffer *cmd_buffer = job->cmd_buffer;
+ if (job->clone_owns_bcl) {
+ /* For suspending jobs in command buffers with the simultaneous use flag
+ * we allocate a real copy of the BCL.
+ */
+ assert(job->suspending &&
+ cmd_buffer &&
+ (cmd_buffer->usage_flags &
+ VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT));
+ v3dv_cl_destroy(&job->bcl);
+ } else {
+ list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
+ list_del(&bo->list_link);
+ vk_free(&job->device->vk.alloc, bo);
+ }
}
list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
@@ -219,22 +191,6 @@ job_destroy_gpu_csd_resources(struct v3dv_job *job)
v3dv_bo_free(job->device, job->csd.shared_memory);
}
-static void
-job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
-{
- assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
- assert(job->cmd_buffer);
- vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events);
-}
-
-static void
-job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job)
-{
- assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
- assert(job->cmd_buffer);
- v3dv_job_destroy(job->cpu.csd_indirect.csd_job);
-}
-
void
v3dv_job_destroy(struct v3dv_job *job)
{
@@ -249,18 +205,12 @@ v3dv_job_destroy(struct v3dv_job *job)
if (!job->is_clone) {
switch (job->type) {
case V3DV_JOB_TYPE_GPU_CL:
- case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
+ case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
job_destroy_gpu_cl_resources(job);
break;
case V3DV_JOB_TYPE_GPU_CSD:
job_destroy_gpu_csd_resources(job);
break;
- case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
- job_destroy_cpu_wait_events_resources(job);
- break;
- case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
- job_destroy_cpu_csd_indirect_resources(job);
- break;
default:
break;
}
@@ -316,7 +266,7 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_job_destroy(cmd_buffer->state.job);
if (cmd_buffer->state.attachments)
- vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
+ vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
if (cmd_buffer->state.query.end.alloc_count > 0)
vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);
@@ -333,38 +283,22 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);
}
+
+ v3dv_destroy_dynamic_framebuffer(cmd_buffer);
}
static void
-cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
+cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
{
- list_del(&cmd_buffer->pool_link);
+ struct v3dv_cmd_buffer *cmd_buffer =
+ container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
+
cmd_buffer_free_resources(cmd_buffer);
- vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
+ vk_command_buffer_finish(&cmd_buffer->vk);
+ vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
}
static bool
-attachment_list_is_subset(struct v3dv_subpass_attachment *l1, uint32_t l1_count,
- struct v3dv_subpass_attachment *l2, uint32_t l2_count)
-{
- for (uint32_t i = 0; i < l1_count; i++) {
- uint32_t attachment_idx = l1[i].attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
- continue;
-
- uint32_t j;
- for (j = 0; j < l2_count; j++) {
- if (l2[j].attachment == attachment_idx)
- break;
- }
- if (j == l2_count)
- return false;
- }
-
- return true;
- }
-
-static bool
cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t subpass_idx)
{
@@ -372,9 +306,9 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
assert(state->pass);
const struct v3dv_physical_device *physical_device =
- &cmd_buffer->device->instance->physicalDevice;
+ cmd_buffer->device->pdevice;
- if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+ if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
return false;
if (!cmd_buffer->state.job)
@@ -399,44 +333,37 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
- /* Don't merge if the subpasses have different view masks, since in that
- * case the framebuffer setup is different and we need to emit different
- * RCLs.
- */
- if (subpass->view_mask != prev_subpass->view_mask)
+ if (subpass->ds_attachment.attachment !=
+ prev_subpass->ds_attachment.attachment)
return false;
- /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED,
- * we need to check that for each subpass all its used attachments are
- * used by the other subpass.
- */
- bool compatible =
- attachment_list_is_subset(prev_subpass->color_attachments,
- prev_subpass->color_count,
- subpass->color_attachments,
- subpass->color_count);
- if (!compatible)
+ if (subpass->color_count != prev_subpass->color_count)
return false;
- compatible =
- attachment_list_is_subset(subpass->color_attachments,
- subpass->color_count,
- prev_subpass->color_attachments,
- prev_subpass->color_count);
- if (!compatible)
- return false;
+ for (uint32_t i = 0; i < subpass->color_count; i++) {
+ if (subpass->color_attachments[i].attachment !=
+ prev_subpass->color_attachments[i].attachment) {
+ return false;
+ }
+ }
- if (subpass->ds_attachment.attachment !=
- prev_subpass->ds_attachment.attachment)
+ /* Don't merge if the subpasses have different view masks, since in that
+ * case the framebuffer setup is different and we need to emit different
+ * RCLs.
+ */
+ if (subpass->view_mask != prev_subpass->view_mask)
return false;
/* FIXME: Since some attachment formats can't be resolved using the TLB we
* need to emit separate resolve jobs for them and that would not be
* compatible with subpass merges. We could fix that by testing if any of
- * the attachments to resolve doesn't suppotr TLB resolves.
+ * the attachments to resolve doesn't support TLB resolves.
*/
- if (prev_subpass->resolve_attachments || subpass->resolve_attachments)
+ if (prev_subpass->resolve_attachments || subpass->resolve_attachments ||
+ prev_subpass->resolve_depth || prev_subpass->resolve_stencil ||
+ subpass->resolve_depth || subpass->resolve_stencil) {
return false;
+ }
return true;
}
@@ -452,18 +379,10 @@ job_compute_frame_tiling(struct v3dv_job *job,
uint32_t layers,
uint32_t render_target_count,
uint8_t max_internal_bpp,
- bool msaa)
-{
- static const uint8_t tile_sizes[] = {
- 64, 64,
- 64, 32,
- 32, 32,
- 32, 16,
- 16, 16,
- 16, 8,
- 8, 8
- };
-
+ uint8_t total_color_bpp,
+ bool msaa,
+ bool double_buffer)
+{
assert(job);
struct v3dv_frame_tiling *tiling = &job->frame_tiling;
@@ -472,23 +391,18 @@ job_compute_frame_tiling(struct v3dv_job *job,
tiling->layers = layers;
tiling->render_target_count = render_target_count;
tiling->msaa = msaa;
-
- uint32_t tile_size_index = 0;
-
- if (render_target_count > 2)
- tile_size_index += 2;
- else if (render_target_count > 1)
- tile_size_index += 1;
-
- if (msaa)
- tile_size_index += 2;
-
tiling->internal_bpp = max_internal_bpp;
- tile_size_index += tiling->internal_bpp;
- assert(tile_size_index < ARRAY_SIZE(tile_sizes) / 2);
+ tiling->total_color_bpp = total_color_bpp;
+ tiling->double_buffer = double_buffer;
- tiling->tile_width = tile_sizes[tile_size_index * 2];
- tiling->tile_height = tile_sizes[tile_size_index * 2 + 1];
+ /* Double-buffer is incompatible with MSAA */
+ assert(!tiling->msaa || !tiling->double_buffer);
+
+ v3d_choose_tile_size(&job->device->devinfo,
+ render_target_count,
+ max_internal_bpp, total_color_bpp, msaa,
+ tiling->double_buffer,
+ &tiling->tile_width, &tiling->tile_height);
tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
@@ -516,41 +430,17 @@ job_compute_frame_tiling(struct v3dv_job *job,
return tiling;
}
-void
-v3dv_job_start_frame(struct v3dv_job *job,
- uint32_t width,
- uint32_t height,
- uint32_t layers,
- bool allocate_tile_state_for_all_layers,
- uint32_t render_target_count,
- uint8_t max_internal_bpp,
- bool msaa)
+bool
+v3dv_job_allocate_tile_state(struct v3dv_job *job)
{
- assert(job);
-
- /* Start by computing frame tiling spec for this job */
- const struct v3dv_frame_tiling *tiling =
- job_compute_frame_tiling(job,
- width, height, layers,
- render_target_count, max_internal_bpp, msaa);
-
- v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
- v3dv_return_if_oom(NULL, job);
-
- /* We only need to allocate tile state for all layers if the binner
- * writes primitives to layers other than the first. This can only be
- * done using layered rendering (writing gl_Layer from a geometry shader),
- * so for other cases of multilayered framebuffers (typically with
- * meta copy/clear operations) that won't use layered rendering, we only
- * need one layer worth of of tile state for the binner.
- */
- if (!allocate_tile_state_for_all_layers)
- layers = 1;
+ struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+ const uint32_t layers =
+ job->allocate_tile_state_for_all_layers ? tiling->layers : 1;
/* The PTB will request the tile alloc initial size per tile at start
* of tile binning.
*/
- uint32_t tile_alloc_size = 64 * tiling->layers *
+ uint32_t tile_alloc_size = 64 * layers *
tiling->draw_tiles_x *
tiling->draw_tiles_y;
@@ -573,47 +463,127 @@ v3dv_job_start_frame(struct v3dv_job *job,
"tile_alloc", true);
if (!job->tile_alloc) {
v3dv_flag_oom(NULL, job);
- return;
+ return false;
}
v3dv_job_add_bo_unchecked(job, job->tile_alloc);
const uint32_t tsda_per_tile_size = 256;
- const uint32_t tile_state_size = tiling->layers *
+ const uint32_t tile_state_size = layers *
tiling->draw_tiles_x *
tiling->draw_tiles_y *
tsda_per_tile_size;
job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);
if (!job->tile_state) {
v3dv_flag_oom(NULL, job);
- return;
+ return false;
}
v3dv_job_add_bo_unchecked(job, job->tile_state);
+ return true;
+}
+
+void
+v3dv_job_start_frame(struct v3dv_job *job,
+ uint32_t width,
+ uint32_t height,
+ uint32_t layers,
+ bool allocate_tile_state_for_all_layers,
+ bool allocate_tile_state_now,
+ uint32_t render_target_count,
+ uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
+ bool msaa)
+{
+ assert(job);
+
+ /* Start by computing frame tiling spec for this job assuming that
+ * double-buffer mode is disabled.
+ */
+ const struct v3dv_frame_tiling *tiling =
+ job_compute_frame_tiling(job, width, height, layers,
+ render_target_count, max_internal_bpp,
+ total_color_bpp, msaa, false);
+
+ v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
+ v3dv_return_if_oom(NULL, job);
- v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers);
+ job->allocate_tile_state_for_all_layers = allocate_tile_state_for_all_layers;
+
+ /* For subpass jobs we postpone tile state allocation until we are finishing
+ * the job and have made a decision about double-buffer.
+ */
+ if (allocate_tile_state_now) {
+ if (!v3dv_job_allocate_tile_state(job))
+ return;
+ }
+
+ v3dv_X(job->device, job_emit_binning_prolog)(job, tiling,
+ allocate_tile_state_for_all_layers ? tiling->layers : 1);
job->ez_state = V3D_EZ_UNDECIDED;
job->first_ez_state = V3D_EZ_UNDECIDED;
}
+static bool
+job_should_enable_double_buffer(struct v3dv_job *job)
+{
+ /* Incompatibility with double-buffer */
+ if (!job->can_use_double_buffer)
+ return false;
+
+ /* Too much geometry processing */
+ if (job->double_buffer_score.geom > 2000000)
+ return false;
+
+ /* Too little rendering to make up for tile store latency */
+ if (job->double_buffer_score.render < 100000)
+ return false;
+
+ return true;
+}
+
static void
cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
{
- assert(cmd_buffer->state.job);
+ struct v3dv_job *job = cmd_buffer->state.job;
+ assert(job);
+
+ /* For subpass jobs we always emit the RCL here */
+ assert(v3dv_cl_offset(&job->rcl) == 0);
+
+ /* Only emit RCL for the first job in a suspend/resume chain */
+ if (!job->resuming) {
+ /* Decide if we want to enable double-buffer for this job. If we do, then
+ * we need to rewrite the TILE_BINNING_MODE_CFG packet in the BCL.
+ */
+ if (job_should_enable_double_buffer(job)) {
+ assert(!job->frame_tiling.double_buffer);
+ job_compute_frame_tiling(job,
+ job->frame_tiling.width,
+ job->frame_tiling.height,
+ job->frame_tiling.layers,
+ job->frame_tiling.render_target_count,
+ job->frame_tiling.internal_bpp,
+ job->frame_tiling.total_color_bpp,
+ job->frame_tiling.msaa,
+ true);
+
+ v3dv_X(job->device, job_emit_enable_double_buffer)(job);
+ }
+
+ /* At this point we have decided whether we want to use double-buffer or
+ * not and the job's frame tiling represents that decision so we can
+ * allocate the tile state, which we need to do before we emit the RCL.
+ */
+ v3dv_job_allocate_tile_state(job);
- /* Typically, we have a single job for each subpass and we emit the job's RCL
- * here when we are ending the frame for the subpass. However, some commands
- * such as vkCmdClearAttachments need to run in their own separate job and
- * they emit their own RCL even if they execute inside a subpass. In this
- * scenario, we don't want to emit subpass RCL when we end the frame for
- * those jobs, so we only emit the subpass RCL if the job has not recorded
- * any RCL commands of its own.
- */
- if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0)
v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
+ }
- v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job);
+ /* Only emit the binning flush for the last job in resume/suspend chain */
+ if (!job->suspending)
+ v3dv_X(cmd_buffer->device, job_emit_binning_flush)(job);
}
struct v3dv_job *
@@ -635,24 +605,47 @@ v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
}
static void
-cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
+cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query, uint32_t count)
{
- struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
- if (state->query.end.used_count > 0) {
- const uint32_t query_count = state->query.end.used_count;
- for (uint32_t i = 0; i < query_count; i++) {
- assert(i < state->query.end.used_count);
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_END_QUERY,
- cmd_buffer, -1);
- v3dv_return_if_oom(cmd_buffer, NULL);
-
- job->cpu.query_end = state->query.end.states[i];
- list_addtail(&job->list_link, &cmd_buffer->jobs);
+ struct v3dv_job *job =
+ v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+ V3DV_JOB_TYPE_CPU_END_QUERY,
+ cmd_buffer, -1);
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ job->cpu.query_end.pool = pool;
+ job->cpu.query_end.query = query;
+ job->cpu.query_end.count = count;
+ list_addtail(&job->list_link, &cmd_buffer->jobs);
+}
+
+static inline bool
+cmd_buffer_has_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ return cmd_buffer->state.query.end.used_count > 0;
+}
+
+static void
+cmd_buffer_add_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ const uint32_t count = state->query.end.used_count;
+ for (uint32_t i = 0; i < count; i++) {
+ assert(i < state->query.end.used_count);
+ struct v3dv_end_query_info *info = &state->query.end.states[i];
+ if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+ v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, info->pool,
+ info->query, info->count, 1);
+ } else {
+ cmd_buffer_emit_end_query_cpu(cmd_buffer, info->pool,
+ info->query, info->count);
}
}
+ state->query.end.used_count = 0;
}
void
@@ -673,8 +666,17 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
* should at least have the start frame commands, otherwise, it should have
* a transfer command. The only exception are secondary command buffers
* inside a render pass.
+ *
+ * With dynamic rendering there is also the possibility that we resume a
+ * suspended pass with an empty job. In that case, we need to ensure the
+ * empty job is still a valid commmand list, which we will ensure when we
+ * add the binning flush right below, which only happens if this is the
+ * last job in the resume/suspend chain. If it is not the last then we know
+ * it must at least have the BRANCH instruction to link with a follow-up
+ * resume job.
*/
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
+ (job->resuming && !job->suspending) ||
v3dv_cl_offset(&job->bcl) > 0);
/* When we merge multiple subpasses into the same job we must only emit one
@@ -684,6 +686,11 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
*/
assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
+ if (!(cmd_buffer->state.barrier.dst_mask & V3DV_BARRIER_GRAPHICS_BIT)) {
+ cmd_buffer->state.barrier.bcl_buffer_access = 0;
+ cmd_buffer->state.barrier.bcl_image_access = 0;
+ }
+
/* If we are finishing a job inside a render pass we have two scenarios:
*
* 1. It is a regular CL, in which case we will submit the job to the GPU,
@@ -699,32 +706,36 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
if (job->type == V3DV_JOB_TYPE_GPU_CL) {
cmd_buffer_end_render_pass_frame(cmd_buffer);
} else {
- assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+ assert(job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);
}
}
+ bool suspending = job->suspending;
list_addtail(&job->list_link, &cmd_buffer->jobs);
cmd_buffer->state.job = NULL;
/* If we have recorded any state with this last GPU job that requires to
- * emit CPU jobs after the job is completed, add them now. The only
- * exception is secondary command buffers inside a render pass, because in
+ * emit jobs after the job is completed, add them now. The only exception
+ * is secondary command buffers inside a render pass, because in
* that case we want to defer this until we finish recording the primary
* job into which we execute the secondary.
*/
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
- !cmd_buffer->state.pass) {
- cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);
+ if (!suspending) {
+ if (cmd_buffer_has_pending_jobs(cmd_buffer) &&
+ (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
+ !cmd_buffer->state.pass)) {
+ cmd_buffer_add_pending_jobs(cmd_buffer);
+ }
}
}
-static bool
-job_type_is_gpu(struct v3dv_job *job)
+bool
+v3dv_job_type_is_gpu(struct v3dv_job *job)
{
switch (job->type) {
case V3DV_JOB_TYPE_GPU_CL:
- case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
+ case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
case V3DV_JOB_TYPE_GPU_TFU:
case V3DV_JOB_TYPE_GPU_CSD:
return true;
@@ -739,24 +750,40 @@ cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
{
assert(cmd_buffer && job);
- if (!cmd_buffer->state.has_barrier)
- return;
-
/* Serialization only affects GPU jobs, CPU jobs are always automatically
* serialized.
*/
- if (!job_type_is_gpu(job))
+ if (!v3dv_job_type_is_gpu(job))
return;
- job->serialize = true;
- if (cmd_buffer->state.has_bcl_barrier &&
- (job->type == V3DV_JOB_TYPE_GPU_CL ||
- job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) {
- job->needs_bcl_sync = true;
+ uint8_t barrier_mask = cmd_buffer->state.barrier.dst_mask;
+ if (barrier_mask == 0)
+ return;
+
+ uint8_t bit = 0;
+ uint8_t *src_mask;
+ if (job->type == V3DV_JOB_TYPE_GPU_CSD) {
+ assert(!job->is_transfer);
+ bit = V3DV_BARRIER_COMPUTE_BIT;
+ src_mask = &cmd_buffer->state.barrier.src_mask_compute;
+ } else if (job->is_transfer) {
+ assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
+ job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
+ job->type == V3DV_JOB_TYPE_GPU_TFU);
+ bit = V3DV_BARRIER_TRANSFER_BIT;
+ src_mask = &cmd_buffer->state.barrier.src_mask_transfer;
+ } else {
+ assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
+ job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
+ bit = V3DV_BARRIER_GRAPHICS_BIT;
+ src_mask = &cmd_buffer->state.barrier.src_mask_graphics;
}
- cmd_buffer->state.has_barrier = false;
- cmd_buffer->state.has_bcl_barrier = false;
+ if (barrier_mask & bit) {
+ job->serialize = *src_mask;
+ *src_mask = 0;
+ cmd_buffer->state.barrier.dst_mask &= ~bit;
+ }
}
void
@@ -779,7 +806,7 @@ v3dv_job_init(struct v3dv_job *job,
list_inithead(&job->list_link);
if (type == V3DV_JOB_TYPE_GPU_CL ||
- type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
+ type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
type == V3DV_JOB_TYPE_GPU_CSD) {
job->bos =
_mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
@@ -787,12 +814,12 @@ v3dv_job_init(struct v3dv_job *job,
v3dv_cl_init(job, &job->indirect);
- if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
+ if (V3D_DBG(ALWAYS_FLUSH))
job->always_flush = true;
}
if (type == V3DV_JOB_TYPE_GPU_CL ||
- type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
+ type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
v3dv_cl_init(job, &job->bcl);
v3dv_cl_init(job, &job->rcl);
}
@@ -806,9 +833,10 @@ v3dv_job_init(struct v3dv_job *job,
*/
cmd_buffer->state.dirty = ~0;
cmd_buffer->state.dirty_descriptor_stages = ~0;
+ vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state);
- /* Honor inheritance of occlussion queries in secondaries if requested */
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
+ /* Honor inheritance of occlusion queries in secondaries if requested */
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
cmd_buffer->state.inheritance.occlusion_query_enable) {
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
}
@@ -820,7 +848,11 @@ v3dv_job_init(struct v3dv_job *job,
if (cmd_buffer->state.pass)
job->first_subpass = subpass_idx;
+ job->is_transfer = cmd_buffer->state.is_transfer;
+
cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
+
+ job->perf = cmd_buffer->state.query.active_query.perf;
}
}
@@ -860,19 +892,16 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
return job;
}
-static VkResult
-cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
+static void
+cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
VkCommandBufferResetFlags flags)
{
+ struct v3dv_cmd_buffer *cmd_buffer =
+ container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
+
+ vk_command_buffer_reset(&cmd_buffer->vk);
if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
struct v3dv_device *device = cmd_buffer->device;
- struct v3dv_cmd_pool *pool = cmd_buffer->pool;
- VkCommandBufferLevel level = cmd_buffer->level;
-
- /* cmd_buffer_init below will re-add the command buffer to the pool
- * so remove it here so we don't end up adding it again.
- */
- list_del(&cmd_buffer->pool_link);
/* FIXME: For now we always free all resources as if
* VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
@@ -880,87 +909,61 @@ cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)
cmd_buffer_free_resources(cmd_buffer);
- cmd_buffer_init(cmd_buffer, device, pool, level);
+ cmd_buffer_init(cmd_buffer, device);
}
assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AllocateCommandBuffers(VkDevice _device,
- const VkCommandBufferAllocateInfo *pAllocateInfo,
- VkCommandBuffer *pCommandBuffers)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool);
-
- VkResult result = VK_SUCCESS;
- uint32_t i;
-
- for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
- result = cmd_buffer_create(device, pool, pAllocateInfo->level,
- &pCommandBuffers[i]);
- if (result != VK_SUCCESS)
- break;
- }
-
- if (result != VK_SUCCESS) {
- v3dv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
- i, pCommandBuffers);
- for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
- pCommandBuffers[i] = VK_NULL_HANDLE;
- }
-
- return result;
}
-VKAPI_ATTR void VKAPI_CALL
-v3dv_FreeCommandBuffers(VkDevice device,
- VkCommandPool commandPool,
- uint32_t commandBufferCount,
- const VkCommandBuffer *pCommandBuffers)
-{
- for (uint32_t i = 0; i < commandBufferCount; i++) {
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
-
- if (!cmd_buffer)
- continue;
-
- cmd_buffer_destroy(cmd_buffer);
- }
-}
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroyCommandPool(VkDevice _device,
- VkCommandPool commandPool,
- const VkAllocationCallbacks *pAllocator)
+static void
+cmd_buffer_emit_resolve(struct v3dv_cmd_buffer *cmd_buffer,
+ uint32_t dst_attachment_idx,
+ uint32_t src_attachment_idx,
+ VkImageAspectFlagBits aspect)
{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
-
- if (!pool)
- return;
-
- list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
- &pool->cmd_buffers, pool_link) {
- cmd_buffer_destroy(cmd_buffer);
- }
+ struct v3dv_image_view *src_iview =
+ cmd_buffer->state.attachments[src_attachment_idx].image_view;
+ struct v3dv_image_view *dst_iview =
+ cmd_buffer->state.attachments[dst_attachment_idx].image_view;
+
+ const VkRect2D *ra = &cmd_buffer->state.render_area;
+
+ VkImageResolve2 region = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2,
+ .srcSubresource = {
+ aspect,
+ src_iview->vk.base_mip_level,
+ src_iview->vk.base_array_layer,
+ src_iview->vk.layer_count,
+ },
+ .srcOffset = { ra->offset.x, ra->offset.y, 0 },
+ .dstSubresource = {
+ aspect,
+ dst_iview->vk.base_mip_level,
+ dst_iview->vk.base_array_layer,
+ dst_iview->vk.layer_count,
+ },
+ .dstOffset = { ra->offset.x, ra->offset.y, 0 },
+ .extent = { ra->extent.width, ra->extent.height, 1 },
+ };
- vk_object_free(&device->vk, pAllocator, pool);
-}
+ struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
+ struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
+ VkResolveImageInfo2 resolve_info = {
+ .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2,
+ .srcImage = v3dv_image_to_handle(src_image),
+ .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+ .dstImage = v3dv_image_to_handle(dst_image),
+ .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+ .regionCount = 1,
+ .pRegions = &region,
+ };
-VKAPI_ATTR void VKAPI_CALL
-v3dv_TrimCommandPool(VkDevice device,
- VkCommandPool commandPool,
- VkCommandPoolTrimFlags flags)
-{
- /* We don't need to do anything here, our command pools never hold on to
- * any resources from command buffers that are freed or reset.
- */
+ VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
+ v3dv_CmdResolveImage2(cmd_buffer_handle, &resolve_info);
}
-
static void
cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -972,8 +975,6 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
if (!subpass->resolve_attachments)
return;
- struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer;
-
/* At this point we have already ended the current subpass and now we are
* about to emit vkCmdResolveImage calls to get the resolves we can't handle
* handle in the subpass RCL.
@@ -993,55 +994,42 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
cmd_buffer->state.pass = NULL;
cmd_buffer->state.subpass_idx = -1;
- VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
for (uint32_t i = 0; i < subpass->color_count; i++) {
const uint32_t src_attachment_idx =
subpass->color_attachments[i].attachment;
if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
continue;
- if (pass->attachments[src_attachment_idx].use_tlb_resolve)
+ /* Skip if this attachment doesn't have a resolve or if it was already
+ * implemented as a TLB resolve.
+ */
+ if (!cmd_buffer->state.attachments[src_attachment_idx].has_resolve ||
+ cmd_buffer->state.attachments[src_attachment_idx].use_tlb_resolve) {
continue;
+ }
const uint32_t dst_attachment_idx =
subpass->resolve_attachments[i].attachment;
- if (dst_attachment_idx == VK_ATTACHMENT_UNUSED)
- continue;
+ assert(dst_attachment_idx != VK_ATTACHMENT_UNUSED);
- struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];
- struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];
-
- VkImageResolve2KHR region = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR,
- .srcSubresource = {
- VK_IMAGE_ASPECT_COLOR_BIT,
- src_iview->vk.base_mip_level,
- src_iview->vk.base_array_layer,
- src_iview->vk.layer_count,
- },
- .srcOffset = { 0, 0, 0 },
- .dstSubresource = {
- VK_IMAGE_ASPECT_COLOR_BIT,
- dst_iview->vk.base_mip_level,
- dst_iview->vk.base_array_layer,
- dst_iview->vk.layer_count,
- },
- .dstOffset = { 0, 0, 0 },
- .extent = src_iview->vk.image->extent,
- };
+ cmd_buffer_emit_resolve(cmd_buffer, dst_attachment_idx, src_attachment_idx,
+ VK_IMAGE_ASPECT_COLOR_BIT);
+ }
- struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
- struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
- VkResolveImageInfo2KHR resolve_info = {
- .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR,
- .srcImage = v3dv_image_to_handle(src_image),
- .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
- .dstImage = v3dv_image_to_handle(dst_image),
- .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
- .regionCount = 1,
- .pRegions = &region,
- };
- v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info);
+ const uint32_t ds_src_attachment_idx =
+ subpass->ds_attachment.attachment;
+ if (ds_src_attachment_idx != VK_ATTACHMENT_UNUSED &&
+ cmd_buffer->state.attachments[ds_src_attachment_idx].has_resolve &&
+ !cmd_buffer->state.attachments[ds_src_attachment_idx].use_tlb_resolve) {
+ assert(subpass->resolve_depth || subpass->resolve_stencil);
+ const VkImageAspectFlags ds_aspects =
+ (subpass->resolve_depth ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
+ (subpass->resolve_stencil ? VK_IMAGE_ASPECT_STENCIL_BIT : 0);
+ const uint32_t ds_dst_attachment_idx =
+ subpass->ds_resolve_attachment.attachment;
+ assert(ds_dst_attachment_idx != VK_ATTACHMENT_UNUSED);
+ cmd_buffer_emit_resolve(cmd_buffer, ds_dst_attachment_idx,
+ ds_src_attachment_idx, ds_aspects);
}
cmd_buffer->state.framebuffer = restore_fb;
@@ -1054,19 +1042,30 @@ cmd_buffer_begin_render_pass_secondary(
struct v3dv_cmd_buffer *cmd_buffer,
const VkCommandBufferInheritanceInfo *inheritance_info)
{
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
assert(inheritance_info);
- cmd_buffer->state.pass =
- v3dv_render_pass_from_handle(inheritance_info->renderPass);
- assert(cmd_buffer->state.pass);
+ const VkCommandBufferInheritanceRenderingInfo *rendering_info = NULL;
+ if (inheritance_info->renderPass == VK_NULL_HANDLE) {
+ rendering_info = vk_find_struct_const(inheritance_info,
+ COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
+ assert(rendering_info);
+ v3dv_setup_dynamic_render_pass_inheritance(cmd_buffer, rendering_info);
+ cmd_buffer->state.pass = &cmd_buffer->state.dynamic_pass;
+ cmd_buffer->state.subpass_idx = 0;
+ cmd_buffer->state.framebuffer = NULL;
+ } else {
+ cmd_buffer->state.pass =
+ v3dv_render_pass_from_handle(inheritance_info->renderPass);
- cmd_buffer->state.framebuffer =
- v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
+ assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
+ cmd_buffer->state.subpass_idx = inheritance_info->subpass;
- assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
- cmd_buffer->state.subpass_idx = inheritance_info->subpass;
+ cmd_buffer->state.framebuffer =
+ v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
+ }
+ assert(cmd_buffer->state.pass);
cmd_buffer->state.inheritance.occlusion_query_enable =
inheritance_info->occlusionQueryEnable;
@@ -1075,8 +1074,8 @@ cmd_buffer_begin_render_pass_secondary(
* so we want to create a job for them here.
*/
struct v3dv_job *job =
- v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass,
- V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+ v3dv_cmd_buffer_start_job(cmd_buffer, cmd_buffer->state.subpass_idx,
+ V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
if (!job) {
v3dv_flag_oom(cmd_buffer, NULL);
return VK_ERROR_OUT_OF_HOST_MEMORY;
@@ -1089,21 +1088,31 @@ cmd_buffer_begin_render_pass_secondary(
*
* "The application must ensure (using scissor if necessary) that all
* rendering is contained within the render area."
- *
- * FIXME: setup constants for the max framebuffer dimensions and use them
- * here and when filling in VkPhysicalDeviceLimits.
*/
const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
cmd_buffer->state.render_area.offset.x = 0;
cmd_buffer->state.render_area.offset.y = 0;
cmd_buffer->state.render_area.extent.width =
- framebuffer ? framebuffer->width : 4096;
+ framebuffer ? framebuffer->width : V3D_MAX_IMAGE_DIMENSION;
cmd_buffer->state.render_area.extent.height =
- framebuffer ? framebuffer->height : 4096;
+ framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION;
+
+ /* We only really execute double-buffer mode in primary jobs, so allow this
+ * mode in render pass secondaries to keep track of the double-buffer mode
+ * score in them and update the primaries accordingly when they are executed
+ * into them.
+ */
+ job->can_use_double_buffer = true;
return VK_SUCCESS;
}
+const struct vk_command_buffer_ops v3dv_cmd_buffer_ops = {
+ .create = cmd_buffer_create,
+ .reset = cmd_buffer_reset,
+ .destroy = cmd_buffer_destroy,
+};
+
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
const VkCommandBufferBeginInfo *pBeginInfo)
@@ -1114,17 +1123,15 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
* command buffer's state. Otherwise, we must reset its state. In both
* cases we reset it.
*/
- VkResult result = cmd_buffer_reset(cmd_buffer, 0);
- if (result != VK_SUCCESS)
- return result;
+ cmd_buffer_reset(&cmd_buffer->vk, 0);
assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
cmd_buffer->usage_flags = pBeginInfo->flags;
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
- result =
+ VkResult result =
cmd_buffer_begin_render_pass_secondary(cmd_buffer,
pBeginInfo->pInheritanceInfo);
if (result != VK_SUCCESS)
@@ -1137,32 +1144,6 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
- VkCommandBufferResetFlags flags)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- return cmd_buffer_reset(cmd_buffer, flags);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetCommandPool(VkDevice device,
- VkCommandPool commandPool,
- VkCommandPoolResetFlags flags)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
-
- VkCommandBufferResetFlags reset_flags = 0;
- if (flags & VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT)
- reset_flags = VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT;
- list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
- &pool->cmd_buffers, pool_link) {
- cmd_buffer_reset(cmd_buffer, reset_flags);
- }
-
- return VK_SUCCESS;
-}
-
static void
cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -1191,21 +1172,64 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
}
static void
+cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ /* NOTE: This should be called after cmd_buffer_update_tile_alignment()
+ * since it relies on up-to-date information about subpass tile alignment.
+ */
+ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ const struct v3dv_render_pass *pass = state->pass;
+ const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+
+ for (uint32_t i = 0; i < subpass->color_count; i++) {
+ const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+ if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ continue;
+
+ state->attachments[attachment_idx].has_resolve =
+ subpass->resolve_attachments &&
+ subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
+
+ state->attachments[attachment_idx].use_tlb_resolve =
+ state->attachments[attachment_idx].has_resolve &&
+ state->tile_aligned_render_area &&
+ pass->attachments[attachment_idx].try_tlb_resolve;
+ }
+
+ uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+ if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+ uint32_t ds_resolve_attachment_idx =
+ subpass->ds_resolve_attachment.attachment;
+ state->attachments[ds_attachment_idx].has_resolve =
+ ds_resolve_attachment_idx != VK_ATTACHMENT_UNUSED;
+
+ assert(!state->attachments[ds_attachment_idx].has_resolve ||
+ (subpass->resolve_depth || subpass->resolve_stencil));
+
+ state->attachments[ds_attachment_idx].use_tlb_resolve =
+ state->attachments[ds_attachment_idx].has_resolve &&
+ state->tile_aligned_render_area &&
+ pass->attachments[ds_attachment_idx].try_tlb_resolve;
+ }
+}
+
+static void
cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t attachment_idx,
const VkClearColorValue *color)
{
assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
-
const struct v3dv_render_pass_attachment *attachment =
&cmd_buffer->state.pass->attachments[attachment_idx];
uint32_t internal_type, internal_bpp;
const struct v3dv_format *format =
v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);
+ /* We don't allow multi-planar formats for render pass attachments */
+ assert(format->plane_count == 1);
v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)
- (format->rt_type, &internal_type, &internal_bpp);
+ (format->planes[0].rt_type, &internal_type, &internal_bpp);
uint32_t internal_size = 4 << internal_bpp;
@@ -1273,12 +1297,39 @@ cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,
}
static void
+cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkRenderPassBeginInfo *pRenderPassBegin)
+{
+ V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
+ V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
+
+ const VkRenderPassAttachmentBeginInfo *attach_begin =
+ vk_find_struct_const(pRenderPassBegin, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
+
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+ for (uint32_t i = 0; i < pass->attachment_count; i++) {
+ if (attach_begin && attach_begin->attachmentCount != 0) {
+ state->attachments[i].image_view =
+ v3dv_image_view_from_handle(attach_begin->pAttachments[i]);
+ } else if (framebuffer) {
+ state->attachments[i].image_view = framebuffer->attachments[i];
+ } else {
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ state->attachments[i].image_view = NULL;
+ }
+ }
+}
+
+static void
cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,
const VkRenderPassBeginInfo *pRenderPassBegin)
{
cmd_buffer_state_set_clear_values(cmd_buffer,
pRenderPassBegin->clearValueCount,
pRenderPassBegin->pClearValues);
+
+ cmd_buffer_state_set_attachments(cmd_buffer, pRenderPassBegin);
}
static void
@@ -1307,10 +1358,33 @@ cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffe
assert(state->attachment_alloc_count >= pass->attachment_count);
}
+/* If our render area is smaller than the current clip window we will have
+ * to emit a new clip window to constraint it to the render area.
+ */
+static void
+constraint_clip_window_to_render_area(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+
+ uint32_t min_render_x = state->render_area.offset.x;
+ uint32_t min_render_y = state->render_area.offset.y;
+ uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
+ uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
+ uint32_t min_clip_x = state->clip_window.offset.x;
+ uint32_t min_clip_y = state->clip_window.offset.y;
+ uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
+ uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
+ if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
+ max_render_x < max_clip_x || max_render_y < max_clip_y) {
+ BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
+ }
+}
+
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
- const VkRenderPassBeginInfo *pRenderPassBegin,
- VkSubpassContents contents)
+v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
+ const VkRenderPassBeginInfo *pRenderPassBegin,
+ const VkSubpassBeginInfo *pSubpassBeginInfo)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
@@ -1326,29 +1400,16 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);
state->render_area = pRenderPassBegin->renderArea;
-
- /* If our render area is smaller than the current clip window we will have
- * to emit a new clip window to constraint it to the render area.
- */
- uint32_t min_render_x = state->render_area.offset.x;
- uint32_t min_render_y = state->render_area.offset.y;
- uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
- uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
- uint32_t min_clip_x = state->clip_window.offset.x;
- uint32_t min_clip_y = state->clip_window.offset.y;
- uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
- uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
- if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
- max_render_x < max_clip_x || max_render_y < max_clip_y) {
- state->dirty |= V3DV_CMD_DIRTY_SCISSOR;
- }
+ constraint_clip_window_to_render_area(cmd_buffer);
/* Setup for first subpass */
v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
+v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,
+ const VkSubpassBeginInfo *pSubpassBeginInfo,
+ const VkSubpassEndInfo *pSubpassEndInfo)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -1366,10 +1427,9 @@ v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
static void
cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
{
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
assert(cmd_buffer->state.pass);
assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
+ assert(!cmd_buffer->state.resuming);
const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
const struct v3dv_render_pass *pass = state->pass;
const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
@@ -1384,7 +1444,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
}
uint32_t att_count = 0;
- VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
+ VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
/* We only need to emit subpass clears as draw calls for color attachments
* if the render area is not aligned to tile boundaries.
@@ -1444,7 +1504,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
"VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
} else if (subpass->do_depth_clear_with_draw ||
subpass->do_stencil_clear_with_draw) {
- perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), "
+ perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), "
"falling back to vkCmdClearAttachments for "
"VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
}
@@ -1458,23 +1518,212 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
* So the clear is only constrained by the render area and not by pipeline
* state such as scissor or viewport, these are the semantics of
* vkCmdClearAttachments as well.
+ *
+ * Also:
+ *
+ * "If the render pass instance this is recorded in uses multiview, then
+ * baseArrayLayer must be zero and layerCount must be one."
*/
+ assert(state->framebuffer);
+ uint32_t layer_count = cmd_buffer->state.pass->multiview_enabled ?
+ 1 : state->framebuffer->layers;
VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
VkClearRect rect = {
.rect = state->render_area,
.baseArrayLayer = 0,
- .layerCount = 1,
+ .layerCount = layer_count,
};
v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
}
+bool
+v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
+ VkImageAspectFlags aspect,
+ uint32_t first_subpass_idx,
+ VkAttachmentLoadOp load_op,
+ uint32_t last_subpass_idx,
+ VkAttachmentStoreOp store_op)
+{
+ /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
+ * testing does not exist in the image.
+ */
+ if (!aspect)
+ return false;
+
+ /* Attachment (or view) load operations apply on the first subpass that
+ * uses the attachment (or view), otherwise we always need to load.
+ */
+ if (state->job->first_subpass > first_subpass_idx)
+ return true;
+
+ /* If the job is continuing a subpass started in another job, we always
+ * need to load.
+ */
+ if (state->job->is_subpass_continue)
+ return true;
+
+ /* If the area is not aligned to tile boundaries and we are going to store,
+ * then we need to load to preserve contents outside the render area.
+ */
+ if (!state->tile_aligned_render_area &&
+ v3dv_cmd_buffer_check_needs_store(state, aspect, last_subpass_idx,
+ store_op)) {
+ return true;
+ }
+
+ /* The attachment load operations must be LOAD */
+ return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
+}
+
+bool
+v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
+ VkImageAspectFlags aspect,
+ uint32_t last_subpass_idx,
+ VkAttachmentStoreOp store_op)
+{
+ /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
+ * testing does not exist in the image.
+ */
+ if (!aspect)
+ return false;
+
+ /* Attachment (or view) store operations only apply on the last subpass
+ * where the attachment (or view) is used, in other subpasses we always
+ * need to store.
+ */
+ if (state->subpass_idx < last_subpass_idx)
+ return true;
+
+ /* Attachment store operations only apply on the last job we emit on the the
+ * last subpass where the attachment is used, otherwise we always need to
+ * store.
+ */
+ if (!state->job->is_subpass_finish)
+ return true;
+
+ /* The attachment store operation must be STORE */
+ return store_op == VK_ATTACHMENT_STORE_OP_STORE;
+}
+
+static void
+cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer *cmd_buffer,
+ bool msaa)
+{
+ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ struct v3dv_job *job = cmd_buffer->state.job;
+ assert(job);
+
+ job->can_use_double_buffer = false;
+
+ /* Double-buffer can only be used if requested via V3D_DEBUG */
+ if (!V3D_DBG(DOUBLE_BUFFER))
+ return;
+
+ /* Double-buffer cannot be enabled for MSAA jobs */
+ if (msaa)
+ return;
+
+ const struct v3dv_render_pass *pass = state->pass;
+ const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+
+ /* FIXME: For now we discard multiview jobs (which have an implicit geometry
+ * shader) for this optimization. If we want to enable this with multiview
+ * we would need to check if any view (layer) in any attachment used by the
+ * job has loads and/or stores as we do below for regular attachments. Also,
+ * we would want to have a heuristic that doesn't automatically disable
+ * double-buffer in the presence of geometry shaders.
+ */
+ if (state->pass->multiview_enabled)
+ return;
+
+ /* Tile loads are serialized against stores, in which case we don't get
+ * any benefits from enabling double-buffer and would just pay the price
+ * of a smaller tile size instead. Similarly, we only benefit from
+ * double-buffer if we have tile stores, as the point of this mode is
+ * to execute rendering of a new tile while we store the previous one to
+ * hide latency on the tile store operation.
+ */
+ bool has_stores = false;
+ for (uint32_t i = 0; i < subpass->color_count; i++) {
+ uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+ if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ continue;
+
+ const struct v3dv_render_pass_attachment *attachment =
+ &state->pass->attachments[attachment_idx];
+
+ /* FIXME: This will check 'tile_aligned_render_area' but that was
+ * computed with a tile size without double-buffer. That is okay
+ * because if the larger tile size is aligned then we know the smaller
+ * tile size for double-buffer will be as well. However, we might
+ * still benefit from doing this check with the smaller tile size
+ * because it can happen that the smaller size is aligned and the
+ * larger size is not.
+ */
+ if (v3dv_cmd_buffer_check_needs_load(state,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ attachment->first_subpass,
+ attachment->desc.loadOp,
+ attachment->last_subpass,
+ attachment->desc.storeOp)) {
+ return;
+ }
+
+ if (v3dv_cmd_buffer_check_needs_store(state,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ attachment->last_subpass,
+ attachment->desc.storeOp)) {
+ has_stores = true;
+ }
+ }
+
+ if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+ uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+ const struct v3dv_render_pass_attachment *ds_attachment =
+ &state->pass->attachments[ds_attachment_idx];
+
+ const VkImageAspectFlags ds_aspects =
+ vk_format_aspects(ds_attachment->desc.format);
+
+ if (v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp)) {
+ return;
+ }
+
+ if (v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.stencilLoadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.stencilStoreOp)) {
+ return;
+ }
+
+ has_stores |= v3dv_cmd_buffer_check_needs_store(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp);
+ has_stores |= v3dv_cmd_buffer_check_needs_store(state,
+ ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.stencilStoreOp);
+ }
+
+ job->can_use_double_buffer = has_stores;
+}
+
static struct v3dv_job *
cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t subpass_idx,
- enum v3dv_job_type type)
+ enum v3dv_job_type type,
+ bool is_subpass_start)
{
assert(type == V3DV_JOB_TYPE_GPU_CL ||
- type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+ type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
assert(subpass_idx < state->pass->subpass_count);
@@ -1488,24 +1737,33 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
if (!job)
return NULL;
+ if (is_subpass_start && cmd_buffer->state.resuming) {
+ assert(subpass_idx == 0);
+ job->resuming = true;
+ }
+
state->subpass_idx = subpass_idx;
/* If we are starting a new job we need to setup binning. We only do this
- * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY
+ * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_INCOMPLETE
* jobs are not submitted to the GPU directly, and are instead meant to be
- * branched to from other V3DV_JOB_TYPE_GPU_CL jobs.
+ * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. With dynamic rendering,
+ * all resuming jobs work similarly to secondary command buffers, so we
+ * apply the same.
*/
if (type == V3DV_JOB_TYPE_GPU_CL &&
- job->first_subpass == state->subpass_idx) {
+ job->first_subpass == state->subpass_idx &&
+ !job->resuming) {
const struct v3dv_subpass *subpass =
&state->pass->subpasses[state->subpass_idx];
const struct v3dv_framebuffer *framebuffer = state->framebuffer;
- uint8_t internal_bpp;
+ uint8_t max_internal_bpp, total_color_bpp;
bool msaa;
v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
- (framebuffer, subpass, &internal_bpp, &msaa);
+ (framebuffer, state->attachments, subpass,
+ &max_internal_bpp, &total_color_bpp, &msaa);
/* From the Vulkan spec:
*
@@ -1527,9 +1785,10 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
framebuffer->width,
framebuffer->height,
layers,
- true,
+ true, false,
subpass->color_count,
- internal_bpp,
+ max_internal_bpp,
+ total_color_bpp,
msaa);
}
@@ -1545,28 +1804,29 @@ v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_job *job =
cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
- V3DV_JOB_TYPE_GPU_CL);
+ V3DV_JOB_TYPE_GPU_CL, true);
if (!job)
return NULL;
+ /* FIXME: do we need all this below for resuming jobs? */
+
/* Check if our render area is aligned to tile boundaries. We have to do
* this in each subpass because the subset of attachments used can change
* and with that the tile size selected by the hardware can change too.
*/
cmd_buffer_update_tile_alignment(cmd_buffer);
+ /* Decide if we can use double-buffer for this subpass job */
+ cmd_buffer_subpass_check_double_buffer_mode(cmd_buffer, job->frame_tiling.msaa);
+
+ cmd_buffer_update_attachment_resolve_state(cmd_buffer);
+
/* If we can't use TLB clears then we need to emit draw clears for any
* LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
- * Depth/Stencil clears if we hit GFXH-1461.
- *
- * Secondary command buffers don't start subpasses (and may not even have
- * framebuffer state), so we only care about this in primaries. The only
- * exception could be a secondary runnning inside a subpass that needs to
- * record a meta operation (with its own render pass) that relies on
- * attachment load clears, but we don't have any instances of that right
- * now.
- */
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+ * Depth/Stencil clears if we hit GFXH-1461. With dynamic render passes this
+ * should only be called when starting the render pass, not when resuming.
+ */
+ if (!cmd_buffer->state.resuming)
cmd_buffer_emit_subpass_clears(cmd_buffer);
return job;
@@ -1580,13 +1840,13 @@ v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
struct v3dv_job *job;
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
- V3DV_JOB_TYPE_GPU_CL);
+ V3DV_JOB_TYPE_GPU_CL, false);
} else {
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
- V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+ V3DV_JOB_TYPE_GPU_CL_INCOMPLETE, false);
}
if (!job)
@@ -1611,7 +1871,8 @@ v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
+v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
+ const VkSubpassEndInfo *pSubpassEndInfo)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -1645,7 +1906,7 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
* inside a render pass.
*/
if (cmd_buffer->state.job) {
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
cmd_buffer->state.pass);
v3dv_cmd_buffer_finish_job(cmd_buffer);
}
@@ -1655,26 +1916,73 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
return VK_SUCCESS;
}
-static void
-clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
+static bool
+clone_bo_list(struct v3dv_device *device,
struct list_head *dst,
struct list_head *src)
{
- assert(cmd_buffer);
+ assert(device);
list_inithead(dst);
list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
struct v3dv_bo *clone_bo =
- vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8,
+ vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!clone_bo) {
- v3dv_flag_oom(cmd_buffer, NULL);
- return;
- }
+ if (!clone_bo)
+ return false;
*clone_bo = *bo;
list_addtail(&clone_bo->list_link, dst);
}
+
+ return true;
+}
+
+struct v3dv_job *
+v3dv_job_clone(struct v3dv_job *job, bool skip_bcl)
+{
+ struct v3dv_job *clone = vk_alloc(&job->device->vk.alloc,
+ sizeof(struct v3dv_job), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+ if (!clone)
+ return NULL;
+
+ /* Cloned jobs don't duplicate resources, they share their CLs with the
+ * oringinal job, since they are typically read-only. The exception to this
+ * is dynamic rendering suspension paired with
+ * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, since in that case we need
+ * to patch the BCL with the resume address and for that we need to create a
+ * copy of the job so we avoid rewriting the resume address for another copy
+ * of the same job that may be running in the GPU. When we create a job for
+ * this use case skip_bcl is set to True and the caller will be responsible
+ * for creating the BCL.
+ */
+ *clone = *job;
+ clone->is_clone = true;
+ clone->cmd_buffer = NULL;
+
+ /* We need to regen the BO lists so that they point to the BO list in the
+ * cloned job. Otherwise functions like list_length() will loop forever.
+ */
+ if (job->type == V3DV_JOB_TYPE_GPU_CL) {
+ assert(job->cmd_buffer);
+ struct v3dv_device *device = job->cmd_buffer->device;
+
+ clone->bcl.job = clone;
+ clone->rcl.job = clone;
+ clone->indirect.job = clone;
+
+ if (!skip_bcl &&
+ !clone_bo_list(device, &clone->bcl.bo_list, &job->bcl.bo_list)) {
+ return NULL;
+ }
+ if (!clone_bo_list(device, &clone->rcl.bo_list, &job->rcl.bo_list))
+ return NULL;
+ if (!clone_bo_list(device, &clone->indirect.bo_list, &job->indirect.bo_list))
+ return NULL;
+ }
+
+ return clone;
}
/* Clones a job for inclusion in the given command buffer. Note that this
@@ -1687,31 +1995,29 @@ struct v3dv_job *
v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
struct v3dv_cmd_buffer *cmd_buffer)
{
- struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc,
- sizeof(struct v3dv_job), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!clone_job) {
+ struct v3dv_job *clone = v3dv_job_clone(job, false);
+ if (!clone) {
v3dv_flag_oom(cmd_buffer, NULL);
return NULL;
}
- /* Cloned jobs don't duplicate resources! */
- *clone_job = *job;
- clone_job->is_clone = true;
- clone_job->cmd_buffer = cmd_buffer;
- list_addtail(&clone_job->list_link, &cmd_buffer->jobs);
+ clone->cmd_buffer = cmd_buffer;
+ list_addtail(&clone->list_link, &cmd_buffer->jobs);
+ return clone;
+}
- /* We need to regen the BO lists so that they point to the BO list in the
- * cloned job. Otherwise functions like list_length() will loop forever.
- */
- if (job->type == V3DV_JOB_TYPE_GPU_CL) {
- clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list);
- clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list);
- clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list,
- &job->indirect.bo_list);
- }
+void
+v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
+ struct v3dv_barrier_state *src)
+{
+ dst->dst_mask |= src->dst_mask;
- return clone_job;
+ dst->src_mask_graphics |= src->src_mask_graphics;
+ dst->src_mask_compute |= src->src_mask_compute;
+ dst->src_mask_transfer |= src->src_mask_transfer;
+
+ dst->bcl_buffer_access |= src->bcl_buffer_access;
+ dst->bcl_image_access |= src->bcl_image_access;
}
static void
@@ -1719,8 +2025,7 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
uint32_t cmd_buffer_count,
const VkCommandBuffer *cmd_buffers)
{
- bool pending_barrier = false;
- bool pending_bcl_barrier = false;
+ struct v3dv_barrier_state pending_barrier = { 0 };
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
@@ -1743,17 +2048,23 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
list_for_each_entry(struct v3dv_job, secondary_job,
&secondary->jobs, list_link) {
/* These can only happen inside a render pass */
- assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+ assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
if (!job)
return;
- if (pending_barrier) {
- job->serialize = true;
- if (pending_bcl_barrier)
+ if (pending_barrier.dst_mask) {
+ /* FIXME: do the same we do for primaries and only choose the
+ * relevant src masks.
+ */
+ job->serialize = pending_barrier.src_mask_graphics |
+ pending_barrier.src_mask_transfer |
+ pending_barrier.src_mask_compute;
+ if (pending_barrier.bcl_buffer_access ||
+ pending_barrier.bcl_image_access) {
job->needs_bcl_sync = true;
- pending_barrier = false;
- pending_bcl_barrier = false;
+ }
+ memset(&pending_barrier, 0, sizeof(pending_barrier));
}
}
@@ -1761,14 +2072,15 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
* barrier state consumed with whatever comes after it (first job in
* the next secondary or the primary, if this was the last secondary).
*/
- assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
- pending_barrier = secondary->state.has_barrier;
- pending_bcl_barrier = secondary->state.has_bcl_barrier;
+ assert(secondary->state.barrier.dst_mask ||
+ (!secondary->state.barrier.bcl_buffer_access &&
+ !secondary->state.barrier.bcl_image_access));
+ pending_barrier = secondary->state.barrier;
}
- if (pending_barrier) {
- primary->state.has_barrier = true;
- primary->state.has_bcl_barrier |= pending_bcl_barrier;
+ if (pending_barrier.dst_mask) {
+ v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
+ &pending_barrier);
}
}
@@ -1788,100 +2100,36 @@ v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
}
}
-/* This goes though the list of possible dynamic states in the pipeline and,
- * for those that are not configured as dynamic, copies relevant state into
- * the command buffer.
- */
static void
-cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
- const struct v3dv_dynamic_state *src)
-{
- struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic;
- uint32_t dynamic_mask = src->mask;
- uint32_t dirty = 0;
-
- if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) {
- dest->viewport.count = src->viewport.count;
- if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
- src->viewport.count * sizeof(VkViewport))) {
- typed_memcpy(dest->viewport.viewports,
- src->viewport.viewports,
- src->viewport.count);
- typed_memcpy(dest->viewport.scale, src->viewport.scale,
- src->viewport.count);
- typed_memcpy(dest->viewport.translate, src->viewport.translate,
- src->viewport.count);
- dirty |= V3DV_CMD_DIRTY_VIEWPORT;
- }
- }
-
- if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) {
- dest->scissor.count = src->scissor.count;
- if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
- src->scissor.count * sizeof(VkRect2D))) {
- typed_memcpy(dest->scissor.scissors,
- src->scissor.scissors, src->scissor.count);
- dirty |= V3DV_CMD_DIRTY_SCISSOR;
- }
- }
-
- if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
- if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
- sizeof(src->stencil_compare_mask))) {
- dest->stencil_compare_mask = src->stencil_compare_mask;
- dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
- }
- }
-
- if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
- if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
- sizeof(src->stencil_write_mask))) {
- dest->stencil_write_mask = src->stencil_write_mask;
- dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
- }
- }
-
- if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
- if (memcmp(&dest->stencil_reference, &src->stencil_reference,
- sizeof(src->stencil_reference))) {
- dest->stencil_reference = src->stencil_reference;
- dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
- }
- }
-
- if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
- if (memcmp(dest->blend_constants, src->blend_constants,
- sizeof(src->blend_constants))) {
- memcpy(dest->blend_constants, src->blend_constants,
- sizeof(src->blend_constants));
- dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
- }
- }
-
- if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) {
- if (memcmp(&dest->depth_bias, &src->depth_bias,
- sizeof(src->depth_bias))) {
- memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias));
- dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
- }
- }
-
- if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
- if (dest->line_width != src->line_width) {
- dest->line_width = src->line_width;
- dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
- }
+cmd_buffer_copy_private_dynamic_state(struct v3dv_dynamic_state *dst,
+ struct v3dv_dynamic_state *src,
+ struct vk_dynamic_graphics_state *src_dyn)
+{
+ if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
+ typed_memcpy(dst->viewport.scale, src->viewport.scale,
+ MAX_VIEWPORTS);
+ typed_memcpy(dst->viewport.translate, src->viewport.translate,
+ MAX_VIEWPORTS);
}
+ if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES))
+ dst->color_write_enable = src->color_write_enable;
+}
- if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
- if (dest->color_write_enable != src->color_write_enable) {
- dest->color_write_enable = src->color_write_enable;
- dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
- }
- }
+/* This function copies relevant static state from the pipeline to the command
+ * buffer state.
+ *
+ * Notice the Vulkan runtime uses the term 'dynamic' to refer to all state
+ * that *could* be dynamic, even if it is not dynamic for a particular
+ * pipeline, so the terminology used in the runtime may be a bit misleading.
+ */
+static void
+cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_pipeline *pipeline)
+{
+ vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, &pipeline->dynamic_graphics_state);
+ cmd_buffer_copy_private_dynamic_state(&cmd_buffer->state.dynamic, &pipeline->dynamic,
+ &pipeline->dynamic_graphics_state);
- cmd_buffer->state.dynamic.mask = dynamic_mask;
- cmd_buffer->state.dirty |= dirty;
}
static void
@@ -1889,13 +2137,17 @@ bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline *pipeline)
{
assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
+
+ /* We need to unconditionally bind the pipeline static state, as the state
+ * could have changed (through calls to vkCmdSetXXX) between bindings of
+ * the same pipeline.
+ */
+ cmd_buffer_bind_pipeline_static_state(cmd_buffer, pipeline);
+
if (cmd_buffer->state.gfx.pipeline == pipeline)
return;
cmd_buffer->state.gfx.pipeline = pipeline;
-
- cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);
-
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
}
@@ -1935,39 +2187,66 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
}
}
-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
+/* Considers the pipeline's negative_one_to_one state and applies it to the
+ * current viewport transform if needed to produce the resulting Z translate
+ * and scale parameters.
+ */
void
-v3dv_viewport_compute_xform(const VkViewport *viewport,
- float scale[3],
- float translate[3])
-{
- float x = viewport->x;
- float y = viewport->y;
- float half_width = 0.5f * viewport->width;
- float half_height = 0.5f * viewport->height;
- double n = viewport->minDepth;
- double f = viewport->maxDepth;
-
- scale[0] = half_width;
- translate[0] = half_width + x;
- scale[1] = half_height;
- translate[1] = half_height + y;
-
- scale[2] = (f - n);
- translate[2] = n;
-
- /* It seems that if the scale is small enough the hardware won't clip
- * correctly so we work around this my choosing the smallest scale that
- * seems to work.
- *
- * This case is exercised by CTS:
- * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
+v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer,
+ uint32_t vp_idx,
+ float *translate_z, float *scale_z)
+{
+ const struct v3dv_viewport_state *vp_state = &cmd_buffer->state.dynamic.viewport;
+ const struct vk_viewport_state *vk_vp_state = &cmd_buffer->vk.dynamic_graphics_state.vp;
+
+ float t = vp_state->translate[vp_idx][2];
+ float s = vp_state->scale[vp_idx][2];
+
+ assert(cmd_buffer->state.gfx.pipeline);
+ if (cmd_buffer->state.gfx.pipeline->negative_one_to_one) {
+ t = (t + vk_vp_state->viewports[vp_idx].maxDepth) * 0.5f;
+ s *= 0.5f;
+ }
+
+ if (translate_z)
+ *translate_z = t;
+
+ if (scale_z)
+ *scale_z = s;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
+ uint32_t attachmentCount,
+ const VkBool32 *pColorWriteEnables)
+{
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+ uint32_t color_write_enable = 0;
+
+ /* Vulkan runtime computes color_write_enable as an 8-bit bitset, setting a
+ * bit per attachment. But when emitting, it is combined with the
+ * color_write_mask, that is stored as a 32-bit mask (one bit per channel,
+ * per attachment). So we store the color_write_enable as a 32-bit mask
+ * ourselves.
*/
- const float min_abs_scale = 0.000009f;
- if (fabs(scale[2]) < min_abs_scale)
- scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
+ for (uint32_t i = 0; i < attachmentCount; i++)
+ color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
+
+ if (v3dv_dyn->color_write_enable == color_write_enable)
+ return;
+
+ v3dv_dyn->color_write_enable = color_write_enable;
+ BITSET_SET(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
+ BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
}
+/* We keep a custom CmdSetViewport because we want to cache the outcome of
+ * viewport_compute_xform, and because we need to set the viewport count. This
+ * is specially relevant to our case because we are pushing/popping the
+ * dynamic state as part of the meta operations.
+ */
VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
uint32_t firstViewport,
@@ -1975,63 +2254,55 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
const VkViewport *pViewports)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
- const uint32_t total_count = firstViewport + viewportCount;
+ struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+ const uint32_t total_count = firstViewport + viewportCount;
assert(firstViewport < MAX_VIEWPORTS);
assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
- if (state->dynamic.viewport.count < total_count)
- state->dynamic.viewport.count = total_count;
-
- if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
- pViewports, viewportCount * sizeof(*pViewports))) {
- return;
- }
-
- memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
- viewportCount * sizeof(*pViewports));
+ vk_common_CmdSetViewportWithCount(commandBuffer,
+ total_count,
+ pViewports);
for (uint32_t i = firstViewport; i < total_count; i++) {
- v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
- state->dynamic.viewport.scale[i],
- state->dynamic.viewport.translate[i]);
+ v3dv_X(cmd_buffer->device, viewport_compute_xform)
+ (&dyn->vp.viewports[i], v3dv_dyn->viewport.scale[i],
+ v3dv_dyn->viewport.translate[i]);
}
+}
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,
+ uint32_t viewportCount,
+ const VkViewport *pViewports)
+{
+ v3dv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
}
+/* We keep a custom CmdSetScissor because we need to set the scissor
+ * count. This is specially relevant to our case because we are
+ * pushing/popping the dynamic state as part of the meta operations.
+ */
VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
uint32_t firstScissor,
uint32_t scissorCount,
const VkRect2D *pScissors)
{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
assert(firstScissor < MAX_SCISSORS);
assert(firstScissor + scissorCount >= 1 &&
firstScissor + scissorCount <= MAX_SCISSORS);
- if (state->dynamic.scissor.count < firstScissor + scissorCount)
- state->dynamic.scissor.count = firstScissor + scissorCount;
-
- if (!memcmp(state->dynamic.scissor.scissors + firstScissor,
- pScissors, scissorCount * sizeof(*pScissors))) {
- return;
- }
-
- memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
- scissorCount * sizeof(*pScissors));
-
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR;
+ vk_common_CmdSetScissorWithCount(commandBuffer,
+ firstScissor + scissorCount,
+ pScissors);
}
static void
emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
{
- if (cmd_buffer->state.dynamic.viewport.count == 0)
+ if (cmd_buffer->vk.dynamic_graphics_state.vp.viewport_count == 0)
return;
struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
@@ -2041,11 +2312,14 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
*/
float *vptranslate = dynamic->viewport.translate[0];
float *vpscale = dynamic->viewport.scale[0];
+ assert(vpscale[0] >= 0);
- float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
- float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
- float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
- float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
+ float vp_minx = vptranslate[0] - vpscale[0];
+ float vp_maxx = vptranslate[0] + vpscale[0];
+
+ /* With KHR_maintenance1 viewport may have negative Y */
+ float vp_miny = vptranslate[1] - fabsf(vpscale[1]);
+ float vp_maxy = vptranslate[1] + fabsf(vpscale[1]);
/* Quoting from v3dx_emit:
* "Clip to the scissor if it's enabled, but still clip to the
@@ -2074,18 +2348,15 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
cmd_buffer->state.render_area.extent.height);
- minx = vp_minx;
- miny = vp_miny;
- maxx = vp_maxx;
- maxy = vp_maxy;
-
/* Clip against user provided scissor if needed.
*
* FIXME: right now we only allow one scissor. Below would need to be
* updated if we support more
*/
- if (dynamic->scissor.count > 0) {
- VkRect2D *scissor = &dynamic->scissor.scissors[0];
+ struct vk_dynamic_graphics_state *vk_dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ if (vk_dyn->vp.scissor_count > 0) {
+ VkRect2D *scissor = &vk_dyn->vp.scissors[0];
minx = MAX2(minx, scissor->offset.x);
miny = MAX2(miny, scissor->offset.y);
maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);
@@ -2108,12 +2379,11 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_X(cmd_buffer->device, job_emit_clip_window)
(cmd_buffer->state.job, &cmd_buffer->state.clip_window);
- cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;
+ BITSET_CLEAR(vk_dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
}
-static void
-update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
- uint32_t dirty_uniform_state)
+static bool
+update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer)
{
/* We need to update uniform streams if any piece of state that is passed
* to the shader as a uniform may have changed.
@@ -2121,15 +2391,29 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
* If only descriptor sets are dirty then we can safely ignore updates
* for shader stages that don't access descriptors.
*/
-
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
+ uint32_t dirty = cmd_buffer->state.dirty;
+ struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+
+ const bool dirty_uniform_state =
+ (dirty & (V3DV_CMD_DIRTY_PIPELINE |
+ V3DV_CMD_DIRTY_PUSH_CONSTANTS |
+ V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
+ V3DV_CMD_DIRTY_VIEW_INDEX |
+ V3DV_CMD_DIRTY_DRAW_ID)) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
- const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE;
- const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT;
- const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
- const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
- const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX;
+ if (!dirty_uniform_state)
+ return false;
+
+ const bool has_new_pipeline = dirty & V3DV_CMD_DIRTY_PIPELINE;
+ const bool has_new_viewport = BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
+ const bool has_new_push_constants = dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+ const bool has_new_descriptors = dirty & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
+ const bool has_new_view_index = dirty & V3DV_CMD_DIRTY_VIEW_INDEX;
+ const bool has_new_draw_id = dirty & V3DV_CMD_DIRTY_DRAW_ID;
/* VK_SHADER_STAGE_FRAGMENT_BIT */
const bool has_new_descriptors_fs =
@@ -2143,8 +2427,7 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
const bool needs_fs_update = has_new_pipeline ||
has_new_view_index ||
has_new_push_constants_fs ||
- has_new_descriptors_fs ||
- has_new_view_index;
+ has_new_descriptors_fs;
if (needs_fs_update) {
struct v3dv_shader_variant *fs_variant =
@@ -2198,6 +2481,7 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
const bool needs_vs_update = has_new_viewport ||
has_new_view_index ||
+ has_new_draw_id ||
has_new_pipeline ||
has_new_push_constants_vs ||
has_new_descriptors_vs;
@@ -2217,6 +2501,9 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
}
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
+ cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DRAW_ID;
+
+ return true;
}
/* This stores command buffer state that we might be about to stomp for
@@ -2228,32 +2515,43 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
{
struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ /* Attachment state.
+ *
+ * We store this state even if we are not currently in a subpass
+ * (subpass_idx != -1) because we may get here to implement subpass
+ * resolves via vkCmdResolveImage from
+ * cmd_buffer_subpass_handle_pending_resolves. In that scenario we pretend
+ * we are no longer in a subpass because Vulkan disallows image resolves
+ * via vkCmdResolveImage during subpasses, but we still need to preserve
+ * attachment state because we may have more subpasses to go through
+ * after processing resolves in the current subass.
+ */
+ const uint32_t attachment_state_item_size =
+ sizeof(struct v3dv_cmd_buffer_attachment_state);
+ const uint32_t attachment_state_total_size =
+ attachment_state_item_size * state->attachment_alloc_count;
+ if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
+ if (state->meta.attachment_alloc_count > 0)
+ vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
+
+ state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
+ attachment_state_total_size, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+ if (!state->meta.attachments) {
+ v3dv_flag_oom(cmd_buffer, NULL);
+ return;
+ }
+ state->meta.attachment_alloc_count = state->attachment_alloc_count;
+ }
+ state->meta.attachment_count = state->attachment_alloc_count;
+ memcpy(state->meta.attachments, state->attachments,
+ attachment_state_total_size);
+
if (state->subpass_idx != -1) {
state->meta.subpass_idx = state->subpass_idx;
state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);
state->meta.pass = v3dv_render_pass_to_handle(state->pass);
- const uint32_t attachment_state_item_size =
- sizeof(struct v3dv_cmd_buffer_attachment_state);
- const uint32_t attachment_state_total_size =
- attachment_state_item_size * state->attachment_alloc_count;
- if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
- if (state->meta.attachment_alloc_count > 0)
- vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
-
- state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
- attachment_state_total_size, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!state->meta.attachments) {
- v3dv_flag_oom(cmd_buffer, NULL);
- return;
- }
- state->meta.attachment_alloc_count = state->attachment_alloc_count;
- }
- state->meta.attachment_count = state->attachment_alloc_count;
- memcpy(state->meta.attachments, state->attachments,
- attachment_state_total_size);
-
state->meta.tile_aligned_render_area = state->tile_aligned_render_area;
memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
}
@@ -2262,6 +2560,8 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
* account the graphics pipeline, and the graphics state
*/
state->meta.gfx.pipeline = state->gfx.pipeline;
+ vk_dynamic_graphics_state_copy(&state->meta.dynamic_graphics_state,
+ &cmd_buffer->vk.dynamic_graphics_state);
memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
struct v3dv_descriptor_state *gfx_descriptor_state =
@@ -2277,35 +2577,35 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
state->meta.has_descriptor_state = false;
}
- /* FIXME: if we keep track of wether we have bound any push constant state
- * at all we could restruct this only to cases where it is actually
- * necessary.
- */
- memcpy(state->meta.push_constants, cmd_buffer->push_constants_data,
- sizeof(state->meta.push_constants));
+ if (cmd_buffer->state.push_constants_size > 0) {
+ state->meta.push_constants_size = cmd_buffer->state.push_constants_size;
+ memcpy(state->meta.push_constants, cmd_buffer->state.push_constants_data,
+ cmd_buffer->state.push_constants_size);
+ cmd_buffer->state.push_constants_size = 0;
+ }
}
/* This restores command buffer state after a meta operation
*/
void
v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
- uint32_t dirty_dynamic_state,
bool needs_subpass_resume)
{
struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ /* Attachment state */
+ assert(state->meta.attachment_count <= state->attachment_alloc_count);
+ const uint32_t attachment_state_item_size =
+ sizeof(struct v3dv_cmd_buffer_attachment_state);
+ const uint32_t attachment_state_total_size =
+ attachment_state_item_size * state->meta.attachment_count;
+ memcpy(state->attachments, state->meta.attachments,
+ attachment_state_total_size);
+
if (state->meta.subpass_idx != -1) {
state->pass = v3dv_render_pass_from_handle(state->meta.pass);
state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);
- assert(state->meta.attachment_count <= state->attachment_alloc_count);
- const uint32_t attachment_state_item_size =
- sizeof(struct v3dv_cmd_buffer_attachment_state);
- const uint32_t attachment_state_total_size =
- attachment_state_item_size * state->meta.attachment_count;
- memcpy(state->attachments, state->meta.attachments,
- attachment_state_total_size);
-
state->tile_aligned_render_area = state->meta.tile_aligned_render_area;
memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));
@@ -2331,10 +2631,11 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
state->gfx.pipeline = NULL;
}
- if (dirty_dynamic_state) {
- memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
- state->dirty |= dirty_dynamic_state;
- }
+ /* Restore dynamic state */
+ vk_dynamic_graphics_state_copy(&cmd_buffer->vk.dynamic_graphics_state,
+ &state->meta.dynamic_graphics_state);
+ memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
+ state->dirty = ~0;
if (state->meta.has_descriptor_state) {
if (state->meta.gfx.descriptor_state.valid != 0) {
@@ -2345,14 +2646,23 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
}
}
- memcpy(cmd_buffer->push_constants_data, state->meta.push_constants,
- sizeof(state->meta.push_constants));
+ /* We only need to restore push constant data if we had any data in the
+ * original command buffer and the meta operation wrote new push constant
+ * data.
+ */
+ if (state->meta.push_constants_size > 0 &&
+ cmd_buffer->state.push_constants_size > 0) {
+ memcpy(cmd_buffer->state.push_constants_data, state->meta.push_constants,
+ state->meta.push_constants_size);
+ }
+ cmd_buffer->state.push_constants_size = state->meta.push_constants_size;
state->meta.gfx.pipeline = NULL;
state->meta.framebuffer = VK_NULL_HANDLE;
state->meta.pass = VK_NULL_HANDLE;
state->meta.subpass_idx = -1;
state->meta.has_descriptor_state = false;
+ state->meta.push_constants_size = 0;
}
static struct v3dv_job *
@@ -2399,7 +2709,7 @@ cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
* in rasterization."
*
* We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
- * emit when we start a new frame at the begining of a subpass. At that point,
+ * emit when we start a new frame at the beginning of a subpass. At that point,
* if the framebuffer doesn't have any attachments we won't enable MSAA and
* the job won't be valid in the scenario described by the spec.
*
@@ -2434,7 +2744,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
* draw calls in them, and then using that info to decide if we need to
* restart the primary job into which they are being recorded.
*/
- if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+ if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
return;
/* Drop the current job and restart it with MSAA enabled */
@@ -2457,16 +2767,185 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
old_job->frame_tiling.width,
old_job->frame_tiling.height,
old_job->frame_tiling.layers,
- true,
+ true, false,
old_job->frame_tiling.render_target_count,
old_job->frame_tiling.internal_bpp,
+ old_job->frame_tiling.total_color_bpp,
true /* msaa */);
v3dv_job_destroy(old_job);
}
+static bool
+cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_pipeline *pipeline,
+ bool indexed, bool indirect)
+{
+ const struct v3dv_descriptor_maps *vs_bin_maps =
+ pipeline->shared_data->maps[BROADCOM_SHADER_VERTEX_BIN];
+
+ const struct v3dv_descriptor_maps *gs_bin_maps =
+ pipeline->shared_data->maps[BROADCOM_SHADER_GEOMETRY_BIN];
+
+ VkAccessFlags buffer_access =
+ cmd_buffer->state.barrier.bcl_buffer_access;
+ if (buffer_access) {
+ /* Index buffer read */
+ if (indexed && (buffer_access & (VK_ACCESS_2_INDEX_READ_BIT |
+ VK_ACCESS_2_MEMORY_READ_BIT))) {
+ return true;
+ }
+
+ /* Indirect buffer read */
+ if (indirect && (buffer_access & (VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT |
+ VK_ACCESS_2_MEMORY_READ_BIT))) {
+ return true;
+ }
+
+ /* Attribute read */
+ if (buffer_access & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
+ VK_ACCESS_2_MEMORY_READ_BIT)) {
+ const struct v3d_vs_prog_data *prog_data =
+ pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
+
+ for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
+ if (prog_data->vattr_sizes[i] > 0)
+ return true;
+ }
+ }
+
+ /* UBO / SSBO read */
+ if (buffer_access & (VK_ACCESS_2_UNIFORM_READ_BIT |
+ VK_ACCESS_2_SHADER_READ_BIT |
+ VK_ACCESS_2_MEMORY_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_READ_BIT)) {
+
+ if (vs_bin_maps->ubo_map.num_desc > 0 ||
+ vs_bin_maps->ssbo_map.num_desc > 0) {
+ return true;
+ }
+
+ if (gs_bin_maps && (gs_bin_maps->ubo_map.num_desc > 0 ||
+ gs_bin_maps->ssbo_map.num_desc > 0)) {
+ return true;
+ }
+ }
+
+ /* SSBO write */
+ if (buffer_access & (VK_ACCESS_2_SHADER_WRITE_BIT |
+ VK_ACCESS_2_MEMORY_WRITE_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT)) {
+ if (vs_bin_maps->ssbo_map.num_desc > 0)
+ return true;
+
+ if (gs_bin_maps && gs_bin_maps->ssbo_map.num_desc > 0)
+ return true;
+ }
+
+ /* Texel Buffer read */
+ if (buffer_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+ VK_ACCESS_2_MEMORY_READ_BIT)) {
+ if (vs_bin_maps->texture_map.num_desc > 0)
+ return true;
+
+ if (gs_bin_maps && gs_bin_maps->texture_map.num_desc > 0)
+ return true;
+ }
+ }
+
+ VkAccessFlags image_access =
+ cmd_buffer->state.barrier.bcl_image_access;
+ if (image_access) {
+ /* Image load / store */
+ if (image_access & (VK_ACCESS_2_SHADER_READ_BIT |
+ VK_ACCESS_2_SHADER_WRITE_BIT |
+ VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
+ VK_ACCESS_2_MEMORY_READ_BIT |
+ VK_ACCESS_2_MEMORY_WRITE_BIT)) {
+ if (vs_bin_maps->texture_map.num_desc > 0 ||
+ vs_bin_maps->sampler_map.num_desc > 0) {
+ return true;
+ }
+
+ if (gs_bin_maps && (gs_bin_maps->texture_map.num_desc > 0 ||
+ gs_bin_maps->sampler_map.num_desc > 0)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
void
-v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
+v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_job *job)
+{
+ job->needs_bcl_sync = true;
+ cmd_buffer->state.barrier.bcl_buffer_access = 0;
+ cmd_buffer->state.barrier.bcl_image_access = 0;
+}
+
+static inline uint32_t
+compute_prog_score(struct v3dv_shader_variant *vs)
+{
+ const uint32_t inst_count = vs->qpu_insts_size / sizeof(uint64_t);
+ const uint32_t tmu_count = vs->prog_data.base->tmu_count +
+ vs->prog_data.base->tmu_spills +
+ vs->prog_data.base->tmu_fills;
+ return inst_count + 4 * tmu_count;
+}
+
+static void
+job_update_double_buffer_score(struct v3dv_job *job,
+ struct v3dv_pipeline *pipeline,
+ uint32_t vertex_count,
+ VkExtent2D *render_area)
+{
+ /* FIXME: assume anything with GS workloads is too expensive */
+ struct v3dv_shader_variant *gs_bin =
+ pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
+ if (gs_bin) {
+ job->can_use_double_buffer = false;
+ return;
+ }
+
+ /* Keep track of vertex processing: too much geometry processing would not
+ * be good for double-buffer.
+ */
+ struct v3dv_shader_variant *vs_bin =
+ pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+ assert(vs_bin);
+ uint32_t geom_score = vertex_count * compute_prog_score(vs_bin);
+
+ struct v3dv_shader_variant *vs =
+ pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+ assert(vs);
+ uint32_t vs_score = vertex_count * compute_prog_score(vs);
+ geom_score += vs_score;
+
+ job->double_buffer_score.geom += geom_score;
+
+ /* Compute pixel rendering cost.
+ *
+ * We estimate that on average a draw would render 0.2% of the pixels in
+ * the render area. That would be a 64x64 region in a 1920x1080 area.
+ */
+ struct v3dv_shader_variant *fs =
+ pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+ assert(fs);
+ uint32_t pixel_count = 0.002f * render_area->width * render_area->height;
+ uint32_t render_score = vs_score + pixel_count * compute_prog_score(fs);
+
+ job->double_buffer_score.render += render_score;
+}
+
+void
+v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
+ bool indexed, bool indirect,
+ uint32_t vertex_count)
{
assert(cmd_buffer->state.gfx.pipeline);
assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
@@ -2489,6 +2968,23 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
job->draw_count++;
+ /* Track VK_KHR_buffer_device_address usage in the job */
+ struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+ job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
+
+ /* If this job is serialized (has consumed a barrier) then check if we need
+ * to sync at the binning stage by testing if the binning shaders involved
+ * with the draw call require access to external resources.
+ */
+ if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access ||
+ cmd_buffer->state.barrier.bcl_image_access)) {
+ assert(!job->needs_bcl_sync);
+ if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline,
+ indexed, indirect)) {
+ v3dv_cmd_buffer_consume_bcl_sync(cmd_buffer, job);
+ }
+ }
+
/* GL shader state binds shaders, uniform and vertex attribute state. The
* compiler injects uniforms to handle some descriptor types (such as
* textures), so we need to regen that when descriptor state changes.
@@ -2497,62 +2993,84 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
* that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
*/
uint32_t *dirty = &cmd_buffer->state.dirty;
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
- const uint32_t dirty_uniform_state =
- *dirty & (V3DV_CMD_DIRTY_PIPELINE |
- V3DV_CMD_DIRTY_PUSH_CONSTANTS |
- V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
- V3DV_CMD_DIRTY_VIEWPORT |
- V3DV_CMD_DIRTY_VIEW_INDEX);
-
- if (dirty_uniform_state)
- update_gfx_uniform_state(cmd_buffer, dirty_uniform_state);
+ const bool dirty_uniform_state =
+ update_gfx_uniform_state(cmd_buffer);
struct v3dv_device *device = cmd_buffer->device;
if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);
- if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
+ if (*dirty & (V3DV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);
+ }
+
+ if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);
}
- if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
emit_scissor(cmd_buffer);
}
- if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS))
v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);
- }
if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);
- const uint32_t dynamic_stencil_dirty_flags =
- V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
- V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
- V3DV_CMD_DIRTY_STENCIL_REFERENCE;
- if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))
+ bool any_dynamic_stencil_dirty =
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
+
+ if (*dirty & V3DV_CMD_DIRTY_PIPELINE || any_dynamic_stencil_dirty)
v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);
- if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
+ if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
+ }
- if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS))
+ v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
+
+ if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
+ }
if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);
- if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);
if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);
- if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE))
+ if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
+ }
+
+ /* We disable double-buffer mode if indirect draws are used because in that
+ * case we don't know the vertex count.
+ */
+ if (indirect) {
+ job->can_use_double_buffer = false;
+ } else if (job->can_use_double_buffer) {
+ job_update_double_buffer_score(job, pipeline, vertex_count,
+ &cmd_buffer->state.render_area.extent);
+ }
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
}
@@ -2561,18 +3079,23 @@ static inline void
cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t view_index)
{
- cmd_buffer->state.view_index = view_index;
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
+ if (view_index != cmd_buffer->state.view_index) {
+ cmd_buffer->state.view_index = view_index;
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
+ }
}
static void
cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_draw_info *info)
{
+ uint32_t vertex_count =
+ info->vertex_count * info->instance_count;
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ cmd_buffer_set_view_index(cmd_buffer, 0);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
return;
}
@@ -2580,7 +3103,7 @@ cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
}
}
@@ -2606,6 +3129,35 @@ v3dv_CmdDraw(VkCommandBuffer commandBuffer,
}
VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
+ uint32_t drawCount,
+ const VkMultiDrawInfoEXT *pVertexInfo,
+ uint32_t instanceCount,
+ uint32_t firstInstance,
+ uint32_t stride)
+
+{
+ if (drawCount == 0 || instanceCount == 0)
+ return;
+
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ uint32_t i = 0;
+ vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
+ cmd_buffer->state.draw_id = i;
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
+
+ struct v3dv_draw_info info = {};
+ info.vertex_count = draw->vertexCount;
+ info.instance_count = instanceCount;
+ info.first_instance = firstInstance;
+ info.first_vertex = draw->firstVertex;
+
+ cmd_buffer_draw(cmd_buffer, &info);
+ }
+}
+
+VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
uint32_t indexCount,
uint32_t instanceCount,
@@ -2618,9 +3170,12 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ uint32_t vertex_count = indexCount * instanceCount;
+
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ cmd_buffer_set_view_index(cmd_buffer, 0);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
(cmd_buffer, indexCount, instanceCount,
firstIndex, vertexOffset, firstInstance);
@@ -2630,7 +3185,7 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
(cmd_buffer, indexCount, instanceCount,
firstIndex, vertexOffset, firstInstance);
@@ -2638,6 +3193,48 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
}
VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
+ uint32_t drawCount,
+ const VkMultiDrawIndexedInfoEXT *pIndexInfo,
+ uint32_t instanceCount,
+ uint32_t firstInstance,
+ uint32_t stride,
+ const int32_t *pVertexOffset)
+{
+ if (drawCount == 0 || instanceCount == 0)
+ return;
+
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ uint32_t i = 0;
+ vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+ uint32_t vertex_count = draw->indexCount * instanceCount;
+ int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
+
+ cmd_buffer->state.draw_id = i;
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
+
+ struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+ if (likely(!pass->multiview_enabled)) {
+ cmd_buffer_set_view_index(cmd_buffer, 0);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
+ v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+ (cmd_buffer, draw->indexCount, instanceCount,
+ draw->firstIndex, vertexOffset, firstInstance);
+ continue;
+ }
+ uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+ while (view_mask) {
+ cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
+ v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+ (cmd_buffer, draw->indexCount, instanceCount,
+ draw->firstIndex, vertexOffset, firstInstance);
+ }
+ }
+}
+
+VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
@@ -2653,7 +3250,8 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ cmd_buffer_set_view_index(cmd_buffer, 0);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
return;
@@ -2662,7 +3260,7 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
}
@@ -2684,7 +3282,8 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ cmd_buffer_set_view_index(cmd_buffer, 0);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
return;
@@ -2693,64 +3292,173 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
- v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+ v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
}
}
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
- VkPipelineStageFlags srcStageMask,
- VkPipelineStageFlags dstStageMask,
- VkDependencyFlags dependencyFlags,
- uint32_t memoryBarrierCount,
- const VkMemoryBarrier *pMemoryBarriers,
- uint32_t bufferBarrierCount,
- const VkBufferMemoryBarrier *pBufferBarriers,
- uint32_t imageBarrierCount,
- const VkImageMemoryBarrier *pImageBarriers)
+static void
+handle_barrier(VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask,
+ VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask,
+ bool is_image_barrier, bool is_buffer_barrier,
+ struct v3dv_barrier_state *state)
{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
/* We only care about barriers between GPU jobs */
- if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT ||
- dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) {
+ if (srcStageMask == VK_PIPELINE_STAGE_2_HOST_BIT ||
+ dstStageMask == VK_PIPELINE_STAGE_2_HOST_BIT) {
return;
}
+ /* Track source of the barrier */
+ uint8_t src_mask = 0;
+
+ const VkPipelineStageFlags2 compute_mask =
+ VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+ if (srcStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
+ src_mask |= V3DV_BARRIER_COMPUTE_BIT;
+
+ const VkPipelineStageFlags2 transfer_mask =
+ VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
+ VK_PIPELINE_STAGE_2_COPY_BIT |
+ VK_PIPELINE_STAGE_2_BLIT_BIT |
+ VK_PIPELINE_STAGE_2_CLEAR_BIT;
+ if (srcStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
+ src_mask |= V3DV_BARRIER_TRANSFER_BIT;
+
+ const VkPipelineStageFlags2 graphics_mask = ~(compute_mask | transfer_mask);
+ if (srcStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
+ src_mask |= V3DV_BARRIER_GRAPHICS_BIT;
+
+ /* Track consumer of the barrier */
+ if (dstStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+ state->dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
+ state->src_mask_compute |= src_mask;
+ }
+
+ if (dstStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+ state->dst_mask |= V3DV_BARRIER_TRANSFER_BIT;
+ state->src_mask_transfer |= src_mask;
+ }
+
+ if (dstStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+ state->dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
+ state->src_mask_graphics |= src_mask;
+
+ if (dstStageMask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
+ VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
+ VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
+ VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
+ VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+ if (is_image_barrier)
+ state->bcl_image_access |= dstAccessMask;
+
+ if (is_buffer_barrier)
+ state->bcl_buffer_access |= dstAccessMask;
+ }
+ }
+}
+
+void
+v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkDependencyInfo *info)
+{
+ uint32_t imageBarrierCount = info->imageMemoryBarrierCount;
+ const VkImageMemoryBarrier2 *pImageBarriers = info->pImageMemoryBarriers;
+
+ uint32_t bufferBarrierCount = info->bufferMemoryBarrierCount;
+ const VkBufferMemoryBarrier2 *pBufferBarriers = info->pBufferMemoryBarriers;
+
+ uint32_t memoryBarrierCount = info->memoryBarrierCount;
+ const VkMemoryBarrier2 *pMemoryBarriers = info->pMemoryBarriers;
+
+ struct v3dv_barrier_state state = { 0 };
+ for (uint32_t i = 0; i < imageBarrierCount; i++) {
+ /* We can safely skip barriers for image layout transitions from UNDEFINED
+ * layout.
+ *
+ * Notice that KHR_synchronization2 allows to specify barriers that don't
+ * involve a layout transition by making oldLayout and newLayout the same,
+ * including UNDEFINED.
+ */
+ if (pImageBarriers[i].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+ pImageBarriers[i].oldLayout != pImageBarriers[i].newLayout) {
+ continue;
+ }
+
+ handle_barrier(pImageBarriers[i].srcStageMask,
+ pImageBarriers[i].srcAccessMask,
+ pImageBarriers[i].dstStageMask,
+ pImageBarriers[i].dstAccessMask,
+ true, false, &state);
+ }
+
+ for (uint32_t i = 0; i < bufferBarrierCount; i++) {
+ handle_barrier(pBufferBarriers[i].srcStageMask,
+ pBufferBarriers[i].srcAccessMask,
+ pBufferBarriers[i].dstStageMask,
+ pBufferBarriers[i].dstAccessMask,
+ false, true, &state);
+ }
+
+ for (uint32_t i = 0; i < memoryBarrierCount; i++) {
+ handle_barrier(pMemoryBarriers[i].srcStageMask,
+ pMemoryBarriers[i].srcAccessMask,
+ pMemoryBarriers[i].dstStageMask,
+ pMemoryBarriers[i].dstAccessMask,
+ true, true, &state);
+ }
+
+ /* Bail if we don't relevant barriers */
+ if (!state.dst_mask)
+ return;
+
/* If we have a recording job, finish it here */
- struct v3dv_job *job = cmd_buffer->state.job;
- if (job)
+ if (cmd_buffer->state.job)
v3dv_cmd_buffer_finish_job(cmd_buffer);
- cmd_buffer->state.has_barrier = true;
- if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
- VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
- VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
- VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
- VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
- VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) {
- cmd_buffer->state.has_bcl_barrier = true;
- }
+ /* Update barrier state in the command buffer */
+ v3dv_cmd_buffer_merge_barrier_state(&cmd_buffer->state.barrier, &state);
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
- uint32_t firstBinding,
- uint32_t bindingCount,
- const VkBuffer *pBuffers,
- const VkDeviceSize *pOffsets)
+v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
+ const VkDependencyInfo *pDependencyInfo)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, pDependencyInfo);
+}
- /* We have to defer setting up vertex buffer since we need the buffer
- * stride from the pipeline.
- */
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
+ uint32_t firstBinding,
+ uint32_t bindingCount,
+ const VkBuffer *pBuffers,
+ const VkDeviceSize *pOffsets,
+ const VkDeviceSize *pSizes,
+ const VkDeviceSize *pStrides)
+{
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
assert(firstBinding + bindingCount <= MAX_VBS);
bool vb_state_changed = false;
+ if (pStrides) {
+ vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk,
+ firstBinding, bindingCount,
+ pStrides);
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
+ vb_state_changed = true;
+ }
+ /* FIXME: at this moment we don't do any thing with pSizes. */
for (uint32_t i = 0; i < bindingCount; i++) {
if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) {
vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);
@@ -2766,24 +3474,6 @@ v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
}
-static uint32_t
-get_index_size(VkIndexType index_type)
-{
- switch (index_type) {
- case VK_INDEX_TYPE_UINT8_EXT:
- return 1;
- break;
- case VK_INDEX_TYPE_UINT16:
- return 2;
- break;
- case VK_INDEX_TYPE_UINT32:
- return 4;
- break;
- default:
- unreachable("Unsupported index type");
- }
-}
-
VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
VkBuffer buffer,
@@ -2792,7 +3482,7 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- const uint32_t index_size = get_index_size(indexType);
+ const uint32_t index_size = vk_index_type_to_bytes(indexType);
if (buffer == cmd_buffer->state.index_buffer.buffer &&
offset == cmd_buffer->state.index_buffer.offset &&
index_size == cmd_buffer->state.index_buffer.index_size) {
@@ -2806,82 +3496,309 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- uint32_t compareMask)
+v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
+ uint32_t lineStippleFactor,
+ uint16_t lineStipplePattern)
{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
- cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff;
- if (faceMask & VK_STENCIL_FACE_BACK_BIT)
- cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff;
-
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
+ /* We do not support stippled line rasterization so we just ignore this. */
}
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- uint32_t writeMask)
+/**
+ * This checks a descriptor set to see if are binding any descriptors that would
+ * involve sampling from a linear image (the hardware only supports this for
+ * 1D images), and if so, attempts to create a tiled copy of the linear image
+ * and rewrite the descriptor set to use that instead.
+ *
+ * This was added to support a scenario with Android where some part of the UI
+ * wanted to show previews of linear swapchain images. For more details:
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712
+ *
+ * Currently this only supports a linear sampling from a simple 2D image, but
+ * it could be extended to support more cases if necessary.
+ */
+static void
+handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_descriptor_set *set,
+ bool is_compute)
{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ for (int32_t i = 0; i < set->layout->binding_count; i++) {
+ const struct v3dv_descriptor_set_binding_layout *blayout =
+ &set->layout->binding[i];
+ if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
+ blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+ continue;
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
- cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff;
- if (faceMask & VK_STENCIL_FACE_BACK_BIT)
- cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff;
+ struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index];
+ if (!desc->image_view)
+ continue;
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
-}
+ struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image;
+ struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view;
+ if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D ||
+ view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) {
+ continue;
+ }
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- uint32_t reference)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ /* FIXME: we can probably handle most of these restrictions too with
+ * a bit of extra effort.
+ */
+ if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D ||
+ view->vk.level_count != 1 || view->vk.layer_count != 1 ||
+ blayout->array_size != 1) {
+ fprintf(stderr, "Sampling from linear image is not supported. "
+ "Expect corruption.\n");
+ continue;
+ }
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
- cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff;
- if (faceMask & VK_STENCIL_FACE_BACK_BIT)
- cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff;
+ /* We are sampling from a linear image. V3D doesn't support this
+ * so we create a tiled copy of the image and rewrite the descriptor
+ * to read from it instead.
+ */
+ perf_debug("Sampling from linear image is not supported natively and "
+ "requires a copy.\n");
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
-}
+ struct v3dv_device *device = cmd_buffer->device;
+ VkDevice vk_device = v3dv_device_to_handle(device);
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
- float depthBiasConstantFactor,
- float depthBiasClamp,
- float depthBiasSlopeFactor)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ /* Allocate shadow tiled image if needed, we only do this once for
+ * each image, on the first sampling attempt. We need to take a lock
+ * since we may be trying to do the same in another command buffer in
+ * a separate thread.
+ */
+ mtx_lock(&device->meta.mtx);
+ VkResult result;
+ VkImage tiled_image;
+ if (image->shadow) {
+ tiled_image = v3dv_image_to_handle(image->shadow);
+ } else {
+ VkImageCreateInfo image_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+ .flags = image->vk.create_flags,
+ .imageType = image->vk.image_type,
+ .format = image->vk.format,
+ .extent = {
+ image->vk.extent.width,
+ image->vk.extent.height,
+ image->vk.extent.depth,
+ },
+ .mipLevels = image->vk.mip_levels,
+ .arrayLayers = image->vk.array_layers,
+ .samples = image->vk.samples,
+ .tiling = VK_IMAGE_TILING_OPTIMAL,
+ .usage = image->vk.usage,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = 0,
+ .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+ };
+ result = v3dv_CreateImage(vk_device, &image_info,
+ &device->vk.alloc, &tiled_image);
+ if (result != VK_SUCCESS) {
+ fprintf(stderr, "Failed to copy linear 2D image for sampling."
+ "Expect corruption.\n");
+ mtx_unlock(&device->meta.mtx);
+ continue;
+ }
- cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor;
- cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp;
- cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor;
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
-}
+ bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+ VkImageMemoryRequirementsInfo2 reqs_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
+ .image = tiled_image,
+ };
+
+ assert(image->plane_count <= V3DV_MAX_PLANE_COUNT);
+ for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) {
+ VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
+ VkImagePlaneMemoryRequirementsInfo plane_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO,
+ .planeAspect = plane_aspect,
+ };
+ if (disjoint)
+ reqs_info.pNext = &plane_info;
+
+ VkMemoryRequirements2 reqs = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+ };
+ v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs);
+
+ VkDeviceMemory mem;
+ VkMemoryAllocateInfo alloc_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = reqs.memoryRequirements.size,
+ .memoryTypeIndex = 0,
+ };
+ result = v3dv_AllocateMemory(vk_device, &alloc_info,
+ &device->vk.alloc, &mem);
+ if (result != VK_SUCCESS) {
+ fprintf(stderr, "Failed to copy linear 2D image for sampling."
+ "Expect corruption.\n");
+ v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
+ mtx_unlock(&device->meta.mtx);
+ continue;
+ }
+
+ VkBindImageMemoryInfo bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
+ .image = tiled_image,
+ .memory = mem,
+ .memoryOffset = 0,
+ };
+ VkBindImagePlaneMemoryInfo plane_bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO,
+ .planeAspect = plane_aspect,
+ };
+ if (disjoint)
+ bind_info.pNext = &plane_bind_info;
+ result = v3dv_BindImageMemory2(vk_device, 1, &bind_info);
+ if (result != VK_SUCCESS) {
+ fprintf(stderr, "Failed to copy linear 2D image for sampling."
+ "Expect corruption.\n");
+ v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
+ v3dv_FreeMemory(vk_device, mem, &device->vk.alloc);
+ mtx_unlock(&device->meta.mtx);
+ continue;
+ }
+ }
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
- float minDepthBounds,
- float maxDepthBounds)
-{
- /* We do not support depth bounds testing so we just ingore this. We are
- * already asserting that pipelines don't enable the feature anyway.
- */
-}
+ image->shadow = v3dv_image_from_handle(tiled_image);
+ }
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
- float lineWidth)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ /* Create a shadow view that refers to the tiled image if needed */
+ VkImageView tiled_view;
+ if (view->shadow) {
+ tiled_view = v3dv_image_view_to_handle(view->shadow);
+ } else {
+ VkImageViewCreateInfo view_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+ .flags = view->vk.create_flags,
+ .image = tiled_image,
+ .viewType = view->vk.view_type,
+ .format = view->vk.format,
+ .components = view->vk.swizzle,
+ .subresourceRange = {
+ .aspectMask = view->vk.aspects,
+ .baseMipLevel = view->vk.base_mip_level,
+ .levelCount = view->vk.level_count,
+ .baseArrayLayer = view->vk.base_array_layer,
+ .layerCount = view->vk.layer_count,
+ },
+ };
+ result = v3dv_create_image_view(device, &view_info, &tiled_view);
+ if (result != VK_SUCCESS) {
+ fprintf(stderr, "Failed to copy linear 2D image for sampling."
+ "Expect corruption.\n");
+ mtx_unlock(&device->meta.mtx);
+ continue;
+ }
+ }
+
+ view->shadow = v3dv_image_view_from_handle(tiled_view);
+
+ mtx_unlock(&device->meta.mtx);
+
+ /* Rewrite the descriptor to use the shadow view */
+ VkDescriptorImageInfo desc_image_info = {
+ .sampler = v3dv_sampler_to_handle(desc->sampler),
+ .imageView = tiled_view,
+ .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+ };
+ VkWriteDescriptorSet write = {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstSet = v3dv_descriptor_set_to_handle(set),
+ .dstBinding = i,
+ .dstArrayElement = 0, /* Assumes array_size is 1 */
+ .descriptorCount = 1,
+ .descriptorType = desc->type,
+ .pImageInfo = &desc_image_info,
+ };
+ v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
+
+ /* Now we need to actually copy the pixel data from the linear image
+ * into the tiled image storage to ensure it is up-to-date.
+ *
+ * FIXME: ideally we would track if the linear image is dirty and skip
+ * this step otherwise, but that would be a bit of a pain.
+ *
+ * Note that we need to place the copy job *before* the current job in
+ * the command buffer state so we have the tiled image ready to process
+ * an upcoming draw call in the current job that samples from it.
+ *
+ * Also, we need to use the TFU path for this copy, as any other path
+ * will use the tile buffer and would require a new framebuffer setup,
+ * thus requiring extra work to stop and resume any in-flight render
+ * pass. Since we are converting a full 2D texture here the TFU should
+ * be able to handle this.
+ */
+ for (int p = 0; p < image->plane_count; p++) {
+ VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
+ struct VkImageCopy2 copy_region = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
+ .srcSubresource = {
+ .aspectMask = image->plane_count == 1 ?
+ view->vk.aspects : (view->vk.aspects & plane_aspect),
+ .mipLevel = view->vk.base_mip_level,
+ .baseArrayLayer = view->vk.base_array_layer,
+ .layerCount = view->vk.layer_count,
+ },
+ .srcOffset = {0, 0, 0 },
+ .dstSubresource = {
+ .aspectMask = image->plane_count == 1 ?
+ view->vk.aspects : (view->vk.aspects & plane_aspect),
+ .mipLevel = view->vk.base_mip_level,
+ .baseArrayLayer = view->vk.base_array_layer,
+ .layerCount = view->vk.layer_count,
+ },
+ .dstOffset = { 0, 0, 0},
+ .extent = {
+ image->planes[p].width,
+ image->planes[p].height,
+ 1,
+ },
+ };
+ struct v3dv_image *copy_src = image;
+ struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image);
+ bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src,
+ &copy_region);
+ if (ok) {
+ /* This will emit the TFU job right before the current in-flight
+ * job (if any), since in-fight jobs are only added to the list
+ * when finished.
+ */
+ struct v3dv_job *tfu_job =
+ list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
+ assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU);
+ /* Serialize the copy since we don't know who is producing the linear
+ * image and we need the image to be ready by the time the copy
+ * executes.
+ */
+ tfu_job->serialize = V3DV_BARRIER_ALL;
- cmd_buffer->state.dynamic.line_width = lineWidth;
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
+ /* Also, we need to ensure the TFU copy job completes before anyhing
+ * else coming after that may be using the tiled shadow copy.
+ */
+ if (cmd_buffer->state.job) {
+ /* If we already had an in-flight job (i.e. we are in a render
+ * pass) make sure the job waits for the TFU copy.
+ */
+ cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT;
+ } else {
+ /* Otherwise, make the the follow-up job syncs with the TFU
+ * job we just added when it is created by adding the
+ * corresponding barrier state.
+ */
+ if (!is_compute) {
+ cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
+ cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT;
+ } else {
+ cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
+ cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT;
+ }
+ }
+ } else {
+ fprintf(stderr, "Failed to copy linear 2D image for sampling."
+ "TFU doesn't support copy. Expect corruption.\n");
+ }
+ }
+ }
}
VKAPI_ATTR void VKAPI_CALL
@@ -2917,6 +3834,15 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
descriptor_state->descriptor_sets[index] = set;
dirty_stages |= set->layout->shader_stages;
descriptor_state_changed = true;
+
+ /* Check if we are sampling from a linear 2D image. This is not
+ * supported in hardware, but may be required for some applications
+ * so we will transparently convert to tiled at the expense of
+ * performance.
+ */
+ handle_sample_from_linear_image(cmd_buffer, set,
+ pipelineBindPoint ==
+ VK_PIPELINE_BIND_POINT_COMPUTE);
}
for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
@@ -2951,79 +3877,19 @@ v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size))
- return;
-
- memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size);
-
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS;
- cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
- const float blendConstants[4])
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
- if (!memcmp(state->dynamic.blend_constants, blendConstants,
- sizeof(state->dynamic.blend_constants))) {
+ if (!memcmp((uint8_t *) cmd_buffer->state.push_constants_data + offset,
+ pValues, size)) {
return;
}
- memcpy(state->dynamic.blend_constants, blendConstants,
- sizeof(state->dynamic.blend_constants));
-
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
-}
+ memcpy((uint8_t *) cmd_buffer->state.push_constants_data + offset,
+ pValues, size);
+ cmd_buffer->state.push_constants_size =
+ MAX2(offset + size, cmd_buffer->state.push_constants_size);
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
- uint32_t attachmentCount,
- const VkBool32 *pColorWriteEnables)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
- uint32_t color_write_enable = 0;
-
- for (uint32_t i = 0; i < attachmentCount; i++)
- color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
-
- if (state->dynamic.color_write_enable == color_write_enable)
- return;
-
- state->dynamic.color_write_enable = color_write_enable;
-
- state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
-}
-
-void
-v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_query_pool *pool,
- uint32_t first,
- uint32_t count)
-{
- /* Resets can only happen outside a render pass instance so we should not
- * be in the middle of job recording.
- */
- assert(cmd_buffer->state.pass == NULL);
- assert(cmd_buffer->state.job == NULL);
-
- assert(first < pool->query_count);
- assert(first + count <= pool->query_count);
-
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_RESET_QUERIES,
- cmd_buffer, -1);
- v3dv_return_if_oom(cmd_buffer, NULL);
-
- job->cpu.query_reset.pool = pool;
- job->cpu.query_reset.first = first;
- job->cpu.query_reset.count = count;
-
- list_addtail(&job->list_link, &cmd_buffer->jobs);
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS |
+ V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO;
+ cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
}
void
@@ -3059,37 +3925,87 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t query,
VkQueryControlFlags flags)
{
- /* FIXME: we only support one active query for now */
- assert(cmd_buffer->state.query.active_query.bo == NULL);
assert(query < pool->query_count);
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ /* FIXME: we only support one active occlusion query for now */
+ assert(cmd_buffer->state.query.active_query.bo == NULL);
+
+ cmd_buffer->state.query.active_query.bo = pool->occlusion.bo;
+ cmd_buffer->state.query.active_query.offset =
+ pool->queries[query].occlusion.offset;
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+ break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+ assert(cmd_buffer->state.query.active_query.perf == NULL);
+ if (cmd_buffer->state.pass)
+ v3dv_cmd_buffer_subpass_finish(cmd_buffer);
- cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
- cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+ cmd_buffer->state.query.active_query.perf =
+ &pool->queries[query].perf;
+
+ if (cmd_buffer->state.pass) {
+ v3dv_cmd_buffer_subpass_resume(cmd_buffer,
+ cmd_buffer->state.subpass_idx);
+ }
+ break;
+ }
+ default:
+ unreachable("Unsupported query type");
+ }
+}
+
+void
+v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ struct v3dv_bo *occlusion_query_bo = state->query.active_query.bo;
+ if (occlusion_query_bo) {
+ assert(!state->query.active_query.paused_bo);
+ state->query.active_query.paused_bo = occlusion_query_bo;
+ state->query.active_query.bo = NULL;
+ state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+ }
}
void
-v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_query_pool *pool,
- uint32_t query)
+v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ struct v3dv_bo *occlusion_query_bo = state->query.active_query.paused_bo;
+ if (occlusion_query_bo) {
+ assert(!state->query.active_query.bo);
+ state->query.active_query.bo = occlusion_query_bo;
+ state->query.active_query.paused_bo = NULL;
+ state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+ }
+}
+
+static void
+v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
{
assert(query < pool->query_count);
- assert(cmd_buffer->state.query.active_query.bo != NULL);
+ assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
- if (cmd_buffer->state.pass) {
- /* Queue the EndQuery in the command buffer state, we will create a CPU
- * job to flag all of these queries as possibly available right after the
- * render pass job in which they have been recorded.
- */
+ /* For occlusion queries in the middle of a render pass we don't want to
+ * split the current job at the EndQuery just to emit query availability,
+ * instead we queue this state in the command buffer and we emit it when
+ * we finish the current job.
+ */
+ if (cmd_buffer->state.pass &&
+ pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
- sizeof(struct v3dv_end_query_cpu_job_info),
+ sizeof(struct v3dv_end_query_info),
state->query.end.used_count,
&state->query.end.alloc_count,
(void **) &state->query.end.states);
v3dv_return_if_oom(cmd_buffer, NULL);
- struct v3dv_end_query_cpu_job_info *info =
+ struct v3dv_end_query_info *info =
&state->query.end.states[state->query.end.used_count++];
info->pool = pool;
@@ -3106,7 +4022,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
*
* In our case, only the first query is used but this means we still need
* to flag the other queries as available so we don't emit errors when
- * the applications attempt to retrive values from them.
+ * the applications attempt to retrieve values from them.
*/
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (!pass->multiview_enabled) {
@@ -3116,60 +4032,65 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
info->count = util_bitcount(subpass->view_mask);
}
} else {
- /* Otherwise, schedule the CPU job immediately */
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_END_QUERY,
- cmd_buffer, -1);
- v3dv_return_if_oom(cmd_buffer, NULL);
-
- job->cpu.query_end.pool = pool;
- job->cpu.query_end.query = query;
+ /* Otherwise, schedule the end query job immediately.
+ *
+ * Multiview queries cannot cross subpass boundaries, so query count is
+ * always 1.
+ */
+ if (pool->query_type == VK_QUERY_TYPE_OCCLUSION)
+ v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, pool, query, 1, 1);
+ else
+ cmd_buffer_emit_end_query_cpu(cmd_buffer, pool, query, 1);
+ }
+}
- /* Multiview queries cannot cross subpass boundaries */
- job->cpu.query_end.count = 1;
+static void
+v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ assert(query < pool->query_count);
+ assert(cmd_buffer->state.query.active_query.bo != NULL);
- list_addtail(&job->list_link, &cmd_buffer->jobs);
- }
+ v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
cmd_buffer->state.query.active_query.bo = NULL;
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
}
-void
-v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_query_pool *pool,
- uint32_t first,
- uint32_t count,
- struct v3dv_buffer *dst,
- uint32_t offset,
- uint32_t stride,
- VkQueryResultFlags flags)
+static void
+v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
{
- /* Copies can only happen outside a render pass instance so we should not
- * be in the middle of job recording.
- */
- assert(cmd_buffer->state.pass == NULL);
- assert(cmd_buffer->state.job == NULL);
+ assert(query < pool->query_count);
+ assert(cmd_buffer->state.query.active_query.perf != NULL);
- assert(first < pool->query_count);
- assert(first + count <= pool->query_count);
+ if (cmd_buffer->state.pass)
+ v3dv_cmd_buffer_subpass_finish(cmd_buffer);
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
- cmd_buffer, -1);
- v3dv_return_if_oom(cmd_buffer, NULL);
+ v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
- job->cpu.query_copy_results.pool = pool;
- job->cpu.query_copy_results.first = first;
- job->cpu.query_copy_results.count = count;
- job->cpu.query_copy_results.dst = dst;
- job->cpu.query_copy_results.offset = offset;
- job->cpu.query_copy_results.stride = stride;
- job->cpu.query_copy_results.flags = flags;
+ cmd_buffer->state.query.active_query.perf = NULL;
- list_addtail(&job->list_link, &cmd_buffer->jobs);
+ if (cmd_buffer->state.pass)
+ v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
+}
+
+void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
+ break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
+ break;
+ default:
+ unreachable("Unsupported query type");
+ }
}
void
@@ -3191,115 +4112,10 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
- VkEvent _event,
- VkPipelineStageFlags stageMask)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- V3DV_FROM_HANDLE(v3dv_event, event, _event);
-
- /* Event (re)sets can only happen outside a render pass instance so we
- * should not be in the middle of job recording.
- */
- assert(cmd_buffer->state.pass == NULL);
- assert(cmd_buffer->state.job == NULL);
-
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_SET_EVENT,
- cmd_buffer, -1);
- v3dv_return_if_oom(cmd_buffer, NULL);
-
- job->cpu.event_set.event = event;
- job->cpu.event_set.state = 1;
-
- list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
- VkEvent _event,
- VkPipelineStageFlags stageMask)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- V3DV_FROM_HANDLE(v3dv_event, event, _event);
-
- /* Event (re)sets can only happen outside a render pass instance so we
- * should not be in the middle of job recording.
- */
- assert(cmd_buffer->state.pass == NULL);
- assert(cmd_buffer->state.job == NULL);
-
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_SET_EVENT,
- cmd_buffer, -1);
- v3dv_return_if_oom(cmd_buffer, NULL);
-
- job->cpu.event_set.event = event;
- job->cpu.event_set.state = 0;
-
- list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
- uint32_t eventCount,
- const VkEvent *pEvents,
- VkPipelineStageFlags srcStageMask,
- VkPipelineStageFlags dstStageMask,
- uint32_t memoryBarrierCount,
- const VkMemoryBarrier *pMemoryBarriers,
- uint32_t bufferMemoryBarrierCount,
- const VkBufferMemoryBarrier *pBufferMemoryBarriers,
- uint32_t imageMemoryBarrierCount,
- const VkImageMemoryBarrier *pImageMemoryBarriers)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
- assert(eventCount > 0);
-
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
- cmd_buffer, -1);
- v3dv_return_if_oom(cmd_buffer, NULL);
-
- const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;
-
- job->cpu.event_wait.events =
- vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!job->cpu.event_wait.events) {
- v3dv_flag_oom(cmd_buffer, NULL);
- return;
- }
- job->cpu.event_wait.event_count = eventCount;
-
- for (uint32_t i = 0; i < eventCount; i++)
- job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);
-
- /* vkCmdWaitEvents can be recorded inside a render pass, so we might have
- * an active job.
- *
- * If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen
- * inside a render pass, it is safe to move the wait job so it happens right
- * before the current job we are currently recording for the subpass, if any
- * (it would actually be safe to move it all the way back to right before
- * the start of the render pass).
- *
- * If we are outside a render pass then we should not have any on-going job
- * and we are free to just add the wait job without restrictions.
- */
- assert(cmd_buffer->state.pass || !cmd_buffer->state.job);
- list_addtail(&job->list_link, &cmd_buffer->jobs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
- VkPipelineStageFlagBits pipelineStage,
- VkQueryPool queryPool,
- uint32_t query)
+v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
+ VkPipelineStageFlags2 stage,
+ VkQueryPool queryPool,
+ uint32_t query)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
@@ -3349,24 +4165,9 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
}
-#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
-#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
-/* Allow this dispatch to start while the last one is still running. */
-#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
-/* Maximum supergroup ID. 6 bits. */
-#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
-/* Batches per supergroup minus 1. 8 bits. */
-#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
-/* Workgroups per supergroup, 0 means 16 */
-#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
-#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
-
-#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
-#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
-#define V3D_CSD_CFG5_THREADING (1 << 0)
-
void
v3dv_cmd_buffer_rewrite_indirect_csd_job(
+ struct v3dv_device *device,
struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts)
{
@@ -3386,15 +4187,22 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
- submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
- (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+ uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
+ (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+ if (device->devinfo.ver < 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+ submit->cfg[4] = num_batches - 1;
+ } else {
+ submit->cfg[4] = num_batches;
+ }
assert(submit->cfg[4] != ~0);
if (info->needs_wg_uniform_rewrite) {
/* Make sure the GPU is not currently accessing the indirect CL for this
* job, since we are about to overwrite some of the uniform data.
*/
- v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE);
+ v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE);
for (uint32_t i = 0; i < 3; i++) {
if (info->wg_uniform_offsets[i]) {
@@ -3420,6 +4228,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t **wg_uniform_offsets_out,
uint32_t *wg_size_out)
{
+ struct v3dv_device *device = cmd_buffer->device;
struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
struct v3dv_shader_variant *cs_variant =
@@ -3478,23 +4287,31 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
if (wg_size_out)
*wg_size_out = wg_size;
- submit->cfg[4] = num_batches - 1;
+ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+ if (device->devinfo.ver < 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+ submit->cfg[4] = num_batches - 1;
+ } else {
+ submit->cfg[4] = num_batches;
+ }
assert(submit->cfg[4] != ~0);
assert(pipeline->shared_data->assembly_bo);
struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
- submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (cs_variant->prog_data.base->single_seg)
submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
if (cs_variant->prog_data.base->threads == 4)
submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
+ /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */
+ if (device->devinfo.ver < 71)
+ submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (cs_variant->prog_data.cs->shared_size > 0) {
job->csd.shared_memory =
v3dv_bo_alloc(cmd_buffer->device,
- cs_variant->prog_data.cs->shared_size * wgs_per_sg,
+ cs_variant->prog_data.cs->shared_size * num_wgs,
"shared_vars", true);
if (!job->csd.shared_memory) {
v3dv_flag_oom(cmd_buffer, NULL);
@@ -3509,6 +4326,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
wg_uniform_offsets_out);
submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
+
+ /* Track VK_KHR_buffer_device_address usage in the job */
+ job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
+
v3dv_job_add_bo(job, uniforms.bo);
return job;
@@ -3541,19 +4362,6 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
- uint32_t groupCountX,
- uint32_t groupCountY,
- uint32_t groupCountZ)
-{
- V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer_emit_pre_dispatch(cmd_buffer);
- cmd_buffer_dispatch(cmd_buffer, 0, 0, 0,
- groupCountX, groupCountY, groupCountZ);
-}
-
-VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
uint32_t baseGroupX,
uint32_t baseGroupY,
@@ -3615,6 +4423,16 @@ cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
job->cpu.csd_indirect.wg_uniform_offsets[2];
list_addtail(&job->list_link, &cmd_buffer->jobs);
+
+ /* If we have a CPU queue we submit the CPU job directly to the
+ * queue and the CSD job will be dispatched from within the kernel
+ * queue, otherwise we will have to dispatch the CSD job manually
+ * right after the CPU job by adding it to the list of jobs in the
+ * command buffer.
+ */
+ if (!cmd_buffer->device->pdevice->caps.cpu_queue)
+ list_addtail(&csd_job->list_link, &cmd_buffer->jobs);
+
cmd_buffer->state.job = NULL;
}
@@ -3633,8 +4451,144 @@ v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
+v3dv_CmdBeginRenderingKHR(VkCommandBuffer commandBuffer,
+ const VkRenderingInfoKHR *info)
{
- /* Nothing to do here since we only support a single device */
- assert(deviceMask == 0x1);
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ cmd_buffer->state.suspending = info->flags & VK_RENDERING_SUSPENDING_BIT;
+ cmd_buffer->state.resuming = info->flags & VK_RENDERING_RESUMING_BIT;
+
+ /* FIXME: for resuming passes we might not need all this setup below since
+ * we are only mostly recording draw calls like in secondaries.
+ */
+
+ v3dv_setup_dynamic_render_pass(cmd_buffer, info);
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ v3dv_setup_dynamic_framebuffer(cmd_buffer, info);
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ state->pass = &state->dynamic_pass;
+ state->framebuffer = state->dynamic_framebuffer;
+
+ VkRenderPassBeginInfo begin_info = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+ .pNext = NULL,
+ .renderPass = v3dv_render_pass_to_handle(state->pass),
+ .framebuffer = v3dv_framebuffer_to_handle(state->framebuffer),
+ .renderArea = info->renderArea,
+ };
+
+ VkClearValue *clear_values = NULL;
+ if (state->pass->attachment_count > 0) {
+ clear_values =
+ vk_alloc(&cmd_buffer->device->vk.alloc,
+ state->pass->attachment_count * sizeof(VkClearValue), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+ if (!clear_values) {
+ v3dv_flag_oom(cmd_buffer, NULL);
+ return;
+ }
+ }
+
+ for (int i = 0; i < info->colorAttachmentCount; i++) {
+ if (!info->pColorAttachments[i].imageView)
+ continue;
+
+ uint32_t a = cmd_buffer->state.dynamic_subpass.color_attachments[i].attachment;
+ assert(a < state->pass->attachment_count);
+ clear_values[a] = info->pColorAttachments[i].clearValue;
+ }
+
+ if (info->pDepthAttachment &&
+ info->pDepthAttachment->imageView != VK_NULL_HANDLE) {
+ uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
+ assert(a < state->pass->attachment_count);
+ clear_values[a].depthStencil.depth =
+ info->pDepthAttachment->clearValue.depthStencil.depth;
+ }
+
+ if (info->pStencilAttachment &&
+ info->pStencilAttachment->imageView != VK_NULL_HANDLE) {
+ uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
+ assert(a < state->pass->attachment_count);
+ clear_values[a].depthStencil.stencil =
+ info->pStencilAttachment->clearValue.depthStencil.stencil;
+ }
+
+ begin_info.clearValueCount = state->pass->attachment_count;
+ begin_info.pClearValues = clear_values;
+
+ cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
+ v3dv_return_if_oom(cmd_buffer, NULL);
+ cmd_buffer_init_render_pass_attachment_state(cmd_buffer, &begin_info);
+
+ if (clear_values)
+ vk_free(&cmd_buffer->vk.pool->alloc, clear_values);
+
+ state->render_area = info->renderArea;
+ constraint_clip_window_to_render_area(cmd_buffer);
+ v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdEndRenderingKHR(VkCommandBuffer commandBuffer)
+{
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ assert(state->subpass_idx == state->pass->subpass_count - 1);
+
+ /* If we have any pending jobs that were waiting for the current job
+ * to finish and we are suspending the pass here, we need to finish the
+ * job completely and ensure we emit the pending jobs immediately.
+ *
+ * FIXME: this is not optimal but since the resuming command buffer won't
+ * have the pending state we can't do it after the resuming chain completes
+ * without some extra work: we would have to generate the pending jobs
+ * now but not add them to this command buffer's job list, instead, they
+ * should be added to a separate list of "pending jobs" and at submit time
+ * we would accumulate these jobs during the suspend/resume chain and emit
+ * them all after the last job in the chain.
+ */
+ if (state->suspending && cmd_buffer_has_pending_jobs(cmd_buffer))
+ v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+ /* If we don't have a job and we are suspending we will need to create one
+ * so we can link to a follow-up resume job. Because would be starting a new
+ * job, we should ensure the command buffer state is not flagged as resuming
+ * from a previous suspend. The new job will consume any pending barrier
+ * state if necessary.
+ */
+ struct v3dv_job *job = cmd_buffer->state.job;
+ if (!job && state->suspending) {
+ state->resuming = false;
+ job = v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->subpass_idx);
+ if (!job)
+ return;
+ }
+
+ /* If this job is suspending it means it will continue execution in another
+ * job (with the same RCL spec). We implement this by branching the BCL and
+ * we will patch the branch address when we know the resuming job.
+ */
+ if (state->suspending)
+ v3dv_X(cmd_buffer->device, cmd_buffer_suspend)(cmd_buffer);
+
+ v3dv_cmd_buffer_subpass_finish(cmd_buffer);
+ v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+ /* This must be done after the resume/suspend chain completed. */
+ if (!state->suspending)
+ cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
+
+ state->framebuffer = NULL;
+ state->pass = NULL;
+ state->subpass_idx = -1;
+ state->suspending = false;
+ state->resuming = false;
}
diff --git a/src/broadcom/vulkan/v3dv_debug.c b/src/broadcom/vulkan/v3dv_debug.c
index 055300d05c9..065e8f66026 100644
--- a/src/broadcom/vulkan/v3dv_debug.c
+++ b/src/broadcom/vulkan/v3dv_debug.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* based in part on radv_debug.h which is:
* Copyright © 2017 Google.
diff --git a/src/broadcom/vulkan/v3dv_debug.h b/src/broadcom/vulkan/v3dv_debug.h
index 75f253700ed..bab21eef2b8 100644
--- a/src/broadcom/vulkan/v3dv_debug.h
+++ b/src/broadcom/vulkan/v3dv_debug.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* based in part on radv_debug.h which is:
* Copyright © 2017 Google.
diff --git a/src/broadcom/vulkan/v3dv_descriptor_set.c b/src/broadcom/vulkan/v3dv_descriptor_set.c
index fd9ec935611..1d777ba08d4 100644
--- a/src/broadcom/vulkan/v3dv_descriptor_set.c
+++ b/src/broadcom/vulkan/v3dv_descriptor_set.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -28,19 +28,26 @@
/*
* For a given descriptor defined by the descriptor_set it belongs, its
- * binding layout, and array_index, it returns the map region assigned to it
- * from the descriptor pool bo.
+ * binding layout, array_index, and plane, it returns the map region assigned
+ * to it from the descriptor pool bo.
*/
-static void*
+static void *
descriptor_bo_map(struct v3dv_device *device,
struct v3dv_descriptor_set *set,
const struct v3dv_descriptor_set_binding_layout *binding_layout,
uint32_t array_index)
{
- assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
+ /* Inline uniform blocks use BO memory to store UBO contents, not
+ * descriptor data, so their descriptor BO size is 0 even though they
+ * do use BO memory.
+ */
+ uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+ assert(bo_size > 0 ||
+ binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
+
return set->pool->bo->map +
set->base_offset + binding_layout->descriptor_offset +
- array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+ array_index * binding_layout->plane_stride * bo_size;
}
static bool
@@ -102,7 +109,7 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
* It also returns the descriptor type, so the caller could do extra
* validation or adding extra offsets if the bo contains more that one field.
*/
-static struct v3dv_cl_reloc
+struct v3dv_cl_reloc
v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
struct v3dv_descriptor_state *descriptor_state,
struct v3dv_descriptor_map *map,
@@ -125,8 +132,13 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
const struct v3dv_descriptor_set_binding_layout *binding_layout =
&set->layout->binding[binding_number];
- assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
- *out_type = binding_layout->type;
+
+ uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+
+ assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+ bo_size > 0);
+ if (out_type)
+ *out_type = binding_layout->type;
uint32_t array_index = map->array_index[index];
assert(array_index < binding_layout->array_size);
@@ -134,7 +146,7 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
struct v3dv_cl_reloc reloc = {
.bo = set->pool->bo,
.offset = set->base_offset + binding_layout->descriptor_offset +
- array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type),
+ array_index * binding_layout->plane_stride * bo_size,
};
return reloc;
@@ -213,40 +225,11 @@ v3dv_descriptor_map_get_sampler_state(struct v3dv_device *device,
type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
- reloc.offset += v3dv_X(device, combined_image_sampler_sampler_state_offset)();
+ reloc.offset += v3dv_X(device, combined_image_sampler_sampler_state_offset)(map->plane[index]);
return reloc;
}
-const struct v3dv_format*
-v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_state,
- struct v3dv_descriptor_map *map,
- struct v3dv_pipeline_layout *pipeline_layout,
- uint32_t index,
- VkFormat *out_vk_format)
-{
- struct v3dv_descriptor *descriptor =
- v3dv_descriptor_map_get_descriptor(descriptor_state, map,
- pipeline_layout, index, NULL);
-
- switch (descriptor->type) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- assert(descriptor->buffer_view);
- *out_vk_format = descriptor->buffer_view->vk_format;
- return descriptor->buffer_view->format;
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
- assert(descriptor->image_view);
- *out_vk_format = descriptor->image_view->vk.format;
- return descriptor->image_view->format;
- default:
- unreachable("descriptor type doesn't has a texture format");
- }
-}
-
struct v3dv_bo*
v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_state,
struct v3dv_descriptor_map *map,
@@ -270,7 +253,8 @@ v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_stat
assert(descriptor->image_view);
struct v3dv_image *image =
(struct v3dv_image *) descriptor->image_view->vk.image;
- return image->mem->bo;
+ assert(map->plane[index] < image->plane_count);
+ return image->planes[map->plane[index]].mem->bo;
}
default:
unreachable("descriptor type doesn't has a texture bo");
@@ -299,11 +283,66 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device,
type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
- reloc.offset += v3dv_X(device, combined_image_sampler_texture_state_offset)();
+ reloc.offset += v3dv_X(device, combined_image_sampler_texture_state_offset)(map->plane[index]);
return reloc;
}
+#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x));
+
+static void
+sha1_update_ycbcr_conversion(struct mesa_sha1 *ctx,
+ const struct vk_ycbcr_conversion_state *conversion)
+{
+ SHA1_UPDATE_VALUE(ctx, conversion->format);
+ SHA1_UPDATE_VALUE(ctx, conversion->ycbcr_model);
+ SHA1_UPDATE_VALUE(ctx, conversion->ycbcr_range);
+ SHA1_UPDATE_VALUE(ctx, conversion->mapping);
+ SHA1_UPDATE_VALUE(ctx, conversion->chroma_offsets);
+ SHA1_UPDATE_VALUE(ctx, conversion->chroma_reconstruction);
+}
+
+static void
+sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
+ const struct v3dv_descriptor_set_binding_layout *layout,
+ const struct v3dv_descriptor_set_layout *set_layout)
+{
+ SHA1_UPDATE_VALUE(ctx, layout->type);
+ SHA1_UPDATE_VALUE(ctx, layout->array_size);
+ SHA1_UPDATE_VALUE(ctx, layout->descriptor_index);
+ SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count);
+ SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index);
+ SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset);
+ SHA1_UPDATE_VALUE(ctx, layout->immutable_samplers_offset);
+ SHA1_UPDATE_VALUE(ctx, layout->plane_stride);
+
+ if (layout->immutable_samplers_offset) {
+ const struct v3dv_sampler *immutable_samplers =
+ v3dv_immutable_samplers(set_layout, layout);
+
+ for (unsigned i = 0; i < layout->array_size; i++) {
+ const struct v3dv_sampler *sampler = &immutable_samplers[i];
+ if (sampler->conversion)
+ sha1_update_ycbcr_conversion(ctx, &sampler->conversion->state);
+ }
+ }
+}
+
+static void
+sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
+ const struct v3dv_descriptor_set_layout *layout)
+{
+ SHA1_UPDATE_VALUE(ctx, layout->flags);
+ SHA1_UPDATE_VALUE(ctx, layout->binding_count);
+ SHA1_UPDATE_VALUE(ctx, layout->shader_stages);
+ SHA1_UPDATE_VALUE(ctx, layout->descriptor_count);
+ SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count);
+
+ for (uint16_t i = 0; i < layout->binding_count; i++)
+ sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i], layout);
+}
+
+
/*
* As anv and tu already points:
*
@@ -326,16 +365,17 @@ v3dv_CreatePipelineLayout(VkDevice _device,
layout = vk_object_zalloc(&device->vk, pAllocator, sizeof(*layout),
VK_OBJECT_TYPE_PIPELINE_LAYOUT);
if (layout == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
layout->num_sets = pCreateInfo->setLayoutCount;
+ layout->ref_cnt = 1;
uint32_t dynamic_offset_count = 0;
for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
V3DV_FROM_HANDLE(v3dv_descriptor_set_layout, set_layout,
pCreateInfo->pSetLayouts[set]);
+ v3dv_descriptor_set_layout_ref(set_layout);
layout->set[set].layout = set_layout;
-
layout->set[set].dynamic_offset_start = dynamic_offset_count;
for (uint32_t b = 0; b < set_layout->binding_count; b++) {
dynamic_offset_count += set_layout->binding[b].array_size *
@@ -356,11 +396,34 @@ v3dv_CreatePipelineLayout(VkDevice _device,
layout->dynamic_offset_count = dynamic_offset_count;
+ struct mesa_sha1 ctx;
+ _mesa_sha1_init(&ctx);
+ for (unsigned s = 0; s < layout->num_sets; s++) {
+ sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
+ _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
+ sizeof(layout->set[s].dynamic_offset_start));
+ }
+ _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
+ _mesa_sha1_final(&ctx, layout->sha1);
+
*pPipelineLayout = v3dv_pipeline_layout_to_handle(layout);
return VK_SUCCESS;
}
+void
+v3dv_pipeline_layout_destroy(struct v3dv_device *device,
+ struct v3dv_pipeline_layout *layout,
+ const VkAllocationCallbacks *alloc)
+{
+ assert(layout);
+
+ for (uint32_t i = 0; i < layout->num_sets; i++)
+ v3dv_descriptor_set_layout_unref(device, layout->set[i].layout);
+
+ vk_object_free(&device->vk, alloc, layout);
+}
+
VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyPipelineLayout(VkDevice _device,
VkPipelineLayout _pipelineLayout,
@@ -371,7 +434,8 @@ v3dv_DestroyPipelineLayout(VkDevice _device,
if (!pipeline_layout)
return;
- vk_object_free(&device->vk, pAllocator, pipeline_layout);
+
+ v3dv_pipeline_layout_unref(device, pipeline_layout, pAllocator);
}
VKAPI_ATTR VkResult VKAPI_CALL
@@ -393,7 +457,10 @@ v3dv_CreateDescriptorPool(VkDevice _device,
uint32_t bo_size = 0;
uint32_t descriptor_count = 0;
- assert(pCreateInfo->poolSizeCount > 0);
+ const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
+
for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
/* Verify supported descriptor type */
switch(pCreateInfo->pPoolSizes[i].type) {
@@ -408,6 +475,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
break;
default:
unreachable("Unimplemented descriptor type");
@@ -415,9 +483,28 @@ v3dv_CreateDescriptorPool(VkDevice _device,
}
assert(pCreateInfo->pPoolSizes[i].descriptorCount > 0);
- descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
- bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
- pCreateInfo->pPoolSizes[i].descriptorCount;
+ if (pCreateInfo->pPoolSizes[i].type ==
+ VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ /* Inline uniform blocks are specified to use the descriptor array
+ * size as the size in bytes of the block.
+ */
+ assert(inline_info);
+ descriptor_count += inline_info->maxInlineUniformBlockBindings;
+ bo_size += pCreateInfo->pPoolSizes[i].descriptorCount;
+ } else {
+ descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+ bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
+ pCreateInfo->pPoolSizes[i].descriptorCount;
+ }
+ }
+
+ /* We align all our buffers to V3D_NON_COHERENT_ATOM_SIZE, make sure we
+ * allocate enough memory to honor that requirement for all our inline
+ * buffers too.
+ */
+ if (inline_info) {
+ bo_size += V3D_NON_COHERENT_ATOM_SIZE *
+ inline_info->maxInlineUniformBlockBindings;
}
if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
@@ -433,7 +520,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
VK_OBJECT_TYPE_DESCRIPTOR_POOL);
if (!pool)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
pool->host_memory_base = (uint8_t*)pool + sizeof(struct v3dv_descriptor_pool);
@@ -457,13 +544,15 @@ v3dv_CreateDescriptorPool(VkDevice _device,
pool->bo = NULL;
}
+ list_inithead(&pool->set_list);
+
*pDescriptorPool = v3dv_descriptor_pool_to_handle(pool);
return VK_SUCCESS;
out_of_device_memory:
vk_object_free(&device->vk, pAllocator, pool);
- return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
static void
@@ -498,6 +587,11 @@ v3dv_DestroyDescriptorPool(VkDevice _device,
if (!pool)
return;
+ list_for_each_entry_safe(struct v3dv_descriptor_set, set,
+ &pool->set_list, pool_link) {
+ v3dv_descriptor_set_layout_unref(device, set->layout);
+ }
+
if (!pool->host_memory_base) {
for(int i = 0; i < pool->entry_count; ++i) {
descriptor_set_destroy(device, pool, pool->entries[i].set, false);
@@ -520,6 +614,12 @@ v3dv_ResetDescriptorPool(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_descriptor_pool, pool, descriptorPool);
+ list_for_each_entry_safe(struct v3dv_descriptor_set, set,
+ &pool->set_list, pool_link) {
+ v3dv_descriptor_set_layout_unref(device, set->layout);
+ }
+ list_inithead(&pool->set_list);
+
if (!pool->host_memory_base) {
for(int i = 0; i < pool->entry_count; ++i) {
descriptor_set_destroy(device, pool, pool->entries[i].set, false);
@@ -539,6 +639,15 @@ v3dv_ResetDescriptorPool(VkDevice _device,
return VK_SUCCESS;
}
+void
+v3dv_descriptor_set_layout_destroy(struct v3dv_device *device,
+ struct v3dv_descriptor_set_layout *set_layout)
+{
+ assert(set_layout->ref_cnt == 0);
+ vk_object_base_finish(&set_layout->base);
+ vk_free2(&device->vk.alloc, NULL, set_layout);
+}
+
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateDescriptorSetLayout(VkDevice _device,
const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
@@ -552,6 +661,13 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
uint32_t num_bindings = 0;
uint32_t immutable_sampler_count = 0;
+
+ /* for immutable descriptors, the plane stride is the largest plane
+ * count of all combined image samplers. For mutable descriptors
+ * this is always 1 since multiplanar images are restricted to
+ * immutable combined image samplers.
+ */
+ uint8_t plane_stride = 1;
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
num_bindings = MAX2(num_bindings, pCreateInfo->pBindings[j].binding + 1);
@@ -570,22 +686,40 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
if ((desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
desc_type == VK_DESCRIPTOR_TYPE_SAMPLER) &&
pCreateInfo->pBindings[j].pImmutableSamplers) {
- immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
+ uint32_t descriptor_count = pCreateInfo->pBindings[j].descriptorCount;
+ immutable_sampler_count += descriptor_count;
+
+ for (uint32_t i = 0; i < descriptor_count; i++) {
+ const VkSampler vk_sampler =
+ pCreateInfo->pBindings[j].pImmutableSamplers[i];
+ VK_FROM_HANDLE(v3dv_sampler, sampler, vk_sampler);
+ plane_stride = MAX2(plane_stride, sampler->plane_count);
+ }
}
}
- uint32_t samplers_offset = sizeof(struct v3dv_descriptor_set_layout) +
- num_bindings * sizeof(set_layout->binding[0]);
+ /* We place immutable samplers after the binding data. We want to use
+ * offsetof instead of any sizeof(struct v3dv_descriptor_set_layout)
+ * because the latter may include padding at the end of the struct.
+ */
+ uint32_t samplers_offset =
+ offsetof(struct v3dv_descriptor_set_layout, binding[num_bindings]);
+
uint32_t size = samplers_offset +
immutable_sampler_count * sizeof(struct v3dv_sampler);
- set_layout = vk_object_zalloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
-
+ /* Descriptor set layouts are reference counted and therefore can survive
+ * vkDestroyPipelineSetLayout, so they need to be allocated with a device
+ * scope.
+ */
+ set_layout =
+ vk_zalloc(&device->vk.alloc, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!set_layout)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ vk_object_base_init(&device->vk, &set_layout->base,
+ VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
- /* We just allocate all the immutable samplers at the end of the struct */
struct v3dv_sampler *samplers = (void*) &set_layout->binding[num_bindings];
assert(pCreateInfo->bindingCount == 0 || num_bindings > 0);
@@ -594,17 +728,15 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
VkResult result = vk_create_sorted_bindings(pCreateInfo->pBindings,
pCreateInfo->bindingCount, &bindings);
if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, set_layout);
- return vk_error(device->instance, result);
+ v3dv_descriptor_set_layout_destroy(device, set_layout);
+ return vk_error(device, result);
}
- memset(set_layout->binding, 0,
- size - sizeof(struct v3dv_descriptor_set_layout));
-
set_layout->binding_count = num_bindings;
set_layout->flags = pCreateInfo->flags;
set_layout->shader_stages = 0;
set_layout->bo_size = 0;
+ set_layout->ref_cnt = 1;
uint32_t descriptor_count = 0;
uint32_t dynamic_offset_count = 0;
@@ -628,6 +760,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
/* Nothing here, just to keep the descriptor type filtering below */
break;
default:
@@ -639,6 +772,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
set_layout->binding[binding_number].array_size = binding->descriptorCount;
set_layout->binding[binding_number].descriptor_index = descriptor_count;
set_layout->binding[binding_number].dynamic_offset_index = dynamic_offset_count;
+ set_layout->binding[binding_number].plane_stride = plane_stride;
if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
@@ -651,18 +785,40 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
samplers += binding->descriptorCount;
samplers_offset += sizeof(struct v3dv_sampler) * binding->descriptorCount;
- }
- descriptor_count += binding->descriptorCount;
- dynamic_offset_count += binding->descriptorCount *
- set_layout->binding[binding_number].dynamic_offset_count;
+ set_layout->binding[binding_number].plane_stride = plane_stride;
+ }
set_layout->shader_stages |= binding->stageFlags;
- set_layout->binding[binding_number].descriptor_offset = set_layout->bo_size;
- set_layout->bo_size +=
- v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
- binding->descriptorCount;
+ if (binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ dynamic_offset_count += binding->descriptorCount *
+ set_layout->binding[binding_number].dynamic_offset_count;
+
+ descriptor_count += binding->descriptorCount;
+
+ set_layout->binding[binding_number].descriptor_offset =
+ set_layout->bo_size;
+ set_layout->bo_size +=
+ v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
+ binding->descriptorCount * set_layout->binding[binding_number].plane_stride;
+ } else {
+ /* We align all our buffers, inline buffers too. We made sure to take
+ * this account when calculating total BO size requirements at pool
+ * creation time.
+ */
+ set_layout->bo_size = align(set_layout->bo_size,
+ V3D_NON_COHERENT_ATOM_SIZE);
+
+ set_layout->binding[binding_number].descriptor_offset =
+ set_layout->bo_size;
+
+ /* Inline uniform blocks are not arrayed, instead descriptorCount
+ * specifies the size of the buffer in bytes.
+ */
+ set_layout->bo_size += binding->descriptorCount;
+ descriptor_count++;
+ }
}
free(bindings);
@@ -686,7 +842,7 @@ v3dv_DestroyDescriptorSetLayout(VkDevice _device,
if (!set_layout)
return;
- vk_object_free(&device->vk, pAllocator, set_layout);
+ v3dv_descriptor_set_layout_unref(device, set_layout);
}
static inline VkResult
@@ -697,7 +853,7 @@ out_of_pool_memory(const struct v3dv_device *device,
* by allocating a new pool, so they don't point to real issues.
*/
if (!pool->is_driver_internal)
- return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY)
+ return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
else
return VK_ERROR_OUT_OF_POOL_MEMORY;
}
@@ -705,7 +861,7 @@ out_of_pool_memory(const struct v3dv_device *device,
static VkResult
descriptor_set_create(struct v3dv_device *device,
struct v3dv_descriptor_pool *pool,
- const struct v3dv_descriptor_set_layout *layout,
+ struct v3dv_descriptor_set_layout *layout,
struct v3dv_descriptor_set **out_set)
{
struct v3dv_descriptor_set *set;
@@ -726,7 +882,7 @@ descriptor_set_create(struct v3dv_device *device,
VK_OBJECT_TYPE_DESCRIPTOR_SET);
if (!set)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
set->pool = pool;
@@ -797,19 +953,24 @@ descriptor_set_create(struct v3dv_device *device,
layout->binding[b].immutable_samplers_offset);
for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
- uint32_t combined_offset =
- layout->binding[b].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
- v3dv_X(device, combined_image_sampler_sampler_state_offset)() : 0;
-
- void *desc_map = descriptor_bo_map(device, set, &layout->binding[b], i);
- desc_map += combined_offset;
-
- memcpy(desc_map,
- samplers[i].sampler_state,
- sizeof(samplers[i].sampler_state));
+ assert(samplers[i].plane_count <= V3DV_MAX_PLANE_COUNT);
+ for (uint8_t plane = 0; plane < samplers[i].plane_count; plane++) {
+ uint32_t combined_offset =
+ layout->binding[b].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
+ v3dv_X(device, combined_image_sampler_sampler_state_offset)(plane) : 0;
+ void *desc_map =
+ descriptor_bo_map(device, set, &layout->binding[b], i);
+ desc_map += combined_offset;
+
+ memcpy(desc_map, samplers[i].sampler_state,
+ sizeof(samplers[i].sampler_state));
+ }
}
}
+ v3dv_descriptor_set_layout_ref(layout);
+ list_addtail(&set->pool_link, &pool->set_list);
+
*out_set = set;
return VK_SUCCESS;
@@ -860,8 +1021,13 @@ v3dv_FreeDescriptorSets(VkDevice _device,
for (uint32_t i = 0; i < count; i++) {
V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
- if (set && !pool->host_memory_base)
- descriptor_set_destroy(device, pool, set, true);
+
+ if (set) {
+ v3dv_descriptor_set_layout_unref(device, set->layout);
+ list_del(&set->pool_link);
+ if (!pool->host_memory_base)
+ descriptor_set_destroy(device, pool, set, true);
+ }
}
return VK_SUCCESS;
@@ -877,11 +1043,16 @@ descriptor_bo_copy(struct v3dv_device *device,
uint32_t src_array_index)
{
assert(dst_binding_layout->type == src_binding_layout->type);
+ assert(src_binding_layout->plane_stride == dst_binding_layout->plane_stride);
- void *dst_map = descriptor_bo_map(device, dst_set, dst_binding_layout, dst_array_index);
- void *src_map = descriptor_bo_map(device, src_set, src_binding_layout, src_array_index);
+ void *dst_map = descriptor_bo_map(device, dst_set, dst_binding_layout,
+ dst_array_index);
+ void *src_map = descriptor_bo_map(device, src_set, src_binding_layout,
+ src_array_index);
- memcpy(dst_map, src_map, v3dv_X(device, descriptor_bo_size)(src_binding_layout->type));
+ memcpy(dst_map, src_map,
+ v3dv_X(device, descriptor_bo_size)(src_binding_layout->type) *
+ src_binding_layout->plane_stride);
}
static void
@@ -916,26 +1087,39 @@ write_image_descriptor(struct v3dv_device *device,
descriptor->sampler = sampler;
descriptor->image_view = iview;
+ assert(iview || sampler);
+ uint8_t plane_count = iview ? iview->plane_count : sampler->plane_count;
+
void *desc_map = descriptor_bo_map(device, set,
binding_layout, array_index);
- if (iview) {
- const uint32_t tex_state_index =
- iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
- desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1;
- memcpy(desc_map,
- iview->texture_shader_state[tex_state_index],
- sizeof(iview->texture_shader_state[0]));
- desc_map += v3dv_X(device, combined_image_sampler_sampler_state_offset)();
- }
+ for (uint8_t plane = 0; plane < plane_count; plane++) {
+ if (iview) {
+ uint32_t offset = desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
+ v3dv_X(device, combined_image_sampler_texture_state_offset)(plane) : 0;
- if (sampler && !binding_layout->immutable_samplers_offset) {
- /* For immutable samplers this was already done as part of the
- * descriptor set create, as that info can't change later
- */
- memcpy(desc_map,
- sampler->sampler_state,
- sizeof(sampler->sampler_state));
+ void *plane_desc_map = desc_map + offset;
+
+ const uint32_t tex_state_index =
+ iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
+ desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1;
+ memcpy(plane_desc_map,
+ iview->planes[plane].texture_shader_state[tex_state_index],
+ sizeof(iview->planes[plane].texture_shader_state[0]));
+ }
+
+ if (sampler && !binding_layout->immutable_samplers_offset) {
+ uint32_t offset = desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
+ v3dv_X(device, combined_image_sampler_sampler_state_offset)(plane) : 0;
+
+ void *plane_desc_map = desc_map + offset;
+ /* For immutable samplers this was already done as part of the
+ * descriptor set create, as that info can't change later
+ */
+ memcpy(plane_desc_map,
+ sampler->sampler_state,
+ sizeof(sampler->sampler_state));
+ }
}
}
@@ -960,6 +1144,31 @@ write_buffer_view_descriptor(struct v3dv_device *device,
sizeof(bview->texture_shader_state));
}
+static void
+write_inline_uniform_descriptor(struct v3dv_device *device,
+ struct v3dv_descriptor *descriptor,
+ struct v3dv_descriptor_set *set,
+ const struct v3dv_descriptor_set_binding_layout *binding_layout,
+ const void *data,
+ size_t offset,
+ size_t size)
+{
+ assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
+ descriptor->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
+ descriptor->buffer = NULL;
+
+ void *desc_map = descriptor_bo_map(device, set, binding_layout, 0);
+ memcpy(desc_map + offset, data, size);
+
+ /* Inline uniform buffers allocate BO space in the pool for all inline
+ * buffers it may allocate and then this space is assigned to individual
+ * descriptors when they are written, so we define the range of an inline
+ * buffer as the largest range of data that the client has written to it.
+ */
+ descriptor->offset = 0;
+ descriptor->range = MAX2(descriptor->range, offset + size);
+}
+
VKAPI_ATTR void VKAPI_CALL
v3dv_UpdateDescriptorSets(VkDevice _device,
uint32_t descriptorWriteCount,
@@ -978,9 +1187,20 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
struct v3dv_descriptor *descriptor = set->descriptors;
descriptor += binding_layout->descriptor_index;
- descriptor += writeset->dstArrayElement;
- for (uint32_t j = 0; j < writeset->descriptorCount; ++j) {
+ /* Inline uniform blocks are not arrayed, instead they use dstArrayElement
+ * to specify the byte offset of the uniform update and descriptorCount
+ * to specify the size (in bytes) of the update.
+ */
+ uint32_t descriptor_count;
+ if (writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ descriptor += writeset->dstArrayElement;
+ descriptor_count = writeset->descriptorCount;
+ } else {
+ descriptor_count = 1;
+ }
+
+ for (uint32_t j = 0; j < descriptor_count; ++j) {
switch(writeset->descriptorType) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -993,12 +1213,11 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
break;
}
case VK_DESCRIPTOR_TYPE_SAMPLER: {
- /* If we are here we shouldn't be modifying a immutable sampler,
- * so we don't ensure that would work or not crash. But let the
- * validation layers check that
- */
+ /* If we are here we shouldn't be modifying an immutable sampler */
+ assert(!binding_layout->immutable_samplers_offset);
const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler);
+
write_image_descriptor(device, descriptor, writeset->descriptorType,
set, binding_layout, NULL, sampler,
writeset->dstArrayElement + j);
@@ -1010,6 +1229,7 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {
const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView);
+
write_image_descriptor(device, descriptor, writeset->descriptorType,
set, binding_layout, iview, NULL,
writeset->dstArrayElement + j);
@@ -1019,7 +1239,17 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView);
- V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler);
+ struct v3dv_sampler *sampler = NULL;
+ if (!binding_layout->immutable_samplers_offset) {
+ /* In general we ignore the sampler when updating a combined
+ * image sampler, but for YCbCr we kwnow that we must use
+ * immutable combined image samplers
+ */
+ assert(iview->plane_count == 1);
+ V3DV_FROM_HANDLE(v3dv_sampler, _sampler, image_info->sampler);
+ sampler = _sampler;
+ }
+
write_image_descriptor(device, descriptor, writeset->descriptorType,
set, binding_layout, iview, sampler,
writeset->dstArrayElement + j);
@@ -1035,6 +1265,18 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
writeset->dstArrayElement + j);
break;
}
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+ const VkWriteDescriptorSetInlineUniformBlock *inline_write =
+ vk_find_struct_const(writeset->pNext,
+ WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
+ assert(inline_write->dataSize == writeset->descriptorCount);
+ write_inline_uniform_descriptor(device, descriptor, set,
+ binding_layout,
+ inline_write->pData,
+ writeset->dstArrayElement, /* offset */
+ inline_write->dataSize);
+ break;
+ }
default:
unreachable("unimplemented descriptor type");
break;
@@ -1061,9 +1303,25 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
struct v3dv_descriptor *dst_descriptor = dst_set->descriptors;
src_descriptor += src_binding_layout->descriptor_index;
- src_descriptor += copyset->srcArrayElement;
-
dst_descriptor += dst_binding_layout->descriptor_index;
+
+ if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ /* {src,dst}ArrayElement specifies src/dst start offset and
+ * descriptorCount specifies size (in bytes) to copy.
+ */
+ const void *src_data = src_set->pool->bo->map +
+ src_set->base_offset +
+ src_binding_layout->descriptor_offset +
+ copyset->srcArrayElement;
+ write_inline_uniform_descriptor(device, dst_descriptor, dst_set,
+ dst_binding_layout,
+ src_data,
+ copyset->dstArrayElement,
+ copyset->descriptorCount);
+ continue;
+ }
+
+ src_descriptor += copyset->srcArrayElement;
dst_descriptor += copyset->dstArrayElement;
for (uint32_t j = 0; j < copyset->descriptorCount; j++) {
@@ -1127,66 +1385,6 @@ v3dv_GetDescriptorSetLayoutSupport(
pSupport->supported = supported;
}
-VkResult
-v3dv_CreateDescriptorUpdateTemplate(
- VkDevice _device,
- const VkDescriptorUpdateTemplateCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkDescriptorUpdateTemplate *pDescriptorUpdateTemplate)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- struct v3dv_descriptor_update_template *template;
-
- size_t size = sizeof(*template) +
- pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]);
- template = vk_object_alloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
- if (template == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- template->bind_point = pCreateInfo->pipelineBindPoint;
-
- assert(pCreateInfo->templateType ==
- VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET);
- template->set = pCreateInfo->set;
-
- template->entry_count = pCreateInfo->descriptorUpdateEntryCount;
- for (uint32_t i = 0; i < template->entry_count; i++) {
- const VkDescriptorUpdateTemplateEntry *pEntry =
- &pCreateInfo->pDescriptorUpdateEntries[i];
-
- template->entries[i] = (struct v3dv_descriptor_template_entry) {
- .type = pEntry->descriptorType,
- .binding = pEntry->dstBinding,
- .array_element = pEntry->dstArrayElement,
- .array_count = pEntry->descriptorCount,
- .offset = pEntry->offset,
- .stride = pEntry->stride,
- };
- }
-
- *pDescriptorUpdateTemplate =
- v3dv_descriptor_update_template_to_handle(template);
-
- return VK_SUCCESS;
-}
-
-void
-v3dv_DestroyDescriptorUpdateTemplate(
- VkDevice _device,
- VkDescriptorUpdateTemplate descriptorUpdateTemplate,
- const VkAllocationCallbacks *pAllocator)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template,
- descriptorUpdateTemplate);
-
- if (!template)
- return;
-
- vk_object_free(&device->vk, pAllocator, template);
-}
-
void
v3dv_UpdateDescriptorSetWithTemplate(
VkDevice _device,
@@ -1196,11 +1394,11 @@ v3dv_UpdateDescriptorSetWithTemplate(
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_descriptor_set, set, descriptorSet);
- V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template,
+ V3DV_FROM_HANDLE(vk_descriptor_update_template, template,
descriptorUpdateTemplate);
for (int i = 0; i < template->entry_count; i++) {
- const struct v3dv_descriptor_template_entry *entry =
+ const struct vk_descriptor_template_entry *entry =
&template->entries[i];
const struct v3dv_descriptor_set_binding_layout *binding_layout =
@@ -1208,8 +1406,7 @@ v3dv_UpdateDescriptorSetWithTemplate(
struct v3dv_descriptor *descriptor =
set->descriptors +
- binding_layout->descriptor_index +
- entry->array_element;
+ binding_layout->descriptor_index;
switch (entry->type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
@@ -1219,7 +1416,8 @@ v3dv_UpdateDescriptorSetWithTemplate(
for (uint32_t j = 0; j < entry->array_count; j++) {
const VkDescriptorBufferInfo *info =
pData + entry->offset + j * entry->stride;
- write_buffer_descriptor(descriptor + j, entry->type, info);
+ write_buffer_descriptor(descriptor + entry->array_element + j,
+ entry->type, info);
}
break;
@@ -1233,9 +1431,9 @@ v3dv_UpdateDescriptorSetWithTemplate(
pData + entry->offset + j * entry->stride;
V3DV_FROM_HANDLE(v3dv_image_view, iview, info->imageView);
V3DV_FROM_HANDLE(v3dv_sampler, sampler, info->sampler);
- write_image_descriptor(device, descriptor + j, entry->type,
- set, binding_layout, iview, sampler,
- entry->array_element + j);
+ write_image_descriptor(device, descriptor + entry->array_element + j,
+ entry->type, set, binding_layout, iview,
+ sampler, entry->array_element + j);
}
break;
@@ -1245,34 +1443,24 @@ v3dv_UpdateDescriptorSetWithTemplate(
const VkBufferView *_bview =
pData + entry->offset + j * entry->stride;
V3DV_FROM_HANDLE(v3dv_buffer_view, bview, *_bview);
- write_buffer_view_descriptor(device, descriptor + j, entry->type,
- set, binding_layout, bview,
+ write_buffer_view_descriptor(device,
+ descriptor + entry->array_element + j,
+ entry->type, set, binding_layout, bview,
entry->array_element + j);
}
break;
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+ write_inline_uniform_descriptor(device, descriptor, set,
+ binding_layout,
+ pData + entry->offset,
+ entry->array_element, /* offset */
+ entry->array_count); /* size */
+ break;
+ }
+
default:
unreachable("Unsupported descriptor type");
}
}
}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSamplerYcbcrConversion(
- VkDevice _device,
- const VkSamplerYcbcrConversionCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkSamplerYcbcrConversion *pYcbcrConversion)
-{
- unreachable("Ycbcr sampler conversion is not supported");
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySamplerYcbcrConversion(
- VkDevice _device,
- VkSamplerYcbcrConversion YcbcrConversion,
- const VkAllocationCallbacks *pAllocator)
-{
- unreachable("Ycbcr sampler conversion is not supported");
-}
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index fec53ec38c5..7992cab59ff 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -44,12 +44,18 @@
#include "compiler/v3d_compiler.h"
#include "drm-uapi/v3d_drm.h"
-#include "format/u_format.h"
+#include "vk_drm_syncobj.h"
#include "vk_util.h"
+#include "git_sha1.h"
#include "util/build_id.h"
-#include "util/debug.h"
-#include "util/u_cpu_detect.h"
+#include "util/os_file.h"
+#include "util/u_debug.h"
+#include "util/format/u_format.h"
+
+#if DETECT_OS_ANDROID
+#include "vk_android.h"
+#endif
#ifdef VK_USE_PLATFORM_XCB_KHR
#include <xcb/xcb.h>
@@ -62,11 +68,15 @@
#include "wayland-drm-client-protocol.h"
#endif
-#ifdef USE_V3D_SIMULATOR
-#include "drm-uapi/i915_drm.h"
-#endif
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION)
-#define V3DV_API_VERSION VK_MAKE_VERSION(1, 0, VK_HEADER_VERSION)
+#ifdef ANDROID_STRICT
+#if ANDROID_API_LEVEL <= 32
+/* Android 12.1 and lower support only Vulkan API v1.1 */
+#undef V3DV_API_VERSION
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
+#endif
+#endif
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion)
@@ -75,25 +85,32 @@ v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion)
return VK_SUCCESS;
}
-#define V3DV_HAS_SURFACE (VK_USE_PLATFORM_WIN32_KHR || \
- VK_USE_PLATFORM_WAYLAND_KHR || \
- VK_USE_PLATFORM_XCB_KHR || \
- VK_USE_PLATFORM_XLIB_KHR || \
- VK_USE_PLATFORM_DISPLAY_KHR)
+#if defined(VK_USE_PLATFORM_WIN32_KHR) || \
+ defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
+ defined(VK_USE_PLATFORM_XCB_KHR) || \
+ defined(VK_USE_PLATFORM_XLIB_KHR) || \
+ defined(VK_USE_PLATFORM_DISPLAY_KHR)
+#define V3DV_USE_WSI_PLATFORM
+#endif
static const struct vk_instance_extension_table instance_extensions = {
.KHR_device_group_creation = true,
#ifdef VK_USE_PLATFORM_DISPLAY_KHR
.KHR_display = true,
+ .KHR_get_display_properties2 = true,
+ .EXT_direct_mode_display = true,
+ .EXT_acquire_drm_display = true,
#endif
.KHR_external_fence_capabilities = true,
.KHR_external_memory_capabilities = true,
.KHR_external_semaphore_capabilities = true,
- .KHR_get_display_properties2 = true,
.KHR_get_physical_device_properties2 = true,
-#ifdef V3DV_HAS_SURFACE
+#ifdef V3DV_USE_WSI_PLATFORM
.KHR_get_surface_capabilities2 = true,
.KHR_surface = true,
+ .KHR_surface_protected_capabilities = true,
+ .EXT_surface_maintenance1 = true,
+ .EXT_swapchain_colorspace = true,
#endif
#ifdef VK_USE_PLATFORM_WAYLAND_KHR
.KHR_wayland_surface = true,
@@ -104,7 +121,14 @@ static const struct vk_instance_extension_table instance_extensions = {
#ifdef VK_USE_PLATFORM_XLIB_KHR
.KHR_xlib_surface = true,
#endif
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+ .EXT_acquire_xlib_display = true,
+#endif
+#ifndef VK_USE_PLATFORM_WIN32_KHR
+ .EXT_headless_surface = true,
+#endif
.EXT_debug_report = true,
+ .EXT_debug_utils = true,
};
static void
@@ -112,43 +136,354 @@ get_device_extensions(const struct v3dv_physical_device *device,
struct vk_device_extension_table *ext)
{
*ext = (struct vk_device_extension_table) {
- .KHR_bind_memory2 = true,
- .KHR_copy_commands2 = true,
- .KHR_dedicated_allocation = true,
- .KHR_device_group = true,
- .KHR_descriptor_update_template = true,
- .KHR_external_fence = true,
- .KHR_external_fence_fd = true,
- .KHR_external_memory = true,
- .KHR_external_memory_fd = true,
- .KHR_external_semaphore = true,
- .KHR_external_semaphore_fd = true,
- .KHR_get_memory_requirements2 = true,
- .KHR_image_format_list = true,
- .KHR_relaxed_block_layout = true,
- .KHR_maintenance1 = true,
- .KHR_maintenance2 = true,
- .KHR_maintenance3 = true,
- .KHR_multiview = true,
- .KHR_shader_non_semantic_info = true,
- .KHR_sampler_mirror_clamp_to_edge = true,
- .KHR_storage_buffer_storage_class = true,
- .KHR_uniform_buffer_standard_layout = true,
-#ifdef V3DV_HAS_SURFACE
- .KHR_swapchain = true,
- .KHR_incremental_present = true,
+ .KHR_8bit_storage = true,
+ .KHR_16bit_storage = true,
+ .KHR_bind_memory2 = true,
+ .KHR_buffer_device_address = true,
+ .KHR_copy_commands2 = true,
+ .KHR_create_renderpass2 = true,
+ .KHR_dedicated_allocation = true,
+ .KHR_device_group = true,
+ .KHR_driver_properties = true,
+ .KHR_descriptor_update_template = true,
+ .KHR_depth_stencil_resolve = true,
+ .KHR_dynamic_rendering = true,
+ .KHR_external_fence = true,
+ .KHR_external_fence_fd = true,
+ .KHR_external_memory = true,
+ .KHR_external_memory_fd = true,
+ .KHR_external_semaphore = true,
+ .KHR_external_semaphore_fd = true,
+ .KHR_format_feature_flags2 = true,
+ .KHR_get_memory_requirements2 = true,
+ .KHR_image_format_list = true,
+ .KHR_imageless_framebuffer = true,
+ .KHR_index_type_uint8 = true,
+ .KHR_line_rasterization = true,
+ .KHR_load_store_op_none = true,
+ .KHR_performance_query = device->caps.perfmon,
+ .KHR_relaxed_block_layout = true,
+ .KHR_maintenance1 = true,
+ .KHR_maintenance2 = true,
+ .KHR_maintenance3 = true,
+ .KHR_maintenance4 = true,
+ .KHR_multiview = true,
+ .KHR_pipeline_executable_properties = true,
+ .KHR_separate_depth_stencil_layouts = true,
+ .KHR_shader_expect_assume = true,
+ .KHR_shader_float_controls = true,
+ .KHR_shader_non_semantic_info = true,
+ .KHR_sampler_mirror_clamp_to_edge = true,
+ .KHR_sampler_ycbcr_conversion = true,
+ .KHR_spirv_1_4 = true,
+ .KHR_storage_buffer_storage_class = true,
+ .KHR_timeline_semaphore = true,
+ .KHR_uniform_buffer_standard_layout = true,
+ .KHR_shader_integer_dot_product = true,
+ .KHR_shader_terminate_invocation = true,
+ .KHR_synchronization2 = true,
+ .KHR_workgroup_memory_explicit_layout = true,
+#ifdef V3DV_USE_WSI_PLATFORM
+ .KHR_swapchain = true,
+ .KHR_swapchain_mutable_format = true,
+ .KHR_incremental_present = true,
+#endif
+ .KHR_variable_pointers = true,
+ .KHR_vertex_attribute_divisor = true,
+ .KHR_vulkan_memory_model = true,
+ .KHR_zero_initialize_workgroup_memory = true,
+ .EXT_4444_formats = true,
+ .EXT_attachment_feedback_loop_layout = true,
+ .EXT_border_color_swizzle = true,
+ .EXT_color_write_enable = true,
+ .EXT_custom_border_color = true,
+ .EXT_depth_clip_control = true,
+ .EXT_depth_clip_enable = device->devinfo.ver >= 71,
+ .EXT_load_store_op_none = true,
+ .EXT_inline_uniform_block = true,
+ .EXT_extended_dynamic_state = true,
+ .EXT_external_memory_dma_buf = true,
+ .EXT_host_query_reset = true,
+ .EXT_image_drm_format_modifier = true,
+ .EXT_image_robustness = true,
+ .EXT_index_type_uint8 = true,
+ .EXT_line_rasterization = true,
+ .EXT_memory_budget = true,
+ .EXT_multi_draw = true,
+ .EXT_physical_device_drm = true,
+ .EXT_pipeline_creation_cache_control = true,
+ .EXT_pipeline_creation_feedback = true,
+ .EXT_pipeline_robustness = true,
+ .EXT_primitive_topology_list_restart = true,
+ .EXT_private_data = true,
+ .EXT_provoking_vertex = true,
+ .EXT_separate_stencil_usage = true,
+ .EXT_shader_demote_to_helper_invocation = true,
+ .EXT_shader_module_identifier = true,
+ .EXT_subgroup_size_control = true,
+#ifdef V3DV_USE_WSI_PLATFORM
+ .EXT_swapchain_maintenance1 = true,
+#endif
+ .EXT_texel_buffer_alignment = true,
+ .EXT_tooling_info = true,
+ .EXT_vertex_attribute_divisor = true,
+#if DETECT_OS_ANDROID
+ .ANDROID_external_memory_android_hardware_buffer = true,
+ .ANDROID_native_buffer = true,
+ .EXT_queue_family_foreign = true,
+#endif
+ };
+}
+
+static void
+get_features(const struct v3dv_physical_device *physical_device,
+ struct vk_features *features)
+{
+ *features = (struct vk_features) {
+ /* Vulkan 1.0 */
+ .robustBufferAccess = true, /* This feature is mandatory */
+ .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
+ .imageCubeArray = true,
+ .independentBlend = true,
+ .geometryShader = true,
+ .tessellationShader = false,
+ .sampleRateShading = true,
+ .dualSrcBlend = false,
+ .logicOp = true,
+ .multiDrawIndirect = false,
+ .drawIndirectFirstInstance = true,
+ .depthClamp = physical_device->devinfo.ver >= 71,
+ .depthBiasClamp = true,
+ .fillModeNonSolid = true,
+ .depthBounds = physical_device->devinfo.ver >= 71,
+ .wideLines = true,
+ .largePoints = true,
+ .alphaToOne = true,
+ .multiViewport = false,
+ .samplerAnisotropy = true,
+ .textureCompressionETC2 = true,
+ .textureCompressionASTC_LDR = true,
+ /* Note that textureCompressionBC requires that the driver support all
+ * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim
+ * that we support it.
+ */
+ .textureCompressionBC = false,
+ .occlusionQueryPrecise = true,
+ .pipelineStatisticsQuery = false,
+ .vertexPipelineStoresAndAtomics = true,
+ .fragmentStoresAndAtomics = true,
+ .shaderTessellationAndGeometryPointSize = true,
+ .shaderImageGatherExtended = true,
+ .shaderStorageImageExtendedFormats = true,
+ .shaderStorageImageMultisample = false,
+ .shaderStorageImageReadWithoutFormat = true,
+ .shaderStorageImageWriteWithoutFormat = false,
+ .shaderUniformBufferArrayDynamicIndexing = false,
+ .shaderSampledImageArrayDynamicIndexing = false,
+ .shaderStorageBufferArrayDynamicIndexing = false,
+ .shaderStorageImageArrayDynamicIndexing = false,
+ .shaderClipDistance = true,
+ .shaderCullDistance = false,
+ .shaderFloat64 = false,
+ .shaderInt64 = false,
+ .shaderInt16 = false,
+ .shaderResourceResidency = false,
+ .shaderResourceMinLod = false,
+ .sparseBinding = false,
+ .sparseResidencyBuffer = false,
+ .sparseResidencyImage2D = false,
+ .sparseResidencyImage3D = false,
+ .sparseResidency2Samples = false,
+ .sparseResidency4Samples = false,
+ .sparseResidency8Samples = false,
+ .sparseResidency16Samples = false,
+ .sparseResidencyAliased = false,
+ .variableMultisampleRate = false,
+ .inheritedQueries = true,
+
+ /* Vulkan 1.1 */
+ .storageBuffer16BitAccess = true,
+ .uniformAndStorageBuffer16BitAccess = true,
+ .storagePushConstant16 = true,
+ .storageInputOutput16 = false,
+ .multiview = true,
+ .multiviewGeometryShader = false,
+ .multiviewTessellationShader = false,
+ .variablePointersStorageBuffer = true,
+ /* FIXME: this needs support for non-constant index on UBO/SSBO */
+ .variablePointers = false,
+ .protectedMemory = false,
+ .samplerYcbcrConversion = true,
+ .shaderDrawParameters = false,
+
+ /* Vulkan 1.2 */
+ .hostQueryReset = true,
+ .uniformAndStorageBuffer8BitAccess = true,
+ .uniformBufferStandardLayout = true,
+ /* V3D 4.2 wraps TMU vector accesses to 16-byte boundaries, so loads and
+ * stores of vectors that cross these boundaries would not work correctly
+ * with scalarBlockLayout and would need to be split into smaller vectors
+ * (and/or scalars) that don't cross these boundaries. For load/stores
+ * with dynamic offsets where we can't identify if the offset is
+ * problematic, we would always have to scalarize. Overall, this would
+ * not lead to best performance so let's just not support it.
+ */
+ .scalarBlockLayout = physical_device->devinfo.ver >= 71,
+ /* This tells applications 2 things:
+ *
+ * 1. If they can select just one aspect for barriers. For us barriers
+ * decide if we need to split a job and we don't care if it is only
+ * for one of the aspects of the image or both, so we don't really
+ * benefit from seeing barriers that select just one aspect.
+ *
+ * 2. If they can program different layouts for each aspect. We
+ * generally don't care about layouts, so again, we don't get any
+ * benefits from this to limit the scope of image layout transitions.
+ *
+ * Still, Vulkan 1.2 requires this feature to be supported so we
+ * advertise it even though we don't really take advantage of it.
+ */
+ .separateDepthStencilLayouts = true,
+ .storageBuffer8BitAccess = true,
+ .storagePushConstant8 = true,
+ .imagelessFramebuffer = true,
+ .timelineSemaphore = true,
+
+ .samplerMirrorClampToEdge = true,
+
+ /* Extended subgroup types is mandatory by Vulkan 1.2, however, it is
+ * only in effect if the implementation supports non 32-bit types, which
+ * we don't, so in practice setting it to true doesn't have any
+ * implications for us.
+ */
+ .shaderSubgroupExtendedTypes = true,
+ .subgroupBroadcastDynamicId = true,
+
+ .vulkanMemoryModel = true,
+ .vulkanMemoryModelDeviceScope = true,
+ .vulkanMemoryModelAvailabilityVisibilityChains = true,
+
+ .bufferDeviceAddress = true,
+ .bufferDeviceAddressCaptureReplay = false,
+ .bufferDeviceAddressMultiDevice = false,
+
+ /* Vulkan 1.3 */
+ .inlineUniformBlock = true,
+ /* Inline buffers work like push constants, so after their are bound
+ * some of their contents may be copied into the uniform stream as soon
+ * as the next draw/dispatch is recorded in the command buffer. This means
+ * that if the client updates the buffer contents after binding it to
+ * a command buffer, the next queue submit of that command buffer may
+ * not use the latest update to the buffer contents, but the data that
+ * was present in the buffer at the time it was bound to the command
+ * buffer.
+ */
+ .descriptorBindingInlineUniformBlockUpdateAfterBind = false,
+ .pipelineCreationCacheControl = true,
+ .privateData = true,
+ .maintenance4 = true,
+ .shaderZeroInitializeWorkgroupMemory = true,
+ .synchronization2 = true,
+ .robustImageAccess = true,
+ .shaderIntegerDotProduct = true,
+
+ /* VK_EXT_4444_formats */
+ .formatA4R4G4B4 = true,
+ .formatA4B4G4R4 = true,
+
+ /* VK_EXT_custom_border_color */
+ .customBorderColors = true,
+ .customBorderColorWithoutFormat = false,
+
+ /* VK_EXT_index_type_uint8 */
+ .indexTypeUint8 = true,
+
+ /* VK_EXT_line_rasterization */
+ .rectangularLines = true,
+ .bresenhamLines = true,
+ .smoothLines = true,
+ .stippledRectangularLines = false,
+ .stippledBresenhamLines = false,
+ .stippledSmoothLines = false,
+
+ /* VK_EXT_color_write_enable */
+ .colorWriteEnable = true,
+
+ /* VK_EXT_extended_dynamic_state */
+ .extendedDynamicState = true,
+
+ /* VK_KHR_pipeline_executable_properties */
+ .pipelineExecutableInfo = true,
+
+ /* VK_EXT_provoking_vertex */
+ .provokingVertexLast = true,
+ /* FIXME: update when supporting EXT_transform_feedback */
+ .transformFeedbackPreservesProvokingVertex = false,
+
+ /* VK_EXT_vertex_attribute_divisor */
+ .vertexAttributeInstanceRateDivisor = true,
+ .vertexAttributeInstanceRateZeroDivisor = false,
+
+ /* VK_KHR_performance_query */
+ .performanceCounterQueryPools = physical_device->caps.perfmon,
+ .performanceCounterMultipleQueryPools = false,
+
+ /* VK_EXT_texel_buffer_alignment */
+ .texelBufferAlignment = true,
+
+ /* VK_KHR_workgroup_memory_explicit_layout */
+ .workgroupMemoryExplicitLayout = true,
+ .workgroupMemoryExplicitLayoutScalarBlockLayout = false,
+ .workgroupMemoryExplicitLayout8BitAccess = true,
+ .workgroupMemoryExplicitLayout16BitAccess = true,
+
+ /* VK_EXT_border_color_swizzle */
+ .borderColorSwizzle = true,
+ .borderColorSwizzleFromImage = true,
+
+ /* VK_EXT_shader_module_identifier */
+ .shaderModuleIdentifier = true,
+
+ /* VK_EXT_depth_clip_control */
+ .depthClipControl = true,
+
+ /* VK_EXT_depth_clip_enable */
+ .depthClipEnable = physical_device->devinfo.ver >= 71,
+
+ /* VK_EXT_attachment_feedback_loop_layout */
+ .attachmentFeedbackLoopLayout = true,
+
+ /* VK_EXT_primitive_topology_list_restart */
+ .primitiveTopologyListRestart = true,
+ /* FIXME: we don't support tessellation shaders yet */
+ .primitiveTopologyPatchListRestart = false,
+
+ /* VK_EXT_pipeline_robustness */
+ .pipelineRobustness = true,
+
+ /* VK_EXT_multi_draw */
+ .multiDraw = true,
+
+ /* VK_KHR_shader_terminate_invocation */
+ .shaderTerminateInvocation = true,
+
+ /* VK_EXT_shader_demote_to_helper_invocation */
+ .shaderDemoteToHelperInvocation = true,
+
+ /* VK_EXT_subgroup_size_control */
+ .subgroupSizeControl = true,
+ .computeFullSubgroups = true,
+
+ /* VK_KHR_shader_expect_assume */
+ .shaderExpectAssume = true,
+
+ /* VK_KHR_dynamic_rendering */
+ .dynamicRendering = true,
+
+#ifdef V3DV_USE_WSI_PLATFORM
+ /* VK_EXT_swapchain_maintenance1 */
+ .swapchainMaintenance1 = true,
#endif
- .KHR_variable_pointers = true,
- .EXT_color_write_enable = true,
- .EXT_custom_border_color = true,
- .EXT_external_memory_dma_buf = true,
- .EXT_index_type_uint8 = true,
- .EXT_physical_device_drm = true,
- .EXT_pipeline_creation_cache_control = true,
- .EXT_pipeline_creation_feedback = true,
- .EXT_private_data = true,
- .EXT_provoking_vertex = true,
- .EXT_vertex_attribute_divisor = true,
};
}
@@ -165,6 +500,10 @@ v3dv_EnumerateInstanceExtensionProperties(const char *pLayerName,
&instance_extensions, pPropertyCount, pProperties);
}
+static VkResult enumerate_devices(struct vk_instance *vk_instance);
+
+static void destroy_physical_device(struct vk_physical_device *device);
+
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -186,6 +525,8 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
struct vk_instance_dispatch_table dispatch_table;
vk_instance_dispatch_table_from_entrypoints(
&dispatch_table, &v3dv_instance_entrypoints, true);
+ vk_instance_dispatch_table_from_entrypoints(
+ &dispatch_table, &wsi_instance_entrypoints, false);
result = vk_instance_init(&instance->vk,
&instance_extensions,
@@ -194,12 +535,13 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
if (result != VK_SUCCESS) {
vk_free(pAllocator, instance);
- return vk_error(instance, result);
+ return vk_error(NULL, result);
}
v3d_process_debug_variable();
- instance->physicalDeviceCount = -1;
+ instance->vk.physical_devices.enumerate = enumerate_devices;
+ instance->vk.physical_devices.destroy = destroy_physical_device;
/* We start with the default values for the pipeline_cache envvars */
instance->pipeline_cache_enabled = true;
@@ -229,8 +571,6 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
}
}
- util_cpu_detect();
-
VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
*pInstance = v3dv_instance_to_handle(instance);
@@ -256,11 +596,11 @@ physical_device_finish(struct v3dv_physical_device *device)
v3dv_physical_device_free_disk_cache(device);
v3d_compiler_free(device->compiler);
+ util_sparse_array_finish(&device->bo_map);
+
close(device->render_fd);
if (device->display_fd >= 0)
close(device->display_fd);
- if (device->master_fd >= 0)
- close(device->master_fd);
free(device->name);
@@ -272,6 +612,13 @@ physical_device_finish(struct v3dv_physical_device *device)
mtx_destroy(&device->mutex);
}
+static void
+destroy_physical_device(struct vk_physical_device *device)
+{
+ physical_device_finish((struct v3dv_physical_device *)device);
+ vk_free(&device->instance->alloc, device);
+}
+
VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyInstance(VkInstance _instance,
const VkAllocationCallbacks *pAllocator)
@@ -281,12 +628,6 @@ v3dv_DestroyInstance(VkInstance _instance,
if (!instance)
return;
- if (instance->physicalDeviceCount > 0) {
- /* We support at most one physical device. */
- assert(instance->physicalDeviceCount == 1);
- physical_device_finish(&instance->physicalDevice);
- }
-
VG(VALGRIND_DESTROY_MEMPOOL(instance));
vk_instance_finish(&instance->vk);
@@ -306,286 +647,39 @@ compute_heap_size()
uint64_t total_ram = (uint64_t) v3d_simulator_get_mem_size();
#endif
- /* We don't want to burn too much ram with the GPU. If the user has 4GiB
- * or less, we use at most half. If they have more than 4GiB, we use 3/4.
+ /* We don't want to burn too much ram with the GPU. If the user has 4GB
+ * or less, we use at most half. If they have more than 4GB we limit it
+ * to 3/4 with a max. of 4GB since the GPU cannot address more than that.
*/
- uint64_t available_ram;
- if (total_ram <= 4ull * 1024ull * 1024ull * 1024ull)
- available_ram = total_ram / 2;
+ const uint64_t MAX_HEAP_SIZE = 4ull * 1024ull * 1024ull * 1024ull;
+ uint64_t available;
+ if (total_ram <= MAX_HEAP_SIZE)
+ available = total_ram / 2;
else
- available_ram = total_ram * 3 / 4;
-
- return available_ram;
-}
-
-#if !using_v3d_simulator
-#ifdef VK_USE_PLATFORM_XCB_KHR
-static int
-create_display_fd_xcb(VkIcdSurfaceBase *surface)
-{
- int fd = -1;
-
- xcb_connection_t *conn;
- xcb_dri3_open_reply_t *reply = NULL;
- if (surface) {
- if (surface->platform == VK_ICD_WSI_PLATFORM_XLIB)
- conn = XGetXCBConnection(((VkIcdSurfaceXlib *)surface)->dpy);
- else
- conn = ((VkIcdSurfaceXcb *)surface)->connection;
- } else {
- conn = xcb_connect(NULL, NULL);
- }
-
- if (xcb_connection_has_error(conn))
- goto finish;
-
- const xcb_setup_t *setup = xcb_get_setup(conn);
- xcb_screen_iterator_t iter = xcb_setup_roots_iterator(setup);
- xcb_screen_t *screen = iter.data;
-
- xcb_dri3_open_cookie_t cookie;
- cookie = xcb_dri3_open(conn, screen->root, None);
- reply = xcb_dri3_open_reply(conn, cookie, NULL);
- if (!reply)
- goto finish;
-
- if (reply->nfd != 1)
- goto finish;
-
- fd = xcb_dri3_open_reply_fds(conn, reply)[0];
- fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
-
-finish:
- if (!surface)
- xcb_disconnect(conn);
- if (reply)
- free(reply);
-
- return fd;
-}
-#endif
-
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-struct v3dv_wayland_info {
- struct wl_drm *wl_drm;
- int fd;
- bool is_set;
- bool authenticated;
-};
-
-static void
-v3dv_drm_handle_device(void *data, struct wl_drm *drm, const char *device)
-{
- struct v3dv_wayland_info *info = data;
- info->fd = open(device, O_RDWR | O_CLOEXEC);
- info->is_set = info->fd != -1;
- if (!info->is_set) {
- fprintf(stderr, "v3dv_drm_handle_device: could not open %s (%s)\n",
- device, strerror(errno));
- return;
- }
-
- drm_magic_t magic;
- if (drmGetMagic(info->fd, &magic)) {
- fprintf(stderr, "v3dv_drm_handle_device: drmGetMagic failed\n");
- close(info->fd);
- info->fd = -1;
- info->is_set = false;
- return;
- }
- wl_drm_authenticate(info->wl_drm, magic);
-}
-
-static void
-v3dv_drm_handle_format(void *data, struct wl_drm *drm, uint32_t format)
-{
-}
-
-static void
-v3dv_drm_handle_authenticated(void *data, struct wl_drm *drm)
-{
- struct v3dv_wayland_info *info = data;
- info->authenticated = true;
-}
-
-static void
-v3dv_drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t value)
-{
-}
-
-struct wl_drm_listener v3dv_drm_listener = {
- .device = v3dv_drm_handle_device,
- .format = v3dv_drm_handle_format,
- .authenticated = v3dv_drm_handle_authenticated,
- .capabilities = v3dv_drm_handle_capabilities
-};
-
-static void
-v3dv_registry_global(void *data,
- struct wl_registry *registry,
- uint32_t name,
- const char *interface,
- uint32_t version)
-{
- struct v3dv_wayland_info *info = data;
- if (strcmp(interface, "wl_drm") == 0) {
- info->wl_drm = wl_registry_bind(registry, name, &wl_drm_interface,
- MIN2(version, 2));
- wl_drm_add_listener(info->wl_drm, &v3dv_drm_listener, data);
- };
-}
-
-static void
-v3dv_registry_global_remove_cb(void *data,
- struct wl_registry *registry,
- uint32_t name)
-{
-}
-
-static int
-create_display_fd_wayland(VkIcdSurfaceBase *surface)
-{
- struct wl_display *display;
- struct wl_registry *registry = NULL;
-
- struct v3dv_wayland_info info = {
- .wl_drm = NULL,
- .fd = -1,
- .is_set = false,
- .authenticated = false
- };
-
- if (surface)
- display = ((VkIcdSurfaceWayland *) surface)->display;
- else
- display = wl_display_connect(NULL);
-
- if (!display)
- return -1;
-
- registry = wl_display_get_registry(display);
- if (!registry) {
- if (!surface)
- wl_display_disconnect(display);
- return -1;
- }
-
- static const struct wl_registry_listener registry_listener = {
- v3dv_registry_global,
- v3dv_registry_global_remove_cb
- };
- wl_registry_add_listener(registry, &registry_listener, &info);
-
- wl_display_roundtrip(display); /* For the registry advertisement */
- wl_display_roundtrip(display); /* For the DRM device event */
- wl_display_roundtrip(display); /* For the authentication event */
-
- wl_drm_destroy(info.wl_drm);
- wl_registry_destroy(registry);
-
- if (!surface)
- wl_display_disconnect(display);
-
- if (!info.is_set)
- return -1;
-
- if (!info.authenticated)
- return -1;
-
- return info.fd;
-}
-#endif
-
-/* Acquire an authenticated display fd without a surface reference. This is the
- * case where the application is making WSI allocations outside the Vulkan
- * swapchain context (only Zink, for now). Since we lack information about the
- * underlying surface we just try our best to figure out the correct display
- * and platform to use. It should work in most cases.
- */
-static void
-acquire_display_device_no_surface(struct v3dv_instance *instance,
- struct v3dv_physical_device *pdevice)
-{
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
- pdevice->display_fd = create_display_fd_wayland(NULL);
-#endif
-
-#ifdef VK_USE_PLATFORM_XCB_KHR
- if (pdevice->display_fd == -1)
- pdevice->display_fd = create_display_fd_xcb(NULL);
-#endif
-
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
- if (pdevice->display_fd == - 1 && pdevice->master_fd >= 0)
- pdevice->display_fd = dup(pdevice->master_fd);
-#endif
-}
+ available = MIN2(MAX_HEAP_SIZE, total_ram * 3 / 4);
-/* Acquire an authenticated display fd from the surface. This is the regular
- * case where the application is using swapchains to create WSI allocations.
- * In this case we use the surface information to figure out the correct
- * display and platform combination.
- */
-static void
-acquire_display_device_surface(struct v3dv_instance *instance,
- struct v3dv_physical_device *pdevice,
- VkIcdSurfaceBase *surface)
-{
- /* Mesa will set both of VK_USE_PLATFORM_{XCB,XLIB} when building with
- * platform X11, so only check for XCB and rely on XCB to get an
- * authenticated device also for Xlib.
- */
-#ifdef VK_USE_PLATFORM_XCB_KHR
- if (surface->platform == VK_ICD_WSI_PLATFORM_XCB ||
- surface->platform == VK_ICD_WSI_PLATFORM_XLIB) {
- pdevice->display_fd = create_display_fd_xcb(surface);
- }
-#endif
-
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
- if (surface->platform == VK_ICD_WSI_PLATFORM_WAYLAND)
- pdevice->display_fd = create_display_fd_wayland(surface);
-#endif
-
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
- if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY &&
- pdevice->master_fd >= 0) {
- pdevice->display_fd = dup(pdevice->master_fd);
- }
-#endif
+ return available;
}
-#endif /* !using_v3d_simulator */
-/* Attempts to get an authenticated display fd from the display server that
- * we can use to allocate BOs for presentable images.
- */
-VkResult
-v3dv_physical_device_acquire_display(struct v3dv_instance *instance,
- struct v3dv_physical_device *pdevice,
- VkIcdSurfaceBase *surface)
+static uint64_t
+compute_memory_budget(struct v3dv_physical_device *device)
{
- VkResult result = VK_SUCCESS;
- mtx_lock(&pdevice->mutex);
-
- if (pdevice->display_fd != -1)
- goto done;
-
- /* When running on the simulator we do everything on a single render node so
- * we don't need to get an authenticated display fd from the display server.
- */
+ uint64_t heap_size = device->memory.memoryHeaps[0].size;
+ uint64_t heap_used = device->heap_used;
+ uint64_t sys_available;
#if !using_v3d_simulator
- if (surface)
- acquire_display_device_surface(instance, pdevice, surface);
- else
- acquire_display_device_no_surface(instance, pdevice);
-
- if (pdevice->display_fd == -1)
- result = VK_ERROR_INITIALIZATION_FAILED;
+ ASSERTED bool has_available_memory =
+ os_get_available_system_memory(&sys_available);
+ assert(has_available_memory);
+#else
+ sys_available = (uint64_t) v3d_simulator_get_mem_free();
#endif
-done:
- mtx_unlock(&pdevice->mutex);
- return result;
+ /* Let's not incite the app to starve the system: report at most 90% of
+ * available system memory.
+ */
+ uint64_t heap_available = sys_available * 9 / 10;
+ return MIN2(heap_size, heap_used + heap_available);
}
static bool
@@ -604,7 +698,8 @@ device_has_expected_features(struct v3dv_physical_device *device)
{
return v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_TFU) &&
v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CSD) &&
- v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH);
+ v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH) &&
+ device->caps.multisync;
}
@@ -614,14 +709,14 @@ init_uuids(struct v3dv_physical_device *device)
const struct build_id_note *note =
build_id_find_nhdr_for_addr(init_uuids);
if (!note) {
- return vk_errorf((struct v3dv_instance*) device->vk.instance,
+ return vk_errorf(device->vk.instance,
VK_ERROR_INITIALIZATION_FAILED,
"Failed to find build-id");
}
unsigned build_id_len = build_id_length(note);
if (build_id_len < 20) {
- return vk_errorf((struct v3dv_instance*) device->vk.instance,
+ return vk_errorf(device->vk.instance,
VK_ERROR_INITIALIZATION_FAILED,
"build-id too short. It needs to be a SHA");
}
@@ -672,38 +767,46 @@ v3dv_physical_device_init_disk_cache(struct v3dv_physical_device *device)
_mesa_sha1_format(timestamp, device->driver_build_sha1);
assert(device->name);
- device->disk_cache = disk_cache_create(device->name, timestamp, 0);
+ device->disk_cache = disk_cache_create(device->name, timestamp, v3d_mesa_debug);
#else
device->disk_cache = NULL;
#endif
}
static VkResult
-physical_device_init(struct v3dv_physical_device *device,
- struct v3dv_instance *instance,
- drmDevicePtr drm_render_device,
- drmDevicePtr drm_primary_device)
+create_physical_device(struct v3dv_instance *instance,
+ drmDevicePtr gpu_device,
+ drmDevicePtr display_device)
{
VkResult result = VK_SUCCESS;
- int32_t master_fd = -1;
+ int32_t display_fd = -1;
int32_t render_fd = -1;
+ struct v3dv_physical_device *device =
+ vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+
+ if (!device)
+ return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
struct vk_physical_device_dispatch_table dispatch_table;
vk_physical_device_dispatch_table_from_entrypoints
(&dispatch_table, &v3dv_physical_device_entrypoints, true);
+ vk_physical_device_dispatch_table_from_entrypoints(
+ &dispatch_table, &wsi_physical_device_entrypoints, false);
- result = vk_physical_device_init(&device->vk, &instance->vk, NULL,
- &dispatch_table);
+ result = vk_physical_device_init(&device->vk, &instance->vk, NULL, NULL,
+ NULL, &dispatch_table);
if (result != VK_SUCCESS)
goto fail;
- assert(drm_render_device);
- const char *path = drm_render_device->nodes[DRM_NODE_RENDER];
+ assert(gpu_device);
+ const char *path = gpu_device->nodes[DRM_NODE_RENDER];
render_fd = open(path, O_RDWR | O_CLOEXEC);
if (render_fd < 0) {
fprintf(stderr, "Opening %s failed: %s\n", path, strerror(errno));
- result = VK_ERROR_INCOMPATIBLE_DRIVER;
+ result = VK_ERROR_INITIALIZATION_FAILED;
goto fail;
}
@@ -714,12 +817,12 @@ physical_device_init(struct v3dv_physical_device *device,
const char *primary_path;
#if !using_v3d_simulator
- if (drm_primary_device)
- primary_path = drm_primary_device->nodes[DRM_NODE_PRIMARY];
+ if (display_device)
+ primary_path = display_device->nodes[DRM_NODE_PRIMARY];
else
primary_path = NULL;
#else
- primary_path = drm_render_device->nodes[DRM_NODE_PRIMARY];
+ primary_path = gpu_device->nodes[DRM_NODE_PRIMARY];
#endif
struct stat primary_stat = {0}, render_stat = {0};
@@ -727,8 +830,7 @@ physical_device_init(struct v3dv_physical_device *device,
device->has_primary = primary_path;
if (device->has_primary) {
if (stat(primary_path, &primary_stat) != 0) {
- result = vk_errorf(instance,
- VK_ERROR_INITIALIZATION_FAILED,
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
"failed to stat DRM primary node %s",
primary_path);
goto fail;
@@ -738,8 +840,7 @@ physical_device_init(struct v3dv_physical_device *device,
}
if (fstat(render_fd, &render_stat) != 0) {
- result = vk_errorf(instance,
- VK_ERROR_INITIALIZATION_FAILED,
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
"failed to stat DRM render node %s",
path);
goto fail;
@@ -747,16 +848,24 @@ physical_device_init(struct v3dv_physical_device *device,
device->has_render = true;
device->render_devid = render_stat.st_rdev;
- if (instance->vk.enabled_extensions.KHR_display) {
+#if using_v3d_simulator
+ device->device_id = gpu_device->deviceinfo.pci->device_id;
+#endif
+
+ if (instance->vk.enabled_extensions.KHR_display ||
+ instance->vk.enabled_extensions.KHR_xcb_surface ||
+ instance->vk.enabled_extensions.KHR_xlib_surface ||
+ instance->vk.enabled_extensions.KHR_wayland_surface ||
+ instance->vk.enabled_extensions.EXT_acquire_drm_display) {
#if !using_v3d_simulator
/* Open the primary node on the vc4 display device */
- assert(drm_primary_device);
- master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+ assert(display_device);
+ display_fd = open(primary_path, O_RDWR | O_CLOEXEC);
#else
/* There is only one device with primary and render nodes.
* Open its primary node.
*/
- master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+ display_fd = open(primary_path, O_RDWR | O_CLOEXEC);
#endif
}
@@ -765,21 +874,32 @@ physical_device_init(struct v3dv_physical_device *device,
#endif
device->render_fd = render_fd; /* The v3d render node */
- device->display_fd = -1; /* Authenticated vc4 primary node */
- device->master_fd = master_fd; /* Master vc4 primary node */
+ device->display_fd = display_fd; /* Master vc4 primary node */
if (!v3d_get_device_info(device->render_fd, &device->devinfo, &v3dv_ioctl)) {
- result = VK_ERROR_INCOMPATIBLE_DRIVER;
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+ "Failed to get info from device.");
goto fail;
}
if (device->devinfo.ver < 42) {
- result = VK_ERROR_INCOMPATIBLE_DRIVER;
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+ "Device version < 42.");
goto fail;
}
+ device->caps.cpu_queue =
+ v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CPU_QUEUE);
+
+ device->caps.multisync =
+ v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT);
+
+ device->caps.perfmon =
+ v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_PERFMON);
+
if (!device_has_expected_features(device)) {
- result = VK_ERROR_INCOMPATIBLE_DRIVER;
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+ "Kernel driver doesn't have required features.");
goto fail;
}
@@ -787,12 +907,15 @@ physical_device_init(struct v3dv_physical_device *device,
if (result != VK_SUCCESS)
goto fail;
- device->compiler = v3d_compiler_init(&device->devinfo);
+ device->compiler = v3d_compiler_init(&device->devinfo,
+ MAX_INLINE_UNIFORM_BUFFERS);
device->next_program_id = 0;
ASSERTED int len =
- asprintf(&device->name, "V3D %d.%d",
- device->devinfo.ver / 10, device->devinfo.ver % 10);
+ asprintf(&device->name, "V3D %d.%d.%d",
+ device->devinfo.ver / 10,
+ device->devinfo.ver % 10,
+ device->devinfo.rev);
assert(len != -1);
v3dv_physical_device_init_disk_cache(device);
@@ -811,7 +934,31 @@ physical_device_init(struct v3dv_physical_device *device,
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
mem->memoryTypes[0].heapIndex = 0;
- device->options.merge_jobs = getenv("V3DV_NO_MERGE_JOBS") == NULL;
+ /* Initialize sparse array for refcounting imported BOs */
+ util_sparse_array_init(&device->bo_map, sizeof(struct v3dv_bo), 512);
+
+ device->options.merge_jobs = !V3D_DBG(NO_MERGE_JOBS);
+
+ device->drm_syncobj_type = vk_drm_syncobj_get_type(device->render_fd);
+
+ /* We don't support timelines in the uAPI yet and we don't want it getting
+ * suddenly turned on by vk_drm_syncobj_get_type() without us adding v3dv
+ * code for it first.
+ */
+ device->drm_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
+
+ /* Multiwait is required for emulated timeline semaphores and is supported
+ * by the v3d kernel interface.
+ */
+ device->drm_syncobj_type.features |= VK_SYNC_FEATURE_GPU_MULTI_WAIT;
+
+ device->sync_timeline_type =
+ vk_sync_timeline_get_type(&device->drm_syncobj_type);
+
+ device->sync_types[0] = &device->drm_syncobj_type;
+ device->sync_types[1] = &device->sync_timeline_type.sync;
+ device->sync_types[2] = NULL;
+ device->vk.supported_sync_types = device->sync_types;
result = v3dv_wsi_init(device);
if (result != VK_SUCCESS) {
@@ -820,35 +967,46 @@ physical_device_init(struct v3dv_physical_device *device,
}
get_device_extensions(device, &device->vk.supported_extensions);
+ get_features(device, &device->vk.supported_features);
+
+ mtx_init(&device->mutex, mtx_plain);
- pthread_mutex_init(&device->mutex, NULL);
+ list_addtail(&device->vk.link, &instance->vk.physical_devices.list);
return VK_SUCCESS;
fail:
vk_physical_device_finish(&device->vk);
+ vk_free(&instance->vk.alloc, device);
if (render_fd >= 0)
close(render_fd);
- if (master_fd >= 0)
- close(master_fd);
+ if (display_fd >= 0)
+ close(display_fd);
return result;
}
+/* This driver hook is expected to return VK_SUCCESS (unless a memory
+ * allocation error happened) if no compatible device is found. If a
+ * compatible device is found, it may return an error code if device
+ * inialization failed.
+ */
static VkResult
-enumerate_devices(struct v3dv_instance *instance)
+enumerate_devices(struct vk_instance *vk_instance)
{
- /* TODO: Check for more devices? */
+ struct v3dv_instance *instance =
+ container_of(vk_instance, struct v3dv_instance, vk);
+
+ /* FIXME: Check for more devices? */
drmDevicePtr devices[8];
- VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER;
int max_devices;
- instance->physicalDeviceCount = 0;
-
max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
if (max_devices < 1)
- return VK_ERROR_INCOMPATIBLE_DRIVER;
+ return VK_SUCCESS;
+
+ VkResult result = VK_SUCCESS;
#if !using_v3d_simulator
int32_t v3d_idx = -1;
@@ -856,25 +1014,24 @@ enumerate_devices(struct v3dv_instance *instance)
#endif
for (unsigned i = 0; i < (unsigned)max_devices; i++) {
#if using_v3d_simulator
- /* In the simulator, we look for an Intel render node */
+ /* In the simulator, we look for an Intel/AMD render node */
const int required_nodes = (1 << DRM_NODE_RENDER) | (1 << DRM_NODE_PRIMARY);
if ((devices[i]->available_nodes & required_nodes) == required_nodes &&
devices[i]->bustype == DRM_BUS_PCI &&
- devices[i]->deviceinfo.pci->vendor_id == 0x8086) {
- result = physical_device_init(&instance->physicalDevice, instance,
- devices[i], NULL);
- if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
+ (devices[i]->deviceinfo.pci->vendor_id == 0x8086 ||
+ devices[i]->deviceinfo.pci->vendor_id == 0x1002)) {
+ result = create_physical_device(instance, devices[i], NULL);
+ if (result == VK_SUCCESS)
break;
}
#else
- /* On actual hardware, we should have a render node (v3d)
- * and a primary node (vc4). We will need to use the primary
- * to allocate WSI buffers and share them with the render node
- * via prime, but that is a privileged operation so we need the
- * primary node to be authenticated, and for that we need the
- * display server to provide the device fd (with DRI3), so we
- * here we only check that the device is present but we don't
- * try to open it.
+ /* On actual hardware, we should have a gpu device (v3d) and a display
+ * device (vc4). We will need to use the display device to allocate WSI
+ * buffers and share them with the render node via prime, but that is a
+ * privileged operation so we need t have an authenticated display fd
+ * and for that we need the display server to provide the it (with DRI3),
+ * so here we only check that the device is present but we don't try to
+ * open it.
*/
if (devices[i]->bustype != DRM_BUS_PLATFORM)
continue;
@@ -882,7 +1039,8 @@ enumerate_devices(struct v3dv_instance *instance)
if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
+ if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
+ strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
v3d_idx = i;
break;
}
@@ -891,8 +1049,9 @@ enumerate_devices(struct v3dv_instance *instance)
} else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
- strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
+ if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
vc4_idx = i;
break;
}
@@ -903,345 +1062,35 @@ enumerate_devices(struct v3dv_instance *instance)
}
#if !using_v3d_simulator
- if (v3d_idx == -1 || vc4_idx == -1)
- result = VK_ERROR_INCOMPATIBLE_DRIVER;
- else
- result = physical_device_init(&instance->physicalDevice, instance,
- devices[v3d_idx], devices[vc4_idx]);
+ if (v3d_idx != -1) {
+ drmDevicePtr v3d_device = devices[v3d_idx];
+ drmDevicePtr vc4_device = vc4_idx != -1 ? devices[vc4_idx] : NULL;
+ result = create_physical_device(instance, v3d_device, vc4_device);
+ }
#endif
drmFreeDevices(devices, max_devices);
- if (result == VK_SUCCESS)
- instance->physicalDeviceCount = 1;
-
return result;
}
-static VkResult
-instance_ensure_physical_device(struct v3dv_instance *instance)
-{
- if (instance->physicalDeviceCount < 0) {
- VkResult result = enumerate_devices(instance);
- if (result != VK_SUCCESS &&
- result != VK_ERROR_INCOMPATIBLE_DRIVER)
- return result;
- }
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_EnumeratePhysicalDevices(VkInstance _instance,
- uint32_t *pPhysicalDeviceCount,
- VkPhysicalDevice *pPhysicalDevices)
-{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
- VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount);
-
- VkResult result = instance_ensure_physical_device(instance);
- if (result != VK_SUCCESS)
- return result;
-
- if (instance->physicalDeviceCount == 0)
- return VK_SUCCESS;
-
- assert(instance->physicalDeviceCount == 1);
- vk_outarray_append(&out, i) {
- *i = v3dv_physical_device_to_handle(&instance->physicalDevice);
- }
-
- return vk_outarray_status(&out);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_EnumeratePhysicalDeviceGroups(
- VkInstance _instance,
- uint32_t *pPhysicalDeviceGroupCount,
- VkPhysicalDeviceGroupProperties *pPhysicalDeviceGroupProperties)
-{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
- VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties,
- pPhysicalDeviceGroupCount);
-
- VkResult result = instance_ensure_physical_device(instance);
- if (result != VK_SUCCESS)
- return result;
-
- assert(instance->physicalDeviceCount == 1);
-
- vk_outarray_append(&out, p) {
- p->physicalDeviceCount = 1;
- memset(p->physicalDevices, 0, sizeof(p->physicalDevices));
- p->physicalDevices[0] =
- v3dv_physical_device_to_handle(&instance->physicalDevice);
- p->subsetAllocation = false;
-
- vk_foreach_struct(ext, p->pNext)
- v3dv_debug_ignored_stype(ext->sType);
- }
-
- return vk_outarray_status(&out);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
- VkPhysicalDeviceFeatures *pFeatures)
-{
- memset(pFeatures, 0, sizeof(*pFeatures));
-
- *pFeatures = (VkPhysicalDeviceFeatures) {
- .robustBufferAccess = true, /* This feature is mandatory */
- .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
- .imageCubeArray = true,
- .independentBlend = true,
- .geometryShader = true,
- .tessellationShader = false,
- .sampleRateShading = true,
- .dualSrcBlend = false,
- .logicOp = true,
- .multiDrawIndirect = false,
- .drawIndirectFirstInstance = true,
- .depthClamp = false,
- .depthBiasClamp = true,
- .fillModeNonSolid = true,
- .depthBounds = false, /* Only available since V3D 4.3.16.2 */
- .wideLines = true,
- .largePoints = true,
- .alphaToOne = true,
- .multiViewport = false,
- .samplerAnisotropy = true,
- .textureCompressionETC2 = true,
- .textureCompressionASTC_LDR = true,
- /* Note that textureCompressionBC requires that the driver support all
- * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim
- * that we support it.
- */
- .textureCompressionBC = false,
- .occlusionQueryPrecise = true,
- .pipelineStatisticsQuery = false,
- .vertexPipelineStoresAndAtomics = true,
- .fragmentStoresAndAtomics = true,
- .shaderTessellationAndGeometryPointSize = true,
- .shaderImageGatherExtended = false,
- .shaderStorageImageExtendedFormats = true,
- .shaderStorageImageMultisample = false,
- .shaderStorageImageReadWithoutFormat = false,
- .shaderStorageImageWriteWithoutFormat = false,
- .shaderUniformBufferArrayDynamicIndexing = false,
- .shaderSampledImageArrayDynamicIndexing = false,
- .shaderStorageBufferArrayDynamicIndexing = false,
- .shaderStorageImageArrayDynamicIndexing = false,
- .shaderClipDistance = true,
- .shaderCullDistance = false,
- .shaderFloat64 = false,
- .shaderInt64 = false,
- .shaderInt16 = false,
- .shaderResourceResidency = false,
- .shaderResourceMinLod = false,
- .sparseBinding = false,
- .sparseResidencyBuffer = false,
- .sparseResidencyImage2D = false,
- .sparseResidencyImage3D = false,
- .sparseResidency2Samples = false,
- .sparseResidency4Samples = false,
- .sparseResidency8Samples = false,
- .sparseResidency16Samples = false,
- .sparseResidencyAliased = false,
- .variableMultisampleRate = false,
- .inheritedQueries = true,
- };
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
- VkPhysicalDeviceFeatures2 *pFeatures)
-{
- v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
-
- VkPhysicalDeviceVulkan11Features vk11 = {
- .storageBuffer16BitAccess = false,
- .uniformAndStorageBuffer16BitAccess = false,
- .storagePushConstant16 = false,
- .storageInputOutput16 = false,
- .multiview = true,
- .multiviewGeometryShader = false,
- .multiviewTessellationShader = false,
- .variablePointersStorageBuffer = true,
- /* FIXME: this needs support for non-constant index on UBO/SSBO */
- .variablePointers = false,
- .protectedMemory = false,
- .samplerYcbcrConversion = false,
- .shaderDrawParameters = false,
- };
-
- vk_foreach_struct(ext, pFeatures->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
- VkPhysicalDeviceCustomBorderColorFeaturesEXT *features =
- (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext;
- features->customBorderColors = true;
- features->customBorderColorWithoutFormat = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: {
- VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features =
- (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext;
- features->uniformBufferStandardLayout = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT: {
- VkPhysicalDevicePrivateDataFeaturesEXT *features =
- (VkPhysicalDevicePrivateDataFeaturesEXT *)ext;
- features->privateData = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
- VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
- (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
- features->indexTypeUint8 = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
- VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = (void *) ext;
- features->colorWriteEnable = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_CREATION_CACHE_CONTROL_FEATURES_EXT: {
- VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *features = (void *) ext;
- features->pipelineCreationCacheControl = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: {
- VkPhysicalDeviceProvokingVertexFeaturesEXT *features = (void *) ext;
- features->provokingVertexLast = true;
- /* FIXME: update when supporting EXT_transform_feedback */
- features->transformFeedbackPreservesProvokingVertex = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
- VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
- (void *) ext;
- features->vertexAttributeInstanceRateDivisor = true;
- features->vertexAttributeInstanceRateZeroDivisor = false;
- break;
- }
-
- /* Vulkan 1.1 */
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: {
- VkPhysicalDeviceVulkan11Features *features =
- (VkPhysicalDeviceVulkan11Features *)ext;
- memcpy(features, &vk11, sizeof(VkPhysicalDeviceVulkan11Features));
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
- VkPhysicalDevice16BitStorageFeatures *features = (void *) ext;
- features->storageBuffer16BitAccess = vk11.storageBuffer16BitAccess;
- features->uniformAndStorageBuffer16BitAccess =
- vk11.uniformAndStorageBuffer16BitAccess;
- features->storagePushConstant16 = vk11.storagePushConstant16;
- features->storageInputOutput16 = vk11.storageInputOutput16;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
- VkPhysicalDeviceMultiviewFeatures *features = (void *) ext;
- features->multiview = vk11.multiview;
- features->multiviewGeometryShader = vk11.multiviewGeometryShader;
- features->multiviewTessellationShader = vk11.multiviewTessellationShader;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
- VkPhysicalDeviceProtectedMemoryFeatures *features = (void *) ext;
- features->protectedMemory = vk11.protectedMemory;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
- VkPhysicalDeviceSamplerYcbcrConversionFeatures *features = (void *) ext;
- features->samplerYcbcrConversion = vk11.samplerYcbcrConversion;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
- VkPhysicalDeviceShaderDrawParametersFeatures *features = (void *) ext;
- features->shaderDrawParameters = vk11.shaderDrawParameters;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
- VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext;
- features->variablePointersStorageBuffer =
- vk11.variablePointersStorageBuffer;
- features->variablePointers = vk11.variablePointers;
- break;
- }
-
- default:
- v3dv_debug_ignored_stype(ext->sType);
- break;
- }
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetDeviceGroupPeerMemoryFeatures(VkDevice device,
- uint32_t heapIndex,
- uint32_t localDeviceIndex,
- uint32_t remoteDeviceIndex,
- VkPeerMemoryFeatureFlags *pPeerMemoryFeatures)
-{
- assert(localDeviceIndex == 0 && remoteDeviceIndex == 0);
- *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
- VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
- VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
- VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
-}
-
uint32_t
v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev)
{
return 0x14E4; /* Broadcom */
}
-
-#if using_v3d_simulator
-static bool
-get_i915_param(int fd, uint32_t param, int *value)
-{
- int tmp;
-
- struct drm_i915_getparam gp = {
- .param = param,
- .value = &tmp,
- };
-
- int ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
- if (ret != 0)
- return false;
-
- *value = tmp;
- return true;
-}
-#endif
-
uint32_t
v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
{
#if using_v3d_simulator
- int devid = 0;
-
- if (!get_i915_param(dev->render_fd, I915_PARAM_CHIPSET_ID, &devid))
- fprintf(stderr, "Error getting device_id\n");
-
- return devid;
+ return dev->device_id;
#else
switch (dev->devinfo.ver) {
case 42:
return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+ case 71:
+ return 0x55701C33; /* Broadcom deviceID for 2712 */
default:
unreachable("Unsupported V3D version");
}
@@ -1260,18 +1109,18 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
STATIC_ASSERT(MAX_STORAGE_BUFFERS >= MAX_DYNAMIC_STORAGE_BUFFERS);
const uint32_t page_size = 4096;
- const uint32_t mem_size = compute_heap_size();
+ const uint64_t mem_size = compute_heap_size();
const uint32_t max_varying_components = 16 * 4;
- const uint32_t v3d_coord_shift = 6;
-
- const float v3d_point_line_granularity = 2.0f / (1 << v3d_coord_shift);
- const uint32_t max_fb_size = 4096;
+ const float v3d_point_line_granularity = 2.0f / (1 << V3D_COORD_SHIFT);
+ const uint32_t max_fb_size = V3D_MAX_IMAGE_DIMENSION;
const VkSampleCountFlags supported_sample_counts =
VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
+ const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
+
struct timespec clock_res;
clock_getres(CLOCK_MONOTONIC, &clock_res);
const float timestamp_period =
@@ -1279,18 +1128,18 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
/* FIXME: this will probably require an in-depth review */
VkPhysicalDeviceLimits limits = {
- .maxImageDimension1D = 4096,
- .maxImageDimension2D = 4096,
- .maxImageDimension3D = 4096,
- .maxImageDimensionCube = 4096,
- .maxImageArrayLayers = 2048,
+ .maxImageDimension1D = V3D_MAX_IMAGE_DIMENSION,
+ .maxImageDimension2D = V3D_MAX_IMAGE_DIMENSION,
+ .maxImageDimension3D = V3D_MAX_IMAGE_DIMENSION,
+ .maxImageDimensionCube = V3D_MAX_IMAGE_DIMENSION,
+ .maxImageArrayLayers = V3D_MAX_ARRAY_LAYERS,
.maxTexelBufferElements = (1ul << 28),
.maxUniformBufferRange = V3D_MAX_BUFFER_RANGE,
.maxStorageBufferRange = V3D_MAX_BUFFER_RANGE,
.maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
.maxMemoryAllocationCount = mem_size / page_size,
.maxSamplerAllocationCount = 64 * 1024,
- .bufferImageGranularity = 256, /* A cache line */
+ .bufferImageGranularity = V3D_NON_COHERENT_ATOM_SIZE,
.sparseAddressSpaceSize = 0,
.maxBoundDescriptorSets = MAX_SETS,
.maxPerStageDescriptorSamplers = V3D_MAX_TEXTURE_SAMPLERS,
@@ -1342,7 +1191,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.maxFragmentInputComponents = max_varying_components,
.maxFragmentOutputAttachments = 4,
.maxFragmentDualSrcAttachments = 0,
- .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS +
+ .maxFragmentCombinedOutputResources = max_rts +
MAX_STORAGE_BUFFERS +
MAX_STORAGE_IMAGES,
@@ -1352,10 +1201,11 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.maxComputeWorkGroupInvocations = 256,
.maxComputeWorkGroupSize = { 256, 256, 256 },
- .subPixelPrecisionBits = v3d_coord_shift,
+ .subPixelPrecisionBits = V3D_COORD_SHIFT,
.subTexelPrecisionBits = 8,
.mipmapPrecisionBits = 8,
- .maxDrawIndexedIndexValue = 0x00ffffff,
+ .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ?
+ 0xffffffff : 0x00ffffff,
.maxDrawIndirectCount = 0x7fffffff,
.maxSamplerLodBias = 14.0f,
.maxSamplerAnisotropy = 16.0f,
@@ -1365,7 +1215,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
2.0 * max_fb_size - 1 },
.viewportSubPixelBits = 0,
.minMemoryMapAlignment = page_size,
- .minTexelBufferOffsetAlignment = V3D_UIFBLOCK_SIZE,
+ .minTexelBufferOffsetAlignment = V3D_TMU_TEXEL_ALIGN,
.minUniformBufferOffsetAlignment = 32,
.minStorageBufferOffsetAlignment = 32,
.minTexelOffset = -8,
@@ -1374,7 +1224,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.maxTexelGatherOffset = 7,
.minInterpolationOffset = -0.5,
.maxInterpolationOffset = 0.5,
- .subPixelInterpolationOffsetBits = v3d_coord_shift,
+ .subPixelInterpolationOffsetBits = V3D_COORD_SHIFT,
.maxFramebufferWidth = max_fb_size,
.maxFramebufferHeight = max_fb_size,
.maxFramebufferLayers = 256,
@@ -1382,7 +1232,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.framebufferDepthSampleCounts = supported_sample_counts,
.framebufferStencilSampleCounts = supported_sample_counts,
.framebufferNoAttachmentsSampleCounts = supported_sample_counts,
- .maxColorAttachments = MAX_RENDER_TARGETS,
+ .maxColorAttachments = max_rts,
.sampledImageColorSampleCounts = supported_sample_counts,
.sampledImageIntegerSampleCounts = supported_sample_counts,
.sampledImageDepthSampleCounts = supported_sample_counts,
@@ -1404,7 +1254,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.standardSampleLocations = false,
.optimalBufferCopyOffsetAlignment = 32,
.optimalBufferCopyRowPitchAlignment = 32,
- .nonCoherentAtomSize = 256,
+ .nonCoherentAtomSize = V3D_NON_COHERENT_ATOM_SIZE,
};
*pProperties = (VkPhysicalDeviceProperties) {
@@ -1431,7 +1281,166 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
v3dv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
+ /* We don't really have special restrictions for the maximum
+ * descriptors per set, other than maybe not exceeding the limits
+ * of addressable memory in a single allocation on either the host
+ * or the GPU. This will be a much larger limit than any of the
+ * per-stage limits already available in Vulkan though, so in practice,
+ * it is not expected to limit anything beyond what is already
+ * constrained through per-stage limits.
+ */
+ const uint32_t max_host_descriptors =
+ (UINT32_MAX - sizeof(struct v3dv_descriptor_set)) /
+ sizeof(struct v3dv_descriptor);
+ const uint32_t max_gpu_descriptors =
+ (UINT32_MAX / v3dv_X(pdevice, max_descriptor_bo_size)());
+
+ VkPhysicalDeviceVulkan13Properties vk13 = {
+ .maxInlineUniformBlockSize = 4096,
+ .maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UNIFORM_BUFFERS,
+ .maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UNIFORM_BUFFERS,
+ .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
+ MAX_INLINE_UNIFORM_BUFFERS,
+ .maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
+ MAX_INLINE_UNIFORM_BUFFERS,
+ .maxBufferSize = V3D_MAX_BUFFER_RANGE,
+ .storageTexelBufferOffsetAlignmentBytes = V3D_TMU_TEXEL_ALIGN,
+ .storageTexelBufferOffsetSingleTexelAlignment = false,
+ .uniformTexelBufferOffsetAlignmentBytes = V3D_TMU_TEXEL_ALIGN,
+ .uniformTexelBufferOffsetSingleTexelAlignment = false,
+ /* No native acceleration for integer dot product. We use NIR lowering. */
+ .integerDotProduct8BitUnsignedAccelerated = false,
+ .integerDotProduct8BitMixedSignednessAccelerated = false,
+ .integerDotProduct4x8BitPackedUnsignedAccelerated = false,
+ .integerDotProduct4x8BitPackedSignedAccelerated = false,
+ .integerDotProduct4x8BitPackedMixedSignednessAccelerated = false,
+ .integerDotProduct16BitUnsignedAccelerated = false,
+ .integerDotProduct16BitSignedAccelerated = false,
+ .integerDotProduct16BitMixedSignednessAccelerated = false,
+ .integerDotProduct32BitUnsignedAccelerated = false,
+ .integerDotProduct32BitSignedAccelerated = false,
+ .integerDotProduct32BitMixedSignednessAccelerated = false,
+ .integerDotProduct64BitUnsignedAccelerated = false,
+ .integerDotProduct64BitSignedAccelerated = false,
+ .integerDotProduct64BitMixedSignednessAccelerated = false,
+ .integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating8BitSignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false,
+ .integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false,
+ .integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating16BitSignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false,
+ .integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating32BitSignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false,
+ .integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating64BitSignedAccelerated = false,
+ .integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false,
+ /* VK_EXT_subgroup_size_control */
+ .minSubgroupSize = V3D_CHANNELS,
+ .maxSubgroupSize = V3D_CHANNELS,
+ .maxComputeWorkgroupSubgroups = 16, /* 256 / 16 */
+ .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
+ };
+
+ VkPhysicalDeviceVulkan12Properties vk12 = {
+ .driverID = VK_DRIVER_ID_MESA_V3DV,
+ .conformanceVersion = {
+ .major = 1,
+ .minor = 3,
+ .subminor = 6,
+ .patch = 1,
+ },
+ .supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT,
+ .supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT,
+ /* FIXME: if we want to support independentResolveNone then we would
+ * need to honor attachment load operations on resolve attachments,
+ * which we currently ignore because the resolve makes them irrelevant,
+ * as it unconditionally writes all pixels in the render area. However,
+ * with independentResolveNone, it is possible to have one aspect of a
+ * D/S resolve attachment stay unresolved, in which case the attachment
+ * load operation is relevant.
+ *
+ * NOTE: implementing attachment load for resolve attachments isn't
+ * immediately trivial because these attachments are not part of the
+ * framebuffer and therefore we can't use the same mechanism we use
+ * for framebuffer attachments. Instead, we should probably have to
+ * emit a meta operation for that right at the start of the render
+ * pass (or subpass).
+ */
+ .independentResolveNone = false,
+ .independentResolve = false,
+ .maxTimelineSemaphoreValueDifference = UINT64_MAX,
+
+ .denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+ .roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+ .shaderSignedZeroInfNanPreserveFloat16 = true,
+ .shaderSignedZeroInfNanPreserveFloat32 = true,
+ .shaderSignedZeroInfNanPreserveFloat64 = false,
+ .shaderDenormPreserveFloat16 = true,
+ .shaderDenormPreserveFloat32 = true,
+ .shaderDenormPreserveFloat64 = false,
+ .shaderDenormFlushToZeroFloat16 = false,
+ .shaderDenormFlushToZeroFloat32 = false,
+ .shaderDenormFlushToZeroFloat64 = false,
+ .shaderRoundingModeRTEFloat16 = true,
+ .shaderRoundingModeRTEFloat32 = true,
+ .shaderRoundingModeRTEFloat64 = false,
+ .shaderRoundingModeRTZFloat16 = false,
+ .shaderRoundingModeRTZFloat32 = false,
+ .shaderRoundingModeRTZFloat64 = false,
+
+ /* V3D doesn't support min/max filtering */
+ .filterMinmaxSingleComponentFormats = false,
+ .filterMinmaxImageComponentMapping = false,
+
+ .framebufferIntegerColorSampleCounts =
+ VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT,
+ };
+ memset(vk12.driverName, 0, VK_MAX_DRIVER_NAME_SIZE);
+ snprintf(vk12.driverName, VK_MAX_DRIVER_NAME_SIZE, "V3DV Mesa");
+ memset(vk12.driverInfo, 0, VK_MAX_DRIVER_INFO_SIZE);
+ snprintf(vk12.driverInfo, VK_MAX_DRIVER_INFO_SIZE,
+ "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+
+ VkSubgroupFeatureFlags subgroup_ops = VK_SUBGROUP_FEATURE_BASIC_BIT;
+ if (pdevice->devinfo.ver >= 71) {
+ subgroup_ops |= VK_SUBGROUP_FEATURE_BALLOT_BIT |
+ VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+ VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+ VK_SUBGROUP_FEATURE_VOTE_BIT |
+ VK_SUBGROUP_FEATURE_QUAD_BIT;
+ }
+
+ VkPhysicalDeviceVulkan11Properties vk11 = {
+ .deviceLUIDValid = false,
+ .subgroupSize = V3D_CHANNELS,
+ .subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT |
+ VK_SHADER_STAGE_FRAGMENT_BIT,
+ .subgroupSupportedOperations = subgroup_ops,
+ .subgroupQuadOperationsInAllStages = false,
+ .pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES,
+ .maxMultiviewViewCount = MAX_MULTIVIEW_VIEW_COUNT,
+ .maxMultiviewInstanceIndex = UINT32_MAX - 1,
+ .protectedNoFault = false,
+ .maxPerSetDescriptors = MIN2(max_host_descriptors, max_gpu_descriptors),
+ /* Minimum required by the spec */
+ .maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE,
+ };
+ memcpy(vk11.deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+ memcpy(vk11.driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
+
+
vk_foreach_struct(ext, pProperties->pNext) {
+ if (vk_get_physical_device_core_1_1_property_ext(ext, &vk11))
+ continue;
+ if (vk_get_physical_device_core_1_2_property_ext(ext, &vk12))
+ continue;
+ if (vk_get_physical_device_core_1_3_property_ext(ext, &vk13))
+ continue;
+
switch (ext->sType) {
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: {
VkPhysicalDeviceCustomBorderColorPropertiesEXT *props =
@@ -1453,15 +1462,31 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
props->maxVertexAttribDivisor = 0xffff;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
- VkPhysicalDeviceIDProperties *id_props =
- (VkPhysicalDeviceIDProperties *)ext;
- memcpy(id_props->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
- memcpy(id_props->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
- /* The LUID is for Windows. */
- id_props->deviceLUIDValid = false;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR : {
+ VkPhysicalDevicePerformanceQueryPropertiesKHR *props =
+ (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
+
+ props->allowCommandBufferQueryCopies = true;
+ break;
+ }
+#if DETECT_OS_ANDROID
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: {
+ VkPhysicalDevicePresentationPropertiesANDROID *props =
+ (VkPhysicalDevicePresentationPropertiesANDROID *)ext;
+ uint64_t front_rendering_usage = 0;
+ struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+ if (gralloc != NULL) {
+ u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage);
+ u_gralloc_destroy(&gralloc);
+ }
+ props->sharedImage = front_rendering_usage ? VK_TRUE
+ : VK_FALSE;
break;
}
+#pragma GCC diagnostic pop
+#endif
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
VkPhysicalDeviceDrmPropertiesEXT *props =
(VkPhysicalDeviceDrmPropertiesEXT *)ext;
@@ -1477,34 +1502,10 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
}
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
- VkPhysicalDeviceMaintenance3Properties *props =
- (VkPhysicalDeviceMaintenance3Properties *)ext;
- /* We don't really have special restrictions for the maximum
- * descriptors per set, other than maybe not exceeding the limits
- * of addressable memory in a single allocation on either the host
- * or the GPU. This will be a much larger limit than any of the
- * per-stage limits already available in Vulkan though, so in practice,
- * it is not expected to limit anything beyond what is already
- * constrained through per-stage limits.
- */
- uint32_t max_host_descriptors =
- (UINT32_MAX - sizeof(struct v3dv_descriptor_set)) /
- sizeof(struct v3dv_descriptor);
- uint32_t max_gpu_descriptors =
- (UINT32_MAX / v3dv_X(pdevice, max_descriptor_bo_size)());
- props->maxPerSetDescriptors =
- MIN2(max_host_descriptors, max_gpu_descriptors);
-
- /* Minimum required by the spec */
- props->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
- VkPhysicalDeviceMultiviewProperties *props =
- (VkPhysicalDeviceMultiviewProperties *)ext;
- props->maxMultiviewViewCount = MAX_MULTIVIEW_VIEW_COUNT;
- props->maxMultiviewInstanceIndex = UINT32_MAX - 1;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
+ VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
+ (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
+ props->lineSubPixelPrecisionBits = V3D_COORD_SHIFT;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT:
@@ -1512,26 +1513,33 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
* never provide this extension.
*/
break;
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
- VkPhysicalDevicePointClippingProperties *props =
- (VkPhysicalDevicePointClippingProperties *)ext;
- props->pointClippingBehavior =
- VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_PROPERTIES_EXT: {
+ VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *props =
+ (VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *)ext;
+ STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
+ sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+ memcpy(props->shaderModuleIdentifierAlgorithmUUID,
+ vk_shaderModuleIdentifierAlgorithmUUID,
+ sizeof(props->shaderModuleIdentifierAlgorithmUUID));
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: {
- VkPhysicalDeviceProtectedMemoryProperties *props =
- (VkPhysicalDeviceProtectedMemoryProperties *)ext;
- props->protectedNoFault = false;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_PROPERTIES_EXT: {
+ VkPhysicalDevicePipelineRobustnessPropertiesEXT *props =
+ (VkPhysicalDevicePipelineRobustnessPropertiesEXT *)ext;
+ props->defaultRobustnessStorageBuffers =
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT;
+ props->defaultRobustnessUniformBuffers =
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT;
+ props->defaultRobustnessVertexInputs =
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT;
+ props->defaultRobustnessImages =
+ VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DEVICE_DEFAULT_EXT;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
- VkPhysicalDeviceSubgroupProperties *props =
- (VkPhysicalDeviceSubgroupProperties *)ext;
- props->subgroupSize = V3D_CHANNELS;
- props->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
- props->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT;
- props->quadOperationsInAllStages = false;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: {
+ VkPhysicalDeviceMultiDrawPropertiesEXT *properties =
+ (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext;
+ properties->maxMultiDrawCount = 2048;
break;
}
default:
@@ -1553,25 +1561,14 @@ v3dv_queue_family_properties = {
};
VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceQueueFamilyProperties(VkPhysicalDevice physicalDevice,
- uint32_t *pCount,
- VkQueueFamilyProperties *pQueueFamilyProperties)
-{
- VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pCount);
-
- vk_outarray_append(&out, p) {
- *p = v3dv_queue_family_properties;
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyProperties2(VkPhysicalDevice physicalDevice,
uint32_t *pQueueFamilyPropertyCount,
VkQueueFamilyProperties2 *pQueueFamilyProperties)
{
- VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount);
+ VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out,
+ pQueueFamilyProperties, pQueueFamilyPropertyCount);
- vk_outarray_append(&out, p) {
+ vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
p->queueFamilyProperties = v3dv_queue_family_properties;
vk_foreach_struct(s, p->pNext) {
@@ -1592,11 +1589,28 @@ VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice physicalDevice,
VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)
{
+ V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
+
v3dv_GetPhysicalDeviceMemoryProperties(physicalDevice,
&pMemoryProperties->memoryProperties);
vk_foreach_struct(ext, pMemoryProperties->pNext) {
switch (ext->sType) {
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: {
+ VkPhysicalDeviceMemoryBudgetPropertiesEXT *p =
+ (VkPhysicalDeviceMemoryBudgetPropertiesEXT *) ext;
+ p->heapUsage[0] = device->heap_used;
+ p->heapBudget[0] = compute_memory_budget(device);
+
+ /* The heapBudget and heapUsage values must be zero for array elements
+ * greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount
+ */
+ for (unsigned i = 1; i < VK_MAX_MEMORY_HEAPS; i++) {
+ p->heapBudget[i] = 0u;
+ p->heapUsage[i] = 0u;
+ }
+ break;
+ }
default:
v3dv_debug_ignored_stype(ext->sType);
break;
@@ -1618,11 +1632,6 @@ v3dv_GetInstanceProcAddr(VkInstance _instance,
* vk_icdGetInstanceProcAddr to work around certain LD_PRELOAD issues seen in apps.
*/
PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction
-VKAPI_CALL vk_icdGetInstanceProcAddr(VkInstance instance,
- const char *pName);
-
-PUBLIC
VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
vk_icdGetInstanceProcAddr(VkInstance instance,
const char* pName)
@@ -1630,23 +1639,6 @@ vk_icdGetInstanceProcAddr(VkInstance instance,
return v3dv_GetInstanceProcAddr(instance, pName);
}
-/* With version 4+ of the loader interface the ICD should expose
- * vk_icdGetPhysicalDeviceProcAddr()
- */
-PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
-vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance,
- const char* pName);
-
-PFN_vkVoidFunction
-vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance,
- const char* pName)
-{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-
- return vk_instance_get_physical_device_proc_addr(&instance->vk, pName);
-}
-
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
VkLayerProperties *pProperties)
@@ -1671,30 +1663,66 @@ v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice,
return VK_SUCCESS;
}
- return vk_error((struct v3dv_instance*) physical_device->vk.instance,
- VK_ERROR_LAYER_NOT_PRESENT);
+ return vk_error(physical_device, VK_ERROR_LAYER_NOT_PRESENT);
+}
+
+static void
+destroy_queue_syncs(struct v3dv_queue *queue)
+{
+ for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+ if (queue->last_job_syncs.syncs[i]) {
+ drmSyncobjDestroy(queue->device->pdevice->render_fd,
+ queue->last_job_syncs.syncs[i]);
+ }
+ }
}
static VkResult
-queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
+queue_init(struct v3dv_device *device, struct v3dv_queue *queue,
+ const VkDeviceQueueCreateInfo *create_info,
+ uint32_t index_in_family)
{
- vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
+ VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info,
+ index_in_family);
+ if (result != VK_SUCCESS)
+ return result;
+
+ result = vk_queue_enable_submit_thread(&queue->vk);
+ if (result != VK_SUCCESS)
+ goto fail_submit_thread;
+
queue->device = device;
- queue->flags = 0;
+ queue->vk.driver_submit = v3dv_queue_driver_submit;
+
+ for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+ queue->last_job_syncs.first[i] = true;
+ int ret = drmSyncobjCreate(device->pdevice->render_fd,
+ DRM_SYNCOBJ_CREATE_SIGNALED,
+ &queue->last_job_syncs.syncs[i]);
+ if (ret) {
+ result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "syncobj create failed: %m");
+ goto fail_last_job_syncs;
+ }
+ }
+
queue->noop_job = NULL;
- list_inithead(&queue->submit_wait_list);
- pthread_mutex_init(&queue->mutex, NULL);
return VK_SUCCESS;
+
+fail_last_job_syncs:
+ destroy_queue_syncs(queue);
+fail_submit_thread:
+ vk_queue_finish(&queue->vk);
+ return result;
}
static void
queue_finish(struct v3dv_queue *queue)
{
- vk_object_base_finish(&queue->base);
- assert(list_is_empty(&queue->submit_wait_list));
if (queue->noop_job)
v3dv_job_destroy(queue->noop_job);
- pthread_mutex_destroy(&queue->mutex);
+ destroy_queue_syncs(queue);
+ vk_queue_finish(&queue->vk);
}
static void
@@ -1728,19 +1756,6 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
- /* Check enabled features */
- if (pCreateInfo->pEnabledFeatures) {
- VkPhysicalDeviceFeatures supported_features;
- v3dv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
- VkBool32 *supported_feature = (VkBool32 *)&supported_features;
- VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures;
- unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
- for (uint32_t i = 0; i < num_features; i++) {
- if (enabled_feature[i] && !supported_feature[i])
- return vk_error(instance, VK_ERROR_FEATURE_NOT_PRESENT);
- }
- }
-
/* Check requested queues (we only expose one queue ) */
assert(pCreateInfo->queueCreateInfoCount == 1);
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
@@ -1759,56 +1774,46 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
struct vk_device_dispatch_table dispatch_table;
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&v3dv_device_entrypoints, true);
+ vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+ &wsi_device_entrypoints, false);
result = vk_device_init(&device->vk, &physical_device->vk,
&dispatch_table, pCreateInfo, pAllocator);
if (result != VK_SUCCESS) {
vk_free(&device->vk.alloc, device);
- return vk_error(instance, result);
+ return vk_error(NULL, result);
}
+#if DETECT_OS_ANDROID
+ device->gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+ assert(device->gralloc);
+#endif
+
device->instance = instance;
device->pdevice = physical_device;
- if (pAllocator)
- device->vk.alloc = *pAllocator;
- else
- device->vk.alloc = physical_device->vk.instance->alloc;
+ mtx_init(&device->query_mutex, mtx_plain);
+ cnd_init(&device->query_ended);
+
+ device->vk.command_buffer_ops = &v3dv_cmd_buffer_ops;
- pthread_mutex_init(&device->mutex, NULL);
+ vk_device_set_drm_fd(&device->vk, physical_device->render_fd);
+ vk_device_enable_threaded_submit(&device->vk);
- result = queue_init(device, &device->queue);
+ result = queue_init(device, &device->queue,
+ pCreateInfo->pQueueCreateInfos, 0);
if (result != VK_SUCCESS)
goto fail;
device->devinfo = physical_device->devinfo;
- /* Vulkan 1.1 and VK_KHR_get_physical_device_properties2 added
- * VkPhysicalDeviceFeatures2 which can be used in the pNext chain of
- * vkDeviceCreateInfo, in which case it should be used instead of
- * pEnabledFeatures.
- */
- const VkPhysicalDeviceFeatures2 *features2 =
- vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_FEATURES_2);
- if (features2) {
- memcpy(&device->features, &features2->features,
- sizeof(device->features));
- } else if (pCreateInfo->pEnabledFeatures) {
- memcpy(&device->features, pCreateInfo->pEnabledFeatures,
- sizeof(device->features));
- }
-
- if (device->features.robustBufferAccess)
+ if (device->vk.enabled_features.robustBufferAccess)
perf_debug("Device created with Robust Buffer Access enabled.\n");
- int ret = drmSyncobjCreate(physical_device->render_fd,
- DRM_SYNCOBJ_CREATE_SIGNALED,
- &device->last_job_sync);
- if (ret) {
- result = VK_ERROR_INITIALIZATION_FAILED;
- goto fail;
- }
+ if (device->vk.enabled_features.robustImageAccess)
+ perf_debug("Device created with Robust Image Access enabled.\n");
-#ifdef DEBUG
+
+#if MESA_DEBUG
v3dv_X(device, device_check_prepacked_sizes)();
#endif
init_device_meta(device);
@@ -1816,14 +1821,42 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
device->instance->default_pipeline_cache_enabled);
device->default_attribute_float =
- v3dv_pipeline_create_default_attribute_values(device, NULL);
+ v3dv_X(device, create_default_attribute_values)(device, NULL);
+
+ device->device_address_mem_ctx = ralloc_context(NULL);
+ util_dynarray_init(&device->device_address_bo_list,
+ device->device_address_mem_ctx);
+
+ mtx_init(&device->events.lock, mtx_plain);
+ result = v3dv_event_allocate_resources(device);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ if (list_is_empty(&device->events.free_list)) {
+ result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ goto fail;
+ }
+
+ result = v3dv_query_allocate_resources(device);
+ if (result != VK_SUCCESS)
+ goto fail;
*pDevice = v3dv_device_to_handle(device);
return VK_SUCCESS;
fail:
+ cnd_destroy(&device->query_ended);
+ mtx_destroy(&device->query_mutex);
+ queue_finish(&device->queue);
+ destroy_device_meta(device);
+ v3dv_pipeline_cache_finish(&device->default_pipeline_cache);
+ v3dv_event_free_resources(device);
+ v3dv_query_free_resources(device);
vk_device_finish(&device->vk);
+#if DETECT_OS_ANDROID
+ u_gralloc_destroy(&device->gralloc);
+#endif
vk_free(&device->vk.alloc, device);
return result;
@@ -1835,10 +1868,14 @@ v3dv_DestroyDevice(VkDevice _device,
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
- v3dv_DeviceWaitIdle(_device);
+ device->vk.dispatch_table.DeviceWaitIdle(_device);
queue_finish(&device->queue);
- pthread_mutex_destroy(&device->mutex);
- drmSyncobjDestroy(device->pdevice->render_fd, device->last_job_sync);
+
+ v3dv_event_free_resources(device);
+ mtx_destroy(&device->events.lock);
+
+ v3dv_query_free_resources(device);
+
destroy_device_meta(device);
v3dv_pipeline_cache_finish(&device->default_pipeline_cache);
@@ -1847,36 +1884,23 @@ v3dv_DestroyDevice(VkDevice _device,
device->default_attribute_float = NULL;
}
+ ralloc_free(device->device_address_mem_ctx);
+
/* Bo cache should be removed the last, as any other object could be
* freeing their private bos
*/
v3dv_bo_cache_destroy(device);
+ cnd_destroy(&device->query_ended);
+ mtx_destroy(&device->query_mutex);
+
vk_device_finish(&device->vk);
+#if DETECT_OS_ANDROID
+ u_gralloc_destroy(&device->gralloc);
+#endif
vk_free2(&device->vk.alloc, pAllocator, device);
}
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetDeviceQueue(VkDevice _device,
- uint32_t queueFamilyIndex,
- uint32_t queueIndex,
- VkQueue *pQueue)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
- assert(queueIndex == 0);
- assert(queueFamilyIndex == 0);
-
- *pQueue = v3dv_queue_to_handle(&device->queue);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_DeviceWaitIdle(VkDevice _device)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- return v3dv_QueueWaitIdle(v3dv_queue_to_handle(&device->queue));
-}
-
static VkResult
device_alloc(struct v3dv_device *device,
struct v3dv_device_memory *mem,
@@ -1914,15 +1938,12 @@ device_free(struct v3dv_device *device, struct v3dv_device_memory *mem)
* display device to free the allocated dumb BO.
*/
if (mem->is_for_wsi) {
- assert(mem->has_bo_ownership);
- device_free_wsi_dumb(device->instance->physicalDevice.display_fd,
- mem->bo->dumb_handle);
+ device_free_wsi_dumb(device->pdevice->display_fd, mem->bo->dumb_handle);
}
- if (mem->has_bo_ownership)
- v3dv_bo_free(device, mem->bo);
- else if (mem->bo)
- vk_free(&device->vk.alloc, mem->bo);
+ p_atomic_add(&device->pdevice->heap_used, -((int64_t)mem->bo->size));
+
+ v3dv_bo_free(device, mem->bo);
}
static void
@@ -1967,21 +1988,12 @@ device_import_bo(struct v3dv_device *device,
int fd, uint64_t size,
struct v3dv_bo **bo)
{
- VkResult result;
-
- *bo = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(struct v3dv_bo), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (*bo == NULL) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
+ *bo = NULL;
off_t real_size = lseek(fd, 0, SEEK_END);
lseek(fd, 0, SEEK_SET);
- if (real_size < 0 || (uint64_t) real_size < size) {
- result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
- goto fail;
- }
+ if (real_size < 0 || (uint64_t) real_size < size)
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
int render_fd = device->pdevice->render_fd;
assert(render_fd >= 0);
@@ -1989,31 +2001,26 @@ device_import_bo(struct v3dv_device *device,
int ret;
uint32_t handle;
ret = drmPrimeFDToHandle(render_fd, fd, &handle);
- if (ret) {
- result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
- goto fail;
- }
+ if (ret)
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
struct drm_v3d_get_bo_offset get_offset = {
.handle = handle,
};
ret = v3dv_ioctl(render_fd, DRM_IOCTL_V3D_GET_BO_OFFSET, &get_offset);
- if (ret) {
- result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
- goto fail;
- }
+ if (ret)
+ return VK_ERROR_INVALID_EXTERNAL_HANDLE;
assert(get_offset.offset != 0);
- v3dv_bo_init(*bo, handle, size, get_offset.offset, "import", false);
+ *bo = v3dv_device_lookup_bo(device->pdevice, handle);
+ assert(*bo);
- return VK_SUCCESS;
+ if ((*bo)->refcnt == 0)
+ v3dv_bo_init_import(*bo, handle, size, get_offset.offset, false);
+ else
+ p_atomic_inc(&(*bo)->refcnt);
-fail:
- if (*bo) {
- vk_free2(&device->vk.alloc, pAllocator, *bo);
- *bo = NULL;
- }
- return result;
+ return VK_SUCCESS;
}
static VkResult
@@ -2030,19 +2037,8 @@ device_alloc_for_wsi(struct v3dv_device *device,
#if using_v3d_simulator
return device_alloc(device, mem, size);
#else
- /* If we are allocating for WSI we should have a swapchain and thus,
- * we should've initialized the display device. However, Zink doesn't
- * use swapchains, so in that case we can get here without acquiring the
- * display device and we need to do it now.
- */
VkResult result;
- struct v3dv_instance *instance = device->instance;
- struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
- if (unlikely(pdevice->display_fd < 0)) {
- result = v3dv_physical_device_acquire_display(instance, pdevice, NULL);
- if (result != VK_SUCCESS)
- return result;
- }
+ struct v3dv_physical_device *pdevice = device->pdevice;
assert(pdevice->display_fd != -1);
mem->is_for_wsi = true;
@@ -2082,6 +2078,53 @@ fail_create:
#endif
}
+static void
+device_add_device_address_bo(struct v3dv_device *device,
+ struct v3dv_bo *bo)
+{
+ util_dynarray_append(&device->device_address_bo_list,
+ struct v3dv_bo *,
+ bo);
+}
+
+static void
+device_remove_device_address_bo(struct v3dv_device *device,
+ struct v3dv_bo *bo)
+{
+ util_dynarray_delete_unordered(&device->device_address_bo_list,
+ struct v3dv_bo *,
+ bo);
+}
+
+static void
+free_memory(struct v3dv_device *device,
+ struct v3dv_device_memory *mem,
+ const VkAllocationCallbacks *pAllocator)
+{
+ if (mem == NULL)
+ return;
+
+ if (mem->bo->map)
+ device_unmap(device, mem);
+
+ if (mem->is_for_device_address)
+ device_remove_device_address_bo(device, mem->bo);
+
+ device_free(device, mem);
+
+ vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_FreeMemory(VkDevice _device,
+ VkDeviceMemory _mem,
+ const VkAllocationCallbacks *pAllocator)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ V3DV_FROM_HANDLE(v3dv_device_memory, mem, _mem);
+ free_memory(device, mem, pAllocator);
+}
+
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AllocateMemory(VkDevice _device,
const VkMemoryAllocateInfo *pAllocateInfo,
@@ -2090,25 +2133,34 @@ v3dv_AllocateMemory(VkDevice _device,
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
struct v3dv_device_memory *mem;
- struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+ struct v3dv_physical_device *pdevice = device->pdevice;
assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
- /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */
- assert(pAllocateInfo->allocationSize > 0);
+ /* We always allocate device memory in multiples of a page, so round up
+ * requested size to that.
+ */
+ const VkDeviceSize alloc_size = align64(pAllocateInfo->allocationSize, 4096);
+
+ if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE))
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+ uint64_t heap_used = p_atomic_read(&pdevice->heap_used);
+ if (unlikely(heap_used + alloc_size > pdevice->memory.memoryHeaps[0].size))
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
- mem = vk_object_zalloc(&device->vk, pAllocator, sizeof(*mem),
- VK_OBJECT_TYPE_DEVICE_MEMORY);
+ mem = vk_device_memory_create(&device->vk, pAllocateInfo,
+ pAllocator, sizeof(*mem));
if (mem == NULL)
return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.memoryTypeCount);
mem->type = &pdevice->memory.memoryTypes[pAllocateInfo->memoryTypeIndex];
- mem->has_bo_ownership = true;
mem->is_for_wsi = false;
const struct wsi_memory_allocate_info *wsi_info = NULL;
const VkImportMemoryFdInfoKHR *fd_info = NULL;
+ const VkMemoryAllocateFlagsInfo *flags_info = NULL;
vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
switch ((unsigned)ext->sType) {
case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA:
@@ -2118,16 +2170,14 @@ v3dv_AllocateMemory(VkDevice _device,
fd_info = (void *)ext;
break;
case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO:
- /* We don't support VK_KHR_buffer_device_address or multiple
- * devices per device group, so we can ignore this.
- */
+ flags_info = (void *)ext;
break;
- case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR:
+ case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
/* We don't have particular optimizations associated with memory
* allocations that won't be suballocated to multiple resources.
*/
break;
- case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR:
+ case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO:
/* The mask of handle types specified here must be supported
* according to VkExternalImageFormatProperties, so it must be
* fd or dmabuf, which don't have special requirements for us.
@@ -2139,57 +2189,58 @@ v3dv_AllocateMemory(VkDevice _device,
}
}
- VkResult result = VK_SUCCESS;
-
- /* We always allocate device memory in multiples of a page, so round up
- * requested size to that.
- */
- VkDeviceSize alloc_size = ALIGN(pAllocateInfo->allocationSize, 4096);
+ VkResult result;
- if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE)) {
- result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+ if (wsi_info) {
+ result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size);
+ } else if (fd_info && fd_info->handleType) {
+ assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
+ fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+ result = device_import_bo(device, pAllocator,
+ fd_info->fd, alloc_size, &mem->bo);
+ if (result == VK_SUCCESS)
+ close(fd_info->fd);
+ } else if (mem->vk.ahardware_buffer) {
+#if DETECT_OS_ANDROID
+ const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
+ assert(handle->numFds > 0);
+ size_t size = lseek(handle->data[0], 0, SEEK_END);
+ result = device_import_bo(device, pAllocator,
+ handle->data[0], size, &mem->bo);
+#else
+ result = VK_ERROR_FEATURE_NOT_PRESENT;
+#endif
} else {
- if (wsi_info) {
- result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size);
- } else if (fd_info && fd_info->handleType) {
- assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
- fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
- result = device_import_bo(device, pAllocator,
- fd_info->fd, alloc_size, &mem->bo);
- mem->has_bo_ownership = false;
- if (result == VK_SUCCESS)
- close(fd_info->fd);
- } else {
- result = device_alloc(device, mem, alloc_size);
- }
+ result = device_alloc(device, mem, alloc_size);
}
if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, mem);
- return vk_error(device->instance, result);
+ vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
+ return vk_error(device, result);
}
- *pMem = v3dv_device_memory_to_handle(mem);
- return result;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_FreeMemory(VkDevice _device,
- VkDeviceMemory _mem,
- const VkAllocationCallbacks *pAllocator)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_device_memory, mem, _mem);
-
- if (mem == NULL)
- return;
-
- if (mem->bo->map)
- v3dv_UnmapMemory(_device, _mem);
+ heap_used = p_atomic_add_return(&pdevice->heap_used, mem->bo->size);
+ if (heap_used > pdevice->memory.memoryHeaps[0].size) {
+ free_memory(device, mem, pAllocator);
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ }
- device_free(device, mem);
+ /* If this memory can be used via VK_KHR_buffer_device_address then we
+ * will need to manually add the BO to any job submit that makes use of
+ * VK_KHR_buffer_device_address, since such jobs may produce buffer
+ * load/store operations that may access any buffer memory allocated with
+ * this flag and we don't have any means to tell which buffers will be
+ * accessed through this mechanism since they don't even have to be bound
+ * through descriptor state.
+ */
+ if (flags_info &&
+ (flags_info->flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT)) {
+ mem->is_for_device_address = true;
+ device_add_device_address_bo(device, mem->bo);
+ }
- vk_object_free(&device->vk, pAllocator, mem);
+ *pMem = v3dv_device_memory_to_handle(mem);
+ return result;
}
VKAPI_ATTR VkResult VKAPI_CALL
@@ -2217,7 +2268,7 @@ v3dv_MapMemory(VkDevice _device,
*/
VkResult result = device_map(device, mem);
if (result != VK_SUCCESS)
- return vk_error(device->instance, result);
+ return vk_error(device, result);
*ppData = ((uint8_t *) mem->bo->map) + offset;
return VK_SUCCESS;
@@ -2252,19 +2303,30 @@ v3dv_InvalidateMappedMemoryRanges(VkDevice _device,
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetImageMemoryRequirements2(VkDevice device,
- const VkImageMemoryRequirementsInfo2 *pInfo,
- VkMemoryRequirements2 *pMemoryRequirements)
+static void
+get_image_memory_requirements(struct v3dv_image *image,
+ VkImageAspectFlagBits planeAspect,
+ VkMemoryRequirements2 *pMemoryRequirements)
{
- V3DV_FROM_HANDLE(v3dv_image, image, pInfo->image);
-
pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
.memoryTypeBits = 0x1,
- .alignment = image->alignment,
- .size = image->size
+ .alignment = image->planes[0].alignment,
+ .size = image->non_disjoint_size
};
+ if (planeAspect != VK_IMAGE_ASPECT_NONE) {
+ assert(image->format->plane_count > 1);
+ /* Disjoint images should have a 0 non_disjoint_size */
+ assert(!pMemoryRequirements->memoryRequirements.size);
+
+ uint8_t plane = v3dv_image_aspect_to_plane(image, planeAspect);
+
+ VkMemoryRequirements *mem_reqs =
+ &pMemoryRequirements->memoryRequirements;
+ mem_reqs->alignment = image->planes[plane].alignment;
+ mem_reqs->size = image->planes[plane].size;
+ }
+
vk_foreach_struct(ext, pMemoryRequirements->pNext) {
switch (ext->sType) {
case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
@@ -2281,6 +2343,65 @@ v3dv_GetImageMemoryRequirements2(VkDevice device,
}
}
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetImageMemoryRequirements2(VkDevice device,
+ const VkImageMemoryRequirementsInfo2 *pInfo,
+ VkMemoryRequirements2 *pMemoryRequirements)
+{
+ V3DV_FROM_HANDLE(v3dv_image, image, pInfo->image);
+
+ VkImageAspectFlagBits planeAspect = VK_IMAGE_ASPECT_NONE;
+ vk_foreach_struct_const(ext, pInfo->pNext) {
+ switch (ext->sType) {
+ case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: {
+ VkImagePlaneMemoryRequirementsInfo *req =
+ (VkImagePlaneMemoryRequirementsInfo *) ext;
+ planeAspect = req->planeAspect;
+ break;
+ }
+ default:
+ v3dv_debug_ignored_stype(ext->sType);
+ break;
+ }
+ }
+
+ get_image_memory_requirements(image, planeAspect, pMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDeviceImageMemoryRequirements(
+ VkDevice _device,
+ const VkDeviceImageMemoryRequirements *pInfo,
+ VkMemoryRequirements2 *pMemoryRequirements)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+
+ struct v3dv_image image = { 0 };
+ vk_image_init(&device->vk, &image.vk, pInfo->pCreateInfo);
+
+ ASSERTED VkResult result =
+ v3dv_image_init(device, pInfo->pCreateInfo, NULL, &image);
+ assert(result == VK_SUCCESS);
+
+ /* From VkDeviceImageMemoryRequirements spec:
+ *
+ * " planeAspect is a VkImageAspectFlagBits value specifying the aspect
+ * corresponding to the image plane to query. This parameter is ignored
+ * unless pCreateInfo::tiling is
+ * VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, or pCreateInfo::flags has
+ * VK_IMAGE_CREATE_DISJOINT_BIT set"
+ *
+ * We need to explicitly ignore that flag, or following asserts could be
+ * triggered.
+ */
+ VkImageAspectFlagBits planeAspect =
+ pInfo->pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT ||
+ pInfo->pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT ?
+ pInfo->planeAspect : 0;
+
+ get_image_memory_requirements(&image, planeAspect, pMemoryRequirements);
+}
+
static void
bind_image_memory(const VkBindImageMemoryInfo *info)
{
@@ -2293,11 +2414,43 @@ bind_image_memory(const VkBindImageMemoryInfo *info)
* the VkMemoryRequirements structure returned from a call to
* vkGetImageMemoryRequirements with image"
*/
- assert(info->memoryOffset % image->alignment == 0);
assert(info->memoryOffset < mem->bo->size);
- image->mem = mem;
- image->mem_offset = info->memoryOffset;
+ uint64_t offset = info->memoryOffset;
+ if (image->non_disjoint_size) {
+ /* We only check for plane 0 as it is the only one that actually starts
+ * at that offset
+ */
+ assert(offset % image->planes[0].alignment == 0);
+ for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+ image->planes[plane].mem = mem;
+ image->planes[plane].mem_offset = offset;
+ }
+ } else {
+ const VkBindImagePlaneMemoryInfo *plane_mem_info =
+ vk_find_struct_const(info->pNext, BIND_IMAGE_PLANE_MEMORY_INFO);
+ assert(plane_mem_info);
+
+ /*
+ * From VkBindImagePlaneMemoryInfo spec:
+ *
+ * "If the image’s tiling is VK_IMAGE_TILING_LINEAR or
+ * VK_IMAGE_TILING_OPTIMAL, then planeAspect must be a single valid
+ * format plane for the image"
+ *
+ * <skip>
+ *
+ * "If the image’s tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+ * then planeAspect must be a single valid memory plane for the
+ * image"
+ *
+ * So planeAspect should only refer to one plane.
+ */
+ uint8_t plane = v3dv_plane_from_aspect(plane_mem_info->planeAspect);
+ assert(offset % image->planes[plane].alignment == 0);
+ image->planes[plane].mem = mem;
+ image->planes[plane].mem_offset = offset;
+ }
}
VKAPI_ATTR VkResult VKAPI_CALL
@@ -2306,21 +2459,59 @@ v3dv_BindImageMemory2(VkDevice _device,
const VkBindImageMemoryInfo *pBindInfos)
{
for (uint32_t i = 0; i < bindInfoCount; i++) {
+#if DETECT_OS_ANDROID
+ V3DV_FROM_HANDLE(v3dv_device_memory, mem, pBindInfos[i].memory);
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ if (mem != NULL && mem->vk.ahardware_buffer) {
+ AHardwareBuffer_Desc description;
+ const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
+
+ V3DV_FROM_HANDLE(v3dv_image, image, pBindInfos[i].image);
+ AHardwareBuffer_describe(mem->vk.ahardware_buffer, &description);
+
+ struct u_gralloc_buffer_handle gr_handle = {
+ .handle = handle,
+ .pixel_stride = description.stride,
+ .hal_format = description.format,
+ };
+
+ VkResult result = v3dv_gralloc_to_drm_explicit_layout(
+ device->gralloc,
+ &gr_handle,
+ image->android_explicit_layout,
+ image->android_plane_layouts,
+ V3DV_MAX_PLANE_COUNT);
+ if (result != VK_SUCCESS)
+ return result;
+
+ result = v3dv_update_image_layout(
+ device, image, image->android_explicit_layout->drmFormatModifier,
+ /* disjoint = */ false, image->android_explicit_layout);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+#endif
+
const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
vk_find_struct_const(pBindInfos->pNext,
BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR);
if (swapchain_info && swapchain_info->swapchain) {
+#if !DETECT_OS_ANDROID
struct v3dv_image *swapchain_image =
v3dv_wsi_get_image_from_swapchain(swapchain_info->swapchain,
swapchain_info->imageIndex);
+ /* Making the assumption that swapchain images are a single plane */
+ assert(swapchain_image->plane_count == 1);
VkBindImageMemoryInfo swapchain_bind = {
.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
.image = pBindInfos[i].image,
- .memory = v3dv_device_memory_to_handle(swapchain_image->mem),
- .memoryOffset = swapchain_image->mem_offset,
+ .memory = v3dv_device_memory_to_handle(swapchain_image->planes[0].mem),
+ .memoryOffset = swapchain_image->planes[0].mem_offset,
};
bind_image_memory(&swapchain_bind);
- } else {
+#endif
+ } else
+ {
bind_image_memory(&pBindInfos[i]);
}
}
@@ -2328,19 +2519,39 @@ v3dv_BindImageMemory2(VkDevice _device,
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetBufferMemoryRequirements2(VkDevice device,
- const VkBufferMemoryRequirementsInfo2 *pInfo,
- VkMemoryRequirements2 *pMemoryRequirements)
+void
+v3dv_buffer_init(struct v3dv_device *device,
+ const VkBufferCreateInfo *pCreateInfo,
+ struct v3dv_buffer *buffer,
+ uint32_t alignment)
{
- V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
+ buffer->size = pCreateInfo->size;
+ buffer->usage = pCreateInfo->usage;
+ buffer->alignment = alignment;
+}
+static void
+get_buffer_memory_requirements(struct v3dv_buffer *buffer,
+ VkMemoryRequirements2 *pMemoryRequirements)
+{
pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
.memoryTypeBits = 0x1,
.alignment = buffer->alignment,
.size = align64(buffer->size, buffer->alignment),
};
+ /* UBO and SSBO may be read using ldunifa, which prefetches the next
+ * 4 bytes after a read. If the buffer's size is exactly a multiple
+ * of a page size and the shader reads the last 4 bytes with ldunifa
+ * the prefetching would read out of bounds and cause an MMU error,
+ * so we allocate extra space to avoid kernel error spamming.
+ */
+ bool can_ldunifa = buffer->usage &
+ (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+ VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
+ if (can_ldunifa && (buffer->size % 4096 == 0))
+ pMemoryRequirements->memoryRequirements.size += buffer->alignment;
+
vk_foreach_struct(ext, pMemoryRequirements->pNext) {
switch (ext->sType) {
case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
@@ -2357,8 +2568,30 @@ v3dv_GetBufferMemoryRequirements2(VkDevice device,
}
}
-static void
-bind_buffer_memory(const VkBindBufferMemoryInfo *info)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetBufferMemoryRequirements2(VkDevice device,
+ const VkBufferMemoryRequirementsInfo2 *pInfo,
+ VkMemoryRequirements2 *pMemoryRequirements)
+{
+ V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
+ get_buffer_memory_requirements(buffer, pMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDeviceBufferMemoryRequirements(
+ VkDevice _device,
+ const VkDeviceBufferMemoryRequirements *pInfo,
+ VkMemoryRequirements2 *pMemoryRequirements)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+
+ struct v3dv_buffer buffer = { 0 };
+ v3dv_buffer_init(device, pInfo->pCreateInfo, &buffer, V3D_NON_COHERENT_ATOM_SIZE);
+ get_buffer_memory_requirements(&buffer, pMemoryRequirements);
+}
+
+void
+v3dv_buffer_bind_memory(const VkBindBufferMemoryInfo *info)
{
V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->buffer);
V3DV_FROM_HANDLE(v3dv_device_memory, mem, info->memory);
@@ -2383,7 +2616,7 @@ v3dv_BindBufferMemory2(VkDevice device,
const VkBindBufferMemoryInfo *pBindInfos)
{
for (uint32_t i = 0; i < bindInfoCount; i++)
- bind_buffer_memory(&pBindInfos[i]);
+ v3dv_buffer_bind_memory(&pBindInfos[i]);
return VK_SUCCESS;
}
@@ -2406,16 +2639,16 @@ v3dv_CreateBuffer(VkDevice _device,
buffer = vk_object_zalloc(&device->vk, pAllocator, sizeof(*buffer),
VK_OBJECT_TYPE_BUFFER);
if (buffer == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- buffer->size = pCreateInfo->size;
- buffer->usage = pCreateInfo->usage;
- buffer->alignment = 256; /* nonCoherentAtomSize */
+ v3dv_buffer_init(device, pCreateInfo, buffer, V3D_NON_COHERENT_ATOM_SIZE);
/* Limit allocations to 32-bit */
const VkDeviceSize aligned_size = align64(buffer->size, buffer->alignment);
- if (aligned_size > UINT32_MAX || aligned_size < buffer->size)
+ if (aligned_size > UINT32_MAX || aligned_size < buffer->size) {
+ vk_free(&device->vk.alloc, buffer);
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+ }
*pBuffer = v3dv_buffer_to_handle(buffer);
@@ -2452,20 +2685,32 @@ v3dv_CreateFramebuffer(VkDevice _device,
framebuffer = vk_object_zalloc(&device->vk, pAllocator, size,
VK_OBJECT_TYPE_FRAMEBUFFER);
if (framebuffer == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
framebuffer->width = pCreateInfo->width;
framebuffer->height = pCreateInfo->height;
framebuffer->layers = pCreateInfo->layers;
framebuffer->has_edge_padding = true;
+ const VkFramebufferAttachmentsCreateInfo *imageless =
+ vk_find_struct_const(pCreateInfo->pNext,
+ FRAMEBUFFER_ATTACHMENTS_CREATE_INFO);
+
framebuffer->attachment_count = pCreateInfo->attachmentCount;
framebuffer->color_attachment_count = 0;
- for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
- framebuffer->attachments[i] =
- v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]);
- if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
- framebuffer->color_attachment_count++;
+ for (uint32_t i = 0; i < framebuffer->attachment_count; i++) {
+ if (!imageless) {
+ framebuffer->attachments[i] =
+ v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]);
+ if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+ framebuffer->color_attachment_count++;
+ } else {
+ assert(i < imageless->attachmentImageInfoCount);
+ if (imageless->pAttachmentImageInfos[i].usage &
+ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+ framebuffer->color_attachment_count++;
+ }
+ }
}
*pFramebuffer = v3dv_framebuffer_to_handle(framebuffer);
@@ -2487,6 +2732,105 @@ v3dv_DestroyFramebuffer(VkDevice _device,
vk_object_free(&device->vk, pAllocator, fb);
}
+void
+v3dv_setup_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkRenderingInfoKHR *info)
+{
+ struct v3dv_device *device = cmd_buffer->device;
+
+ /* Max framebuffer attachments is max_color_RTs + D/S multiplied by two for
+ * MSAA resolves.
+ */
+ const uint32_t max_attachments =
+ 2 * (V3D_MAX_RENDER_TARGETS(device->devinfo.ver) + 1);
+ const uint32_t attachments_alloc_size =
+ sizeof(struct v3dv_image_view *) * max_attachments;
+
+ /* Only allocate the dynamic framebuffer once and will stay valid
+ * for the duration of the command buffer.
+ */
+ struct v3dv_framebuffer *fb = cmd_buffer->state.dynamic_framebuffer;
+ if (!fb) {
+ uint32_t alloc_size = sizeof(struct v3dv_framebuffer) +
+ attachments_alloc_size;
+ fb = vk_object_zalloc(&cmd_buffer->device->vk, NULL, alloc_size,
+ VK_OBJECT_TYPE_FRAMEBUFFER);
+ if (fb == NULL) {
+ v3dv_flag_oom(cmd_buffer, NULL);
+ return;
+ }
+ cmd_buffer->state.dynamic_framebuffer = fb;
+ } else {
+ memset(fb->attachments, 0, attachments_alloc_size);
+ }
+
+ fb->width = info->renderArea.offset.x + info->renderArea.extent.width;
+ fb->height = info->renderArea.offset.y + info->renderArea.extent.height;
+
+ /* From the Vulkan spec for VkFramebufferCreateInfo:
+ *
+ * "If the render pass uses multiview, then layers must be one (...)"
+ */
+ fb->layers = info->viewMask == 0 ? info->layerCount : 1;
+
+ struct v3dv_render_pass *pass = &cmd_buffer->state.dynamic_pass;
+ assert(pass->subpass_count == 1 && pass->subpasses);
+ assert(pass->subpasses[0].color_count == info->colorAttachmentCount);
+ fb->color_attachment_count = info->colorAttachmentCount;
+
+ uint32_t a = 0;
+ for (int i = 0; i < info->colorAttachmentCount; i++) {
+ if (info->pColorAttachments[i].imageView == VK_NULL_HANDLE)
+ continue;
+ fb->attachments[a++] =
+ v3dv_image_view_from_handle(info->pColorAttachments[i].imageView);
+ if (info->pColorAttachments[i].resolveMode != VK_RESOLVE_MODE_NONE) {
+ fb->attachments[a++] =
+ v3dv_image_view_from_handle(info->pColorAttachments[i].resolveImageView);
+ }
+ }
+
+ if ((info->pDepthAttachment && info->pDepthAttachment->imageView) ||
+ (info->pStencilAttachment && info->pStencilAttachment->imageView)) {
+ const struct VkRenderingAttachmentInfo *common_ds_info =
+ (info->pDepthAttachment &&
+ info->pDepthAttachment->imageView != VK_NULL_HANDLE) ?
+ info->pDepthAttachment :
+ info->pStencilAttachment;
+
+ fb->attachments[a++] =
+ v3dv_image_view_from_handle(common_ds_info->imageView);
+
+ if (common_ds_info->resolveMode != VK_RESOLVE_MODE_NONE) {
+ fb->attachments[a++] =
+ v3dv_image_view_from_handle(common_ds_info->resolveImageView);
+ }
+ }
+
+ assert(a == pass->attachment_count);
+ fb->attachment_count = a;
+
+ /* Dynamic rendering doesn't provide the size of the underlying framebuffer
+ * so we estimate its size from the render area. This means it is possible
+ * the underlying attachments are larger and thus we cannot assume we have
+ * edge padding.
+ */
+ fb->has_edge_padding = false;
+}
+
+void
+v3dv_destroy_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ if (!cmd_buffer->state.dynamic_framebuffer)
+ return;
+
+ VkDevice vk_device = v3dv_device_to_handle(cmd_buffer->device);
+ VkFramebuffer vk_dynamic_fb =
+ v3dv_framebuffer_to_handle(cmd_buffer->state.dynamic_framebuffer);
+ v3dv_DestroyFramebuffer(vk_device, vk_dynamic_fb, NULL);
+ cmd_buffer->state.dynamic_framebuffer = NULL;
+}
+
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
VkExternalMemoryHandleTypeFlagBits handleType,
@@ -2494,7 +2838,7 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
VkMemoryFdPropertiesKHR *pMemoryFdProperties)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
- struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+ struct v3dv_physical_device *pdevice = device->pdevice;
switch (handleType) {
case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
@@ -2502,7 +2846,7 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
(1 << pdevice->memory.memoryTypeCount) - 1;
return VK_SUCCESS;
default:
- return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
}
@@ -2523,7 +2867,7 @@ v3dv_GetMemoryFdKHR(VkDevice _device,
mem->bo->handle,
DRM_CLOEXEC, &fd);
if (ret)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
*pFd = fd;
@@ -2531,63 +2875,6 @@ v3dv_GetMemoryFdKHR(VkDevice _device,
}
VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateEvent(VkDevice _device,
- const VkEventCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkEvent *pEvent)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- struct v3dv_event *event =
- vk_object_zalloc(&device->vk, pAllocator, sizeof(*event),
- VK_OBJECT_TYPE_EVENT);
- if (!event)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- /* Events are created in the unsignaled state */
- event->state = false;
- *pEvent = v3dv_event_to_handle(event);
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroyEvent(VkDevice _device,
- VkEvent _event,
- const VkAllocationCallbacks *pAllocator)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_event, event, _event);
-
- if (!event)
- return;
-
- vk_object_free(&device->vk, pAllocator, event);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetEventStatus(VkDevice _device, VkEvent _event)
-{
- V3DV_FROM_HANDLE(v3dv_event, event, _event);
- return p_atomic_read(&event->state) ? VK_EVENT_SET : VK_EVENT_RESET;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_SetEvent(VkDevice _device, VkEvent _event)
-{
- V3DV_FROM_HANDLE(v3dv_event, event, _event);
- p_atomic_set(&event->state, 1);
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetEvent(VkDevice _device, VkEvent _event)
-{
- V3DV_FROM_HANDLE(v3dv_event, event, _event);
- p_atomic_set(&event->state, 0);
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateSampler(VkDevice _device,
const VkSamplerCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -2601,7 +2888,9 @@ v3dv_CreateSampler(VkDevice _device,
sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
VK_OBJECT_TYPE_SAMPLER);
if (!sampler)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ sampler->plane_count = 1;
sampler->compare_enable = pCreateInfo->compareEnable;
sampler->unnormalized_coordinates = pCreateInfo->unnormalizedCoordinates;
@@ -2610,7 +2899,21 @@ v3dv_CreateSampler(VkDevice _device,
vk_find_struct_const(pCreateInfo->pNext,
SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT);
- v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
+ const VkSamplerYcbcrConversionInfo *ycbcr_conv_info =
+ vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
+
+ const struct vk_format_ycbcr_info *ycbcr_info = NULL;
+
+ if (ycbcr_conv_info) {
+ VK_FROM_HANDLE(vk_ycbcr_conversion, conversion, ycbcr_conv_info->conversion);
+ ycbcr_info = vk_format_get_ycbcr_info(conversion->state.format);
+ if (ycbcr_info) {
+ sampler->plane_count = ycbcr_info->n_planes;
+ sampler->conversion = conversion;
+ }
+ }
+
+ v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);
*pSampler = v3dv_sampler_to_handle(sampler);
@@ -2659,49 +2962,65 @@ v3dv_GetImageSparseMemoryRequirements2(
*pSparseMemoryRequirementCount = 0;
}
-/* vk_icd.h does not declare this function, so we declare it here to
- * suppress Wmissing-prototypes.
- */
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion);
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDeviceImageSparseMemoryRequirements(
+ VkDevice device,
+ const VkDeviceImageMemoryRequirements *pInfo,
+ uint32_t *pSparseMemoryRequirementCount,
+ VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+ *pSparseMemoryRequirementCount = 0;
+}
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion)
+VkDeviceAddress
+v3dv_GetBufferDeviceAddress(VkDevice device,
+ const VkBufferDeviceAddressInfo *pInfo)
{
- /* For the full details on loader interface versioning, see
- * <https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers/blob/master/loader/LoaderAndLayerInterface.md>.
- * What follows is a condensed summary, to help you navigate the large and
- * confusing official doc.
- *
- * - Loader interface v0 is incompatible with later versions. We don't
- * support it.
- *
- * - In loader interface v1:
- * - The first ICD entrypoint called by the loader is
- * vk_icdGetInstanceProcAddr(). The ICD must statically expose this
- * entrypoint.
- * - The ICD must statically expose no other Vulkan symbol unless it is
- * linked with -Bsymbolic.
- * - Each dispatchable Vulkan handle created by the ICD must be
- * a pointer to a struct whose first member is VK_LOADER_DATA. The
- * ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC.
- * - The loader implements vkCreate{PLATFORM}SurfaceKHR() and
- * vkDestroySurfaceKHR(). The ICD must be capable of working with
- * such loader-managed surfaces.
- *
- * - Loader interface v2 differs from v1 in:
- * - The first ICD entrypoint called by the loader is
- * vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must
- * statically expose this entrypoint.
- *
- * - Loader interface v3 differs from v2 in:
- * - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(),
- * vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR,
- * because the loader no longer does so.
- *
- * - Loader interface v4 differs from v3 in:
- * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr().
- */
- *pSupportedVersion = MIN2(*pSupportedVersion, 3u);
- return VK_SUCCESS;
+ V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
+ return buffer->mem_offset + buffer->mem->bo->offset;
+}
+
+uint64_t
+v3dv_GetBufferOpaqueCaptureAddress(VkDevice device,
+ const VkBufferDeviceAddressInfo *pInfo)
+{
+ /* Not implemented */
+ return 0;
+}
+
+uint64_t
+v3dv_GetDeviceMemoryOpaqueCaptureAddress(
+ VkDevice device,
+ const VkDeviceMemoryOpaqueCaptureAddressInfo *pInfo)
+{
+ /* Not implemented */
+ return 0;
+}
+
+VkResult
+v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
+ nir_shader *nir,
+ VkPipelineLayout pipeline_layout,
+ VkPipeline *pipeline)
+{
+ struct vk_shader_module cs_m = vk_shader_module_from_nir(nir);
+
+ VkPipelineShaderStageCreateInfo set_event_cs_stage = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+ .module = vk_shader_module_to_handle(&cs_m),
+ .pName = "main",
+ };
+
+ VkComputePipelineCreateInfo info = {
+ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+ .stage = set_event_cs_stage,
+ .layout = pipeline_layout,
+ };
+
+ VkResult result =
+ v3dv_CreateComputePipelines(v3dv_device_to_handle(device), VK_NULL_HANDLE,
+ 1, &info, &device->vk.alloc, pipeline);
+
+ return result;
}
diff --git a/src/broadcom/vulkan/v3dv_event.c b/src/broadcom/vulkan/v3dv_event.c
new file mode 100644
index 00000000000..a3aad37d9c7
--- /dev/null
+++ b/src/broadcom/vulkan/v3dv_event.c
@@ -0,0 +1,712 @@
+/*
+ * Copyright © 2022 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "compiler/nir/nir_builder.h"
+
+#include "vk_common_entrypoints.h"
+
+static nir_shader *
+get_set_event_cs()
+{
+ const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+ nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+ "set event cs");
+
+ nir_def *buf =
+ nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+ .desc_set = 0,
+ .binding = 0,
+ .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+ nir_def *offset =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+ nir_def *value =
+ nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+ nir_store_ssbo(&b, value, buf, offset,
+ .access = 0, .write_mask = 0x1, .align_mul = 4);
+
+ return b.shader;
+}
+
+static nir_shader *
+get_wait_event_cs()
+{
+ const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+ nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+ "wait event cs");
+
+ nir_def *buf =
+ nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+ .desc_set = 0,
+ .binding = 0,
+ .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+ nir_def *offset =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+ nir_loop *loop = nir_push_loop(&b);
+ nir_def *load =
+ nir_load_ssbo(&b, 1, 8, buf, offset, .access = 0, .align_mul = 4);
+ nir_def *value = nir_i2i32(&b, load);
+
+ nir_if *if_stmt = nir_push_if(&b, nir_ieq_imm(&b, value, 1));
+ nir_jump(&b, nir_jump_break);
+ nir_pop_if(&b, if_stmt);
+ nir_pop_loop(&b, loop);
+
+ return b.shader;
+}
+
+static bool
+create_event_pipelines(struct v3dv_device *device)
+{
+ VkResult result;
+
+ if (!device->events.descriptor_set_layout) {
+ /* Pipeline layout:
+ * - 1 storage buffer for the BO with the events state.
+ * - 2 push constants:
+ * 0B: offset of the event in the buffer (4 bytes).
+ * 4B: value for the event (1 byte), only used with the set_event_pipeline.
+ */
+ VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
+ .binding = 0,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .descriptorCount = 1,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ };
+
+ VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+ .bindingCount = 1,
+ .pBindings = &descriptor_set_layout_binding,
+ };
+
+ result =
+ v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
+ &descriptor_set_layout_info,
+ &device->vk.alloc,
+ &device->events.descriptor_set_layout);
+
+ if (result != VK_SUCCESS)
+ return false;
+ }
+
+ if (!device->events.pipeline_layout) {
+ VkPipelineLayoutCreateInfo pipeline_layout_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+ .setLayoutCount = 1,
+ .pSetLayouts = &device->events.descriptor_set_layout,
+ .pushConstantRangeCount = 1,
+ .pPushConstantRanges =
+ &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 5 },
+ };
+
+ result =
+ v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+ &pipeline_layout_info,
+ &device->vk.alloc,
+ &device->events.pipeline_layout);
+
+ if (result != VK_SUCCESS)
+ return false;
+ }
+
+ VkPipeline pipeline;
+
+ if (!device->events.set_event_pipeline) {
+ nir_shader *set_event_cs_nir = get_set_event_cs();
+ result = v3dv_create_compute_pipeline_from_nir(device,
+ set_event_cs_nir,
+ device->events.pipeline_layout,
+ &pipeline);
+ ralloc_free(set_event_cs_nir);
+ if (result != VK_SUCCESS)
+ return false;
+
+ device->events.set_event_pipeline = pipeline;
+ }
+
+ if (!device->events.wait_event_pipeline) {
+ nir_shader *wait_event_cs_nir = get_wait_event_cs();
+ result = v3dv_create_compute_pipeline_from_nir(device,
+ wait_event_cs_nir,
+ device->events.pipeline_layout,
+ &pipeline);
+ ralloc_free(wait_event_cs_nir);
+ if (result != VK_SUCCESS)
+ return false;
+
+ device->events.wait_event_pipeline = pipeline;
+ }
+
+ return true;
+}
+
+static void
+destroy_event_pipelines(struct v3dv_device *device)
+{
+ VkDevice _device = v3dv_device_to_handle(device);
+
+ v3dv_DestroyPipeline(_device, device->events.set_event_pipeline,
+ &device->vk.alloc);
+ device->events.set_event_pipeline = VK_NULL_HANDLE;
+
+ v3dv_DestroyPipeline(_device, device->events.wait_event_pipeline,
+ &device->vk.alloc);
+ device->events.wait_event_pipeline = VK_NULL_HANDLE;
+
+ v3dv_DestroyPipelineLayout(_device, device->events.pipeline_layout,
+ &device->vk.alloc);
+ device->events.pipeline_layout = VK_NULL_HANDLE;
+
+ v3dv_DestroyDescriptorSetLayout(_device,
+ device->events.descriptor_set_layout,
+ &device->vk.alloc);
+ device->events.descriptor_set_layout = VK_NULL_HANDLE;
+}
+
+static void
+init_event(struct v3dv_device *device, struct v3dv_event *event, uint32_t index)
+{
+ vk_object_base_init(&device->vk, &event->base, VK_OBJECT_TYPE_EVENT);
+ event->index = index;
+ list_addtail(&event->link, &device->events.free_list);
+}
+
+VkResult
+v3dv_event_allocate_resources(struct v3dv_device *device)
+{
+ VkResult result = VK_SUCCESS;
+ VkDevice _device = v3dv_device_to_handle(device);
+
+ /* BO with event states. Make sure we always align to a page size (4096)
+ * to ensure we use all the memory the kernel will allocate for the BO.
+ *
+ * CTS has tests that require over 8192 active events (yes, really) so
+ * let's make sure we allow for that.
+ */
+ const uint32_t bo_size = 3 * 4096;
+ struct v3dv_bo *bo = v3dv_bo_alloc(device, bo_size, "events", true);
+ if (!bo) {
+ result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ goto fail;
+ }
+
+ device->events.bo = bo;
+
+ if (!v3dv_bo_map(device, bo, bo_size)) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail;
+ }
+
+ /* Pre-allocate our events, each event requires 1 byte of BO storage */
+ device->events.event_count = bo_size;
+ device->events.events =
+ vk_zalloc2(&device->vk.alloc, NULL,
+ device->events.event_count * sizeof(struct v3dv_event), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!device->events.events) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail;
+ }
+
+ list_inithead(&device->events.free_list);
+ for (int i = 0; i < device->events.event_count; i++)
+ init_event(device, &device->events.events[i], i);
+
+ /* Vulkan buffer for the event state BO */
+ VkBufferCreateInfo buf_info = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .size = bo->size,
+ .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+ };
+ result = v3dv_CreateBuffer(_device, &buf_info, NULL,
+ &device->events.buffer);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ struct v3dv_device_memory *mem =
+ vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
+ VK_OBJECT_TYPE_DEVICE_MEMORY);
+ if (!mem) {
+ result = VK_ERROR_OUT_OF_HOST_MEMORY;
+ goto fail;
+ }
+
+ mem->bo = bo;
+ mem->type = &device->pdevice->memory.memoryTypes[0];
+
+ device->events.mem = v3dv_device_memory_to_handle(mem);
+ VkBindBufferMemoryInfo bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+ .buffer = device->events.buffer,
+ .memory = device->events.mem,
+ .memoryOffset = 0,
+ };
+ v3dv_BindBufferMemory2(_device, 1, &bind_info);
+
+ /* Pipelines */
+ if (!create_event_pipelines(device)) {
+ result = VK_ERROR_OUT_OF_HOST_MEMORY;
+ goto fail;
+ }
+
+ /* Descriptor pool & set to access the buffer */
+ VkDescriptorPoolSize pool_size = {
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .descriptorCount = 1,
+ };
+ VkDescriptorPoolCreateInfo pool_info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+ .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+ .maxSets = 1,
+ .poolSizeCount = 1,
+ .pPoolSizes = &pool_size,
+ };
+ result =
+ v3dv_CreateDescriptorPool(_device, &pool_info, NULL,
+ &device->events.descriptor_pool);
+
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ VkDescriptorSetAllocateInfo alloc_info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+ .descriptorPool = device->events.descriptor_pool,
+ .descriptorSetCount = 1,
+ .pSetLayouts = &device->events.descriptor_set_layout,
+ };
+ result = v3dv_AllocateDescriptorSets(_device, &alloc_info,
+ &device->events.descriptor_set);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ VkDescriptorBufferInfo desc_buf_info = {
+ .buffer = device->events.buffer,
+ .offset = 0,
+ .range = VK_WHOLE_SIZE,
+ };
+
+ VkWriteDescriptorSet write = {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstSet = device->events.descriptor_set,
+ .dstBinding = 0,
+ .dstArrayElement = 0,
+ .descriptorCount = 1,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .pBufferInfo = &desc_buf_info,
+ };
+ v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
+
+ return VK_SUCCESS;
+
+fail:
+ v3dv_event_free_resources(device);
+ return result;
+}
+
+void
+v3dv_event_free_resources(struct v3dv_device *device)
+{
+ if (device->events.bo) {
+ v3dv_bo_free(device, device->events.bo);
+ device->events.bo = NULL;
+ }
+
+ if (device->events.events) {
+ vk_free2(&device->vk.alloc, NULL, device->events.events);
+ device->events.events = NULL;
+ }
+
+ if (device->events.mem) {
+ vk_object_free(&device->vk, NULL,
+ v3dv_device_memory_from_handle(device->events.mem));
+ device->events.mem = VK_NULL_HANDLE;
+ }
+
+ v3dv_DestroyBuffer(v3dv_device_to_handle(device),
+ device->events.buffer, NULL);
+ device->events.buffer = VK_NULL_HANDLE;
+
+ v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
+ device->events.descriptor_pool,
+ 1, &device->events.descriptor_set);
+ device->events.descriptor_set = VK_NULL_HANDLE;
+
+ v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
+ device->events.descriptor_pool,
+ NULL);
+ device->events.descriptor_pool = VK_NULL_HANDLE;
+
+ destroy_event_pipelines(device);
+}
+
+static struct v3dv_event *
+allocate_event(struct v3dv_device *device)
+{
+ mtx_lock(&device->events.lock);
+ if (list_is_empty(&device->events.free_list)) {
+ mtx_unlock(&device->events.lock);
+ return NULL;
+ }
+
+ struct v3dv_event *event =
+ list_first_entry(&device->events.free_list, struct v3dv_event, link);
+ list_del(&event->link);
+ mtx_unlock(&device->events.lock);
+
+ return event;
+}
+
+static void
+free_event(struct v3dv_device *device, uint32_t index)
+{
+ assert(index < device->events.event_count);
+ mtx_lock(&device->events.lock);
+ list_addtail(&device->events.events[index].link, &device->events.free_list);
+ mtx_unlock(&device->events.lock);
+}
+
+static void
+event_set_value(struct v3dv_device *device,
+ struct v3dv_event *event,
+ uint8_t value)
+{
+ assert(value == 0 || value == 1);
+ uint8_t *data = (uint8_t *) device->events.bo->map;
+ data[event->index] = value;
+}
+
+static uint8_t
+event_get_value(struct v3dv_device *device, struct v3dv_event *event)
+{
+ uint8_t *data = (uint8_t *) device->events.bo->map;
+ return data[event->index];
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_CreateEvent(VkDevice _device,
+ const VkEventCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkEvent *pEvent)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ VkResult result = VK_SUCCESS;
+
+ struct v3dv_event *event = allocate_event(device);
+ if (!event) {
+ result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ goto fail;
+ }
+
+ event_set_value(device, event, 0);
+ *pEvent = v3dv_event_to_handle(event);
+ return VK_SUCCESS;
+
+fail:
+ return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_DestroyEvent(VkDevice _device,
+ VkEvent _event,
+ const VkAllocationCallbacks *pAllocator)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+ if (!event)
+ return;
+
+ free_event(device, event->index);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetEventStatus(VkDevice _device, VkEvent _event)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ V3DV_FROM_HANDLE(v3dv_event, event, _event);
+ return event_get_value(device, event) ? VK_EVENT_SET : VK_EVENT_RESET;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_SetEvent(VkDevice _device, VkEvent _event)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ V3DV_FROM_HANDLE(v3dv_event, event, _event);
+ event_set_value(device, event, 1);
+ return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_ResetEvent(VkDevice _device, VkEvent _event)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ V3DV_FROM_HANDLE(v3dv_event, event, _event);
+ event_set_value(device, event, 0);
+ return VK_SUCCESS;
+}
+
+static void
+cmd_buffer_emit_set_event(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_event *event,
+ uint8_t value)
+{
+ assert(value == 0 || value == 1);
+
+ struct v3dv_device *device = cmd_buffer->device;
+ VkCommandBuffer commandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+ v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+ v3dv_CmdBindPipeline(commandBuffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->events.set_event_pipeline);
+
+ v3dv_CmdBindDescriptorSets(commandBuffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->events.pipeline_layout,
+ 0, 1, &device->events.descriptor_set, 0, NULL);
+
+ assert(event->index < device->events.event_count);
+ uint32_t offset = event->index;
+ v3dv_CmdPushConstants(commandBuffer,
+ device->events.pipeline_layout,
+ VK_SHADER_STAGE_COMPUTE_BIT,
+ 0, 4, &offset);
+
+ v3dv_CmdPushConstants(commandBuffer,
+ device->events.pipeline_layout,
+ VK_SHADER_STAGE_COMPUTE_BIT,
+ 4, 1, &value);
+
+ vk_common_CmdDispatch(commandBuffer, 1, 1, 1);
+
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+static void
+cmd_buffer_emit_wait_event(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_event *event)
+{
+ struct v3dv_device *device = cmd_buffer->device;
+ VkCommandBuffer commandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+ v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+ v3dv_CmdBindPipeline(commandBuffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->events.wait_event_pipeline);
+
+ v3dv_CmdBindDescriptorSets(commandBuffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->events.pipeline_layout,
+ 0, 1, &device->events.descriptor_set, 0, NULL);
+
+ assert(event->index < device->events.event_count);
+ uint32_t offset = event->index;
+ v3dv_CmdPushConstants(commandBuffer,
+ device->events.pipeline_layout,
+ VK_SHADER_STAGE_COMPUTE_BIT,
+ 0, 4, &offset);
+
+ vk_common_CmdDispatch(commandBuffer, 1, 1, 1);
+
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetEvent2(VkCommandBuffer commandBuffer,
+ VkEvent _event,
+ const VkDependencyInfo *pDependencyInfo)
+{
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+ /* Event (re)sets can only happen outside a render pass instance so we
+ * should not be in the middle of job recording.
+ */
+ assert(cmd_buffer->state.pass == NULL);
+ assert(cmd_buffer->state.job == NULL);
+
+ /* We need to add the compute stage to the dstStageMask of all dependencies,
+ * so let's go ahead and patch the dependency info we receive.
+ */
+ struct v3dv_device *device = cmd_buffer->device;
+
+ uint32_t memory_barrier_count = pDependencyInfo->memoryBarrierCount;
+ VkMemoryBarrier2 *memory_barriers = memory_barrier_count ?
+ vk_alloc2(&device->vk.alloc, NULL,
+ memory_barrier_count * sizeof(memory_barriers[0]), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+ for (int i = 0; i < memory_barrier_count; i++) {
+ memory_barriers[i] = pDependencyInfo->pMemoryBarriers[i];
+ memory_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+ }
+
+ uint32_t buffer_barrier_count = pDependencyInfo->bufferMemoryBarrierCount;
+ VkBufferMemoryBarrier2 *buffer_barriers = buffer_barrier_count ?
+ vk_alloc2(&device->vk.alloc, NULL,
+ buffer_barrier_count * sizeof(buffer_barriers[0]), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+ for (int i = 0; i < buffer_barrier_count; i++) {
+ buffer_barriers[i] = pDependencyInfo->pBufferMemoryBarriers[i];
+ buffer_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+ }
+
+ uint32_t image_barrier_count = pDependencyInfo->imageMemoryBarrierCount;
+ VkImageMemoryBarrier2 *image_barriers = image_barrier_count ?
+ vk_alloc2(&device->vk.alloc, NULL,
+ image_barrier_count * sizeof(image_barriers[0]), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+ for (int i = 0; i < image_barrier_count; i++) {
+ image_barriers[i] = pDependencyInfo->pImageMemoryBarriers[i];
+ image_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+ }
+
+ VkDependencyInfo info = {
+ .sType = pDependencyInfo->sType,
+ .dependencyFlags = pDependencyInfo->dependencyFlags,
+ .memoryBarrierCount = memory_barrier_count,
+ .pMemoryBarriers = memory_barriers,
+ .bufferMemoryBarrierCount = buffer_barrier_count,
+ .pBufferMemoryBarriers = buffer_barriers,
+ .imageMemoryBarrierCount = image_barrier_count,
+ .pImageMemoryBarriers = image_barriers,
+ };
+
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &info);
+
+ cmd_buffer_emit_set_event(cmd_buffer, event, 1);
+
+ if (memory_barriers)
+ vk_free2(&device->vk.alloc, NULL, memory_barriers);
+ if (buffer_barriers)
+ vk_free2(&device->vk.alloc, NULL, buffer_barriers);
+ if (image_barriers)
+ vk_free2(&device->vk.alloc, NULL, image_barriers);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdResetEvent2(VkCommandBuffer commandBuffer,
+ VkEvent _event,
+ VkPipelineStageFlags2 stageMask)
+{
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+ /* Event (re)sets can only happen outside a render pass instance so we
+ * should not be in the middle of job recording.
+ */
+ assert(cmd_buffer->state.pass == NULL);
+ assert(cmd_buffer->state.job == NULL);
+
+ VkMemoryBarrier2 barrier = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+ .srcStageMask = stageMask,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ };
+ VkDependencyInfo info = {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .memoryBarrierCount = 1,
+ .pMemoryBarriers = &barrier,
+ };
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &info);
+
+ cmd_buffer_emit_set_event(cmd_buffer, event, 0);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdWaitEvents2(VkCommandBuffer commandBuffer,
+ uint32_t eventCount,
+ const VkEvent *pEvents,
+ const VkDependencyInfo *pDependencyInfo)
+{
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ for (uint32_t i = 0; i < eventCount; i++) {
+ struct v3dv_event *event = v3dv_event_from_handle(pEvents[i]);;
+ cmd_buffer_emit_wait_event(cmd_buffer, event);
+ }
+
+ /* We need to add the compute stage to the srcStageMask of all dependencies,
+ * so let's go ahead and patch the dependency info we receive.
+ */
+ struct v3dv_device *device = cmd_buffer->device;
+ for (int e = 0; e < eventCount; e++) {
+ const VkDependencyInfo *info = &pDependencyInfo[e];
+
+ uint32_t memory_barrier_count = info->memoryBarrierCount;
+ VkMemoryBarrier2 *memory_barriers = memory_barrier_count ?
+ vk_alloc2(&device->vk.alloc, NULL,
+ memory_barrier_count * sizeof(memory_barriers[0]), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+ for (int i = 0; i < memory_barrier_count; i++) {
+ memory_barriers[i] = info->pMemoryBarriers[i];
+ memory_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+ }
+
+ uint32_t buffer_barrier_count = info->bufferMemoryBarrierCount;
+ VkBufferMemoryBarrier2 *buffer_barriers = buffer_barrier_count ?
+ vk_alloc2(&device->vk.alloc, NULL,
+ buffer_barrier_count * sizeof(buffer_barriers[0]), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+ for (int i = 0; i < buffer_barrier_count; i++) {
+ buffer_barriers[i] = info->pBufferMemoryBarriers[i];
+ buffer_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+ }
+
+ uint32_t image_barrier_count = info->imageMemoryBarrierCount;
+ VkImageMemoryBarrier2 *image_barriers = image_barrier_count ?
+ vk_alloc2(&device->vk.alloc, NULL,
+ image_barrier_count * sizeof(image_barriers[0]), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL;
+ for (int i = 0; i < image_barrier_count; i++) {
+ image_barriers[i] = info->pImageMemoryBarriers[i];
+ image_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+ }
+
+ VkDependencyInfo new_info = {
+ .sType = info->sType,
+ .dependencyFlags = info->dependencyFlags,
+ .memoryBarrierCount = memory_barrier_count,
+ .pMemoryBarriers = memory_barriers,
+ .bufferMemoryBarrierCount = buffer_barrier_count,
+ .pBufferMemoryBarriers = buffer_barriers,
+ .imageMemoryBarrierCount = image_barrier_count,
+ .pImageMemoryBarriers = image_barriers,
+ };
+
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &new_info);
+
+ if (memory_barriers)
+ vk_free2(&device->vk.alloc, NULL, memory_barriers);
+ if (buffer_barriers)
+ vk_free2(&device->vk.alloc, NULL, buffer_barriers);
+ if (image_barriers)
+ vk_free2(&device->vk.alloc, NULL, image_barriers);
+ }
+}
diff --git a/src/broadcom/vulkan/v3dv_formats.c b/src/broadcom/vulkan/v3dv_formats.c
index 6e32d341a25..4d8f648d26a 100644
--- a/src/broadcom/vulkan/v3dv_formats.c
+++ b/src/broadcom/vulkan/v3dv_formats.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -22,15 +22,20 @@
*/
#include "v3dv_private.h"
+#if DETECT_OS_ANDROID
+#include "vk_android.h"
+#endif
+#include "vk_enum_defines.h"
#include "vk_util.h"
-#include "vk_format_info.h"
#include "drm-uapi/drm_fourcc.h"
#include "util/format/u_format.h"
#include "vulkan/wsi/wsi_common.h"
+#include <vulkan/vulkan_android.h>
+
const uint8_t *
-v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f)
+v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f, uint8_t plane)
{
const struct v3dv_format *vf = v3dv_X(device, get_format)(f);
static const uint8_t fallback[] = {0, 1, 2, 3};
@@ -38,23 +43,43 @@ v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f)
if (!vf)
return fallback;
- return vf->swizzle;
+ return vf->planes[plane].swizzle;
}
-uint8_t
-v3dv_get_tex_return_size(const struct v3dv_format *vf,
- bool compare_enable)
+bool
+v3dv_format_swizzle_needs_rb_swap(const uint8_t *swizzle)
{
- if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT))
- return 16;
+ /* Normal case */
+ if (swizzle[0] == PIPE_SWIZZLE_Z)
+ return swizzle[2] == PIPE_SWIZZLE_X;
+
+ /* Format uses reverse flag */
+ if (swizzle[0] == PIPE_SWIZZLE_Y)
+ return swizzle[2] == PIPE_SWIZZLE_W;
- if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT))
- return 32;
+ return false;
+}
+
+bool
+v3dv_format_swizzle_needs_reverse(const uint8_t *swizzle)
+{
+ /* Normal case */
+ if (swizzle[0] == PIPE_SWIZZLE_W &&
+ swizzle[1] == PIPE_SWIZZLE_Z &&
+ swizzle[2] == PIPE_SWIZZLE_Y &&
+ swizzle[3] == PIPE_SWIZZLE_X) {
+ return true;
+ }
- if (compare_enable)
- return 16;
+ /* Format uses RB swap flag */
+ if (swizzle[0] == PIPE_SWIZZLE_Y &&
+ swizzle[1] == PIPE_SWIZZLE_Z &&
+ swizzle[2] == PIPE_SWIZZLE_W &&
+ swizzle[3] == PIPE_SWIZZLE_X) {
+ return true;
+ }
- return vf->return_size;
+ return false;
}
/* Some cases of transfer operations are raw data copies that don't depend
@@ -62,6 +87,9 @@ v3dv_get_tex_return_size(const struct v3dv_format *vf,
* involved). In these cases, it is safe to choose any format supported by
* the TFU so long as it has the same texel size, which allows us to use the
* TFU paths with formats that are not TFU supported otherwise.
+ *
+ * Even when copying multi-plane images, we are copying per-plane, so the
+ * compatible TFU format will be single-plane.
*/
const struct v3dv_format *
v3dv_get_compatible_tfu_format(struct v3dv_device *device,
@@ -82,20 +110,18 @@ v3dv_get_compatible_tfu_format(struct v3dv_device *device,
*out_vk_format = vk_format;
const struct v3dv_format *format = v3dv_X(device, get_format)(vk_format);
- assert(v3dv_X(device, tfu_supports_tex_format)(format->tex_type));
+ assert(format->plane_count == 1);
+ assert(v3dv_X(device, tfu_supports_tex_format)(format->planes[0].tex_type));
return format;
}
-static VkFormatFeatureFlags
-image_format_features(struct v3dv_physical_device *pdevice,
- VkFormat vk_format,
- const struct v3dv_format *v3dv_format,
- VkImageTiling tiling)
+static VkFormatFeatureFlags2
+image_format_plane_features(struct v3dv_physical_device *pdevice,
+ VkFormat vk_format,
+ const struct v3dv_format_plane *v3dv_format,
+ VkImageTiling tiling)
{
- if (!v3dv_format || !v3dv_format->supported)
- return 0;
-
const VkImageAspectFlags aspects = vk_format_aspects(vk_format);
const VkImageAspectFlags zs_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
@@ -114,7 +140,7 @@ image_format_features(struct v3dv_physical_device *pdevice,
return 0;
}
- VkFormatFeatureFlags flags = 0;
+ VkFormatFeatureFlags2 flags = 0;
/* Raster format is only supported for 1D textures, so let's just
* always require optimal tiling for anything that requires sampling.
@@ -123,55 +149,127 @@ image_format_features(struct v3dv_physical_device *pdevice,
*/
if (v3dv_format->tex_type != TEXTURE_DATA_FORMAT_NO &&
tiling == VK_IMAGE_TILING_OPTIMAL) {
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
- VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_SRC_BIT;
- if (v3dv_format->supports_filtering)
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
}
if (v3dv_format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
- flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_BLIT_DST_BIT;
- if (v3dv_X(pdevice, format_supports_blending)(v3dv_format))
- flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+ flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
} else if (aspects & zs_aspects) {
- flags |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_BLIT_DST_BIT;
+ flags |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
}
}
const struct util_format_description *desc =
vk_format_description(vk_format);
- assert(desc);
- if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->is_array) {
- flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
- if (desc->nr_channels == 1 && vk_format_is_int(vk_format))
- flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
- } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
- vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
- vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
- /* To comply with shaderStorageImageExtendedFormats */
- flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+ if (tiling != VK_IMAGE_TILING_LINEAR) {
+ if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->is_array) {
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+ if (desc->nr_channels == 1 && vk_format_is_int(vk_format))
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
+ } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
+ vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32 ||
+ vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
+ vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
+ /* To comply with shaderStorageImageExtendedFormats */
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+ }
+ }
+
+ /* All our depth formats support shadow comparisons. */
+ if (vk_format_has_depth(vk_format) &&
+ (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) {
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
}
if (flags) {
- flags |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
- VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+ flags |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
}
return flags;
}
-static VkFormatFeatureFlags
+static VkFormatFeatureFlags2
+image_format_features(struct v3dv_physical_device *pdevice,
+ VkFormat vk_format,
+ const struct v3dv_format *v3dv_format,
+ VkImageTiling tiling)
+{
+ if (!v3dv_format || !v3dv_format->plane_count)
+ return 0;
+
+ VkFormatFeatureFlags2 flags = ~0ull;
+ for (uint8_t plane = 0;
+ flags && plane < v3dv_format->plane_count;
+ plane++) {
+ VkFormat plane_format = vk_format_get_plane_format(vk_format, plane);
+
+ flags &= image_format_plane_features(pdevice,
+ plane_format,
+ &v3dv_format->planes[plane],
+ tiling);
+ }
+
+ const struct vk_format_ycbcr_info *ycbcr_info =
+ vk_format_get_ycbcr_info(vk_format);
+
+ if (ycbcr_info) {
+ assert(v3dv_format->plane_count == ycbcr_info->n_planes);
+
+ flags |= VK_FORMAT_FEATURE_2_DISJOINT_BIT;
+
+ if (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT) {
+ flags |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+ for (unsigned p = 0; p < ycbcr_info->n_planes; p++) {
+ if (ycbcr_info->planes[p].denominator_scales[0] > 1 ||
+ ycbcr_info->planes[p].denominator_scales[1] > 1) {
+ flags |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT;
+ break;
+ }
+ }
+ }
+
+ /* FIXME: in the future we should be able to support BLIT_SRC via the
+ * blit_shader path
+ */
+ const VkFormatFeatureFlags2 disallowed_ycbcr_image_features =
+ VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+ VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+
+ flags &= ~disallowed_ycbcr_image_features;
+ }
+
+ if (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT &&
+ v3dv_format->supports_filtering) {
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+ }
+
+ if (flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT &&
+ v3dv_X(pdevice, format_supports_blending)(v3dv_format)) {
+ flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT;
+ }
+
+ return flags;
+}
+
+static VkFormatFeatureFlags2
buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format)
{
- if (!v3dv_format || !v3dv_format->supported)
+ if (!v3dv_format)
return 0;
- if (!v3dv_format->supported)
+ if (v3dv_format->plane_count != 1)
return 0;
/* We probably only want to support buffer formats that have a
@@ -182,32 +280,39 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
const struct util_format_description *desc =
vk_format_description(vk_format);
- assert(desc);
- VkFormatFeatureFlags flags = 0;
+ VkFormatFeatureFlags2 flags = 0;
if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
desc->is_array) {
- flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
- if (v3dv_format->tex_type != TEXTURE_DATA_FORMAT_NO) {
- flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT |
- VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+ flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT;
+ if (v3dv_format->planes[0].tex_type != TEXTURE_DATA_FORMAT_NO) {
+ /* STORAGE_READ_WITHOUT_FORMAT can also be applied for buffers. From spec:
+ * "VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT specifies
+ * that image views or buffer views created with this format can
+ * be used as storage images for read operations without
+ * specifying a format."
+ */
+ flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
}
- } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32) {
- flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT |
- VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT |
- VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+ } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
+ vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
+ flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT |
+ VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
} else if (vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
- flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT |
- VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+ flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
}
if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
desc->is_array &&
desc->nr_channels == 1 &&
vk_format_is_int(vk_format)) {
- flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
}
return flags;
@@ -216,48 +321,44 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
bool
v3dv_buffer_format_supports_features(struct v3dv_device *device,
VkFormat vk_format,
- VkFormatFeatureFlags features)
+ VkFormatFeatureFlags2 features)
{
const struct v3dv_format *v3dv_format = v3dv_X(device, get_format)(vk_format);
- const VkFormatFeatureFlags supported =
+ const VkFormatFeatureFlags2 supported =
buffer_format_features(vk_format, v3dv_format);
return (supported & features) == features;
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice,
- VkFormat format,
- VkFormatProperties* pFormatProperties)
+v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
+ VkFormat format,
+ VkFormatProperties2 *pFormatProperties)
{
V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
const struct v3dv_format *v3dv_format = v3dv_X(pdevice, get_format)(format);
- *pFormatProperties = (VkFormatProperties) {
- .linearTilingFeatures =
- image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_LINEAR),
- .optimalTilingFeatures =
- image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_OPTIMAL),
- .bufferFeatures =
- buffer_format_features(format, v3dv_format),
+ VkFormatFeatureFlags2 linear2, optimal2, buffer2;
+ linear2 = image_format_features(pdevice, format, v3dv_format,
+ VK_IMAGE_TILING_LINEAR);
+ optimal2 = image_format_features(pdevice, format, v3dv_format,
+ VK_IMAGE_TILING_OPTIMAL);
+ buffer2 = buffer_format_features(format, v3dv_format);
+ pFormatProperties->formatProperties = (VkFormatProperties) {
+ .linearTilingFeatures = vk_format_features2_to_features(linear2),
+ .optimalTilingFeatures = vk_format_features2_to_features(optimal2),
+ .bufferFeatures = vk_format_features2_to_features(buffer2),
};
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
- VkFormat format,
- VkFormatProperties2 *pFormatProperties)
-{
- v3dv_GetPhysicalDeviceFormatProperties(physicalDevice, format,
- &pFormatProperties->formatProperties);
vk_foreach_struct(ext, pFormatProperties->pNext) {
switch ((unsigned)ext->sType) {
case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT: {
struct VkDrmFormatModifierPropertiesListEXT *list = (void *)ext;
- VK_OUTARRAY_MAKE(out, list->pDrmFormatModifierProperties,
- &list->drmFormatModifierCount);
+ VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
+ list->pDrmFormatModifierProperties,
+ &list->drmFormatModifierCount);
if (pFormatProperties->formatProperties.linearTilingFeatures) {
- vk_outarray_append(&out, mod_props) {
+ vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT,
+ &out, mod_props) {
mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR;
mod_props->drmFormatModifierPlaneCount = 1;
mod_props->drmFormatModifierTilingFeatures =
@@ -265,7 +366,8 @@ v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
}
}
if (pFormatProperties->formatProperties.optimalTilingFeatures) {
- vk_outarray_append(&out, mod_props) {
+ vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT,
+ &out, mod_props) {
mod_props->drmFormatModifier = DRM_FORMAT_MOD_BROADCOM_UIF;
mod_props->drmFormatModifierPlaneCount = 1;
mod_props->drmFormatModifierTilingFeatures =
@@ -274,6 +376,36 @@ v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
}
break;
}
+ case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT: {
+ struct VkDrmFormatModifierPropertiesList2EXT *list = (void *)ext;
+ VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out,
+ list->pDrmFormatModifierProperties,
+ &list->drmFormatModifierCount);
+ if (linear2) {
+ vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT,
+ &out, mod_props) {
+ mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR;
+ mod_props->drmFormatModifierPlaneCount = 1;
+ mod_props->drmFormatModifierTilingFeatures = linear2;
+ }
+ }
+ if (optimal2) {
+ vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT,
+ &out, mod_props) {
+ mod_props->drmFormatModifier = DRM_FORMAT_MOD_BROADCOM_UIF;
+ mod_props->drmFormatModifierPlaneCount = 1;
+ mod_props->drmFormatModifierTilingFeatures = optimal2;
+ }
+ }
+ break;
+ }
+ case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: {
+ VkFormatProperties3 *props = (VkFormatProperties3 *)ext;
+ props->linearTilingFeatures = linear2;
+ props->optimalTilingFeatures = optimal2;
+ props->bufferFeatures = buffer2;
+ break;
+ }
default:
v3dv_debug_ignored_stype(ext->sType);
break;
@@ -290,7 +422,7 @@ get_image_format_properties(
VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties)
{
const struct v3dv_format *v3dv_format = v3dv_X(physical_device, get_format)(info->format);
- VkFormatFeatureFlags format_feature_flags =
+ VkFormatFeatureFlags2 format_feature_flags =
image_format_features(physical_device, info->format, v3dv_format, tiling);
if (!format_feature_flags)
goto unsupported;
@@ -307,8 +439,24 @@ get_image_format_properties(
if (info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT)
goto unsupported;
- if (info->usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_SRC_BIT)) {
+ const VkImageStencilUsageCreateInfo *stencil_usage_info =
+ vk_find_struct_const(info->pNext, IMAGE_STENCIL_USAGE_CREATE_INFO);
+
+ VkImageUsageFlags image_usage =
+ info->usage | (stencil_usage_info ? stencil_usage_info->stencilUsage : 0);
+
+ /* If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set it means the usage flags may
+ * not be be supported for the image format but are supported for at least
+ * one compatible format from which an image view can be created for the
+ * image. This means we should not report the format as unsupported based
+ * on the usage flags when usage refers to how an image view may be used
+ * (i.e. as a framebuffer attachment, for sampling, etc).
+ */
+ VkImageUsageFlags view_usage =
+ info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT ? 0 : image_usage;
+
+ if (image_usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT)) {
goto unsupported;
}
@@ -323,16 +471,16 @@ get_image_format_properties(
}
}
- if (info->usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_DST_BIT)) {
+ if (image_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT)) {
goto unsupported;
}
}
- if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
+ if (view_usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+ VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT))
goto unsupported;
- }
/* Sampling of raster depth/stencil images is not supported. Since 1D
* images are always raster, even if the user requested optimal tiling,
@@ -344,50 +492,47 @@ get_image_format_properties(
}
}
- if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
+ if (view_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT)) {
goto unsupported;
}
}
- if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
+ if (view_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT)) {
goto unsupported;
}
}
- if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+ if (view_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
if (!(format_feature_flags &
- VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+ VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) {
goto unsupported;
}
}
- /* FIXME: these are taken from VkPhysicalDeviceLimits, we should just put
- * these limits available in the physical device and read them from there
- * wherever we need them.
- */
switch (info->type) {
case VK_IMAGE_TYPE_1D:
- pImageFormatProperties->maxExtent.width = 4096;
+ pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION;
pImageFormatProperties->maxExtent.height = 1;
pImageFormatProperties->maxExtent.depth = 1;
- pImageFormatProperties->maxArrayLayers = 2048;
- pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */
+ pImageFormatProperties->maxArrayLayers = V3D_MAX_ARRAY_LAYERS;
+ pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS;
break;
case VK_IMAGE_TYPE_2D:
- pImageFormatProperties->maxExtent.width = 4096;
- pImageFormatProperties->maxExtent.height = 4096;
+ pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION;
+ pImageFormatProperties->maxExtent.height = V3D_MAX_IMAGE_DIMENSION;
pImageFormatProperties->maxExtent.depth = 1;
- pImageFormatProperties->maxArrayLayers = 2048;
- pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */
+ pImageFormatProperties->maxArrayLayers =
+ v3dv_format->plane_count == 1 ? V3D_MAX_ARRAY_LAYERS : 1;
+ pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS;
break;
case VK_IMAGE_TYPE_3D:
- pImageFormatProperties->maxExtent.width = 4096;
- pImageFormatProperties->maxExtent.height = 4096;
- pImageFormatProperties->maxExtent.depth = 4096;
+ pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION;
+ pImageFormatProperties->maxExtent.height = V3D_MAX_IMAGE_DIMENSION;
+ pImageFormatProperties->maxExtent.depth = V3D_MAX_IMAGE_DIMENSION;
pImageFormatProperties->maxArrayLayers = 1;
- pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */
+ pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS;
break;
default:
unreachable("bad VkImageType");
@@ -416,16 +561,50 @@ get_image_format_properties(
if (tiling != VK_IMAGE_TILING_LINEAR &&
info->type == VK_IMAGE_TYPE_2D &&
!(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
- (format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT ||
- format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+ (format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT ||
+ format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) {
pImageFormatProperties->sampleCounts |= VK_SAMPLE_COUNT_4_BIT;
}
if (tiling == VK_IMAGE_TILING_LINEAR)
pImageFormatProperties->maxMipLevels = 1;
+ /* From the Vulkan 1.2 spec, section 12.3. Images, VkImageCreateInfo structure:
+ *
+ * "Images created with one of the formats that require a sampler Y′CBCR
+ * conversion, have further restrictions on their limits and
+ * capabilities compared to images created with other formats. Creation
+ * of images with a format requiring Y′CBCR conversion may not be
+ * supported unless other parameters meet all of the constraints:
+ *
+ * * imageType is VK_IMAGE_TYPE_2D
+ * * mipLevels is 1
+ * * arrayLayers is 1, unless the ycbcrImageArrays feature is enabled, or
+ * otherwise indicated by VkImageFormatProperties::maxArrayLayers, as
+ * returned by vkGetPhysicalDeviceImageFormatProperties
+ * * samples is VK_SAMPLE_COUNT_1_BIT
+ *
+ * Implementations may support additional limits and capabilities beyond
+ * those listed above."
+ *
+ * We don't provide such additional limits, so we set those limits, or just
+ * return unsupported.
+ */
+ if (vk_format_get_plane_count(info->format) > 1) {
+ if (info->type != VK_IMAGE_TYPE_2D)
+ goto unsupported;
+ pImageFormatProperties->maxMipLevels = 1;
+ pImageFormatProperties->maxArrayLayers = 1;
+ pImageFormatProperties->sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+ }
+
pImageFormatProperties->maxResourceSize = 0xffffffff; /* 32-bit allocation */
+ if (pYcbcrImageFormatProperties) {
+ pYcbcrImageFormatProperties->combinedImageSamplerDescriptorCount =
+ vk_format_get_plane_count(info->format);
+ }
+
return VK_SUCCESS;
unsupported:
@@ -486,6 +665,8 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *drm_format_mod_info = NULL;
VkExternalImageFormatProperties *external_props = NULL;
+ UNUSED VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
+ VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
VkImageTiling tiling = base_info->tiling;
/* Extract input structs */
@@ -494,6 +675,9 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
external_info = (const void *) s;
break;
+ case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO:
+ /* Do nothing, get_image_format_properties() below will handle it */;
+ break;
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT:
drm_format_mod_info = (const void *) s;
switch (drm_format_mod_info->drmFormatModifier) {
@@ -522,6 +706,12 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
external_props = (void *) s;
break;
+ case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
+ android_usage = (void *)s;
+ break;
+ case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
+ ycbcr_props = (void *) s;
+ break;
default:
v3dv_debug_ignored_stype(s->sType);
break;
@@ -530,7 +720,8 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
VkResult result =
get_image_format_properties(physical_device, base_info, tiling,
- &base_props->imageFormatProperties, NULL);
+ &base_props->imageFormatProperties,
+ ycbcr_props);
if (result != VK_SUCCESS)
goto done;
@@ -541,12 +732,28 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
if (external_props)
external_props->externalMemoryProperties = prime_fd_props;
break;
+#if DETECT_OS_ANDROID
+ case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID:
+ if (external_props) {
+ external_props->externalMemoryProperties.exportFromImportedHandleTypes = 0;
+ external_props->externalMemoryProperties.compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;
+ external_props->externalMemoryProperties.externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT | VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT;
+ }
+ break;
+#endif
default:
result = VK_ERROR_FORMAT_NOT_SUPPORTED;
break;
}
}
+ if (android_usage) {
+#if DETECT_OS_ANDROID
+ android_usage->androidHardwareBufferUsage =
+ vk_image_usage_to_ahb_usage(base_info->flags, base_info->usage);
+#endif
+ }
+
done:
return result;
}
diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c
index c7ae05c4c22..358c03c555f 100644
--- a/src/broadcom/vulkan/v3dv_image.c
+++ b/src/broadcom/vulkan/v3dv_image.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,9 +26,11 @@
#include "drm-uapi/drm_fourcc.h"
#include "util/format/u_format.h"
#include "util/u_math.h"
-#include "vk_format_info.h"
#include "vk_util.h"
#include "vulkan/wsi/wsi_common.h"
+#if DETECT_OS_ANDROID
+#include "vk_android.h"
+#endif
/**
* Computes the HW's UIFblock padding for a given height/cpp.
@@ -71,32 +73,61 @@ v3d_get_ub_pad(uint32_t cpp, uint32_t height)
return 0;
}
-static void
-v3d_setup_slices(struct v3dv_image *image)
+/**
+ * Computes the dimension with required padding for mip levels.
+ *
+ * This padding is required for width and height dimensions when the mip
+ * level is greater than 1, and for the depth dimension when the mip level
+ * is greater than 0. This function expects to be passed a mip level >= 1.
+ *
+ * Note: Hardware documentation seems to suggest that the third argument
+ * should be the utile dimensions, but through testing it was found that
+ * the block dimension should be used instead.
+ */
+static uint32_t
+v3d_get_dimension_mpad(uint32_t dimension, uint32_t level, uint32_t block_dimension)
{
- assert(image->cpp > 0);
+ assert(level >= 1);
+ uint32_t pot_dim = u_minify(dimension, 1);
+ pot_dim = util_next_power_of_two(DIV_ROUND_UP(pot_dim, block_dimension));
+ uint32_t padded_dim = block_dimension * pot_dim;
+ return u_minify(padded_dim, level - 1);
+}
- uint32_t width = image->vk.extent.width;
- uint32_t height = image->vk.extent.height;
- uint32_t depth = image->vk.extent.depth;
+static bool
+v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
+ uint32_t plane_offset,
+ const VkSubresourceLayout *plane_layouts)
+{
+ assert(image->planes[plane].cpp > 0);
- /* Note that power-of-two padding is based on level 1. These are not
- * equivalent to just util_next_power_of_two(dimension), because at a
- * level 0 dimension of 9, the level 1 power-of-two padded value is 4,
- * not 8.
- */
- uint32_t pot_width = 2 * util_next_power_of_two(u_minify(width, 1));
- uint32_t pot_height = 2 * util_next_power_of_two(u_minify(height, 1));
- uint32_t pot_depth = 2 * util_next_power_of_two(u_minify(depth, 1));
+ uint32_t width = image->planes[plane].width;
+ uint32_t height = image->planes[plane].height;
+ uint32_t depth = image->vk.extent.depth;
- uint32_t utile_w = v3d_utile_width(image->cpp);
- uint32_t utile_h = v3d_utile_height(image->cpp);
+ uint32_t utile_w = v3d_utile_width(image->planes[plane].cpp);
+ uint32_t utile_h = v3d_utile_height(image->planes[plane].cpp);
uint32_t uif_block_w = utile_w * 2;
uint32_t uif_block_h = utile_h * 2;
uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
uint32_t block_height = vk_format_get_blockheight(image->vk.format);
+ /* Note that power-of-two padding is based on level 1. These are not
+ * equivalent to just util_next_power_of_two(dimension), because at a
+ * level 0 dimension of 9, the level 1 power-of-two padded value is 4,
+ * not 8. Additionally the pot padding is based on the block size.
+ */
+ uint32_t pot_width = 2 * v3d_get_dimension_mpad(width,
+ 1,
+ block_width);
+ uint32_t pot_height = 2 * v3d_get_dimension_mpad(height,
+ 1,
+ block_height);
+ uint32_t pot_depth = 2 * v3d_get_dimension_mpad(depth,
+ 1,
+ 1);
+
assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT ||
image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
bool msaa = image->vk.samples != VK_SAMPLE_COUNT_1_BIT;
@@ -107,14 +138,30 @@ v3d_setup_slices(struct v3dv_image *image)
assert(depth > 0);
assert(image->vk.mip_levels >= 1);
- uint32_t offset = 0;
+ /* Texture Base Address needs to be 64-byte aligned. If we have an explicit
+ * plane layout we will return false to fail image creation with appropriate
+ * error code.
+ */
+ uint32_t offset;
+ if (plane_layouts) {
+ offset = plane_layouts[plane].offset;
+ if (offset % 64 != 0)
+ return false;
+ } else {
+ offset = plane_offset;
+ }
+ assert(plane_offset % 64 == 0);
+
for (int32_t i = image->vk.mip_levels - 1; i >= 0; i--) {
- struct v3d_resource_slice *slice = &image->slices[i];
+ struct v3d_resource_slice *slice = &image->planes[plane].slices[i];
+
+ slice->width = u_minify(width, i);
+ slice->height = u_minify(height, i);
uint32_t level_width, level_height, level_depth;
if (i < 2) {
- level_width = u_minify(width, i);
- level_height = u_minify(height, i);
+ level_width = slice->width;
+ level_height = slice->height;
} else {
level_width = u_minify(pot_width, i);
level_height = u_minify(pot_height, i);
@@ -136,7 +183,7 @@ v3d_setup_slices(struct v3dv_image *image)
if (!image->tiled) {
slice->tiling = V3D_TILING_RASTER;
if (image->vk.image_type == VK_IMAGE_TYPE_1D)
- level_width = align(level_width, 64 / image->cpp);
+ level_width = align(level_width, 64 / image->planes[plane].cpp);
} else {
if ((i != 0 || !uif_top) &&
(level_width <= utile_w || level_height <= utile_h)) {
@@ -158,7 +205,8 @@ v3d_setup_slices(struct v3dv_image *image)
level_width = align(level_width, 4 * uif_block_w);
level_height = align(level_height, uif_block_h);
- slice->ub_pad = v3d_get_ub_pad(image->cpp, level_height);
+ slice->ub_pad = v3d_get_ub_pad(image->planes[plane].cpp,
+ level_height);
level_height += slice->ub_pad * uif_block_h;
/* If the padding set us to to be aligned to the page cache size,
@@ -175,12 +223,25 @@ v3d_setup_slices(struct v3dv_image *image)
}
slice->offset = offset;
- slice->stride = level_width * image->cpp;
+ slice->stride = level_width * image->planes[plane].cpp;
+
+ /* We assume that rowPitch in the plane layout refers to level 0 */
+ if (plane_layouts && i == 0) {
+ if (plane_layouts[plane].rowPitch < slice->stride)
+ return false;
+ if (plane_layouts[plane].rowPitch % image->planes[plane].cpp)
+ return false;
+ if (image->tiled && (plane_layouts[plane].rowPitch % (4 * uif_block_w)))
+ return false;
+ slice->stride = plane_layouts[plane].rowPitch;
+ }
+
slice->padded_height = level_height;
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
slice->tiling == V3D_TILING_UIF_XOR) {
slice->padded_height_of_output_image_in_uif_blocks =
- slice->padded_height / (2 * v3d_utile_height(image->cpp));
+ slice->padded_height /
+ (2 * v3d_utile_height(image->planes[plane].cpp));
}
slice->size = level_height * slice->stride;
@@ -188,7 +249,7 @@ v3d_setup_slices(struct v3dv_image *image)
/* The HW aligns level 1's base to a page if any of level 1 or
* below could be UIF XOR. The lower levels then inherit the
- * alignment for as long as necesary, thanks to being power of
+ * alignment for as long as necessary, thanks to being power of
* two aligned.
*/
if (i == 1 &&
@@ -200,7 +261,7 @@ v3d_setup_slices(struct v3dv_image *image)
offset += slice_total_size;
}
- image->size = offset;
+ image->planes[plane].size = offset - plane_offset;
/* UIF/UBLINEAR levels need to be aligned to UIF-blocks, and LT only
* needs to be aligned to utile boundaries. Since tiles are laid out
@@ -209,14 +270,27 @@ v3d_setup_slices(struct v3dv_image *image)
* slices.
*
* We additionally align to 4k, which improves UIF XOR performance.
+ *
+ * Finally, because the Texture Base Address field must be 64-byte aligned,
+ * we also need to align linear images to 64 if the image is going to be
+ * used for transfer.
*/
- image->alignment = image->tiled ? 4096 : image->cpp;
+ if (image->tiled) {
+ image->planes[plane].alignment = 4096;
+ } else {
+ image->planes[plane].alignment =
+ (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) ?
+ 64 : image->planes[plane].cpp;
+ }
+
uint32_t align_offset =
- align(image->slices[0].offset, image->alignment) - image->slices[0].offset;
+ align(image->planes[plane].slices[0].offset,
+ image->planes[plane].alignment) -
+ image->planes[plane].slices[0].offset;
if (align_offset) {
- image->size += align_offset;
+ image->planes[plane].size += align_offset;
for (int i = 0; i < image->vk.mip_levels; i++)
- image->slices[i].offset += align_offset;
+ image->planes[plane].slices[i].offset += align_offset;
}
/* Arrays and cube textures have a stride which is the distance from
@@ -224,41 +298,112 @@ v3d_setup_slices(struct v3dv_image *image)
* we need to program the stride between slices of miplevel 0.
*/
if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
- image->cube_map_stride =
- align(image->slices[0].offset + image->slices[0].size, 64);
- image->size += image->cube_map_stride * (image->vk.array_layers - 1);
+ image->planes[plane].cube_map_stride =
+ align(image->planes[plane].slices[0].offset +
+ image->planes[plane].slices[0].size, 64);
+
+ if (plane_layouts && image->vk.array_layers > 1) {
+ if (plane_layouts[plane].arrayPitch % 64 != 0)
+ return false;
+ if (plane_layouts[plane].arrayPitch <
+ image->planes[plane].cube_map_stride) {
+ return false;
+ }
+ image->planes[plane].cube_map_stride = plane_layouts[plane].arrayPitch;
+ }
+
+ image->planes[plane].size += image->planes[plane].cube_map_stride *
+ (image->vk.array_layers - 1);
} else {
- image->cube_map_stride = image->slices[0].size;
+ image->planes[plane].cube_map_stride = image->planes[plane].slices[0].size;
+ if (plane_layouts) {
+ /* We assume that depthPitch in the plane layout refers to level 0 */
+ if (plane_layouts[plane].depthPitch !=
+ image->planes[plane].slices[0].size) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+static VkResult
+v3d_setup_slices(struct v3dv_image *image, bool disjoint,
+ const VkSubresourceLayout *plane_layouts)
+{
+ if (disjoint && image->plane_count == 1)
+ disjoint = false;
+
+ uint64_t offset = 0;
+ for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+ offset = disjoint ? 0 : offset;
+ if (!v3d_setup_plane_slices(image, plane, offset, plane_layouts)) {
+ assert(plane_layouts);
+ return VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT;
+ }
+ offset += align64(image->planes[plane].size, 64);
}
+
+ /* From the Vulkan spec:
+ *
+ * "If the size of the resultant image would exceed maxResourceSize, then
+ * vkCreateImage must fail and return VK_ERROR_OUT_OF_DEVICE_MEMORY. This
+ * failure may occur even when all image creation parameters satisfy their
+ * valid usage requirements."
+ */
+ if (offset > 0xffffffff)
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+ image->non_disjoint_size = disjoint ? 0 : offset;
+ return VK_SUCCESS;
}
uint32_t
-v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer)
+v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer,
+ uint8_t plane)
{
- const struct v3d_resource_slice *slice = &image->slices[level];
+ const struct v3d_resource_slice *slice = &image->planes[plane].slices[level];
if (image->vk.image_type == VK_IMAGE_TYPE_3D)
- return image->mem_offset + slice->offset + layer * slice->size;
+ return image->planes[plane].mem_offset + slice->offset + layer * slice->size;
else
- return image->mem_offset + slice->offset + layer * image->cube_map_stride;
+ return image->planes[plane].mem_offset + slice->offset +
+ layer * image->planes[plane].cube_map_stride;
}
-static VkResult
-create_image(struct v3dv_device *device,
- const VkImageCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkImage *pImage)
+VkResult
+v3dv_update_image_layout(struct v3dv_device *device,
+ struct v3dv_image *image,
+ uint64_t modifier,
+ bool disjoint,
+ const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info)
{
- struct v3dv_image *image = NULL;
+ assert(!explicit_mod_info ||
+ image->plane_count == explicit_mod_info->drmFormatModifierPlaneCount);
- image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image));
- if (image == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ assert(!explicit_mod_info ||
+ modifier == explicit_mod_info->drmFormatModifier);
+
+ image->tiled = modifier != DRM_FORMAT_MOD_LINEAR;
+
+ image->vk.drm_format_mod = modifier;
+
+ return v3d_setup_slices(image, disjoint,
+ explicit_mod_info ? explicit_mod_info->pPlaneLayouts :
+ NULL);
+}
+VkResult
+v3dv_image_init(struct v3dv_device *device,
+ const VkImageCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ struct v3dv_image *image)
+{
/* When using the simulator the WSI common code will see that our
* driver wsi device doesn't match the display device and because of that
* it will not attempt to present directly from the swapchain images,
- * instead it will use the prime blit path (use_prime_blit flag in
+ * instead it will use the prime blit path (use_buffer_blit flag in
* struct wsi_swapchain), where it copies the contents of the swapchain
* images to a linear buffer with appropriate row stride for presentation.
* As a result, on that path, swapchain images do not have any special
@@ -266,11 +411,20 @@ create_image(struct v3dv_device *device,
*/
VkImageTiling tiling = pCreateInfo->tiling;
uint64_t modifier = DRM_FORMAT_MOD_INVALID;
+ const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = NULL;
+ const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info = NULL;
+#if DETECT_OS_ANDROID
+ if (image->is_native_buffer_memory) {
+ assert(image->android_explicit_layout);
+ explicit_mod_info = image->android_explicit_layout;
+ modifier = explicit_mod_info->drmFormatModifier;
+ }
+#endif
if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
- const VkImageDrmFormatModifierListCreateInfoEXT *mod_info =
+ mod_info =
vk_find_struct_const(pCreateInfo->pNext,
IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
- const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info =
+ explicit_mod_info =
vk_find_struct_const(pCreateInfo->pNext,
IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
assert(mod_info || explicit_mod_info);
@@ -297,21 +451,42 @@ create_image(struct v3dv_device *device,
tiling = VK_IMAGE_TILING_LINEAR;
}
+ if (modifier == DRM_FORMAT_MOD_INVALID)
+ modifier = (tiling == VK_IMAGE_TILING_OPTIMAL) ? DRM_FORMAT_MOD_BROADCOM_UIF
+ : DRM_FORMAT_MOD_LINEAR;
+
const struct v3dv_format *format =
- v3dv_X(device, get_format)(pCreateInfo->format);
- v3dv_assert(format != NULL && format->supported);
+ v3dv_X(device, get_format)(image->vk.format);
+ v3dv_assert(format != NULL && format->plane_count);
assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT ||
pCreateInfo->samples == VK_SAMPLE_COUNT_4_BIT);
image->format = format;
- image->cpp = vk_format_get_blocksize(image->vk.format);
- image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL ||
- (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
- modifier != DRM_FORMAT_MOD_LINEAR);
- image->vk.tiling = tiling;
- image->vk.drm_format_mod = modifier;
+ image->plane_count = vk_format_get_plane_count(image->vk.format);
+
+ const struct vk_format_ycbcr_info *ycbcr_info =
+ vk_format_get_ycbcr_info(image->vk.format);
+
+ for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+ VkFormat plane_format =
+ vk_format_get_plane_format(image->vk.format, plane);
+ image->planes[plane].cpp =
+ vk_format_get_blocksize(plane_format);
+ image->planes[plane].vk_format = plane_format;
+
+ image->planes[plane].width = image->vk.extent.width;
+ image->planes[plane].height = image->vk.extent.height;
+
+ if (ycbcr_info) {
+ image->planes[plane].width /=
+ ycbcr_info->planes[plane].denominator_scales[0];
+
+ image->planes[plane].height /=
+ ycbcr_info->planes[plane].denominator_scales[1];
+ }
+ }
/* Our meta paths can create image views with compatible formats for any
* image, so always set this flag to keep the common Vulkan image code
@@ -319,11 +494,112 @@ create_image(struct v3dv_device *device,
*/
image->vk.create_flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
- v3d_setup_slices(image);
+#if DETECT_OS_ANDROID
+ /* At this time, an AHB handle is not yet provided.
+ * Image layout will be filled up during vkBindImageMemory2
+ */
+ if (image->is_ahb)
+ return VK_SUCCESS;
+#endif
+
+ bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+
+ return v3dv_update_image_layout(device, image, modifier, disjoint,
+ explicit_mod_info);
+}
+
+static VkResult
+create_image(struct v3dv_device *device,
+ const VkImageCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkImage *pImage)
+{
+ VkResult result;
+ struct v3dv_image *image = NULL;
+
+ image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image));
+ if (image == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+#if DETECT_OS_ANDROID
+ const VkExternalMemoryImageCreateInfo *external_info =
+ vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO);
+
+ const VkNativeBufferANDROID *native_buffer =
+ vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
+
+ if (native_buffer != NULL)
+ image->is_native_buffer_memory = true;
+
+ image->is_ahb = external_info && (external_info->handleTypes &
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID);
+
+ assert(!(image->is_ahb && image->is_native_buffer_memory));
+
+ if (image->is_ahb || image->is_native_buffer_memory) {
+ image->android_explicit_layout = vk_alloc2(&device->vk.alloc, pAllocator,
+ sizeof(VkImageDrmFormatModifierExplicitCreateInfoEXT),
+ 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!image->android_explicit_layout) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail;
+ }
+
+ image->android_plane_layouts = vk_alloc2(&device->vk.alloc, pAllocator,
+ sizeof(VkSubresourceLayout) * V3DV_MAX_PLANE_COUNT,
+ 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!image->android_plane_layouts) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail;
+ }
+ }
+
+ if (image->is_native_buffer_memory) {
+ struct u_gralloc_buffer_handle gr_handle = {
+ .handle = native_buffer->handle,
+ .hal_format = native_buffer->format,
+ .pixel_stride = native_buffer->stride,
+ };
+
+ result = v3dv_gralloc_to_drm_explicit_layout(device->gralloc,
+ &gr_handle,
+ image->android_explicit_layout,
+ image->android_plane_layouts,
+ V3DV_MAX_PLANE_COUNT);
+ if (result != VK_SUCCESS)
+ goto fail;
+ }
+#endif
+
+ result = v3dv_image_init(device, pCreateInfo, pAllocator, image);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+#if DETECT_OS_ANDROID
+ if (image->is_native_buffer_memory) {
+ result = v3dv_import_native_buffer_fd(v3dv_device_to_handle(device),
+ native_buffer->handle->data[0], pAllocator,
+ v3dv_image_to_handle(image));
+ if (result != VK_SUCCESS)
+ goto fail;
+ }
+#endif
*pImage = v3dv_image_to_handle(image);
return VK_SUCCESS;
+
+fail:
+#if DETECT_OS_ANDROID
+ if (image->android_explicit_layout)
+ vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout);
+ if (image->android_plane_layouts)
+ vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts);
+#endif
+
+ vk_image_destroy(&device->vk, pAllocator, &image->vk);
+ return result;
}
static VkResult
@@ -381,8 +657,14 @@ v3dv_CreateImage(VkDevice _device,
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
+#if DETECT_OS_ANDROID
+ /* VkImageSwapchainCreateInfoKHR is not useful at all */
+ const VkImageSwapchainCreateInfoKHR *swapchain_info = NULL;
+#else
const VkImageSwapchainCreateInfoKHR *swapchain_info =
vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
+#endif
+
if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE)
return create_image_from_swapchain(device, pCreateInfo, swapchain_info,
pAllocator, pImage);
@@ -398,13 +680,30 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
{
V3DV_FROM_HANDLE(v3dv_image, image, _image);
+ uint8_t plane = v3dv_plane_from_aspect(subresource->aspectMask);
const struct v3d_resource_slice *slice =
- &image->slices[subresource->mipLevel];
+ &image->planes[plane].slices[subresource->mipLevel];
+
+ /* About why the offset below works for both disjoint and non-disjoint
+ * cases, from the Vulkan spec:
+ *
+ * "If the image is disjoint, then the offset is relative to the base
+ * address of the plane."
+ *
+ * "If the image is non-disjoint, then the offset is relative to the base
+ * address of the image."
+ *
+ * In our case, the per-plane mem_offset for non-disjoint images is the
+ * same for all planes and matches the base address of the image.
+ */
layout->offset =
- v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer);
+ v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer,
+ plane) - image->planes[plane].mem_offset;
layout->rowPitch = slice->stride;
- layout->depthPitch = image->cube_map_stride;
- layout->arrayPitch = image->cube_map_stride;
+ layout->depthPitch = image->vk.image_type == VK_IMAGE_TYPE_3D ?
+ image->planes[plane].cube_map_stride : 0;
+ layout->arrayPitch = image->vk.array_layers > 1 ?
+ image->planes[plane].cube_map_stride : 0;
if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
layout->size = slice->size;
@@ -419,7 +718,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
layout->size = slice->size * image->vk.extent.depth;
} else {
const struct v3d_resource_slice *prev_slice =
- &image->slices[subresource->mipLevel - 1];
+ &image->planes[plane].slices[subresource->mipLevel - 1];
layout->size = prev_slice->offset - slice->offset;
}
}
@@ -436,6 +735,35 @@ v3dv_DestroyImage(VkDevice _device,
if (image == NULL)
return;
+ /* If we have created a shadow tiled image for this image we must also free
+ * it (along with its memory allocation).
+ */
+ if (image->shadow) {
+ bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+ for (int i = 0; i < (disjoint ? image->plane_count : 1); i++) {
+ if (image->shadow->planes[i].mem) {
+ v3dv_FreeMemory(_device,
+ v3dv_device_memory_to_handle(image->shadow->planes[i].mem),
+ pAllocator);
+ }
+ }
+ v3dv_DestroyImage(_device, v3dv_image_to_handle(image->shadow),
+ pAllocator);
+ image->shadow = NULL;
+ }
+
+#if DETECT_OS_ANDROID
+ if (image->is_native_buffer_memory)
+ v3dv_FreeMemory(_device,
+ v3dv_device_memory_to_handle(image->planes[0].mem),
+ pAllocator);
+
+ if (image->android_explicit_layout)
+ vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout);
+ if (image->android_plane_layouts)
+ vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts);
+#endif
+
vk_image_destroy(&device->vk, pAllocator, &image->vk);
}
@@ -451,96 +779,102 @@ v3dv_image_type_to_view_type(VkImageType type)
}
}
-static enum pipe_swizzle
-vk_component_mapping_to_pipe_swizzle(VkComponentSwizzle swz)
-{
- assert(swz != VK_COMPONENT_SWIZZLE_IDENTITY);
-
- switch (swz) {
- case VK_COMPONENT_SWIZZLE_ZERO:
- return PIPE_SWIZZLE_0;
- case VK_COMPONENT_SWIZZLE_ONE:
- return PIPE_SWIZZLE_1;
- case VK_COMPONENT_SWIZZLE_R:
- return PIPE_SWIZZLE_X;
- case VK_COMPONENT_SWIZZLE_G:
- return PIPE_SWIZZLE_Y;
- case VK_COMPONENT_SWIZZLE_B:
- return PIPE_SWIZZLE_Z;
- case VK_COMPONENT_SWIZZLE_A:
- return PIPE_SWIZZLE_W;
- default:
- unreachable("Unknown VkComponentSwizzle");
- };
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateImageView(VkDevice _device,
- const VkImageViewCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkImageView *pView)
+static VkResult
+create_image_view(struct v3dv_device *device,
+ bool driver_internal,
+ const VkImageViewCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkImageView *pView)
{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_image, image, pCreateInfo->image);
struct v3dv_image_view *iview;
- iview = vk_image_view_create(&device->vk, pCreateInfo, pAllocator,
- sizeof(*iview));
+ iview = vk_image_view_create(&device->vk, driver_internal, pCreateInfo,
+ pAllocator, sizeof(*iview));
if (iview == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ const VkImageAspectFlagBits any_plane_aspect =
+ VK_IMAGE_ASPECT_PLANE_0_BIT |
+ VK_IMAGE_ASPECT_PLANE_1_BIT |
+ VK_IMAGE_ASPECT_PLANE_2_BIT;
+
+ if (image->vk.aspects & any_plane_aspect) {
+ assert((image->vk.aspects & ~any_plane_aspect) == 0);
+ iview->plane_count = 0;
+ static const VkImageAspectFlagBits plane_aspects[]= {
+ VK_IMAGE_ASPECT_PLANE_0_BIT,
+ VK_IMAGE_ASPECT_PLANE_1_BIT,
+ VK_IMAGE_ASPECT_PLANE_2_BIT
+ };
+ for (uint8_t plane = 0; plane < V3DV_MAX_PLANE_COUNT; plane++) {
+ if (iview->vk.aspects & plane_aspects[plane])
+ iview->planes[iview->plane_count++].image_plane = plane;
+ }
+ } else {
+ iview->plane_count = 1;
+ iview->planes[0].image_plane = 0;
+ }
+ /* At this point we should have at least one plane */
+ assert(iview->plane_count > 0);
const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
- iview->offset = v3dv_layer_offset(image, iview->vk.base_mip_level,
- iview->vk.base_array_layer);
-
/* If we have D24S8 format but the view only selects the stencil aspect
* we want to re-interpret the format as RGBA8_UINT, then map our stencil
* data reads to the R component and ignore the GBA channels that contain
* the depth aspect data.
+ *
+ * FIXME: thwe code belows calls vk_component_mapping_to_pipe_swizzle
+ * only so it can then call util_format_compose_swizzles later. Maybe it
+ * makes sense to implement swizzle composition using VkSwizzle directly.
*/
VkFormat format;
- uint8_t image_view_swizzle[4];
- if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
+ if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
format = VK_FORMAT_R8G8B8A8_UINT;
- image_view_swizzle[0] = PIPE_SWIZZLE_X;
- image_view_swizzle[1] = PIPE_SWIZZLE_0;
- image_view_swizzle[2] = PIPE_SWIZZLE_0;
- image_view_swizzle[3] = PIPE_SWIZZLE_1;
+ uint8_t stencil_aspect_swizzle[4] = {
+ PIPE_SWIZZLE_X, PIPE_SWIZZLE_0, PIPE_SWIZZLE_0, PIPE_SWIZZLE_1,
+ };
+ uint8_t view_swizzle[4];
+ vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);
+
+ util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
+ iview->view_swizzle);
} else {
- format = pCreateInfo->format;
-
- /* FIXME: we are doing this vk to pipe swizzle mapping just to call
- * util_format_compose_swizzles. Would be good to check if it would be
- * better to reimplement the latter using vk component
- */
- image_view_swizzle[0] =
- vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.r);
- image_view_swizzle[1] =
- vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.g);
- image_view_swizzle[2] =
- vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.b);
- image_view_swizzle[3] =
- vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.a);
+ format = iview->vk.format;
+ vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
+ iview->view_swizzle);
}
- iview->vk.format = format;
+ iview->vk.view_format = format;
iview->format = v3dv_X(device, get_format)(format);
- assert(iview->format && iview->format->supported);
+ assert(iview->format && iview->format->plane_count);
- if (vk_format_is_depth_or_stencil(iview->vk.format)) {
- iview->internal_type =
- v3dv_X(device, get_internal_depth_type)(iview->vk.format);
- } else {
- v3dv_X(device, get_internal_type_bpp_for_output_format)
- (iview->format->rt_type, &iview->internal_type, &iview->internal_bpp);
- }
+ for (uint8_t plane = 0; plane < iview->plane_count; plane++) {
+ iview->planes[plane].offset = v3dv_layer_offset(image,
+ iview->vk.base_mip_level,
+ iview->vk.base_array_layer,
+ plane);
+
+ if (vk_format_is_depth_or_stencil(iview->vk.view_format)) {
+ iview->planes[plane].internal_type =
+ v3dv_X(device, get_internal_depth_type)(iview->vk.view_format);
+ } else {
+ v3dv_X(device, get_internal_type_bpp_for_output_format)
+ (iview->format->planes[plane].rt_type,
+ &iview->planes[plane].internal_type,
+ &iview->planes[plane].internal_bpp);
+ }
- const uint8_t *format_swizzle = v3dv_get_format_swizzle(device, format);
- util_format_compose_swizzles(format_swizzle, image_view_swizzle,
- iview->swizzle);
- iview->swap_rb = iview->swizzle[0] == PIPE_SWIZZLE_Z;
+ const uint8_t *format_swizzle =
+ v3dv_get_format_swizzle(device, format, plane);
+ util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
+ iview->planes[plane].swizzle);
+
+ iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
+ iview->planes[plane].channel_reverse = v3dv_format_swizzle_needs_reverse(format_swizzle);
+ }
v3dv_X(device, pack_texture_shader_state)(device, iview);
@@ -549,6 +883,25 @@ v3dv_CreateImageView(VkDevice _device,
return VK_SUCCESS;
}
+VkResult
+v3dv_create_image_view(struct v3dv_device *device,
+ const VkImageViewCreateInfo *pCreateInfo,
+ VkImageView *pView)
+{
+ return create_image_view(device, true, pCreateInfo, NULL, pView);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_CreateImageView(VkDevice _device,
+ const VkImageViewCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkImageView *pView)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+
+ return create_image_view(device, false, pCreateInfo, pAllocator, pView);
+}
+
VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyImageView(VkDevice _device,
VkImageView imageView,
@@ -560,6 +913,13 @@ v3dv_DestroyImageView(VkDevice _device,
if (image_view == NULL)
return;
+ if (image_view->shadow) {
+ v3dv_DestroyImageView(_device,
+ v3dv_image_view_to_handle(image_view->shadow),
+ pAllocator);
+ image_view->shadow = NULL;
+ }
+
vk_image_view_destroy(&device->vk, pAllocator, &image_view->vk);
}
@@ -578,7 +938,7 @@ v3dv_CreateBufferView(VkDevice _device,
vk_object_zalloc(&device->vk, pAllocator, sizeof(*view),
VK_OBJECT_TYPE_BUFFER_VIEW);
if (!view)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
uint32_t range;
if (pCreateInfo->range == VK_WHOLE_SIZE)
@@ -596,8 +956,10 @@ v3dv_CreateBufferView(VkDevice _device,
view->vk_format = pCreateInfo->format;
view->format = v3dv_X(device, get_format)(view->vk_format);
+ /* We don't support multi-plane formats for buffer views */
+ assert(view->format->plane_count == 1);
v3dv_X(device, get_internal_type_bpp_for_output_format)
- (view->format->rt_type, &view->internal_type, &view->internal_bpp);
+ (view->format->planes[0].rt_type, &view->internal_type, &view->internal_bpp);
if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT ||
buffer->usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)
diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h
index aaab1ce03ac..4df172e6bf3 100644
--- a/src/broadcom/vulkan/v3dv_limits.h
+++ b/src/broadcom/vulkan/v3dv_limits.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -23,8 +23,6 @@
#ifndef V3DV_LIMITS_H
#define V3DV_LIMITS_H
-#define NSEC_PER_SEC 1000000000ull
-
/* From vulkan spec "If the multiple viewports feature is not enabled,
* scissorCount must be 1", ditto for viewportCount. For now we don't support
* that feature.
@@ -43,7 +41,8 @@
#define MAX_STORAGE_IMAGES 4
#define MAX_INPUT_ATTACHMENTS 4
-#define MAX_UNIFORM_BUFFERS 12
+#define MAX_UNIFORM_BUFFERS 16
+#define MAX_INLINE_UNIFORM_BUFFERS 4
#define MAX_STORAGE_BUFFERS 8
#define MAX_DYNAMIC_UNIFORM_BUFFERS 8
@@ -51,8 +50,6 @@
#define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
MAX_DYNAMIC_STORAGE_BUFFERS)
-#define MAX_RENDER_TARGETS 4
-
#define MAX_MULTIVIEW_VIEW_COUNT 16
/* These are tunable parameters in the HW design, but all the V3D
diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
index 5555c690bb3..d8868142329 100644
--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -25,8 +25,8 @@
#include "v3dv_meta_common.h"
#include "compiler/nir/nir_builder.h"
-#include "vk_format_info.h"
#include "util/u_pack_color.h"
+#include "vk_common_entrypoints.h"
static void
get_hw_clear_color(struct v3dv_device *device,
@@ -68,7 +68,13 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
{
const VkOffset3D origin = { 0, 0, 0 };
VkFormat fb_format;
- if (!v3dv_meta_can_use_tlb(image, &origin, &fb_format))
+
+ /* From vkCmdClearColorImage spec:
+ * "image must not use any of the formats that require a sampler YCBCR
+ * conversion"
+ */
+ assert(image->plane_count == 1);
+ if (!v3dv_meta_can_use_tlb(image, 0, 0, &origin, NULL, &fb_format))
return false;
uint32_t internal_type, internal_bpp;
@@ -120,8 +126,9 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
if (!job)
return true;
- v3dv_job_start_frame(job, width, height, max_layer, false,
- 1, internal_bpp,
+ v3dv_job_start_frame(job, width, height, max_layer,
+ false, true, 1, internal_bpp,
+ 4 * v3d_internal_bpp_words(internal_bpp),
image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
@@ -161,11 +168,15 @@ v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,
.color = *pColor,
};
+ cmd_buffer->state.is_transfer = true;
+
for (uint32_t i = 0; i < rangeCount; i++) {
if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
continue;
unreachable("Unsupported color clear.");
}
+
+ cmd_buffer->state.is_transfer = false;
}
VKAPI_ATTR void VKAPI_CALL
@@ -183,11 +194,15 @@ v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
.depthStencil = *pDepthStencil,
};
+ cmd_buffer->state.is_transfer = true;
+
for (uint32_t i = 0; i < rangeCount; i++) {
if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
continue;
unreachable("Unsupported depth/stencil clear.");
}
+
+ cmd_buffer->state.is_transfer = false;
}
static void
@@ -304,39 +319,6 @@ v3dv_meta_clear_finish(struct v3dv_device *device)
}
}
-static nir_ssa_def *
-gen_rect_vertices(nir_builder *b)
-{
- nir_ssa_def *vertex_id = nir_load_vertex_id(b);
-
- /* vertex 0: -1.0, -1.0
- * vertex 1: -1.0, 1.0
- * vertex 2: 1.0, -1.0
- * vertex 3: 1.0, 1.0
- *
- * so:
- *
- * channel 0 is vertex_id < 2 ? -1.0 : 1.0
- * channel 1 is vertex id & 1 ? 1.0 : -1.0
- */
-
- nir_ssa_def *one = nir_imm_int(b, 1);
- nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
- nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
-
- nir_ssa_def *comp[4];
- comp[0] = nir_bcsel(b, c0cmp,
- nir_imm_float(b, -1.0f),
- nir_imm_float(b, 1.0f));
-
- comp[1] = nir_bcsel(b, c1cmp,
- nir_imm_float(b, 1.0f),
- nir_imm_float(b, -1.0f));
- comp[2] = nir_imm_float(b, 0.0f);
- comp[3] = nir_imm_float(b, 1.0f);
- return nir_vec(b, comp, 4);
-}
-
static nir_shader *
get_clear_rect_vs()
{
@@ -349,7 +331,7 @@ get_clear_rect_vs()
nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
vs_out_pos->data.location = VARYING_SLOT_POS;
- nir_ssa_def *pos = gen_rect_vertices(&b);
+ nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
nir_store_var(&b, vs_out_pos, pos, 0xf);
return b.shader;
@@ -372,8 +354,8 @@ get_clear_rect_gs(uint32_t push_constant_layer_base)
nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
(1ull << VARYING_SLOT_LAYER);
- nir->info.gs.input_primitive = GL_TRIANGLES;
- nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
+ nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
+ nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
nir->info.gs.vertices_in = 3;
nir->info.gs.vertices_out = 3;
nir->info.gs.invocations = 1;
@@ -406,7 +388,7 @@ get_clear_rect_gs(uint32_t push_constant_layer_base)
nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
/* gl_Layer from push constants */
- nir_ssa_def *layer =
+ nir_def *layer =
nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
.base = push_constant_layer_base, .range = 4);
nir_store_var(&b, gs_out_layer, layer, 0x1);
@@ -434,7 +416,7 @@ get_color_clear_rect_fs(uint32_t rt_idx, VkFormat format)
nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
fs_out_color->data.location = FRAG_RESULT_DATA0 + rt_idx;
- nir_ssa_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
+ nir_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
nir_store_var(&b, fs_out_color, color_load, 0xf);
return b.shader;
@@ -452,7 +434,7 @@ get_depth_clear_rect_fs()
"out_depth");
fs_out_depth->data.location = FRAG_RESULT_DEPTH;
- nir_ssa_def *depth_load =
+ nir_def *depth_load =
nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
nir_store_var(&b, fs_out_depth, depth_load, 0x1);
@@ -475,12 +457,11 @@ create_pipeline(struct v3dv_device *device,
VkPipeline *pipeline)
{
VkPipelineShaderStageCreateInfo stages[3] = { 0 };
- struct vk_shader_module vs_m;
+ struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
struct vk_shader_module gs_m;
struct vk_shader_module fs_m;
uint32_t stage_count = 0;
- v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
stages[stage_count].stage = VK_SHADER_STAGE_VERTEX_BIT;
stages[stage_count].module = vk_shader_module_to_handle(&vs_m);
@@ -488,7 +469,7 @@ create_pipeline(struct v3dv_device *device,
stage_count++;
if (gs_nir) {
- v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
+ gs_m = vk_shader_module_from_nir(gs_nir);
stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
stages[stage_count].stage = VK_SHADER_STAGE_GEOMETRY_BIT;
stages[stage_count].module = vk_shader_module_to_handle(&gs_m);
@@ -497,7 +478,7 @@ create_pipeline(struct v3dv_device *device,
}
if (fs_nir) {
- v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
+ fs_m = vk_shader_module_from_nir(fs_nir);
stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
stages[stage_count].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
stages[stage_count].module = vk_shader_module_to_handle(&fs_m);
@@ -581,6 +562,7 @@ create_pipeline(struct v3dv_device *device,
pipeline);
ralloc_free(vs_nir);
+ ralloc_free(gs_nir);
ralloc_free(fs_nir);
return result;
@@ -592,7 +574,7 @@ create_color_clear_pipeline(struct v3dv_device *device,
uint32_t subpass_idx,
uint32_t rt_idx,
VkFormat format,
- uint32_t samples,
+ VkSampleCountFlagBits samples,
uint32_t components,
bool is_layered,
VkPipelineLayout pipeline_layout,
@@ -709,10 +691,11 @@ static VkResult
create_color_clear_render_pass(struct v3dv_device *device,
uint32_t rt_idx,
VkFormat format,
- uint32_t samples,
+ VkSampleCountFlagBits samples,
VkRenderPass *pass)
{
- VkAttachmentDescription att = {
+ VkAttachmentDescription2 att = {
+ .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
.format = format,
.samples = samples,
.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
@@ -721,12 +704,14 @@ create_color_clear_render_pass(struct v3dv_device *device,
.finalLayout = VK_IMAGE_LAYOUT_GENERAL,
};
- VkAttachmentReference att_ref = {
+ VkAttachmentReference2 att_ref = {
+ .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
.attachment = rt_idx,
.layout = VK_IMAGE_LAYOUT_GENERAL,
};
- VkSubpassDescription subpass = {
+ VkSubpassDescription2 subpass = {
+ .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
.inputAttachmentCount = 0,
.colorAttachmentCount = 1,
@@ -737,8 +722,8 @@ create_color_clear_render_pass(struct v3dv_device *device,
.pPreserveAttachments = NULL,
};
- VkRenderPassCreateInfo info = {
- .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+ VkRenderPassCreateInfo2 info = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
.attachmentCount = 1,
.pAttachments = &att,
.subpassCount = 1,
@@ -747,14 +732,14 @@ create_color_clear_render_pass(struct v3dv_device *device,
.pDependencies = NULL,
};
- return v3dv_CreateRenderPass(v3dv_device_to_handle(device),
- &info, &device->vk.alloc, pass);
+ return v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
+ &info, &device->vk.alloc, pass);
}
static inline uint64_t
get_color_clear_pipeline_cache_key(uint32_t rt_idx,
VkFormat format,
- uint32_t samples,
+ VkSampleCountFlagBits samples,
uint32_t components,
bool is_layered)
{
@@ -764,7 +749,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
uint32_t bit_offset = 0;
key |= rt_idx;
- bit_offset += 2;
+ bit_offset += 3;
key |= ((uint64_t) format) << bit_offset;
bit_offset += 32;
@@ -819,7 +804,7 @@ get_color_clear_pipeline(struct v3dv_device *device,
uint32_t rt_idx,
uint32_t attachment_idx,
VkFormat format,
- uint32_t samples,
+ VkSampleCountFlagBits samples,
uint32_t components,
bool is_layered,
struct v3dv_meta_color_clear_pipeline **pipeline)
@@ -1012,7 +997,7 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
const VkFormat format =
cmd_buffer->state.pass->attachments[attachment_idx].desc.format;
- const VkFormat samples =
+ const VkSampleCountFlagBits samples =
cmd_buffer->state.pass->attachments[attachment_idx].desc.samples;
const uint32_t components = VK_COLOR_COMPONENT_R_BIT |
VK_COLOR_COMPONENT_G_BIT |
@@ -1049,8 +1034,6 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
VK_PIPELINE_BIND_POINT_GRAPHICS,
pipeline->pipeline);
- uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
-
for (uint32_t i = 0; i < rect_count; i++) {
const VkViewport viewport = {
.x = rects[i].rect.offset.x,
@@ -1087,7 +1070,7 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
cmd_buffer, (uintptr_t)pipeline,
(v3dv_cmd_buffer_private_obj_destroy_cb) destroy_color_clear_pipeline);
- v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
}
/* Emits a scissored quad, clearing the depth aspect by writing to gl_FragDepth
@@ -1139,18 +1122,14 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
VK_PIPELINE_BIND_POINT_GRAPHICS,
pipeline->pipeline);
- uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- v3dv_CmdSetStencilReference(cmd_buffer_handle,
- VK_STENCIL_FACE_FRONT_AND_BACK,
- clear_ds->stencil);
- v3dv_CmdSetStencilWriteMask(cmd_buffer_handle,
- VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
- v3dv_CmdSetStencilCompareMask(cmd_buffer_handle,
- VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
- dynamic_states |= VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK |
- VK_DYNAMIC_STATE_STENCIL_WRITE_MASK |
- VK_DYNAMIC_STATE_STENCIL_REFERENCE;
+ vk_common_CmdSetStencilReference(cmd_buffer_handle,
+ VK_STENCIL_FACE_FRONT_AND_BACK,
+ clear_ds->stencil);
+ vk_common_CmdSetStencilWriteMask(cmd_buffer_handle,
+ VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
+ vk_common_CmdSetStencilCompareMask(cmd_buffer_handle,
+ VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
}
for (uint32_t i = 0; i < rect_count; i++) {
@@ -1179,7 +1158,7 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
}
}
- v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
}
static void
@@ -1212,9 +1191,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- /* We can only clear attachments in the current subpass */
- assert(attachmentCount <= 5); /* 4 color + D/S */
+ /* We can have at most max_color_RTs + 1 D/S attachments */
+ assert(attachmentCount <=
+ V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);
+ /* We can only clear attachments in the current subpass */
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
@@ -1225,6 +1206,9 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
* framebuffers, we use a geometry shader to redirect clears to the
* appropriate layers.
*/
+
+ v3dv_cmd_buffer_pause_occlusion_query(cmd_buffer);
+
bool is_layered, all_rects_same_layers;
gather_layering_info(rectCount, pRects, &is_layered, &all_rects_same_layers);
for (uint32_t i = 0; i < attachmentCount; i++) {
@@ -1242,4 +1226,6 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
rectCount, pRects);
}
}
+
+ v3dv_cmd_buffer_resume_occlusion_query(cmd_buffer);
}
diff --git a/src/broadcom/vulkan/v3dv_meta_common.h b/src/broadcom/vulkan/v3dv_meta_common.h
index 555b55f90b7..3be51b56a1f 100644
--- a/src/broadcom/vulkan/v3dv_meta_common.h
+++ b/src/broadcom/vulkan/v3dv_meta_common.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -23,30 +23,6 @@
#ifndef V3DV_META_COMMON_H
#define V3DV_META_COMMON_H
-/* Disable level 0 write, just write following mipmaps */
-#define V3D_TFU_IOA_DIMTW (1 << 0)
-#define V3D_TFU_IOA_FORMAT_SHIFT 3
-#define V3D_TFU_IOA_FORMAT_LINEARTILE 3
-#define V3D_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
-#define V3D_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
-#define V3D_TFU_IOA_FORMAT_UIF_NO_XOR 6
-#define V3D_TFU_IOA_FORMAT_UIF_XOR 7
-
-#define V3D_TFU_ICFG_NUMMM_SHIFT 5
-#define V3D_TFU_ICFG_TTYPE_SHIFT 9
-
-#define V3D_TFU_ICFG_OPAD_SHIFT 22
-
-#define V3D_TFU_ICFG_FORMAT_SHIFT 18
-#define V3D_TFU_ICFG_FORMAT_RASTER 0
-#define V3D_TFU_ICFG_FORMAT_SAND_128 1
-#define V3D_TFU_ICFG_FORMAT_SAND_256 2
-#define V3D_TFU_ICFG_FORMAT_LINEARTILE 11
-#define V3D_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
-#define V3D_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
-#define V3D_TFU_ICFG_FORMAT_UIF_NO_XOR 14
-#define V3D_TFU_ICFG_FORMAT_UIF_XOR 15
-
/**
* Copy/Clear operations implemented in v3dv_meta_*.c that use the TLB hardware
* need to figure out TLB programming from the target image data instead of an
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index 85cd8e06638..0713b1b4084 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -25,9 +25,8 @@
#include "v3dv_meta_common.h"
#include "compiler/nir/nir_builder.h"
-#include "vk_format_info.h"
#include "util/u_pack_color.h"
-#include "vulkan/util/vk_common_entrypoints.h"
+#include "vk_common_entrypoints.h"
static uint32_t
meta_blit_key_hash(const void *key)
@@ -42,6 +41,19 @@ meta_blit_key_compare(const void *key1, const void *key2)
}
static bool
+texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
+ VkImageAspectFlags aspect,
+ struct v3dv_image *image,
+ VkFormat dst_format,
+ VkFormat src_format,
+ struct v3dv_buffer *buffer,
+ uint32_t buffer_bpp,
+ VkColorComponentFlags cmask,
+ VkComponentMapping *cswizzle,
+ uint32_t region_count,
+ const VkBufferImageCopy2 *regions);
+
+static bool
create_blit_pipeline_layout(struct v3dv_device *device,
VkDescriptorSetLayout *descriptor_set_layout,
VkPipelineLayout *pipeline_layout)
@@ -338,18 +350,41 @@ get_compatible_tlb_format(VkFormat format)
/**
* Checks if we can implement an image copy or clear operation using the TLB
* hardware.
+ *
+ * The extent and miplevel are only used to validate tile stores (to match the
+ * region to store against the miplevel dimensions to avoid avoid cases where
+ * the region to store is not a aligned to tile boundaries). If extent is
+ * NULL no checks are done (which is fine if the image will only be used for a
+ * TLB load or when we know in advance that the store will be for the entire
+ * size of the image miplevel).
+ *
+ * For tlb copies we are doing a per-plane copy, so for multi-plane formats,
+ * the compatible format will be single-plane.
*/
bool
v3dv_meta_can_use_tlb(struct v3dv_image *image,
+ uint8_t plane,
+ uint8_t miplevel,
const VkOffset3D *offset,
+ const VkExtent3D *extent,
VkFormat *compat_format)
{
if (offset->x != 0 || offset->y != 0)
return false;
- if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
+ /* FIXME: this is suboptimal, what we really want to check is that the
+ * extent of the region to copy is the full slice or a multiple of the
+ * tile size.
+ */
+ if (extent) {
+ struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel];
+ if (slice->width != extent->width || slice->height != extent->height)
+ return false;
+ }
+
+ if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
if (compat_format)
- *compat_format = image->vk.format;
+ *compat_format = image->planes[plane].vk_format;
return true;
}
@@ -357,9 +392,11 @@ v3dv_meta_can_use_tlb(struct v3dv_image *image,
* a compatible format instead.
*/
if (compat_format) {
- *compat_format = get_compatible_tlb_format(image->vk.format);
- if (*compat_format != VK_FORMAT_UNDEFINED)
+ *compat_format = get_compatible_tlb_format(image->planes[plane].vk_format);
+ if (*compat_format != VK_FORMAT_UNDEFINED) {
+ assert(vk_format_get_plane_count(*compat_format) == 1);
return true;
+ }
}
return false;
@@ -379,11 +416,17 @@ static bool
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_buffer *buffer,
struct v3dv_image *image,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
VkFormat fb_format;
- if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
+ uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
+ assert(plane < image->plane_count);
+
+ if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
+ &region->imageOffset, &region->imageExtent,
+ &fb_format)) {
return false;
+ }
uint32_t internal_type, internal_bpp;
v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -403,13 +446,16 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
return true;
/* Handle copy from compressed format using a compatible format */
- const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
- const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
+ const uint32_t block_w =
+ vk_format_get_blockwidth(image->planes[plane].vk_format);
+ const uint32_t block_h =
+ vk_format_get_blockheight(image->planes[plane].vk_format);
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers, false,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -432,29 +478,110 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
VkFormat src_format,
VkColorComponentFlags cmask,
VkComponentMapping *cswizzle,
- const VkImageBlit2KHR *region,
+ const VkImageBlit2 *region,
VkFilter filter,
bool dst_is_padded_image);
+
/**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
+ * A structure that contains all the information we may need in various
+ * processes involving image to buffer copies implemented with blit paths.
+ */
+struct image_to_buffer_info {
+ /* Source image info */
+ VkFormat src_format;
+ uint8_t plane;
+ VkColorComponentFlags cmask;
+ VkComponentMapping cswizzle;
+ VkImageAspectFlags src_copy_aspect;
+ uint32_t block_width;
+ uint32_t block_height;
+
+ /* Destination buffer info */
+ VkFormat dst_format;
+ uint32_t buf_width;
+ uint32_t buf_height;
+ uint32_t buf_bpp;
+ VkImageAspectFlags dst_copy_aspect;
+};
+
+static VkImageBlit2
+blit_region_for_image_to_buffer(const VkOffset3D *offset,
+ const VkExtent3D *extent,
+ uint32_t mip_level,
+ uint32_t base_layer,
+ uint32_t layer_offset,
+ struct image_to_buffer_info *info)
+{
+ VkImageBlit2 output = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
+ .srcSubresource = {
+ .aspectMask = info->src_copy_aspect,
+ .mipLevel = mip_level,
+ .baseArrayLayer = base_layer + layer_offset,
+ .layerCount = 1,
+ },
+ .srcOffsets = {
+ {
+ DIV_ROUND_UP(offset->x, info->block_width),
+ DIV_ROUND_UP(offset->y, info->block_height),
+ offset->z + layer_offset,
+ },
+ {
+ DIV_ROUND_UP(offset->x + extent->width, info->block_width),
+ DIV_ROUND_UP(offset->y + extent->height, info->block_height),
+ offset->z + layer_offset + 1,
+ },
+ },
+ .dstSubresource = {
+ .aspectMask = info->dst_copy_aspect,
+ .mipLevel = 0,
+ .baseArrayLayer = 0,
+ .layerCount = 1,
+ },
+ .dstOffsets = {
+ { 0, 0, 0 },
+ {
+ DIV_ROUND_UP(extent->width, info->block_width),
+ DIV_ROUND_UP(extent->height, info->block_height),
+ 1
+ },
+ },
+ };
+
+ return output;
+}
+
+/**
+ * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can
+ * use to implement buffer to image copies with blit paths.
+ *
+ * Returns false if the copy operation can't be implemented with a blit.
*/
static bool
-copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_buffer *buffer,
- struct v3dv_image *image,
- const VkBufferImageCopy2KHR *region)
+gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_image *image,
+ const VkBufferImageCopy2 *region,
+ struct image_to_buffer_info *out_info)
{
- bool handled = false;
+ bool supported = false;
+
+ VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask;
+ /* For multi-planar images we copy one plane at a time using an image alias
+ * with a color aspect for each plane.
+ */
+ if (image->plane_count > 1)
+ dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+
+ VkImageAspectFlags src_copy_aspect = region->imageSubresource.aspectMask;
+ uint8_t plane = v3dv_plane_from_aspect(src_copy_aspect);
+ assert(plane < image->plane_count);
/* Generally, the bpp of the data in the buffer matches that of the
* source image. The exception is the case where we are copying
* stencil (8bpp) to a combined d24s8 image (32bpp).
*/
- uint32_t buffer_bpp = image->cpp;
-
- VkImageAspectFlags copy_aspect = region->imageSubresource.aspectMask;
+ uint32_t buffer_bpp = image->planes[plane].cpp;
/* Because we are going to implement the copy as a blit, we need to create
* a linear image from the destination buffer and we also want our blit
@@ -477,22 +604,23 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
};
switch (buffer_bpp) {
case 16:
- assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+ assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
dst_format = VK_FORMAT_R32G32B32A32_UINT;
src_format = dst_format;
break;
case 8:
- assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+ assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
dst_format = VK_FORMAT_R16G16B16A16_UINT;
src_format = dst_format;
break;
case 4:
- switch (copy_aspect) {
+ switch (dst_copy_aspect) {
case VK_IMAGE_ASPECT_COLOR_BIT:
src_format = VK_FORMAT_R8G8B8A8_UINT;
dst_format = VK_FORMAT_R8G8B8A8_UINT;
break;
case VK_IMAGE_ASPECT_DEPTH_BIT:
+ assert(image->plane_count == 1);
assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
@@ -517,7 +645,8 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
}
break;
case VK_IMAGE_ASPECT_STENCIL_BIT:
- assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
+ assert(image->plane_count == 1);
+ assert(dst_copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
/* Copying from S8D24. We want to write 8-bit stencil values only,
* so adjust the buffer bpp for that. Since the hardware stores stencil
@@ -529,23 +658,23 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
break;
default:
unreachable("unsupported aspect");
- return handled;
+ return supported;
};
break;
case 2:
- assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
- copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
+ assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
+ dst_copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
dst_format = VK_FORMAT_R16_UINT;
src_format = dst_format;
break;
case 1:
- assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+ assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
dst_format = VK_FORMAT_R8_UINT;
src_format = dst_format;
break;
default:
unreachable("unsupported bit-size");
- return handled;
+ return supported;
};
/* The hardware doesn't support linear depth/stencil stores, so we
@@ -554,10 +683,10 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
*/
assert(vk_format_is_color(src_format));
assert(vk_format_is_color(dst_format));
- copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+ dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
/* We should be able to handle the blit if we got this far */
- handled = true;
+ supported = true;
/* Obtain the 2D buffer region spec */
uint32_t buf_width, buf_height;
@@ -572,99 +701,250 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
buf_height = region->bufferImageHeight;
/* If the image is compressed, the bpp refers to blocks, not pixels */
- uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
- uint32_t block_height = vk_format_get_blockheight(image->vk.format);
- buf_width = buf_width / block_width;
- buf_height = buf_height / block_height;
+ uint32_t block_width =
+ vk_format_get_blockwidth(image->planes[plane].vk_format);
+ uint32_t block_height =
+ vk_format_get_blockheight(image->planes[plane].vk_format);
+ buf_width = DIV_ROUND_UP(buf_width, block_width);
+ buf_height = DIV_ROUND_UP(buf_height, block_height);
+
+ out_info->src_format = src_format;
+ out_info->dst_format = dst_format;
+ out_info->src_copy_aspect = src_copy_aspect;
+ out_info->dst_copy_aspect = dst_copy_aspect;
+ out_info->buf_width = buf_width;
+ out_info->buf_height = buf_height;
+ out_info->buf_bpp = buffer_bpp;
+ out_info->block_width = block_width;
+ out_info->block_height = block_height;
+ out_info->cmask = cmask;
+ out_info->cswizzle = cswizzle;
+ out_info->plane = plane;
+
+ return supported;
+}
- /* Compute layers to copy */
- uint32_t num_layers;
- if (image->vk.image_type != VK_IMAGE_TYPE_3D)
- num_layers = region->imageSubresource.layerCount;
- else
- num_layers = region->imageExtent.depth;
- assert(num_layers > 0);
+/* Creates a linear image to alias buffer memory. It also includes that image
+ * as a private object in the cmd_buffer.
+ *
+ * This is used for cases where we want to implement an image to buffer copy,
+ * but we need to rely on a mechanism that uses an image as destination, like
+ * blitting.
+ */
+static VkResult
+create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_buffer *buffer,
+ const VkBufferImageCopy2 *region,
+ struct image_to_buffer_info *info,
+ uint32_t layer,
+ VkImage *out_image)
+{
+ VkImageCreateInfo image_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+ .imageType = VK_IMAGE_TYPE_2D,
+ .format = info->dst_format,
+ .extent = { info->buf_width, info->buf_height, 1 },
+ .mipLevels = 1,
+ .arrayLayers = 1,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .tiling = VK_IMAGE_TILING_LINEAR,
+ .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = 0,
+ .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+ };
- /* Our blit interface can see the real format of the images to detect
- * copies between compressed and uncompressed images and adapt the
- * blit region accordingly. Here we are just doing a raw copy of
- * compressed data, but we are passing an uncompressed view of the
- * buffer for the blit destination image (since compressed formats are
- * not renderable), so we also want to provide an uncompressed view of
- * the source image.
- */
VkResult result;
struct v3dv_device *device = cmd_buffer->device;
VkDevice _device = v3dv_device_to_handle(device);
- if (vk_format_is_compressed(image->vk.format)) {
- VkImage uiview;
- VkImageCreateInfo uiview_info = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
- .imageType = VK_IMAGE_TYPE_3D,
- .format = dst_format,
- .extent = { buf_width, buf_height, image->vk.extent.depth },
- .mipLevels = image->vk.mip_levels,
- .arrayLayers = image->vk.array_layers,
- .samples = image->vk.samples,
- .tiling = image->vk.tiling,
- .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
- .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
- .queueFamilyIndexCount = 0,
- .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
- };
- result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview);
- if (result != VK_SUCCESS)
- return handled;
- v3dv_cmd_buffer_add_private_obj(
- cmd_buffer, (uintptr_t)uiview,
- (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+ VkImage buffer_image;
+ result =
+ v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
+ if (result != VK_SUCCESS)
+ return result;
- result =
- vk_common_BindImageMemory(_device, uiview,
- v3dv_device_memory_to_handle(image->mem),
- image->mem_offset);
- if (result != VK_SUCCESS)
- return handled;
+ *out_image = buffer_image;
+
+ v3dv_cmd_buffer_add_private_obj(
+ cmd_buffer, (uintptr_t)buffer_image,
+ (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+
+ /* Bind the buffer memory to the image
+ */
+ VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
+ layer * info->buf_width * info->buf_height * info->buf_bpp;
+
+ result =
+ vk_common_BindImageMemory(_device, buffer_image,
+ v3dv_device_memory_to_handle(buffer->mem),
+ buffer_offset);
+ return result;
+}
- image = v3dv_image_from_handle(uiview);
+/**
+ * Creates an image with a single mip level that aliases the memory of a
+ * mip level in another image, re-interpreting the memory with an uncompressed
+ * format. The image is added to the command buffer as a private object for
+ * disposal.
+ */
+static bool
+create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_image *image,
+ VkFormat format,
+ uint32_t plane,
+ uint32_t mip_level,
+ uint32_t layer,
+ VkImage *alias)
+{
+ VkResult result;
+ assert(!vk_format_is_compressed(format));
+
+ struct v3dv_device *device = cmd_buffer->device;
+ VkDevice vk_device = v3dv_device_to_handle(device);
+ uint32_t mip_width = image->planes[plane].slices[mip_level].width;
+ uint32_t mip_height = image->planes[plane].slices[mip_level].height;
+
+ uint32_t block_width =
+ vk_format_get_blockwidth(image->planes[plane].vk_format);
+ uint32_t block_height =
+ vk_format_get_blockheight(image->planes[plane].vk_format);
+
+ VkImageCreateInfo info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+ .imageType = image->vk.image_type,
+ .format = format,
+ .extent = { DIV_ROUND_UP(mip_width, block_width),
+ DIV_ROUND_UP(mip_height, block_height),
+ 1 },
+ .mipLevels = 1,
+ .arrayLayers = 1,
+ .samples = image->vk.samples,
+ .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
+ .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = 0,
+ .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+ };
+ result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias);
+ if (result != VK_SUCCESS)
+ return false;
+
+ /* The alias we have just created has just one mip, but we may be aliasing
+ * any mip in the original image. Because the slice setup changes based on
+ * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally)
+ * and this can influence the tiling layout selected for the slice, we want
+ * to make sure we copy the slice description from the actual mip level in
+ * the original image, and then rewrite any fields that we need for the
+ * alias. Particularly, we want to make the offset 0 because we are going to
+ * bind the underlying image memory exactly at the start of the selected mip.
+ * We also want to relax the image alignment requirements to the minimum
+ * (the one imposed by the Texture Base Address field) since we may not be
+ * aliasing a level 0 (for which we typically want a page alignment for
+ * optimal performance).
+ */
+ V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias);
+ v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level];
+ v3dv_alias->planes[plane].slices[0].width = info.extent.width;
+ v3dv_alias->planes[plane].slices[0].height = info.extent.height;
+ v3dv_alias->planes[plane].slices[0].offset = 0;
+ v3dv_alias->planes[plane].alignment = 64;
+
+ v3dv_cmd_buffer_add_private_obj(
+ cmd_buffer, (uintptr_t)*alias,
+ (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+
+ result =
+ vk_common_BindImageMemory(vk_device, *alias,
+ v3dv_device_memory_to_handle(image->planes[plane].mem),
+ v3dv_layer_offset(image, mip_level, layer, plane));
+ return result == VK_SUCCESS;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_buffer *buffer,
+ struct v3dv_image *image,
+ const VkBufferImageCopy2 *region)
+{
+ bool handled = false;
+ struct image_to_buffer_info info;
+
+ /* This path uses a shader blit which doesn't support linear images. Return
+ * early to avoid all the heavy lifting in preparation for the
+ * blit_shader() call that is bound to fail in that scenario.
+ */
+ if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) {
+ return handled;
}
+ handled = gather_image_to_buffer_info(cmd_buffer, image, region,
+ &info);
+
+ if (!handled)
+ return handled;
+
+ /* We should be able to handle the blit if we got this far */
+ handled = true;
+
+ /* Compute layers to copy */
+ uint32_t num_layers;
+ if (image->vk.image_type != VK_IMAGE_TYPE_3D)
+ num_layers = region->imageSubresource.layerCount;
+ else
+ num_layers = region->imageExtent.depth;
+ assert(num_layers > 0);
+
/* Copy requested layers */
+ VkResult result;
+ VkImageBlit2 blit_region;
+ uint32_t mip_level = region->imageSubresource.mipLevel;
+ uint32_t base_layer = region->imageSubresource.baseArrayLayer;
for (uint32_t i = 0; i < num_layers; i++) {
- /* Create the destination blit image from the destination buffer */
- VkImageCreateInfo image_info = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
- .imageType = VK_IMAGE_TYPE_2D,
- .format = dst_format,
- .extent = { buf_width, buf_height, 1 },
- .mipLevels = 1,
- .arrayLayers = 1,
- .samples = VK_SAMPLE_COUNT_1_BIT,
- .tiling = VK_IMAGE_TILING_LINEAR,
- .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
- .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
- .queueFamilyIndexCount = 0,
- .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
- };
-
- VkImage buffer_image;
- result =
- v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
- if (result != VK_SUCCESS)
- return handled;
+ uint32_t layer_offset = i;
+
+ if (vk_format_is_compressed(image->vk.format)) {
+ /* Our blit interface can see the real format of the images to detect
+ * copies between compressed and uncompressed images and adapt the
+ * blit region accordingly. Here we are just doing a raw copy of
+ * compressed data, but we are passing an uncompressed view of the
+ * buffer for the blit destination image (since compressed formats are
+ * not renderable), so we also want to provide an uncompressed view of
+ * the source image.
+ *
+ * It is important that we create the alias over the selected mip
+ * level (instead of aliasing the entire image) because an uncompressed
+ * view of the image won't have the same number of mip levels as the
+ * original image and the implicit mip size calculations the hw will
+ * do to sample from a non-zero mip level may not match exactly between
+ * compressed and uncompressed views.
+ */
+ VkImage alias;
+ if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format,
+ info.plane, mip_level,
+ base_layer + layer_offset,
+ &alias)) {
+ return handled;
+ }
- v3dv_cmd_buffer_add_private_obj(
- cmd_buffer, (uintptr_t)buffer_image,
- (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+ /* We are aliasing the selected mip level and layer with a
+ * single-mip and single-layer image.
+ */
+ image = v3dv_image_from_handle(alias);
+ mip_level = 0;
+ base_layer = 0;
+ layer_offset = 0;
+ }
- /* Bind the buffer memory to the image */
- VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
- i * buf_width * buf_height * buffer_bpp;
+ /* Create the destination blit image from the destination buffer */
+ VkImage buffer_image;
result =
- vk_common_BindImageMemory(_device, buffer_image,
- v3dv_device_memory_to_handle(buffer->mem),
- buffer_offset);
+ create_image_from_buffer(cmd_buffer, buffer, region, &info,
+ i, &buffer_image);
if (result != VK_SUCCESS)
return handled;
@@ -676,48 +956,17 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
* image, but that we need to blit to a S8D24 destination (the only
* stencil format we support).
*/
- const VkImageBlit2KHR blit_region = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
- .srcSubresource = {
- .aspectMask = copy_aspect,
- .mipLevel = region->imageSubresource.mipLevel,
- .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
- .layerCount = 1,
- },
- .srcOffsets = {
- {
- DIV_ROUND_UP(region->imageOffset.x, block_width),
- DIV_ROUND_UP(region->imageOffset.y, block_height),
- region->imageOffset.z + i,
- },
- {
- DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
- block_width),
- DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
- block_height),
- region->imageOffset.z + i + 1,
- },
- },
- .dstSubresource = {
- .aspectMask = copy_aspect,
- .mipLevel = 0,
- .baseArrayLayer = 0,
- .layerCount = 1,
- },
- .dstOffsets = {
- { 0, 0, 0 },
- {
- DIV_ROUND_UP(region->imageExtent.width, block_width),
- DIV_ROUND_UP(region->imageExtent.height, block_height),
- 1
- },
- },
- };
+ blit_region =
+ blit_region_for_image_to_buffer(&region->imageOffset,
+ &region->imageExtent,
+ mip_level, base_layer, layer_offset,
+ &info);
handled = blit_shader(cmd_buffer,
- v3dv_image_from_handle(buffer_image), dst_format,
- image, src_format,
- cmask, &cswizzle,
+ v3dv_image_from_handle(buffer_image),
+ info.dst_format,
+ image, info.src_format,
+ info.cmask, &info.cswizzle,
&blit_region, VK_FILTER_NEAREST, false);
if (!handled) {
/* This is unexpected, we should have a supported blit spec */
@@ -730,9 +979,110 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
return true;
}
+static bool
+copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_image *dst,
+ struct v3dv_image *src,
+ const VkImageCopy2 *region);
+
+static VkImageCopy2
+image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region,
+ struct image_to_buffer_info *info,
+ uint32_t layer)
+{
+ VkImageCopy2 output = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
+ .srcSubresource = {
+ .aspectMask = info->src_copy_aspect,
+ .mipLevel = region->imageSubresource.mipLevel,
+ .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer,
+ .layerCount = 1,
+ },
+ .srcOffset = {
+ DIV_ROUND_UP(region->imageOffset.x, info->block_width),
+ DIV_ROUND_UP(region->imageOffset.y, info->block_height),
+ region->imageOffset.z,
+ },
+ .dstSubresource = {
+ .aspectMask = info->dst_copy_aspect,
+ .mipLevel = 0,
+ .baseArrayLayer = 0,
+ .layerCount = 1,
+ },
+ .dstOffset = { 0, 0, 0 },
+ .extent = {
+ DIV_ROUND_UP(region->imageExtent.width, info->block_width),
+ DIV_ROUND_UP(region->imageExtent.height, info->block_height),
+ 1
+ },
+ };
+
+ return output;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_buffer *dst_buffer,
+ struct v3dv_image *src_image,
+ const VkBufferImageCopy2 *region)
+{
+ bool handled = false;
+ VkImage dst_buffer_image;
+ struct image_to_buffer_info info;
+
+ /* This is a requirement for copy_image_linear_texel_buffer below. We check
+ * it in advance in order to do an early return
+ */
+ if (src_image->tiled)
+ return false;
+
+ handled =
+ gather_image_to_buffer_info(cmd_buffer, src_image, region,
+ &info);
+ if (!handled)
+ return handled;
+
+ /* At this point the implementation should support the copy, any possible
+ * error below are for different reasons, like out-of-memory error
+ */
+ handled = true;
+
+ uint32_t num_layers;
+ if (src_image->vk.image_type != VK_IMAGE_TYPE_3D)
+ num_layers = region->imageSubresource.layerCount;
+ else
+ num_layers = region->imageExtent.depth;
+ assert(num_layers > 0);
+
+ VkResult result;
+ VkImageCopy2 image_region;
+ for (uint32_t layer = 0; layer < num_layers; layer++) {
+ /* Create the destination image from the destination buffer */
+ result =
+ create_image_from_buffer(cmd_buffer, dst_buffer, region, &info,
+ layer, &dst_buffer_image);
+ if (result != VK_SUCCESS)
+ return handled;
+
+ image_region =
+ image_copy_region_for_image_to_buffer(region, &info, layer);
+
+ handled =
+ copy_image_linear_texel_buffer(cmd_buffer,
+ v3dv_image_from_handle(dst_buffer_image),
+ src_image, &image_region);
+ }
+
+ return handled;
+}
+
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
- const VkCopyImageToBufferInfo2KHR *info)
+v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
+ const VkCopyImageToBufferInfo2 *info)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -741,13 +1091,23 @@ v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
+ cmd_buffer->state.is_transfer = true;
+
for (uint32_t i = 0; i < info->regionCount; i++) {
- if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i]))
+ const VkBufferImageCopy2 *region = &info->pRegions[i];
+
+ if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, region))
continue;
- if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &info->pRegions[i]))
+
+ if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region))
continue;
+
+ if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region))
+ continue;
+
unreachable("Unsupported image to buffer copy.");
}
+ cmd_buffer->state.is_transfer = false;
}
/**
@@ -758,10 +1118,15 @@ static bool
copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *dst,
struct v3dv_image *src,
- const VkImageCopy2KHR *region)
+ const VkImageCopy2 *region)
{
+ if (V3D_DBG(DISABLE_TFU)) {
+ perf_debug("Copy images: TFU disabled, fallbacks could be slower.\n");
+ return false;
+ }
+
/* Destination can't be raster format */
- if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
+ if (!dst->tiled)
return false;
/* We can only do full copies, so if the format is D24S8 both aspects need
@@ -772,7 +1137,7 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
VK_IMAGE_ASPECT_STENCIL_BIT;
if (region->dstSubresource.aspectMask != ds_aspects)
- return false;
+ return false;
}
/* Don't handle copies between uncompressed and compressed formats for now.
@@ -797,9 +1162,14 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
return false;
+ uint8_t src_plane =
+ v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+ uint8_t dst_plane =
+ v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+
const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
- uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
- uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
+ uint32_t dst_width = u_minify(dst->planes[dst_plane].width, dst_mip_level);
+ uint32_t dst_height = u_minify(dst->planes[dst_plane].height, dst_mip_level);
if (region->extent.width != dst_width || region->extent.height != dst_height)
return false;
@@ -809,8 +1179,10 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
* members represent the texel dimensions of the source image and not
* the destination."
*/
- const uint32_t block_w = vk_format_get_blockwidth(src->vk.format);
- const uint32_t block_h = vk_format_get_blockheight(src->vk.format);
+ const uint32_t block_w =
+ vk_format_get_blockwidth(src->planes[src_plane].vk_format);
+ const uint32_t block_h =
+ vk_format_get_blockheight(src->planes[src_plane].vk_format);
uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
@@ -834,10 +1206,10 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
* the underlying pixel data according to its format, we can always choose
* to use compatible formats that are supported with the TFU unit.
*/
- assert(dst->cpp == src->cpp);
+ assert(dst->planes[dst_plane].cpp == src->planes[src_plane].cpp);
const struct v3dv_format *format =
v3dv_get_compatible_tfu_format(cmd_buffer->device,
- dst->cpp, NULL);
+ dst->planes[dst_plane].cpp, NULL);
/* Emit a TFU job for each layer to blit */
const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
@@ -850,15 +1222,47 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
region->dstSubresource.baseArrayLayer : region->dstOffset.z;
for (uint32_t i = 0; i < layer_count; i++) {
- v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
- (cmd_buffer, dst, dst_mip_level, base_dst_layer + i,
- src, src_mip_level, base_src_layer + i,
- width, height, format);
+ const uint32_t dst_offset =
+ dst->planes[dst_plane].mem->bo->offset +
+ v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i, dst_plane);
+ const uint32_t src_offset =
+ src->planes[src_plane].mem->bo->offset +
+ v3dv_layer_offset(src, src_mip_level, base_src_layer + i, src_plane);
+
+ const struct v3d_resource_slice *dst_slice =
+ &dst->planes[dst_plane].slices[dst_mip_level];
+ const struct v3d_resource_slice *src_slice =
+ &src->planes[src_plane].slices[src_mip_level];
+
+ v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+ cmd_buffer,
+ dst->planes[dst_plane].mem->bo->handle,
+ dst_offset,
+ dst_slice->tiling,
+ dst_slice->padded_height,
+ dst->planes[dst_plane].cpp,
+ src->planes[src_plane].mem->bo->handle,
+ src_offset,
+ src_slice->tiling,
+ src_slice->tiling == V3D_TILING_RASTER ?
+ src_slice->stride : src_slice->padded_height,
+ src->planes[src_plane].cpp,
+ /* All compatible TFU formats are single-plane */
+ width, height, &format->planes[0]);
}
return true;
}
+inline bool
+v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_image *dst,
+ struct v3dv_image *src,
+ const VkImageCopy2 *region)
+{
+ return copy_image_tfu(cmd_buffer, dst, src, region);
+}
+
/**
* Returns true if the implementation supports the requested operation (even if
* it failed to process it, for example, due to an out-of-memory error).
@@ -867,11 +1271,20 @@ static bool
copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *dst,
struct v3dv_image *src,
- const VkImageCopy2KHR *region)
+ const VkImageCopy2 *region)
{
+ uint8_t src_plane =
+ v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+ assert(src_plane < src->plane_count);
+ uint8_t dst_plane =
+ v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+ assert(dst_plane < dst->plane_count);
+
VkFormat fb_format;
- if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, &fb_format) ||
- !v3dv_meta_can_use_tlb(dst, &region->dstOffset, &fb_format)) {
+ if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel,
+ &region->srcOffset, NULL, &fb_format) ||
+ !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel,
+ &region->dstOffset, &region->extent, &fb_format)) {
return false;
}
@@ -881,7 +1294,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
* dstImage has a multi-planar image format then the aspectMask member
* of srcSubresource and dstSubresource must match."
*/
- assert(region->dstSubresource.aspectMask ==
+ assert(src->plane_count != 1 || dst->plane_count != 1 ||
+ region->dstSubresource.aspectMask ==
region->srcSubresource.aspectMask);
uint32_t internal_type, internal_bpp;
v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -911,12 +1325,15 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
return true;
/* Handle copy to compressed image using compatible format */
- const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
- const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
+ const uint32_t block_w =
+ vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
+ const uint32_t block_h =
+ vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp,
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
@@ -951,6 +1368,8 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
VkFormat format)
{
assert(!vk_format_is_compressed(format));
+ /* We don't support ycbcr compressed formats */
+ assert(src->plane_count == 1);
VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
@@ -966,7 +1385,7 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
.mipLevels = src->vk.mip_levels,
.arrayLayers = src->vk.array_layers,
.samples = src->vk.samples,
- .tiling = src->vk.tiling,
+ .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
.usage = src->vk.usage,
};
@@ -979,8 +1398,8 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
}
struct v3dv_image *image = v3dv_image_from_handle(_image);
- image->mem = src->mem;
- image->mem_offset = src->mem_offset;
+ image->planes[0].mem = src->planes[0].mem;
+ image->planes[0].mem_offset = src->planes[0].mem_offset;
return image;
}
@@ -992,12 +1411,26 @@ static bool
copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *dst,
struct v3dv_image *src,
- const VkImageCopy2KHR *region)
+ const VkImageCopy2 *region)
{
- const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
- const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
- const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
- const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
+ if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D)
+ return false;
+
+ uint8_t src_plane =
+ v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+ assert(src_plane < src->plane_count);
+ uint8_t dst_plane =
+ v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+ assert(dst_plane < dst->plane_count);
+
+ const uint32_t src_block_w =
+ vk_format_get_blockwidth(src->planes[src_plane].vk_format);
+ const uint32_t src_block_h =
+ vk_format_get_blockheight(src->planes[src_plane].vk_format);
+ const uint32_t dst_block_w =
+ vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
+ const uint32_t dst_block_h =
+ vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
const float block_scale_w = (float)src_block_w / (float)dst_block_w;
const float block_scale_h = (float)src_block_h / (float)dst_block_h;
@@ -1033,10 +1466,10 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
* divisors for the width and height depending on the source image's
* bpp.
*/
- assert(src->cpp == dst->cpp);
+ assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
format = VK_FORMAT_R32G32_UINT;
- switch (src->cpp) {
+ switch (src->planes[src_plane].cpp) {
case 16:
format = VK_FORMAT_R32G32B32A32_UINT;
break;
@@ -1061,13 +1494,15 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
dst = create_image_alias(cmd_buffer, dst,
dst_scale_w, dst_scale_h, format);
} else {
- format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
- src->vk.format : get_compatible_tlb_format(src->vk.format);
+ format = src->format->planes[src_plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
+ src->planes[src_plane].vk_format :
+ get_compatible_tlb_format(src->planes[src_plane].vk_format);
if (format == VK_FORMAT_UNDEFINED)
return false;
const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
- if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO)
+ assert(f->plane_count < 2);
+ if (!f->plane_count || f->planes[0].tex_type == TEXTURE_DATA_FORMAT_NO)
return false;
}
@@ -1090,14 +1525,21 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
* (since the region dimensions are already specified in terms of the source
* image).
*/
+ uint32_t region_width = region->extent.width * src_scale_w;
+ uint32_t region_height = region->extent.height * src_scale_h;
+ if (src_block_w > 1)
+ region_width = util_next_power_of_two(region_width);
+ if (src_block_h > 1)
+ region_height = util_next_power_of_two(region_height);
+
const VkOffset3D src_start = {
region->srcOffset.x * src_scale_w,
region->srcOffset.y * src_scale_h,
region->srcOffset.z,
};
const VkOffset3D src_end = {
- src_start.x + region->extent.width * src_scale_w,
- src_start.y + region->extent.height * src_scale_h,
+ src_start.x + region_width,
+ src_start.y + region_height,
src_start.z + region->extent.depth,
};
@@ -1107,13 +1549,13 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
region->dstOffset.z,
};
const VkOffset3D dst_end = {
- dst_start.x + region->extent.width * src_scale_w,
- dst_start.y + region->extent.height * src_scale_h,
+ dst_start.x + region_width,
+ dst_start.y + region_height,
dst_start.z + region->extent.depth,
};
- const VkImageBlit2KHR blit_region = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
+ const VkImageBlit2 blit_region = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
.srcSubresource = region->srcSubresource,
.srcOffsets = { src_start, src_end },
.dstSubresource = region->dstSubresource,
@@ -1130,9 +1572,113 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
return handled;
}
+static bool
+copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_image *dst,
+ struct v3dv_image *src,
+ const VkImageCopy2 *region)
+{
+ if (src->tiled)
+ return false;
+
+ /* Implementations are allowed to restrict linear images like this */
+ assert(region->srcOffset.z == 0);
+ assert(region->dstOffset.z == 0);
+ assert(region->srcSubresource.mipLevel == 0);
+ assert(region->srcSubresource.baseArrayLayer == 0);
+ assert(region->srcSubresource.layerCount == 1);
+ assert(region->dstSubresource.mipLevel == 0);
+ assert(region->dstSubresource.baseArrayLayer == 0);
+ assert(region->dstSubresource.layerCount == 1);
+
+ uint8_t src_plane =
+ v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+ uint8_t dst_plane =
+ v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+
+ assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
+ const uint32_t bpp = src->planes[src_plane].cpp;
+
+ VkFormat format;
+ switch (bpp) {
+ case 16:
+ format = VK_FORMAT_R32G32B32A32_UINT;
+ break;
+ case 8:
+ format = VK_FORMAT_R16G16B16A16_UINT;
+ break;
+ case 4:
+ format = VK_FORMAT_R8G8B8A8_UINT;
+ break;
+ case 2:
+ format = VK_FORMAT_R16_UINT;
+ break;
+ case 1:
+ format = VK_FORMAT_R8_UINT;
+ break;
+ default:
+ unreachable("unsupported bit-size");
+ return false;
+ }
+
+ VkComponentMapping ident_swizzle = {
+ .r = VK_COMPONENT_SWIZZLE_IDENTITY,
+ .g = VK_COMPONENT_SWIZZLE_IDENTITY,
+ .b = VK_COMPONENT_SWIZZLE_IDENTITY,
+ .a = VK_COMPONENT_SWIZZLE_IDENTITY,
+ };
+
+ const uint32_t buf_stride = src->planes[src_plane].slices[0].stride;
+ const VkDeviceSize buf_offset =
+ region->srcOffset.y * buf_stride + region->srcOffset.x * bpp;
+
+ struct v3dv_buffer src_buffer;
+ vk_object_base_init(&cmd_buffer->device->vk, &src_buffer.base,
+ VK_OBJECT_TYPE_BUFFER);
+
+ const struct VkBufferCreateInfo buf_create_info = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .size = src->planes[src_plane].size,
+ .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ };
+ v3dv_buffer_init(cmd_buffer->device, &buf_create_info, &src_buffer,
+ src->planes[src_plane].alignment);
+
+ const VkBindBufferMemoryInfo buf_bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+ .buffer = v3dv_buffer_to_handle(&src_buffer),
+ .memory = v3dv_device_memory_to_handle(src->planes[src_plane].mem),
+ .memoryOffset = src->planes[src_plane].mem_offset +
+ v3dv_layer_offset(src, 0, 0, src_plane),
+ };
+ v3dv_buffer_bind_memory(&buf_bind_info);
+
+ const VkBufferImageCopy2 copy_region = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
+ .pNext = NULL,
+ .bufferOffset = buf_offset,
+ .bufferRowLength = buf_stride / bpp,
+ .bufferImageHeight = src->vk.extent.height,
+ .imageSubresource = region->dstSubresource,
+ .imageOffset = region->dstOffset,
+ .imageExtent = region->extent,
+ };
+
+ return texel_buffer_shader_copy(cmd_buffer,
+ region->dstSubresource.aspectMask,
+ dst,
+ format,
+ format,
+ &src_buffer,
+ src->planes[src_plane].cpp,
+ 0 /* color mask: full */, &ident_swizzle,
+ 1, &copy_region);
+}
+
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
- const VkCopyImageInfo2KHR *info)
+v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer,
+ const VkCopyImageInfo2 *info)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -1141,25 +1687,34 @@ v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
assert(src->vk.samples == dst->vk.samples);
+ cmd_buffer->state.is_transfer = true;
+
for (uint32_t i = 0; i < info->regionCount; i++) {
- if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i]))
+ const VkImageCopy2 *region = &info->pRegions[i];
+ if (copy_image_tfu(cmd_buffer, dst, src, region))
+ continue;
+ if (copy_image_tlb(cmd_buffer, dst, src, region))
continue;
- if (copy_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
+ if (copy_image_blit(cmd_buffer, dst, src, region))
continue;
- if (copy_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
+ if (copy_image_linear_texel_buffer(cmd_buffer, dst, src, region))
continue;
unreachable("Image copy not supported");
}
+
+ cmd_buffer->state.is_transfer = false;
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
- const VkCopyBufferInfo2KHR *pCopyBufferInfo)
+v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
+ const VkCopyBufferInfo2 *pCopyBufferInfo)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
+ cmd_buffer->state.is_transfer = true;
+
for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
v3dv_X(cmd_buffer->device, meta_copy_buffer)
(cmd_buffer,
@@ -1167,6 +1722,8 @@ v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
src_buffer->mem->bo, src_buffer->mem_offset,
&pCopyBufferInfo->pRegions[i]);
}
+
+ cmd_buffer->state.is_transfer = false;
}
static void
@@ -1202,12 +1759,14 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
return;
}
+ cmd_buffer->state.is_transfer = true;
+
memcpy(src_bo->map, pData, dataSize);
v3dv_bo_unmap(cmd_buffer->device, src_bo);
- VkBufferCopy2KHR region = {
- .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2_KHR,
+ VkBufferCopy2 region = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
.srcOffset = 0,
.dstOffset = dstOffset,
.size = dataSize,
@@ -1217,11 +1776,12 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
(cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
src_bo, 0, &region);
- if (!copy_job)
- return;
+ if (copy_job) {
+ v3dv_cmd_buffer_add_private_obj(
+ cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
+ }
- v3dv_cmd_buffer_add_private_obj(
- cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
+ cmd_buffer->state.is_transfer = false;
}
VKAPI_ATTR void VKAPI_CALL
@@ -1234,6 +1794,8 @@ v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
+ cmd_buffer->state.is_transfer = true;
+
struct v3dv_bo *bo = dst_buffer->mem->bo;
/* From the Vulkan spec:
@@ -1248,6 +1810,8 @@ v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
v3dv_X(cmd_buffer->device, meta_fill_buffer)
(cmd_buffer, bo, dstOffset, size, data);
+
+ cmd_buffer->state.is_transfer = false;
}
/**
@@ -1258,19 +1822,24 @@ static bool
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
+ if (V3D_DBG(DISABLE_TFU)) {
+ perf_debug("Copy buffer to image: TFU disabled, fallbacks could be slower.\n");
+ return false;
+ }
+
assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
/* Destination can't be raster format */
- if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
+ if (!image->tiled)
return false;
/* We can't copy D24S8 because buffer to image copies only copy one aspect
* at a time, and the TFU copies full images. Also, V3D depth bits for
* both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
* the Vulkan spec has the buffer data specified the other way around, so it
- * is not a straight copy, we would havew to swizzle the channels, which the
+ * is not a straight copy, we would have to swizzle the channels, which the
* TFU can't do.
*/
if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
@@ -1295,12 +1864,20 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
else
height = region->bufferImageHeight;
- if (width != image->vk.extent.width || height != image->vk.extent.height)
+ const uint8_t plane =
+ v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
+
+ const uint32_t mip_level = region->imageSubresource.mipLevel;
+ const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
+
+ if (width != slice->width || height != slice->height)
return false;
/* Handle region semantics for compressed images */
- const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
- const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
+ const uint32_t block_w =
+ vk_format_get_blockwidth(image->planes[plane].vk_format);
+ const uint32_t block_h =
+ vk_format_get_blockheight(image->planes[plane].vk_format);
width = DIV_ROUND_UP(width, block_w);
height = DIV_ROUND_UP(height, block_h);
@@ -1311,10 +1888,10 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
*/
const struct v3dv_format *format =
v3dv_get_compatible_tfu_format(cmd_buffer->device,
- image->cpp, NULL);
-
- const uint32_t mip_level = region->imageSubresource.mipLevel;
- const struct v3d_resource_slice *slice = &image->slices[mip_level];
+ image->planes[plane].cpp, NULL);
+ /* We only use single-plane formats with the TFU */
+ assert(format->plane_count == 1);
+ const struct v3dv_format_plane *format_plane = &format->planes[0];
uint32_t num_layers;
if (image->vk.image_type != VK_IMAGE_TYPE_3D)
@@ -1323,14 +1900,14 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
num_layers = region->imageExtent.depth;
assert(num_layers > 0);
- assert(image->mem && image->mem->bo);
- const struct v3dv_bo *dst_bo = image->mem->bo;
+ assert(image->planes[plane].mem && image->planes[plane].mem->bo);
+ const struct v3dv_bo *dst_bo = image->planes[plane].mem->bo;
assert(buffer->mem && buffer->mem->bo);
const struct v3dv_bo *src_bo = buffer->mem->bo;
/* Emit a TFU job per layer to copy */
- const uint32_t buffer_stride = width * image->cpp;
+ const uint32_t buffer_stride = width * image->planes[plane].cpp;
for (int i = 0; i < num_layers; i++) {
uint32_t layer;
if (image->vk.image_type != VK_IMAGE_TYPE_3D)
@@ -1338,46 +1915,27 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
else
layer = region->imageOffset.z + i;
- struct drm_v3d_submit_tfu tfu = {
- .ios = (height << 16) | width,
- .bo_handles = {
- dst_bo->handle,
- src_bo->handle != dst_bo->handle ? src_bo->handle : 0
- },
- };
-
const uint32_t buffer_offset =
buffer->mem_offset + region->bufferOffset +
height * buffer_stride * i;
-
const uint32_t src_offset = src_bo->offset + buffer_offset;
- tfu.iia |= src_offset;
- tfu.icfg |= V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT;
- tfu.iis |= width;
const uint32_t dst_offset =
- dst_bo->offset + v3dv_layer_offset(image, mip_level, layer);
- tfu.ioa |= dst_offset;
-
- tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
- (slice->tiling - V3D_TILING_LINEARTILE)) <<
- V3D_TFU_IOA_FORMAT_SHIFT;
- tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
-
- /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
- * OPAD field for the destination (how many extra UIF blocks beyond
- * those necessary to cover the height).
- */
- if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
- slice->tiling == V3D_TILING_UIF_XOR) {
- uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp);
- uint32_t implicit_padded_height = align(height, uif_block_h);
- uint32_t icfg =
- (slice->padded_height - implicit_padded_height) / uif_block_h;
- tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
- }
-
- v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
+ dst_bo->offset + v3dv_layer_offset(image, mip_level, layer, plane);
+
+ v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+ cmd_buffer,
+ dst_bo->handle,
+ dst_offset,
+ slice->tiling,
+ slice->padded_height,
+ image->planes[plane].cpp,
+ src_bo->handle,
+ src_offset,
+ V3D_TILING_RASTER,
+ width,
+ 1,
+ width, height, format_plane);
}
return true;
@@ -1391,11 +1949,17 @@ static bool
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
VkFormat fb_format;
- if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
+ uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
+ assert(plane < image->plane_count);
+
+ if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
+ &region->imageOffset, &region->imageExtent,
+ &fb_format)) {
return false;
+ }
uint32_t internal_type, internal_bpp;
v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -1415,13 +1979,16 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
return true;
/* Handle copy to compressed format using a compatible format */
- const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
- const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
+ const uint32_t block_w =
+ vk_format_get_blockwidth(image->planes[plane].vk_format);
+ const uint32_t block_h =
+ vk_format_get_blockheight(image->planes[plane].vk_format);
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers, false,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -1440,7 +2007,7 @@ static bool
create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
return true;
@@ -1569,8 +2136,6 @@ create_blit_render_pass(struct v3dv_device *device,
VkRenderPass *pass_load,
VkRenderPass *pass_no_load);
-static nir_ssa_def *gen_rect_vertices(nir_builder *b);
-
static bool
create_pipeline(struct v3dv_device *device,
struct v3dv_render_pass *pass,
@@ -1595,7 +2160,7 @@ get_texel_buffer_copy_vs()
glsl_vec4_type(), "gl_Position");
vs_out_pos->data.location = VARYING_SLOT_POS;
- nir_ssa_def *pos = gen_rect_vertices(&b);
+ nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
nir_store_var(&b, vs_out_pos, pos, 0xf);
return b.shader;
@@ -1618,8 +2183,8 @@ get_texel_buffer_copy_gs()
nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
(1ull << VARYING_SLOT_LAYER);
- nir->info.gs.input_primitive = GL_TRIANGLES;
- nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
+ nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
+ nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
nir->info.gs.vertices_in = 3;
nir->info.gs.vertices_out = 3;
nir->info.gs.invocations = 1;
@@ -1652,7 +2217,7 @@ get_texel_buffer_copy_gs()
nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
/* gl_Layer from push constants */
- nir_ssa_def *layer =
+ nir_def *layer =
nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
.base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
.range = 4);
@@ -1666,7 +2231,7 @@ get_texel_buffer_copy_gs()
return nir;
}
-static nir_ssa_def *
+static nir_def *
load_frag_coord(nir_builder *b)
{
nir_foreach_shader_in_variable(var, b->shader) {
@@ -1730,24 +2295,24 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
/* Load the box describing the pixel region we want to copy from the
* texel buffer.
*/
- nir_ssa_def *box =
+ nir_def *box =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
.base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
.range = 16);
/* Load the buffer stride (this comes in texel units) */
- nir_ssa_def *stride =
+ nir_def *stride =
nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
.base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
.range = 4);
/* Load the buffer offset (this comes in texel units) */
- nir_ssa_def *offset =
+ nir_def *offset =
nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
.base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
.range = 4);
- nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
+ nir_def *coord = nir_f2i32(&b, load_frag_coord(&b));
/* Load pixel data from texel buffer based on the x,y offset of the pixel
* within the box. Texel buffers are 1D arrays of texels.
@@ -1757,28 +2322,26 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
* texel buffer should always be within its bounds and we we don't need
* to add a check for that here.
*/
- nir_ssa_def *x_offset =
+ nir_def *x_offset =
nir_isub(&b, nir_channel(&b, coord, 0),
nir_channel(&b, box, 0));
- nir_ssa_def *y_offset =
+ nir_def *y_offset =
nir_isub(&b, nir_channel(&b, coord, 1),
nir_channel(&b, box, 1));
- nir_ssa_def *texel_offset =
+ nir_def *texel_offset =
nir_iadd(&b, nir_iadd(&b, offset, x_offset),
nir_imul(&b, y_offset, stride));
- nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+ nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def;
nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
tex->op = nir_texop_txf;
- tex->src[0].src_type = nir_tex_src_coord;
- tex->src[0].src = nir_src_for_ssa(texel_offset);
- tex->src[1].src_type = nir_tex_src_texture_deref;
- tex->src[1].src = nir_src_for_ssa(tex_deref);
+ tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset);
+ tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
tex->dest_type = nir_type_uint32;
tex->is_array = false;
tex->coord_components = 1;
- nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
+ nir_def_init(&tex->instr, &tex->def, 4, 32);
nir_builder_instr_insert(&b, &tex->instr);
uint32_t swiz[4];
@@ -1790,7 +2353,7 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
swiz[3] =
component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
- nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4);
+ nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4);
nir_store_var(&b, fs_out_color, s, 0xf);
return b.shader;
@@ -1876,7 +2439,7 @@ get_copy_texel_buffer_pipeline(
mtx_lock(&device->meta.mtx);
struct hash_entry *entry =
_mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
- &key);
+ key);
if (entry) {
mtx_unlock(&device->meta.mtx);
*pipeline = entry->data;
@@ -1905,8 +2468,10 @@ get_copy_texel_buffer_pipeline(
if (!ok)
goto fail;
+ uint8_t *dupkey = malloc(V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+ memcpy(dupkey, key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
_mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
- &key, *pipeline);
+ dupkey, *pipeline);
mtx_unlock(&device->meta.mtx);
return true;
@@ -1938,7 +2503,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
VkColorComponentFlags cmask,
VkComponentMapping *cswizzle,
uint32_t region_count,
- const VkBufferImageCopy2KHR *regions)
+ const VkBufferImageCopy2 *regions)
{
VkResult result;
bool handled = false;
@@ -1957,7 +2522,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
/* We only handle color copies. Callers can copy D/S aspects by using
* a compatible color format and maybe a cmask/cswizzle for D24 formats.
*/
- if (aspect != VK_IMAGE_ASPECT_COLOR_BIT)
+ if (!vk_format_is_color(dst_format) || !vk_format_is_color(src_format))
return handled;
/* FIXME: we only handle uncompressed images for now. */
@@ -1978,7 +2543,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
if (v3dv_buffer_format_supports_features(
cmd_buffer->device, src_format,
- VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
+ VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT)) {
buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
} else {
return handled;
@@ -2027,13 +2592,10 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
if (result != VK_SUCCESS)
return handled;
- /* FIXME: for some reason passing region->bufferOffset here for the
- * offset field doesn't work, making the following CTS tests fail:
- *
- * dEQP-VK.api.copy_and_blit.core.buffer_to_image.*buffer_offset*
- *
- * So instead we pass 0 here and we pass the offset in texels as a push
- * constant to the shader, which seems to work correctly.
+ /* We can't pass region->bufferOffset here for the offset field because
+ * the texture base pointer in the texture shader state must be a 64-byte
+ * aligned value. Instead, we use 0 here and we pass the offset in texels
+ * as a push constant to the shader.
*/
VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
VkBufferViewCreateInfo buffer_view_info = {
@@ -2068,7 +2630,6 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
/* Push command buffer state before starting meta operation */
v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
- uint32_t dirty_dynamic_state = 0;
/* Bind common state for all layers and regions */
VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
@@ -2087,8 +2648,10 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
* For 3D images, this creates a layered framebuffer with a number of
* layers matching the depth extent of the 3D image.
*/
- uint32_t fb_width = u_minify(image->vk.extent.width, resource->mipLevel);
- uint32_t fb_height = u_minify(image->vk.extent.height, resource->mipLevel);
+ uint8_t plane = v3dv_plane_from_aspect(aspect);
+ uint32_t fb_width = u_minify(image->planes[plane].width, resource->mipLevel);
+ uint32_t fb_height = u_minify(image->planes[plane].height, resource->mipLevel);
+
VkImageViewCreateInfo image_view_info = {
.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
.image = v3dv_image_to_handle(image),
@@ -2103,8 +2666,8 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
},
};
VkImageView image_view;
- result = v3dv_CreateImageView(_device, &image_view_info,
- &cmd_buffer->device->vk.alloc, &image_view);
+ result = v3dv_create_image_view(cmd_buffer->device,
+ &image_view_info, &image_view);
if (result != VK_SUCCESS)
goto fail;
@@ -2173,7 +2736,12 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
.clearValueCount = 0,
};
- v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
+ VkSubpassBeginInfo sp_info = {
+ .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
+ .contents = VK_SUBPASS_CONTENTS_INLINE,
+ };
+
+ v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
struct v3dv_job *job = cmd_buffer->state.job;
if (!job)
goto fail;
@@ -2190,9 +2758,8 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
}
/* For each region */
- dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
for (uint32_t r = 0; r < region_count; r++) {
- const VkBufferImageCopy2KHR *region = &regions[r];
+ const VkBufferImageCopy2 *region = &regions[r];
/* Obtain the 2D buffer region spec */
uint32_t buf_width, buf_height;
@@ -2240,11 +2807,15 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
} /* For each region */
- v3dv_CmdEndRenderPass(_cmd_buffer);
+ VkSubpassEndInfo sp_end_info = {
+ .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
+ };
+
+ v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
} /* For each layer */
fail:
- v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
return handled;
}
@@ -2263,7 +2834,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
VkColorComponentFlags cmask,
VkComponentMapping *cswizzle,
uint32_t region_count,
- const VkBufferImageCopy2KHR *regions)
+ const VkBufferImageCopy2 *regions)
{
/* Since we can't sample linear images we need to upload the linear
* buffer to a tiled image that we can use as a blit source, which
@@ -2338,14 +2909,19 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
*/
assert(num_layers == 1 || region_count == 1);
- const uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
- const uint32_t block_height = vk_format_get_blockheight(image->vk.format);
+ uint8_t plane = v3dv_plane_from_aspect(aspect);
+ assert(plane < image->plane_count);
+
+ const uint32_t block_width =
+ vk_format_get_blockwidth(image->planes[plane].vk_format);
+ const uint32_t block_height =
+ vk_format_get_blockheight(image->planes[plane].vk_format);
/* Copy regions by uploading each region to a temporary tiled image using
* the memory we have just allocated as storage.
*/
for (uint32_t r = 0; r < region_count; r++) {
- const VkBufferImageCopy2KHR *region = &regions[r];
+ const VkBufferImageCopy2 *region = &regions[r];
/* Obtain the 2D buffer region spec */
uint32_t buf_width, buf_height;
@@ -2396,16 +2972,23 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
if (result != VK_SUCCESS)
return handled;
+ /* When copying a multi-plane image the aspect indicates the plane to
+ * copy. For these, we only copy one plane at a time, which is always
+ * a color plane.
+ */
+ VkImageAspectFlags copy_aspect =
+ image->plane_count == 1 ? aspect : VK_IMAGE_ASPECT_COLOR_BIT;
+
/* Upload buffer contents for the selected layer */
const VkDeviceSize buf_offset_bytes =
region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
- const VkBufferImageCopy2KHR buffer_image_copy = {
- .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2_KHR,
+ const VkBufferImageCopy2 buffer_image_copy = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
.bufferOffset = buf_offset_bytes,
.bufferRowLength = region->bufferRowLength / block_width,
.bufferImageHeight = region->bufferImageHeight / block_height,
.imageSubresource = {
- .aspectMask = aspect,
+ .aspectMask = copy_aspect,
.mipLevel = 0,
.baseArrayLayer = 0,
.layerCount = 1,
@@ -2434,10 +3017,10 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
* image, but that we need to blit to a S8D24 destination (the only
* stencil format we support).
*/
- const VkImageBlit2KHR blit_region = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
+ const VkImageBlit2 blit_region = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
.srcSubresource = {
- .aspectMask = aspect,
+ .aspectMask = copy_aspect,
.mipLevel = 0,
.baseArrayLayer = 0,
.layerCount = 1,
@@ -2493,7 +3076,7 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
uint32_t region_count,
- const VkBufferImageCopy2KHR *regions,
+ const VkBufferImageCopy2 *regions,
bool use_texel_buffer)
{
/* We can only call this with region_count > 1 if we can batch the regions
@@ -2501,12 +3084,20 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
* the same aspect.
*/
VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
+ const VkImageAspectFlagBits any_plane_aspect =
+ VK_IMAGE_ASPECT_PLANE_0_BIT |
+ VK_IMAGE_ASPECT_PLANE_1_BIT |
+ VK_IMAGE_ASPECT_PLANE_2_BIT;
+
+ bool is_plane_aspect = aspect & any_plane_aspect;
/* Generally, the bpp of the data in the buffer matches that of the
* destination image. The exception is the case where we are uploading
* stencil (8bpp) to a combined d24s8 image (32bpp).
*/
- uint32_t buf_bpp = image->cpp;
+ uint8_t plane = v3dv_plane_from_aspect(aspect);
+ assert(plane < image->plane_count);
+ uint32_t buf_bpp = image->planes[plane].cpp;
/* We are about to upload the buffer data to an image so we can then
* blit that to our destination region. Because we are going to implement
@@ -2539,6 +3130,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
case 4:
switch (aspect) {
case VK_IMAGE_ASPECT_COLOR_BIT:
+ case VK_IMAGE_ASPECT_PLANE_0_BIT:
+ case VK_IMAGE_ASPECT_PLANE_1_BIT:
+ case VK_IMAGE_ASPECT_PLANE_2_BIT:
src_format = VK_FORMAT_R8G8B8A8_UINT;
dst_format = src_format;
break;
@@ -2548,7 +3142,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
src_format = VK_FORMAT_R8G8B8A8_UINT;
dst_format = src_format;
- aspect = VK_IMAGE_ASPECT_COLOR_BIT;
/* For D24 formats, the Vulkan spec states that the depth component
* in the buffer is stored in the 24-LSB, but V3D wants it in the
@@ -2578,7 +3171,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
src_format = VK_FORMAT_R8_UINT;
dst_format = VK_FORMAT_R8G8B8A8_UINT;
cmask = VK_COLOR_COMPONENT_R_BIT;
- aspect = VK_IMAGE_ASPECT_COLOR_BIT;
break;
default:
unreachable("unsupported aspect");
@@ -2586,12 +3178,14 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
};
break;
case 2:
- aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+ assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
+ aspect == VK_IMAGE_ASPECT_DEPTH_BIT ||
+ is_plane_aspect);
src_format = VK_FORMAT_R16_UINT;
dst_format = src_format;
break;
case 1:
- assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+ assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT || is_plane_aspect);
src_format = VK_FORMAT_R8_UINT;
dst_format = src_format;
break;
@@ -2615,75 +3209,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
}
}
-/**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
- */
-static bool
-copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_image *image,
- struct v3dv_buffer *buffer,
- const VkBufferImageCopy2KHR *region)
-{
- /* FIXME */
- if (vk_format_is_depth_or_stencil(image->vk.format))
- return false;
-
- if (vk_format_is_compressed(image->vk.format))
- return false;
-
- if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
- return false;
-
- uint32_t buffer_width, buffer_height;
- if (region->bufferRowLength == 0)
- buffer_width = region->imageExtent.width;
- else
- buffer_width = region->bufferRowLength;
-
- if (region->bufferImageHeight == 0)
- buffer_height = region->imageExtent.height;
- else
- buffer_height = region->bufferImageHeight;
-
- uint32_t buffer_stride = buffer_width * image->cpp;
- uint32_t buffer_layer_stride = buffer_stride * buffer_height;
-
- uint32_t num_layers;
- if (image->vk.image_type != VK_IMAGE_TYPE_3D)
- num_layers = region->imageSubresource.layerCount;
- else
- num_layers = region->imageExtent.depth;
- assert(num_layers > 0);
-
- struct v3dv_job *job =
- v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
- V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
- cmd_buffer, -1);
- if (!job)
- return true;
-
- job->cpu.copy_buffer_to_image.image = image;
- job->cpu.copy_buffer_to_image.buffer = buffer;
- job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
- job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
- job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
- job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
- job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
- job->cpu.copy_buffer_to_image.mip_level =
- region->imageSubresource.mipLevel;
- job->cpu.copy_buffer_to_image.base_layer =
- region->imageSubresource.baseArrayLayer;
- job->cpu.copy_buffer_to_image.layer_count = num_layers;
-
- list_addtail(&job->list_link, &cmd_buffer->jobs);
-
- return true;
-}
-
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
- const VkCopyBufferToImageInfo2KHR *info)
+v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
+ const VkCopyBufferToImageInfo2 *info)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
@@ -2691,6 +3219,8 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
+ cmd_buffer->state.is_transfer = true;
+
uint32_t r = 0;
while (r < info->regionCount) {
/* The TFU and TLB paths can only copy one region at a time and the region
@@ -2739,12 +3269,6 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
* slow it might not be worth it and we should instead put more effort
* in handling more cases with the other paths.
*/
- if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer,
- &info->pRegions[r])) {
- batch_size = 1;
- goto handled;
- }
-
if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
batch_size, &info->pRegions[r], false)) {
goto handled;
@@ -2755,6 +3279,8 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
handled:
r += batch_size;
}
+
+ cmd_buffer->state.is_transfer = false;
}
static void
@@ -2773,17 +3299,31 @@ static bool
blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *dst,
struct v3dv_image *src,
- const VkImageBlit2KHR *region)
+ const VkImageBlit2 *region)
{
+ if (V3D_DBG(DISABLE_TFU)) {
+ perf_debug("Blit: TFU disabled, fallbacks could be slower.");
+ return false;
+ }
+
assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
+ /* From vkCmdBlitImage:
+ * "srcImage must not use a format that requires a sampler YCBCR
+ * conversion"
+ * "dstImage must not use a format that requires a sampler YCBCR
+ * conversion"
+ */
+ assert(dst->plane_count == 1);
+ assert(src->plane_count == 1);
+
/* Format must match */
if (src->vk.format != dst->vk.format)
return false;
/* Destination can't be raster format */
- if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
+ if (!dst->tiled)
return false;
/* Source region must start at (0,0) */
@@ -2825,7 +3365,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
*/
const struct v3dv_format *format =
v3dv_get_compatible_tfu_format(cmd_buffer->device,
- dst->cpp, NULL);
+ dst->planes[0].cpp, NULL);
/* Emit a TFU job for each layer to blit */
assert(region->dstSubresource.layerCount ==
@@ -2871,10 +3411,31 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
const uint32_t src_layer =
src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
- v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
- (cmd_buffer, dst, dst_mip_level, dst_layer,
- src, src_mip_level, src_layer,
- dst_width, dst_height, format);
+
+ const uint32_t dst_offset =
+ dst->planes[0].mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level,
+ dst_layer, 0);
+ const uint32_t src_offset =
+ src->planes[0].mem->bo->offset + v3dv_layer_offset(src, src_mip_level,
+ src_layer, 0);
+
+ const struct v3d_resource_slice *dst_slice = &dst->planes[0].slices[dst_mip_level];
+ const struct v3d_resource_slice *src_slice = &src->planes[0].slices[src_mip_level];
+
+ v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+ cmd_buffer,
+ dst->planes[0].mem->bo->handle,
+ dst_offset,
+ dst_slice->tiling,
+ dst_slice->padded_height,
+ dst->planes[0].cpp,
+ src->planes[0].mem->bo->handle,
+ src_offset,
+ src_slice->tiling,
+ src_slice->tiling == V3D_TILING_RASTER ?
+ src_slice->stride : src_slice->padded_height,
+ src->planes[0].cpp,
+ dst_width, dst_height, &format->planes[0]);
}
return true;
@@ -2941,7 +3502,8 @@ create_blit_render_pass(struct v3dv_device *device,
const bool is_color_blit = vk_format_is_color(dst_format);
/* Attachment load operation is specified below */
- VkAttachmentDescription att = {
+ VkAttachmentDescription2 att = {
+ .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
.format = dst_format,
.samples = VK_SAMPLE_COUNT_1_BIT,
.storeOp = VK_ATTACHMENT_STORE_OP_STORE,
@@ -2949,12 +3511,14 @@ create_blit_render_pass(struct v3dv_device *device,
.finalLayout = VK_IMAGE_LAYOUT_GENERAL,
};
- VkAttachmentReference att_ref = {
+ VkAttachmentReference2 att_ref = {
+ .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
.attachment = 0,
.layout = VK_IMAGE_LAYOUT_GENERAL,
};
- VkSubpassDescription subpass = {
+ VkSubpassDescription2 subpass = {
+ .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
.inputAttachmentCount = 0,
.colorAttachmentCount = is_color_blit ? 1 : 0,
@@ -2965,8 +3529,8 @@ create_blit_render_pass(struct v3dv_device *device,
.pPreserveAttachments = NULL,
};
- VkRenderPassCreateInfo info = {
- .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+ VkRenderPassCreateInfo2 info = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
.attachmentCount = 1,
.pAttachments = &att,
.subpassCount = 1,
@@ -2977,60 +3541,27 @@ create_blit_render_pass(struct v3dv_device *device,
VkResult result;
att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
- result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
- &info, &device->vk.alloc, pass_load);
+ result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
+ &info, &device->vk.alloc, pass_load);
if (result != VK_SUCCESS)
return false;
att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
- result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
- &info, &device->vk.alloc, pass_no_load);
+ result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
+ &info, &device->vk.alloc, pass_no_load);
return result == VK_SUCCESS;
}
-static nir_ssa_def *
-gen_rect_vertices(nir_builder *b)
-{
- nir_ssa_def *vertex_id = nir_load_vertex_id(b);
-
- /* vertex 0: -1.0, -1.0
- * vertex 1: -1.0, 1.0
- * vertex 2: 1.0, -1.0
- * vertex 3: 1.0, 1.0
- *
- * so:
- *
- * channel 0 is vertex_id < 2 ? -1.0 : 1.0
- * channel 1 is vertex id & 1 ? 1.0 : -1.0
- */
-
- nir_ssa_def *one = nir_imm_int(b, 1);
- nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
- nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
-
- nir_ssa_def *comp[4];
- comp[0] = nir_bcsel(b, c0cmp,
- nir_imm_float(b, -1.0f),
- nir_imm_float(b, 1.0f));
-
- comp[1] = nir_bcsel(b, c1cmp,
- nir_imm_float(b, 1.0f),
- nir_imm_float(b, -1.0f));
- comp[2] = nir_imm_float(b, 0.0f);
- comp[3] = nir_imm_float(b, 1.0f);
- return nir_vec(b, comp, 4);
-}
-
-static nir_ssa_def *
+static nir_def *
gen_tex_coords(nir_builder *b)
{
- nir_ssa_def *tex_box =
+ nir_def *tex_box =
nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
- nir_ssa_def *tex_z =
+ nir_def *tex_z =
nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
- nir_ssa_def *vertex_id = nir_load_vertex_id(b);
+ nir_def *vertex_id = nir_load_vertex_id(b);
/* vertex 0: src0_x, src0_y
* vertex 1: src0_x, src1_y
@@ -3043,11 +3574,11 @@ gen_tex_coords(nir_builder *b)
* channel 1 is vertex id & 1 ? src1_y : src0_y
*/
- nir_ssa_def *one = nir_imm_int(b, 1);
- nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
- nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
+ nir_def *one = nir_imm_int(b, 1);
+ nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2);
+ nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
- nir_ssa_def *comp[4];
+ nir_def *comp[4];
comp[0] = nir_bcsel(b, c0cmp,
nir_channel(b, tex_box, 0),
nir_channel(b, tex_box, 2));
@@ -3060,9 +3591,9 @@ gen_tex_coords(nir_builder *b)
return nir_vec(b, comp, 4);
}
-static nir_ssa_def *
+static nir_def *
build_nir_tex_op_read(struct nir_builder *b,
- nir_ssa_def *tex_pos,
+ nir_def *tex_pos,
enum glsl_base_type tex_type,
enum glsl_sampler_dim dim)
{
@@ -3075,57 +3606,49 @@ build_nir_tex_op_read(struct nir_builder *b,
sampler->data.descriptor_set = 0;
sampler->data.binding = 0;
- nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+ nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
tex->sampler_dim = dim;
tex->op = nir_texop_tex;
- tex->src[0].src_type = nir_tex_src_coord;
- tex->src[0].src = nir_src_for_ssa(tex_pos);
- tex->src[1].src_type = nir_tex_src_texture_deref;
- tex->src[1].src = nir_src_for_ssa(tex_deref);
- tex->src[2].src_type = nir_tex_src_sampler_deref;
- tex->src[2].src = nir_src_for_ssa(tex_deref);
+ tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
+ tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
+ tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref);
tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
tex->is_array = glsl_sampler_type_is_array(sampler_type);
tex->coord_components = tex_pos->num_components;
- nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
+ nir_def_init(&tex->instr, &tex->def, 4, 32);
nir_builder_instr_insert(b, &tex->instr);
- return &tex->dest.ssa;
+ return &tex->def;
}
-static nir_ssa_def *
+static nir_def *
build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
nir_variable *sampler,
- nir_ssa_def *tex_deref,
+ nir_def *tex_deref,
enum glsl_base_type tex_type,
- nir_ssa_def *tex_pos,
- nir_ssa_def *sample_idx)
+ nir_def *tex_pos,
+ nir_def *sample_idx)
{
- nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4);
+ nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
tex->op = nir_texop_txf_ms;
- tex->src[0].src_type = nir_tex_src_coord;
- tex->src[0].src = nir_src_for_ssa(tex_pos);
- tex->src[1].src_type = nir_tex_src_texture_deref;
- tex->src[1].src = nir_src_for_ssa(tex_deref);
- tex->src[2].src_type = nir_tex_src_sampler_deref;
- tex->src[2].src = nir_src_for_ssa(tex_deref);
- tex->src[3].src_type = nir_tex_src_ms_index;
- tex->src[3].src = nir_src_for_ssa(sample_idx);
+ tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
+ tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
+ tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx);
tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
tex->is_array = false;
tex->coord_components = tex_pos->num_components;
- nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
+ nir_def_init(&tex->instr, &tex->def, 4, 32);
nir_builder_instr_insert(b, &tex->instr);
- return &tex->dest.ssa;
+ return &tex->def;
}
/* Fetches all samples at the given position and averages them */
-static nir_ssa_def *
+static nir_def *
build_nir_tex_op_ms_resolve(struct nir_builder *b,
- nir_ssa_def *tex_pos,
+ nir_def *tex_pos,
enum glsl_base_type tex_type,
VkSampleCountFlagBits src_samples)
{
@@ -3139,10 +3662,10 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b,
const bool is_int = glsl_base_type_is_integer(tex_type);
- nir_ssa_def *tmp = NULL;
- nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+ nir_def *tmp = NULL;
+ nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
for (uint32_t i = 0; i < src_samples; i++) {
- nir_ssa_def *s =
+ nir_def *s =
build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
tex_type, tex_pos,
nir_imm_int(b, i));
@@ -3157,13 +3680,13 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b,
}
assert(!is_int);
- return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples));
+ return nir_fmul_imm(b, tmp, 1.0f / src_samples);
}
/* Fetches the current sample (gl_SampleID) at the given position */
-static nir_ssa_def *
+static nir_def *
build_nir_tex_op_ms_read(struct nir_builder *b,
- nir_ssa_def *tex_pos,
+ nir_def *tex_pos,
enum glsl_base_type tex_type)
{
const struct glsl_type *sampler_type =
@@ -3173,17 +3696,17 @@ build_nir_tex_op_ms_read(struct nir_builder *b,
sampler->data.descriptor_set = 0;
sampler->data.binding = 0;
- nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+ nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
tex_type, tex_pos,
nir_load_sample_id(b));
}
-static nir_ssa_def *
+static nir_def *
build_nir_tex_op(struct nir_builder *b,
struct v3dv_device *device,
- nir_ssa_def *tex_pos,
+ nir_def *tex_pos,
enum glsl_base_type tex_type,
VkSampleCountFlagBits dst_samples,
VkSampleCountFlagBits src_samples,
@@ -3227,10 +3750,10 @@ get_blit_vs()
vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
- nir_ssa_def *pos = gen_rect_vertices(&b);
+ nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
nir_store_var(&b, vs_out_pos, pos, 0xf);
- nir_ssa_def *tex_coord = gen_tex_coords(&b);
+ nir_def *tex_coord = gen_tex_coords(&b);
nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
return b.shader;
@@ -3281,11 +3804,11 @@ get_color_blit_fs(struct v3dv_device *device,
nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
fs_out_color->data.location = FRAG_RESULT_DATA0;
- nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
+ nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
tex_coord = nir_channels(&b, tex_coord, channel_mask);
- nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
+ nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
dst_samples, src_samples, sampler_dim);
/* For integer textures, if the bit-size of the destination is too small to
@@ -3300,7 +3823,7 @@ get_color_blit_fs(struct v3dv_device *device,
enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
- nir_ssa_def *c[4];
+ nir_def *c[4];
for (uint32_t i = 0; i < 4; i++) {
c[i] = nir_channel(&b, color, i);
@@ -3318,11 +3841,11 @@ get_color_blit_fs(struct v3dv_device *device,
assert(dst_bit_size > 0);
if (util_format_is_pure_uint(dst_pformat)) {
- nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
+ nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
c[i] = nir_umin(&b, c[i], max);
} else {
- nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
- nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
+ nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
+ nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
}
}
@@ -3348,14 +3871,12 @@ create_pipeline(struct v3dv_device *device,
const VkPipelineLayout layout,
VkPipeline *pipeline)
{
- struct vk_shader_module vs_m;
+ struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
+ struct vk_shader_module fs_m = vk_shader_module_from_nir(fs_nir);
struct vk_shader_module gs_m;
- struct vk_shader_module fs_m;
uint32_t num_stages = gs_nir ? 3 : 2;
- v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
- v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
VkPipelineShaderStageCreateInfo stages[3] = {
{
@@ -3379,7 +3900,7 @@ create_pipeline(struct v3dv_device *device,
};
if (gs_nir) {
- v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
+ gs_m = vk_shader_module_from_nir(gs_nir);
stages[2].module = vk_shader_module_to_handle(&gs_m);
}
@@ -3452,6 +3973,7 @@ create_pipeline(struct v3dv_device *device,
pipeline);
ralloc_free(vs_nir);
+ ralloc_free(gs_nir);
ralloc_free(fs_nir);
return result == VK_SUCCESS;
@@ -3762,6 +4284,8 @@ allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
* cmask parameter (which can be 0 to default to all channels), as well as a
* swizzle to apply to the source via the cswizzle parameter (which can be NULL
* to use the default identity swizzle).
+ *
+ * Supports multi-plane formats too.
*/
static bool
blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
@@ -3771,25 +4295,23 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
VkFormat src_format,
VkColorComponentFlags cmask,
VkComponentMapping *cswizzle,
- const VkImageBlit2KHR *_region,
+ const VkImageBlit2 *region,
VkFilter filter,
bool dst_is_padded_image)
{
bool handled = true;
VkResult result;
- uint32_t dirty_dynamic_state = 0;
/* We don't support rendering to linear depth/stencil, this should have
* been rewritten to a compatible color blit by the caller.
*/
- assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR ||
- !vk_format_is_depth_or_stencil(dst_format));
+ assert(dst->tiled || !vk_format_is_depth_or_stencil(dst_format));
/* Can't sample from linear images */
- if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && src->vk.image_type != VK_IMAGE_TYPE_1D)
+ if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) {
return false;
+ }
- VkImageBlit2KHR region = *_region;
/* Rewrite combined D/S blits to compatible color blits */
if (vk_format_is_depth_or_stencil(dst_format)) {
assert(src_format == dst_format);
@@ -3803,12 +4325,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
break;
case VK_FORMAT_X8_D24_UNORM_PACK32:
case VK_FORMAT_D24_UNORM_S8_UINT:
- if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+ if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
cmask |= VK_COLOR_COMPONENT_G_BIT |
VK_COLOR_COMPONENT_B_BIT |
VK_COLOR_COMPONENT_A_BIT;
}
- if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+ if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
cmask |= VK_COLOR_COMPONENT_R_BIT;
}
@@ -3818,10 +4340,15 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
unreachable("Unsupported depth/stencil format");
};
src_format = dst_format;
- region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
- region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
}
+ uint8_t src_plane =
+ v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
+ assert(src_plane < src->plane_count);
+ uint8_t dst_plane =
+ v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
+ assert(dst_plane < dst->plane_count);
+
const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
VK_COLOR_COMPONENT_G_BIT |
VK_COLOR_COMPONENT_B_BIT |
@@ -3844,34 +4371,40 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
* need to apply those same semantics here when we compute the size of the
* destination image level.
*/
- const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
- const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
- const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
- const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
+ const uint32_t dst_block_w =
+ vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
+ const uint32_t dst_block_h =
+ vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
+ const uint32_t src_block_w =
+ vk_format_get_blockwidth(src->planes[src_plane].vk_format);
+ const uint32_t src_block_h =
+ vk_format_get_blockheight(src->planes[src_plane].vk_format);
const uint32_t dst_level_w =
u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
- region.dstSubresource.mipLevel);
+ region->dstSubresource.mipLevel);
const uint32_t dst_level_h =
u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
- region.dstSubresource.mipLevel);
+ region->dstSubresource.mipLevel);
const uint32_t src_level_w =
- u_minify(src->vk.extent.width, region.srcSubresource.mipLevel);
+ u_minify(src->planes[src_plane].width, region->srcSubresource.mipLevel);
const uint32_t src_level_h =
- u_minify(src->vk.extent.height, region.srcSubresource.mipLevel);
+ u_minify(src->planes[src_plane].height, region->srcSubresource.mipLevel);
+
+ assert(src->plane_count == 1 || src->vk.image_type != VK_IMAGE_TYPE_3D);
const uint32_t src_level_d =
- u_minify(src->vk.extent.depth, region.srcSubresource.mipLevel);
+ u_minify(src->vk.extent.depth, region->srcSubresource.mipLevel);
uint32_t dst_x, dst_y, dst_w, dst_h;
bool dst_mirror_x, dst_mirror_y;
- compute_blit_box(region.dstOffsets,
+ compute_blit_box(region->dstOffsets,
dst_level_w, dst_level_h,
&dst_x, &dst_y, &dst_w, &dst_h,
&dst_mirror_x, &dst_mirror_y);
uint32_t src_x, src_y, src_w, src_h;
bool src_mirror_x, src_mirror_y;
- compute_blit_box(region.srcOffsets,
+ compute_blit_box(region->srcOffsets,
src_level_w, src_level_h,
&src_x, &src_y, &src_w, &src_h,
&src_mirror_x, &src_mirror_y);
@@ -3880,10 +4413,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t max_dst_layer;
bool dst_mirror_z = false;
if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
- min_dst_layer = region.dstSubresource.baseArrayLayer;
- max_dst_layer = min_dst_layer + region.dstSubresource.layerCount;
+ min_dst_layer = region->dstSubresource.baseArrayLayer;
+ max_dst_layer = min_dst_layer + region->dstSubresource.layerCount;
} else {
- compute_blit_3d_layers(region.dstOffsets,
+ compute_blit_3d_layers(region->dstOffsets,
&min_dst_layer, &max_dst_layer,
&dst_mirror_z);
}
@@ -3892,10 +4425,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t max_src_layer;
bool src_mirror_z = false;
if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
- min_src_layer = region.srcSubresource.baseArrayLayer;
- max_src_layer = min_src_layer + region.srcSubresource.layerCount;
+ min_src_layer = region->srcSubresource.baseArrayLayer;
+ max_src_layer = min_src_layer + region->srcSubresource.layerCount;
} else {
- compute_blit_3d_layers(region.srcOffsets,
+ compute_blit_3d_layers(region->srcOffsets,
&min_src_layer, &max_src_layer,
&src_mirror_z);
}
@@ -4010,7 +4543,6 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
};
/* Record per-layer commands */
- VkImageAspectFlags aspects = region.dstSubresource.aspectMask;
for (uint32_t i = 0; i < layer_count; i++) {
/* Setup framebuffer */
VkImageViewCreateInfo dst_image_view_info = {
@@ -4019,16 +4551,16 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
.viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
.format = dst_format,
.subresourceRange = {
- .aspectMask = aspects,
- .baseMipLevel = region.dstSubresource.mipLevel,
+ .aspectMask = region->dstSubresource.aspectMask,
+ .baseMipLevel = region->dstSubresource.mipLevel,
.levelCount = 1,
.baseArrayLayer = min_dst_layer + i,
.layerCount = 1
},
};
VkImageView dst_image_view;
- result = v3dv_CreateImageView(_device, &dst_image_view_info,
- &device->vk.alloc, &dst_image_view);
+ result = v3dv_create_image_view(device, &dst_image_view_info,
+ &dst_image_view);
if (result != VK_SUCCESS)
goto fail;
@@ -4078,8 +4610,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
.format = src_format,
.components = *cswizzle,
.subresourceRange = {
- .aspectMask = aspects,
- .baseMipLevel = region.srcSubresource.mipLevel,
+ .aspectMask = region->srcSubresource.aspectMask,
+ .baseMipLevel = region->srcSubresource.mipLevel,
.levelCount = 1,
.baseArrayLayer =
src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
@@ -4087,8 +4619,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
},
};
VkImageView src_image_view;
- result = v3dv_CreateImageView(_device, &src_image_view_info,
- &device->vk.alloc, &src_image_view);
+ result = v3dv_create_image_view(device, &src_image_view_info,
+ &src_image_view);
if (result != VK_SUCCESS)
goto fail;
@@ -4146,7 +4678,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
.clearValueCount = 0,
};
- v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
+ VkSubpassBeginInfo sp_info = {
+ .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
+ .contents = VK_SUBPASS_CONTENTS_INLINE,
+ };
+
+ v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
struct v3dv_job *job = cmd_buffer->state.job;
if (!job)
goto fail;
@@ -4170,25 +4707,37 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
- v3dv_CmdEndRenderPass(_cmd_buffer);
- dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
+ VkSubpassEndInfo sp_end_info = {
+ .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
+ };
+
+ v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
}
fail:
- v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
return handled;
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
- const VkBlitImageInfo2KHR *pBlitImageInfo)
+v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer,
+ const VkBlitImageInfo2 *pBlitImageInfo)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
- /* This command can only happen outside a render pass */
+ /* From vkCmdBlitImage:
+ * "srcImage must not use a format that requires a sampler YCBCR
+ * conversion"
+ * "dstImage must not use a format that requires a sampler YCBCR
+ * conversion"
+ */
+ assert(src->plane_count == 1);
+ assert(dst->plane_count == 1);
+
+ /* This command can only happen outside a render pass */
assert(cmd_buffer->state.pass == NULL);
assert(cmd_buffer->state.job == NULL);
@@ -4199,29 +4748,41 @@ v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
/* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
assert(!vk_format_is_compressed(dst->vk.format));
+ cmd_buffer->state.is_transfer = true;
+
for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
- if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i]))
+ const VkImageBlit2 *region = &pBlitImageInfo->pRegions[i];
+
+ if (blit_tfu(cmd_buffer, dst, src, region))
continue;
if (blit_shader(cmd_buffer,
dst, dst->vk.format,
src, src->vk.format,
0, NULL,
- &pBlitImageInfo->pRegions[i],
+ region,
pBlitImageInfo->filter, true)) {
continue;
}
unreachable("Unsupported blit operation");
}
+
+ cmd_buffer->state.is_transfer = false;
}
static bool
resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *dst,
struct v3dv_image *src,
- const VkImageResolve2KHR *region)
+ const VkImageResolve2 *region)
{
- if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, NULL) ||
- !v3dv_meta_can_use_tlb(dst, &region->dstOffset, NULL)) {
+ /* No resolve for multi-planar images. Using plane 0 */
+ assert(dst->plane_count == 1);
+ assert(src->plane_count == 1);
+
+ if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel,
+ &region->srcOffset, NULL, NULL) ||
+ !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel,
+ &region->dstOffset, &region->extent, NULL)) {
return false;
}
@@ -4242,8 +4803,10 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
if (!job)
return true;
- const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
- const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
+ const uint32_t block_w =
+ vk_format_get_blockwidth(dst->planes[0].vk_format);
+ const uint32_t block_h =
+ vk_format_get_blockheight(dst->planes[0].vk_format);
const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
@@ -4252,8 +4815,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
(fb_format, region->srcSubresource.aspectMask,
&internal_type, &internal_bpp);
- v3dv_job_start_frame(job, width, height, num_layers, false,
- 1, internal_bpp, true);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ true);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -4271,10 +4835,10 @@ static bool
resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *dst,
struct v3dv_image *src,
- const VkImageResolve2KHR *region)
+ const VkImageResolve2 *region)
{
- const VkImageBlit2KHR blit_region = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
+ const VkImageBlit2 blit_region = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
.srcSubresource = region->srcSubresource,
.srcOffsets = {
region->srcOffset,
@@ -4300,8 +4864,8 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
}
VKAPI_ATTR void VKAPI_CALL
-v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
- const VkResolveImageInfo2KHR *info)
+v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer,
+ const VkResolveImageInfo2 *info)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -4315,6 +4879,12 @@ v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
+ /* We don't support multi-sampled multi-plane images */
+ assert(src->plane_count == 1);
+ assert(dst->plane_count == 1);
+
+ cmd_buffer->state.is_transfer = true;
+
for (uint32_t i = 0; i < info->regionCount; i++) {
if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
continue;
@@ -4322,4 +4892,6 @@ v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
continue;
unreachable("Unsupported multismaple resolve operation");
}
+
+ cmd_buffer->state.is_transfer = false;
}
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 464703e42a4..ae6e37159d4 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,7 @@
#include "v3dv_private.h"
static uint32_t
-num_subpass_attachments(const VkSubpassDescription *desc)
+num_subpass_attachments(const VkSubpassDescription2 *desc)
{
return desc->inputAttachmentCount +
desc->colorAttachmentCount +
@@ -33,11 +33,11 @@ num_subpass_attachments(const VkSubpassDescription *desc)
}
static void
-set_use_tlb_resolve(struct v3dv_device *device,
+set_try_tlb_resolve(struct v3dv_device *device,
struct v3dv_render_pass_attachment *att)
{
const struct v3dv_format *format = v3dv_X(device, get_format)(att->desc.format);
- att->use_tlb_resolve = v3dv_X(device, format_supports_tlb_resolve)(format);
+ att->try_tlb_resolve = v3dv_X(device, format_supports_tlb_resolve)(format);
}
static void
@@ -82,7 +82,7 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device,
if (subpass->resolve_attachments &&
subpass->resolve_attachments[j].attachment != VK_ATTACHMENT_UNUSED) {
- set_use_tlb_resolve(device, att);
+ set_try_tlb_resolve(device, att);
}
}
@@ -92,6 +92,9 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device,
pass->attachments[ds_attachment_idx].first_subpass = i;
if (i > pass->attachments[ds_attachment_idx].last_subpass)
pass->attachments[ds_attachment_idx].last_subpass = i;
+
+ if (subpass->ds_resolve_attachment.attachment != VK_ATTACHMENT_UNUSED)
+ set_try_tlb_resolve(device, &pass->attachments[ds_attachment_idx]);
}
for (uint32_t j = 0; j < subpass->input_count; j++) {
@@ -118,21 +121,57 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device,
}
}
+/* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
+ * the clear might get lost. If a subpass has this then we can't emit
+ * the clear using the TLB and we have to do it as a draw call. This
+ * issue is fixed since V3D 4.3.18.
+ *
+ * FIXME: separate stencil.
+ */
+static void
+check_do_depth_stencil_clear_with_draw(struct v3dv_device *device,
+ struct v3dv_render_pass *pass,
+ struct v3dv_subpass *subpass)
+{
+ if (device->devinfo.ver > 42 ||
+ subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
+ return;
+ }
+
+ struct v3dv_render_pass_attachment *att =
+ &pass->attachments[subpass->ds_attachment.attachment];
+ if (att->desc.format != VK_FORMAT_D24_UNORM_S8_UINT)
+ return;
+
+ if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+ att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD) {
+ subpass->do_depth_clear_with_draw = true;
+ } else if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD &&
+ att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+ subpass->do_stencil_clear_with_draw = true;
+ }
+}
VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateRenderPass(VkDevice _device,
- const VkRenderPassCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkRenderPass *pRenderPass)
+v3dv_CreateRenderPass2(VkDevice _device,
+ const VkRenderPassCreateInfo2 *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkRenderPass *pRenderPass)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
struct v3dv_render_pass *pass;
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO);
+ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2);
- const VkRenderPassMultiviewCreateInfo *multiview_info =
- vk_find_struct_const(pCreateInfo->pNext, RENDER_PASS_MULTIVIEW_CREATE_INFO);
- bool multiview_enabled = multiview_info && multiview_info->subpassCount > 0;
+ /* From the VK_KHR_multiview spec:
+ *
+ * When a subpass uses a non-zero view mask, multiview functionality is
+ * considered to be enabled. Multiview is all-or-nothing for a render
+ * pass - that is, either all subpasses must have a non-zero view mask
+ * (though some subpasses may have only one view) or all must be zero.
+ */
+ bool multiview_enabled = pCreateInfo->subpassCount &&
+ pCreateInfo->pSubpasses[0].viewMask;
size_t size = sizeof(*pass);
size_t subpasses_offset = size;
@@ -143,7 +182,7 @@ v3dv_CreateRenderPass(VkDevice _device,
pass = vk_object_zalloc(&device->vk, pAllocator, size,
VK_OBJECT_TYPE_RENDER_PASS);
if (pass == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pass->multiview_enabled = multiview_enabled;
pass->attachment_count = pCreateInfo->attachmentCount;
@@ -156,7 +195,7 @@ v3dv_CreateRenderPass(VkDevice _device,
uint32_t subpass_attachment_count = 0;
for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
- const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+ const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
subpass_attachment_count += num_subpass_attachments(desc);
}
@@ -168,7 +207,7 @@ v3dv_CreateRenderPass(VkDevice _device,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (pass->subpass_attachments == NULL) {
vk_object_free(&device->vk, pAllocator, pass);
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
} else {
pass->subpass_attachments = NULL;
@@ -176,13 +215,12 @@ v3dv_CreateRenderPass(VkDevice _device,
struct v3dv_subpass_attachment *p = pass->subpass_attachments;
for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
- const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+ const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
struct v3dv_subpass *subpass = &pass->subpasses[i];
subpass->input_count = desc->inputAttachmentCount;
subpass->color_count = desc->colorAttachmentCount;
- if (multiview_enabled)
- subpass->view_mask = multiview_info->pViewMasks[i];
+ subpass->view_mask = desc->viewMask;
if (desc->inputAttachmentCount > 0) {
subpass->input_attachments = p;
@@ -226,27 +264,38 @@ v3dv_CreateRenderPass(VkDevice _device,
.layout = desc->pDepthStencilAttachment->layout,
};
- /* GFXH-1461: if depth is cleared but stencil is loaded (or viceversa),
- * the clear might get lost. If a subpass has this then we can't emit
- * the clear using the TLB and we have to do it as a draw call.
- *
- * FIXME: separate stencil.
- */
- if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
- struct v3dv_render_pass_attachment *att =
- &pass->attachments[subpass->ds_attachment.attachment];
- if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
- if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
- att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD) {
- subpass->do_depth_clear_with_draw = true;
- } else if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD &&
- att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
- subpass->do_stencil_clear_with_draw = true;
- }
- }
+ check_do_depth_stencil_clear_with_draw(device, pass, subpass);
+
+ /* VK_KHR_depth_stencil_resolve */
+ const VkSubpassDescriptionDepthStencilResolve *resolve_desc =
+ vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE);
+ const VkAttachmentReference2 *resolve_att =
+ resolve_desc && resolve_desc->pDepthStencilResolveAttachment &&
+ resolve_desc->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED ?
+ resolve_desc->pDepthStencilResolveAttachment : NULL;
+ if (resolve_att) {
+ subpass->ds_resolve_attachment = (struct v3dv_subpass_attachment) {
+ .attachment = resolve_att->attachment,
+ .layout = resolve_att->layout,
+ };
+ assert(resolve_desc->depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT ||
+ resolve_desc->stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
+ subpass->resolve_depth =
+ resolve_desc->depthResolveMode != VK_RESOLVE_MODE_NONE &&
+ resolve_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT;
+ subpass->resolve_stencil =
+ resolve_desc->stencilResolveMode != VK_RESOLVE_MODE_NONE &&
+ resolve_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT;
+ } else {
+ subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED;
+ subpass->resolve_depth = false;
+ subpass->resolve_stencil = false;
}
} else {
subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED;
+ subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED;
+ subpass->resolve_depth = false;
+ subpass->resolve_stencil = false;
}
}
@@ -280,50 +329,44 @@ subpass_get_granularity(struct v3dv_device *device,
uint32_t subpass_idx,
VkExtent2D *granularity)
{
- static const uint8_t tile_sizes[] = {
- 64, 64,
- 64, 32,
- 32, 32,
- 32, 16,
- 16, 16,
- 16, 8,
- 8, 8
- };
-
- /* Our tile size depends on the number of color attachments and the maximum
- * bpp across them.
- */
+ /* Granularity is defined by the tile size */
assert(subpass_idx < pass->subpass_count);
struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
- const uint32_t color_attachment_count = subpass->color_count;
+ const uint32_t color_count = subpass->color_count;
+ bool msaa = false;
uint32_t max_internal_bpp = 0;
- for (uint32_t i = 0; i < color_attachment_count; i++) {
+ uint32_t total_color_bpp = 0;
+ for (uint32_t i = 0; i < color_count; i++) {
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
if (attachment_idx == VK_ATTACHMENT_UNUSED)
continue;
- const VkAttachmentDescription *desc =
+ const VkAttachmentDescription2 *desc =
&pass->attachments[attachment_idx].desc;
const struct v3dv_format *format = v3dv_X(device, get_format)(desc->format);
uint32_t internal_type, internal_bpp;
+ /* We don't support rendering to YCbCr images */
+ assert(format->plane_count == 1);
v3dv_X(device, get_internal_type_bpp_for_output_format)
- (format->rt_type, &internal_type, &internal_bpp);
+ (format->planes[0].rt_type, &internal_type, &internal_bpp);
max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
- }
-
- uint32_t idx = 0;
- if (color_attachment_count > 2)
- idx += 2;
- else if (color_attachment_count > 1)
- idx += 1;
+ total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
- idx += max_internal_bpp;
+ if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
+ msaa = true;
+ }
- assert(idx < ARRAY_SIZE(tile_sizes));
+ /* If requested, double-buffer may or may not be enabled depending on
+ * heuristics so we choose a conservative granularity here, with it disabled.
+ */
+ uint32_t width, height;
+ v3d_choose_tile_size(&device->devinfo, color_count,
+ max_internal_bpp, total_color_bpp, msaa,
+ false /* double-buffer */, &width, &height);
*granularity = (VkExtent2D) {
- .width = tile_sizes[idx * 2],
- .height = tile_sizes[idx * 2 + 1]
+ .width = width,
+ .height = height
};
}
@@ -390,3 +433,264 @@ v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device,
(fb->has_edge_padding &&
area->offset.y + area->extent.height >= fb->height));
}
+
+static void
+setup_dynamic_attachment(struct v3dv_device *device,
+ struct v3dv_render_pass_attachment *att,
+ const VkRenderingAttachmentInfo *info,
+ bool is_stencil,
+ bool is_resolve)
+{
+ struct v3dv_image_view *view = v3dv_image_view_from_handle(info->imageView);
+
+ VkAttachmentLoadOp load_op, stencil_load_op;
+ VkAttachmentStoreOp store_op, stencil_store_op;
+
+ if (!is_stencil) {
+ stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+ if (!is_resolve) {
+ load_op = info->loadOp;
+ store_op = info->storeOp;
+ } else {
+ load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ store_op = VK_ATTACHMENT_STORE_OP_STORE;
+ }
+ } else {
+ load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+ if (!is_resolve) {
+ stencil_load_op = info->loadOp;
+ stencil_store_op = info->storeOp;
+ } else {
+ stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ stencil_store_op = VK_ATTACHMENT_STORE_OP_STORE;
+ }
+ }
+
+ att->desc = (VkAttachmentDescription2) {
+ .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
+ .flags = 0,
+ .format = view->vk.format,
+ .samples = view->vk.image->samples,
+ .loadOp = load_op,
+ .storeOp = store_op,
+ .stencilLoadOp = stencil_load_op,
+ .stencilStoreOp = stencil_store_op,
+ .initialLayout = info->imageLayout,
+ .finalLayout = info->imageLayout,
+ };
+
+ if (is_resolve)
+ set_try_tlb_resolve(device, att);
+}
+
+void
+v3dv_setup_dynamic_render_pass(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkRenderingInfoKHR *info)
+{
+ struct v3dv_device *device = cmd_buffer->device;
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+ struct v3dv_render_pass *pass = &state->dynamic_pass;
+ struct v3dv_subpass *subpass = &state->dynamic_subpass;
+ struct v3dv_render_pass_attachment *pass_attachments =
+ &state->dynamic_attachments[0];
+ struct v3dv_subpass_attachment *subpass_attachments =
+ &state->dynamic_subpass_attachments[0];
+
+ memset(pass, 0, sizeof(*pass));
+ memset(subpass, 0, sizeof(*subpass));
+ memset(pass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+ memset(subpass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+
+ vk_object_base_init(&device->vk, (struct vk_object_base *) pass,
+ VK_OBJECT_TYPE_RENDER_PASS);
+
+ pass->attachments = pass_attachments;
+ pass->subpass_attachments = subpass_attachments;
+
+ subpass->view_mask = info->viewMask;
+ subpass->color_count = info->colorAttachmentCount;
+ subpass->color_attachments = &subpass_attachments[0];
+ subpass->resolve_attachments = &subpass_attachments[subpass->color_count];
+
+ pass->multiview_enabled = info->viewMask != 0;
+ pass->subpass_count = 1;
+ pass->subpasses = subpass;
+
+ int a = 0;
+ for (int i = 0; i < info->colorAttachmentCount; i++) {
+ struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+ const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
+
+ if (att_info->imageView == VK_NULL_HANDLE) {
+ subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+ subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+ continue;
+ }
+
+ setup_dynamic_attachment(device, att, att_info, false, false);
+ subpass->color_attachments[i].attachment = a++;
+ subpass->color_attachments[i].layout = att_info->imageLayout;
+
+ if (att_info->resolveMode != VK_RESOLVE_MODE_NONE) {
+ struct v3dv_render_pass_attachment *resolve_att = &pass->attachments[a];
+ setup_dynamic_attachment(device, resolve_att, att_info, false, true);
+ subpass->resolve_attachments[i].attachment = a++;
+ subpass->resolve_attachments[i].layout = att_info->resolveImageLayout;
+ } else {
+ subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+ }
+ }
+
+ bool has_depth = info->pDepthAttachment &&
+ info->pDepthAttachment->imageView != VK_NULL_HANDLE;
+ bool has_stencil = info->pStencilAttachment &&
+ info->pStencilAttachment->imageView != VK_NULL_HANDLE;
+ if (has_depth || has_stencil) {
+ struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+ subpass->ds_attachment.attachment = a++;
+
+ bool has_depth_resolve = false;
+ bool has_stencil_resolve = false;
+
+ if (has_depth) {
+ setup_dynamic_attachment(device, att, info->pDepthAttachment,
+ false, false);
+ subpass->ds_attachment.layout = info->pDepthAttachment->imageLayout;
+ has_depth_resolve =
+ info->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE;
+ }
+
+ if (has_stencil) {
+ if (has_depth) {
+ att->desc.stencilLoadOp = info->pStencilAttachment->loadOp;
+ att->desc.stencilStoreOp = info->pStencilAttachment->storeOp;
+ } else {
+ setup_dynamic_attachment(device, att, info->pStencilAttachment,
+ true, false);
+ subpass->ds_attachment.layout =
+ info->pStencilAttachment->imageLayout;
+ }
+ has_stencil_resolve =
+ info->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE;
+ }
+
+ if (has_depth_resolve || has_stencil_resolve) {
+ struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+ subpass->ds_resolve_attachment.attachment = a++;
+ if (has_depth_resolve) {
+ setup_dynamic_attachment(device, att, info->pDepthAttachment,
+ false, true);
+ subpass->ds_resolve_attachment.layout =
+ info->pDepthAttachment->resolveImageLayout;
+ subpass->resolve_depth = true;
+ }
+ if (has_stencil_resolve) {
+ if (has_depth_resolve) {
+ att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
+ } else {
+ setup_dynamic_attachment(device, att, info->pStencilAttachment,
+ true, true);
+ subpass->ds_resolve_attachment.layout =
+ info->pStencilAttachment->resolveImageLayout;
+ }
+ subpass->resolve_stencil = true;
+ }
+ } else {
+ subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED;
+ }
+ } else {
+ subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED;
+ }
+
+ check_do_depth_stencil_clear_with_draw(device, pass, subpass);
+
+ pass->attachment_count = a;
+}
+
+void
+v3dv_setup_dynamic_render_pass_inheritance(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkCommandBufferInheritanceRenderingInfo *info)
+{
+ struct v3dv_device *device = cmd_buffer->device;
+ struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+ struct v3dv_render_pass *pass = &state->dynamic_pass;
+ struct v3dv_subpass *subpass = &state->dynamic_subpass;
+ struct v3dv_render_pass_attachment *pass_attachments =
+ &state->dynamic_attachments[0];
+ struct v3dv_subpass_attachment *subpass_attachments =
+ &state->dynamic_subpass_attachments[0];
+
+ memset(pass, 0, sizeof(*pass));
+ memset(subpass, 0, sizeof(*subpass));
+ memset(pass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+ memset(subpass_attachments, 0, sizeof(state->dynamic_subpass_attachments));
+
+ vk_object_base_init(&device->vk, (struct vk_object_base *) pass,
+ VK_OBJECT_TYPE_RENDER_PASS);
+
+ pass->attachments = pass_attachments;
+ pass->subpass_attachments = subpass_attachments;
+
+ subpass->view_mask = info->viewMask;
+ subpass->color_count = info->colorAttachmentCount;
+ subpass->color_attachments = &subpass_attachments[0];
+ subpass->resolve_attachments = NULL;
+
+ pass->multiview_enabled = info->viewMask != 0;
+ pass->subpass_count = 1;
+ pass->subpasses = subpass;
+
+ int a = 0;
+ for (int i = 0; i < info->colorAttachmentCount; i++) {
+ struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+ const VkFormat format = info->pColorAttachmentFormats[i];
+
+ if (format == VK_FORMAT_UNDEFINED) {
+ subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
+ continue;
+ }
+
+ /* We don't have info about load/store, so we assume we load and we
+ * store.
+ */
+ att->desc.format = format;
+ att->desc.samples = info->rasterizationSamples;
+ att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+ att->desc.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+ att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+ subpass->color_attachments[i].attachment = a++;
+ }
+
+ if (info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
+ info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) {
+ struct v3dv_render_pass_attachment *att = &pass->attachments[a];
+ att->desc.format = info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ?
+ info->depthAttachmentFormat : info->stencilAttachmentFormat;
+ att->desc.samples = info->rasterizationSamples;
+ if (vk_format_has_depth(att->desc.format)) {
+ att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+ att->desc.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+ } else {
+ att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ att->desc.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+ }
+ if (vk_format_has_stencil(att->desc.format)) {
+ att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+ att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
+ } else {
+ att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+ att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+ }
+ subpass->ds_attachment.attachment = a++;
+ } else {
+ subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED;
+ }
+
+ pass->attachment_count = a;
+}
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 44962c50508..9851a24c2cd 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,18 +26,18 @@
#include "v3dv_debug.h"
#include "v3dv_private.h"
-#include "vk_format_info.h"
-
#include "common/v3d_debug.h"
+#include "qpu/qpu_disasm.h"
#include "compiler/nir/nir_builder.h"
#include "nir/nir_serialize.h"
#include "util/u_atomic.h"
-#include "util/u_prim.h"
#include "util/os_time.h"
-#include "vulkan/util/vk_format.h"
+#include "vk_format.h"
+#include "vk_nir_convert_ycbcr.h"
+#include "vk_pipeline.h"
static VkResult
compute_vpm_config(struct v3dv_pipeline *pipeline);
@@ -61,31 +61,15 @@ v3dv_print_v3d_key(struct v3d_key *key,
}
static void
-pipeline_compute_sha1_from_nir(nir_shader *nir,
- unsigned char sha1[20])
-{
- assert(nir);
- struct blob blob;
- blob_init(&blob);
-
- nir_serialize(&blob, nir, false);
- if (!blob.out_of_memory)
- _mesa_sha1_compute(blob.data, blob.size, sha1);
-
- blob_finish(&blob);
-}
-
-void
-v3dv_shader_module_internal_init(struct v3dv_device *device,
- struct vk_shader_module *module,
- nir_shader *nir)
+pipeline_compute_sha1_from_nir(struct v3dv_pipeline_stage *p_stage)
{
- vk_object_base_init(&device->vk, &module->base,
- VK_OBJECT_TYPE_SHADER_MODULE);
- module->nir = nir;
- module->size = 0;
+ VkPipelineShaderStageCreateInfo info = {
+ .module = vk_shader_module_handle_from_nir(p_stage->nir),
+ .pName = p_stage->entrypoint,
+ .stage = mesa_to_vk_shader_stage(p_stage->nir->info.stage),
+ };
- pipeline_compute_sha1_from_nir(nir, module->sha1);
+ vk_pipeline_hash_shader_stage(&info, NULL, p_stage->shader_sha1);
}
void
@@ -95,6 +79,10 @@ v3dv_shader_variant_destroy(struct v3dv_device *device,
/* The assembly BO is shared by all variants in the pipeline, so it can't
* be freed here and should be freed with the pipeline
*/
+ if (variant->qpu_insts) {
+ free(variant->qpu_insts);
+ variant->qpu_insts = NULL;
+ }
ralloc_free(variant->prog_data.base);
vk_free(&device->vk.alloc, variant);
}
@@ -118,22 +106,10 @@ pipeline_free_stages(struct v3dv_device *device,
{
assert(pipeline);
- /* FIXME: we can't just use a loop over mesa stage due the bin, would be
- * good to find an alternative.
- */
- destroy_pipeline_stage(device, pipeline->vs, pAllocator);
- destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
- destroy_pipeline_stage(device, pipeline->gs, pAllocator);
- destroy_pipeline_stage(device, pipeline->gs_bin, pAllocator);
- destroy_pipeline_stage(device, pipeline->fs, pAllocator);
- destroy_pipeline_stage(device, pipeline->cs, pAllocator);
-
- pipeline->vs = NULL;
- pipeline->vs_bin = NULL;
- pipeline->gs = NULL;
- pipeline->gs_bin = NULL;
- pipeline->fs = NULL;
- pipeline->cs = NULL;
+ for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+ destroy_pipeline_stage(device, pipeline->stages[stage], pAllocator);
+ pipeline->stages[stage] = NULL;
+ }
}
static void
@@ -161,6 +137,12 @@ v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
pipeline->default_attribute_values = NULL;
}
+ if (pipeline->executables.mem_ctx)
+ ralloc_free(pipeline->executables.mem_ctx);
+
+ if (pipeline->layout)
+ v3dv_pipeline_layout_unref(device, pipeline->layout, pAllocator);
+
vk_object_free(&device->vk, pAllocator, pipeline);
}
@@ -181,31 +163,44 @@ v3dv_DestroyPipeline(VkDevice _device,
static const struct spirv_to_nir_options default_spirv_options = {
.caps = {
.device_group = true,
+ .float_controls = true,
.multiview = true,
+ .storage_8bit = true,
+ .storage_16bit = true,
+ .subgroup_ballot = true,
.subgroup_basic = true,
+ .subgroup_quad = true,
+ .subgroup_shuffle = true,
+ .subgroup_vote = true,
.variable_pointers = true,
+ .vk_memory_model = true,
+ .vk_memory_model_device_scope = true,
+ .physical_storage_buffer_address = true,
+ .workgroup_memory_explicit_layout = true,
+ .image_read_without_format = true,
+ .demote_to_helper_invocation = true,
},
.ubo_addr_format = nir_address_format_32bit_index_offset,
.ssbo_addr_format = nir_address_format_32bit_index_offset,
- .phys_ssbo_addr_format = nir_address_format_64bit_global,
+ .phys_ssbo_addr_format = nir_address_format_2x32bit_global,
.push_const_addr_format = nir_address_format_logical,
.shared_addr_format = nir_address_format_32bit_offset,
- .frag_coord_is_sysval = false,
};
const nir_shader_compiler_options v3dv_nir_options = {
.lower_uadd_sat = true,
+ .lower_usub_sat = true,
.lower_iadd_sat = true,
.lower_all_io_to_temps = true,
.lower_extract_byte = true,
.lower_extract_word = true,
.lower_insert_byte = true,
.lower_insert_word = true,
- .lower_bitfield_insert_to_shifts = true,
- .lower_bitfield_extract_to_shifts = true,
+ .lower_bitfield_insert = true,
+ .lower_bitfield_extract = true,
.lower_bitfield_reverse = true,
.lower_bit_count = true,
- .lower_cs_local_id_from_index = true,
+ .lower_cs_local_id_to_index = true,
.lower_ffract = true,
.lower_fmod = true,
.lower_pack_unorm_2x16 = true,
@@ -218,14 +213,9 @@ const nir_shader_compiler_options v3dv_nir_options = {
.lower_unpack_snorm_4x8 = true,
.lower_pack_half_2x16 = true,
.lower_unpack_half_2x16 = true,
- /* FIXME: see if we can avoid the uadd_carry and usub_borrow lowering and
- * get the tests to pass since it might produce slightly better code.
- */
- .lower_uadd_carry = true,
- .lower_usub_borrow = true,
- /* FIXME: check if we can use multop + umul24 to implement mul2x32_64
- * without lowering.
- */
+ .lower_pack_32_2x16 = true,
+ .lower_pack_32_2x16_split = true,
+ .lower_unpack_32_2x16_split = true,
.lower_mul_2x32_64 = true,
.lower_fdiv = true,
.lower_find_lsb = true,
@@ -240,10 +230,10 @@ const nir_shader_compiler_options v3dv_nir_options = {
.lower_isign = true,
.lower_ldexp = true,
.lower_mul_high = true,
- .lower_wpos_pntc = true,
- .lower_rotate = true,
+ .lower_wpos_pntc = false,
.lower_to_scalar = true,
.lower_device_index_to_zero = true,
+ .lower_fquantize2f16 = true,
.has_fsub = true,
.has_isub = true,
.vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
@@ -252,7 +242,7 @@ const nir_shader_compiler_options v3dv_nir_options = {
.max_unroll_iterations = 16,
.force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp),
.divergence_analysis_options =
- nir_divergence_multiple_workgroup_per_compute_subgroup
+ nir_divergence_multiple_workgroup_per_compute_subgroup,
};
const nir_shader_compiler_options *
@@ -261,95 +251,39 @@ v3dv_pipeline_get_nir_options(void)
return &v3dv_nir_options;
}
-#define OPT(pass, ...) ({ \
- bool this_progress = false; \
- NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
- if (this_progress) \
- progress = true; \
- this_progress; \
-})
-
-static void
-nir_optimize(nir_shader *nir, bool allow_copies)
-{
- bool progress;
-
- do {
- progress = false;
- OPT(nir_split_array_vars, nir_var_function_temp);
- OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
- OPT(nir_opt_deref);
- OPT(nir_lower_vars_to_ssa);
- if (allow_copies) {
- /* Only run this pass in the first call to nir_optimize. Later calls
- * assume that we've lowered away any copy_deref instructions and we
- * don't want to introduce any more.
- */
- OPT(nir_opt_find_array_copies);
- }
- OPT(nir_opt_copy_prop_vars);
- OPT(nir_opt_dead_write_vars);
- OPT(nir_opt_combine_stores, nir_var_all);
-
- OPT(nir_lower_alu_to_scalar, NULL, NULL);
-
- OPT(nir_copy_prop);
- OPT(nir_lower_phis_to_scalar, false);
-
- OPT(nir_copy_prop);
- OPT(nir_opt_dce);
- OPT(nir_opt_cse);
- OPT(nir_opt_combine_stores, nir_var_all);
-
- /* Passing 0 to the peephole select pass causes it to convert
- * if-statements that contain only move instructions in the branches
- * regardless of the count.
- *
- * Passing 1 to the peephole select pass causes it to convert
- * if-statements that contain at most a single ALU instruction (total)
- * in both branches.
- */
- OPT(nir_opt_peephole_select, 0, false, false);
- OPT(nir_opt_peephole_select, 8, false, true);
-
- OPT(nir_opt_intrinsics);
- OPT(nir_opt_idiv_const, 32);
- OPT(nir_opt_algebraic);
- OPT(nir_opt_constant_folding);
-
- OPT(nir_opt_dead_cf);
+static const struct vk_ycbcr_conversion_state *
+lookup_ycbcr_conversion(const void *_pipeline_layout, uint32_t set,
+ uint32_t binding, uint32_t array_index)
+{
+ struct v3dv_pipeline_layout *pipeline_layout =
+ (struct v3dv_pipeline_layout *) _pipeline_layout;
- OPT(nir_opt_if, false);
- OPT(nir_opt_conditional_discard);
+ assert(set < pipeline_layout->num_sets);
+ struct v3dv_descriptor_set_layout *set_layout =
+ pipeline_layout->set[set].layout;
- OPT(nir_opt_remove_phis);
- OPT(nir_opt_undef);
- OPT(nir_lower_pack);
- } while (progress);
+ assert(binding < set_layout->binding_count);
+ struct v3dv_descriptor_set_binding_layout *bind_layout =
+ &set_layout->binding[binding];
- OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
+ if (bind_layout->immutable_samplers_offset) {
+ const struct v3dv_sampler *immutable_samplers =
+ v3dv_immutable_samplers(set_layout, bind_layout);
+ const struct v3dv_sampler *sampler = &immutable_samplers[array_index];
+ return sampler->conversion ? &sampler->conversion->state : NULL;
+ } else {
+ return NULL;
+ }
}
static void
preprocess_nir(nir_shader *nir)
{
- /* We have to lower away local variable initializers right before we
- * inline functions. That way they get properly initialized at the top
- * of the function and not at the top of its caller.
- */
- NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
- NIR_PASS_V(nir, nir_lower_returns);
- NIR_PASS_V(nir, nir_inline_functions);
- NIR_PASS_V(nir, nir_opt_deref);
-
- /* Pick off the single entrypoint that we want */
- foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
- if (func->is_entrypoint)
- func->name = ralloc_strdup(func, "main");
- else
- exec_node_remove(&func->node);
- }
- assert(exec_list_length(&nir->functions) == 1);
+ const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+ .frag_coord = true,
+ .point_coord = true,
+ };
+ NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
/* Vulkan uses the separate-shader linking model */
nir->info.separate_shader = true;
@@ -357,76 +291,63 @@ preprocess_nir(nir_shader *nir)
/* Make sure we lower variable initializers on output variables so that
* nir_remove_dead_variables below sees the corresponding stores
*/
- NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);
-
- /* Now that we've deleted all but the main function, we can go ahead and
- * lower the rest of the variable initializers.
- */
- NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
-
- /* Split member structs. We do this before lower_io_to_temporaries so that
- * it doesn't lower system values to temporaries by accident.
- */
- NIR_PASS_V(nir, nir_split_var_copies);
- NIR_PASS_V(nir, nir_split_per_member_structs);
+ NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_shader_out);
if (nir->info.stage == MESA_SHADER_FRAGMENT)
- NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
+ NIR_PASS(_, nir, nir_lower_io_to_vector, nir_var_shader_out);
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
- NIR_PASS_V(nir, nir_lower_input_attachments,
+ NIR_PASS(_, nir, nir_lower_input_attachments,
&(nir_input_attachment_options) {
.use_fragcoord_sysval = false,
});
}
- NIR_PASS_V(nir, nir_lower_explicit_io,
- nir_var_mem_push_const,
- nir_address_format_32bit_offset);
+ NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+ nir_shader_get_entrypoint(nir), true, false);
- NIR_PASS_V(nir, nir_lower_explicit_io,
- nir_var_mem_ubo | nir_var_mem_ssbo,
- nir_address_format_32bit_index_offset);
+ NIR_PASS(_, nir, nir_lower_system_values);
- NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in |
- nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
- NULL);
+ NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
- NIR_PASS_V(nir, nir_propagate_invariant, false);
- NIR_PASS_V(nir, nir_lower_io_to_temporaries,
- nir_shader_get_entrypoint(nir), true, false);
+ NIR_PASS(_, nir, nir_normalize_cubemap_coords);
- NIR_PASS_V(nir, nir_lower_system_values);
- NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
+ NIR_PASS(_, nir, nir_lower_global_vars_to_local);
- NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+ NIR_PASS(_, nir, nir_split_var_copies);
+ NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
- NIR_PASS_V(nir, nir_normalize_cubemap_coords);
+ v3d_optimize_nir(NULL, nir);
- NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+ NIR_PASS(_, nir, nir_lower_explicit_io,
+ nir_var_mem_push_const,
+ nir_address_format_32bit_offset);
- NIR_PASS_V(nir, nir_split_var_copies);
- NIR_PASS_V(nir, nir_split_struct_vars, nir_var_function_temp);
+ NIR_PASS(_, nir, nir_lower_explicit_io,
+ nir_var_mem_ubo | nir_var_mem_ssbo,
+ nir_address_format_32bit_index_offset);
- nir_optimize(nir, true);
+ NIR_PASS(_, nir, nir_lower_explicit_io,
+ nir_var_mem_global,
+ nir_address_format_2x32bit_global);
- NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+ NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
/* Lower a bunch of stuff */
- NIR_PASS_V(nir, nir_lower_var_copies);
+ NIR_PASS(_, nir, nir_lower_var_copies);
- NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
+ NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
- NIR_PASS_V(nir, nir_lower_indirect_derefs,
- nir_var_function_temp, 2);
+ NIR_PASS(_, nir, nir_lower_indirect_derefs,
+ nir_var_function_temp, 2);
- NIR_PASS_V(nir, nir_lower_array_deref_of_vec,
- nir_var_mem_ubo | nir_var_mem_ssbo,
- nir_lower_direct_array_deref_of_vec_load);
+ NIR_PASS(_, nir, nir_lower_array_deref_of_vec,
+ nir_var_mem_ubo | nir_var_mem_ssbo,
+ nir_lower_direct_array_deref_of_vec_load);
- NIR_PASS_V(nir, nir_lower_frexp);
+ NIR_PASS(_, nir, nir_lower_frexp);
/* Get rid of split copies */
- nir_optimize(nir, false);
+ v3d_optimize_nir(NULL, nir);
}
static nir_shader *
@@ -435,42 +356,35 @@ shader_module_compile_to_nir(struct v3dv_device *device,
{
nir_shader *nir;
const nir_shader_compiler_options *nir_options = &v3dv_nir_options;
+ gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(stage->stage);
- if (!stage->module->nir) {
- uint32_t *spirv = (uint32_t *) stage->module->data;
- assert(stage->module->size % 4 == 0);
-
- if (V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV)
- v3dv_print_spirv(stage->module->data, stage->module->size, stderr);
-
- uint32_t num_spec_entries = 0;
- struct nir_spirv_specialization *spec_entries =
- vk_spec_info_to_nir_spirv(stage->spec_info, &num_spec_entries);
- const struct spirv_to_nir_options spirv_options = default_spirv_options;
- nir = spirv_to_nir(spirv, stage->module->size / 4,
- spec_entries, num_spec_entries,
- broadcom_shader_stage_to_gl(stage->stage),
- stage->entrypoint,
- &spirv_options, nir_options);
- assert(nir);
- nir_validate_shader(nir, "after spirv_to_nir");
- free(spec_entries);
- } else {
- /* For NIR modules created by the driver we can't consume the NIR
- * directly, we need to clone it first, since ownership of the NIR code
- * (as with SPIR-V code for SPIR-V shaders), belongs to the creator
- * of the module and modules can be destroyed immediately after been used
- * to create pipelines.
- */
- nir = nir_shader_clone(NULL, stage->module->nir);
- nir_validate_shader(nir, "nir module");
+
+ if (V3D_DBG(DUMP_SPIRV) && stage->module->nir == NULL)
+ v3dv_print_spirv(stage->module->data, stage->module->size, stderr);
+
+ /* vk_shader_module_to_nir also handles internal shaders, when module->nir
+ * != NULL. It also calls nir_validate_shader on both cases, so we don't
+ * call it again here.
+ */
+ VkResult result = vk_shader_module_to_nir(&device->vk, stage->module,
+ gl_stage,
+ stage->entrypoint,
+ stage->spec_info,
+ &default_spirv_options,
+ nir_options,
+ NULL, &nir);
+ if (result != VK_SUCCESS)
+ return NULL;
+ assert(nir->info.stage == gl_stage);
+
+ if (V3D_DBG(SHADERDB) && stage->module->nir == NULL) {
+ char sha1buf[41];
+ _mesa_sha1_format(sha1buf, stage->pipeline->sha1);
+ nir->info.name = ralloc_strdup(nir, sha1buf);
}
- assert(nir->info.stage == broadcom_shader_stage_to_gl(stage->stage));
- if (V3D_DEBUG & (V3D_DEBUG_NIR |
- v3d_debug_flag_for_shader_stage(
- broadcom_shader_stage_to_gl(stage->stage)))) {
- fprintf(stderr, "Initial form: %s prog %d NIR:\n",
+ if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) {
+ fprintf(stderr, "NIR after vk_shader_module_to_nir: %s prog %d NIR:\n",
broadcom_shader_stage_name(stage->stage),
stage->program_id);
nir_print_shader(nir, stderr);
@@ -497,17 +411,21 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
int binding,
int array_index,
int array_size,
- uint8_t return_size)
+ int start_index,
+ uint8_t return_size,
+ uint8_t plane)
{
assert(array_index < array_size);
assert(return_size == 16 || return_size == 32);
- unsigned index = 0;
- for (unsigned i = 0; i < map->num_desc; i++) {
- if (set == map->set[i] &&
- binding == map->binding[i] &&
- array_index == map->array_index[i]) {
- assert(array_size == map->array_size[i]);
+ unsigned index = start_index;
+ for (; index < map->num_desc; index++) {
+ if (map->used[index] &&
+ set == map->set[index] &&
+ binding == map->binding[index] &&
+ array_index == map->array_index[index] &&
+ plane == map->plane[index]) {
+ assert(array_size == map->array_size[index]);
if (return_size != map->return_size[index]) {
/* It the return_size is different it means that the same sampler
* was used for operations with different precision
@@ -517,26 +435,36 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
map->return_size[index] = 32;
}
return index;
+ } else if (!map->used[index]) {
+ break;
}
- index++;
}
- assert(index == map->num_desc);
+ assert(index < DESCRIPTOR_MAP_SIZE);
+ assert(!map->used[index]);
- map->set[map->num_desc] = set;
- map->binding[map->num_desc] = binding;
- map->array_index[map->num_desc] = array_index;
- map->array_size[map->num_desc] = array_size;
- map->return_size[map->num_desc] = return_size;
- map->num_desc++;
+ map->used[index] = true;
+ map->set[index] = set;
+ map->binding[index] = binding;
+ map->array_index[index] = array_index;
+ map->array_size[index] = array_size;
+ map->return_size[index] = return_size;
+ map->plane[index] = plane;
+ map->num_desc = MAX2(map->num_desc, index + 1);
return index;
}
+struct lower_pipeline_layout_state {
+ struct v3dv_pipeline *pipeline;
+ const struct v3dv_pipeline_layout *layout;
+ bool needs_default_sampler_state;
+};
+
static void
lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
- struct v3dv_pipeline *pipeline)
+ struct lower_pipeline_layout_state *state)
{
assert(instr->intrinsic == nir_intrinsic_load_push_constant);
instr->intrinsic = nir_intrinsic_load_uniform;
@@ -568,8 +496,11 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
&pipeline->shared_data->maps[broadcom_stage]->sampler_map :
&pipeline->shared_data->maps[broadcom_stage]->texture_map;
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
return &pipeline->shared_data->maps[broadcom_stage]->ubo_map;
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map;
default:
unreachable("Descriptor type unknown or not having a descriptor map");
@@ -581,9 +512,7 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
static void
lower_vulkan_resource_index(nir_builder *b,
nir_intrinsic_instr *instr,
- nir_shader *shader,
- struct v3dv_pipeline *pipeline,
- const struct v3dv_pipeline_layout *layout)
+ struct lower_pipeline_layout_state *state)
{
assert(instr->intrinsic == nir_intrinsic_vulkan_resource_index);
@@ -591,35 +520,50 @@ lower_vulkan_resource_index(nir_builder *b,
unsigned set = nir_intrinsic_desc_set(instr);
unsigned binding = nir_intrinsic_binding(instr);
- struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
+ struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
struct v3dv_descriptor_set_binding_layout *binding_layout =
&set_layout->binding[binding];
unsigned index = 0;
- const VkDescriptorType desc_type = nir_intrinsic_desc_type(instr);
- switch (desc_type) {
+ switch (binding_layout->type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
struct v3dv_descriptor_map *descriptor_map =
- pipeline_get_descriptor_map(pipeline, desc_type, shader->info.stage, false);
+ pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
+ b->shader->info.stage, false);
if (!const_val)
unreachable("non-constant vulkan_resource_index array index");
+ /* At compile-time we will need to know if we are processing a UBO load
+ * for an inline or a regular UBO so we can handle inline loads like
+ * push constants. At the level of NIR level however, the inline
+ * information is gone, so we rely on the index to make this distinction.
+ * Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for
+ * inline buffers. This means that at the descriptor map level
+ * we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1,
+ * and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS.
+ */
+ uint32_t start_index = 0;
+ if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+ binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
+ start_index += MAX_INLINE_UNIFORM_BUFFERS;
+ }
+
index = descriptor_map_add(descriptor_map, set, binding,
const_val->u32,
binding_layout->array_size,
- 32 /* return_size: doesn't really apply for this case */);
-
- if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
- /* skip index 0 which is used for push constants */
- index++;
- }
+ start_index,
+ 32 /* return_size: doesn't really apply for this case */,
+ 0);
break;
}
default:
- unreachable("unsupported desc_type for vulkan_resource_index");
+ unreachable("unsupported descriptor type for vulkan_resource_index");
break;
}
@@ -627,30 +571,43 @@ lower_vulkan_resource_index(nir_builder *b,
* vulkan_load_descriptor return a vec2 providing an index and
* offset. Our backend compiler only cares about the index part.
*/
- nir_ssa_def_rewrite_uses(&instr->dest.ssa,
+ nir_def_rewrite_uses(&instr->def,
nir_imm_ivec2(b, index, 0));
nir_instr_remove(&instr->instr);
}
+static uint8_t
+tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
+{
+ int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane);
+ if (plane_src_idx < 0)
+ return 0;
+
+ uint8_t plane = nir_src_as_uint(tex->src[plane_src_idx].src);
+ nir_tex_instr_remove_src(tex, plane_src_idx);
+ return plane;
+}
+
/* Returns return_size, so it could be used for the case of not having a
* sampler object
*/
static uint8_t
-lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
- nir_shader *shader,
- struct v3dv_pipeline *pipeline,
- const struct v3dv_pipeline_layout *layout)
+lower_tex_src(nir_builder *b,
+ nir_tex_instr *instr,
+ unsigned src_idx,
+ struct lower_pipeline_layout_state *state)
{
- nir_ssa_def *index = NULL;
+ nir_def *index = NULL;
unsigned base_index = 0;
unsigned array_elements = 1;
nir_tex_src *src = &instr->src[src_idx];
bool is_sampler = src->src_type == nir_tex_src_sampler_deref;
+ uint8_t plane = tex_instr_get_and_remove_plane_src(instr);
+
/* We compute first the offsets */
nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
while (deref->deref_type != nir_deref_type_var) {
- assert(deref->parent.is_ssa);
nir_deref_instr *parent =
nir_instr_as_deref(deref->parent.ssa->parent_instr);
@@ -667,8 +624,8 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
}
index = nir_iadd(b, index,
- nir_imul(b, nir_imm_int(b, array_elements),
- nir_ssa_for_src(b, deref->arr.index, 1)));
+ nir_imul_imm(b, deref->arr.index.ssa,
+ array_elements));
}
array_elements *= glsl_get_length(parent->type);
@@ -683,8 +640,7 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
* instr if needed
*/
if (index) {
- nir_instr_rewrite_src(&instr->instr, &src->src,
- nir_src_for_ssa(index));
+ nir_src_rewrite(&src->src, index);
src->src_type = is_sampler ?
nir_tex_src_sampler_offset :
@@ -696,13 +652,13 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
uint32_t set = deref->var->data.descriptor_set;
uint32_t binding = deref->var->data.binding;
/* FIXME: this is a really simplified check for the precision to be used
- * for the sampling. Right now we are ony checking for the variables used
+ * for the sampling. Right now we are only checking for the variables used
* on the operation itself, but there are other cases that we could use to
* infer the precision requirement.
*/
bool relaxed_precision = deref->var->data.precision == GLSL_PRECISION_MEDIUM ||
deref->var->data.precision == GLSL_PRECISION_LOW;
- struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
+ struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
struct v3dv_descriptor_set_binding_layout *binding_layout =
&set_layout->binding[binding];
@@ -714,23 +670,25 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
base_index;
uint8_t return_size;
- if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT))
+ if (V3D_DBG(TMU_16BIT))
return_size = 16;
- else if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT))
+ else if (V3D_DBG(TMU_32BIT))
return_size = 32;
else
- return_size = relaxed_precision || instr->is_shadow ? 16 : 32;
+ return_size = relaxed_precision ? 16 : 32;
struct v3dv_descriptor_map *map =
- pipeline_get_descriptor_map(pipeline, binding_layout->type,
- shader->info.stage, is_sampler);
+ pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
+ b->shader->info.stage, is_sampler);
int desc_index =
descriptor_map_add(map,
deref->var->data.descriptor_set,
deref->var->data.binding,
array_index,
binding_layout->array_size,
- return_size);
+ 0,
+ return_size,
+ plane);
if (is_sampler)
instr->sampler_index = desc_index;
@@ -741,10 +699,9 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
}
static bool
-lower_sampler(nir_builder *b, nir_tex_instr *instr,
- nir_shader *shader,
- struct v3dv_pipeline *pipeline,
- const struct v3dv_pipeline_layout *layout)
+lower_sampler(nir_builder *b,
+ nir_tex_instr *instr,
+ struct lower_pipeline_layout_state *state)
{
uint8_t return_size = 0;
@@ -752,44 +709,43 @@ lower_sampler(nir_builder *b, nir_tex_instr *instr,
nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
if (texture_idx >= 0)
- return_size = lower_tex_src_to_offset(b, instr, texture_idx, shader,
- pipeline, layout);
+ return_size = lower_tex_src(b, instr, texture_idx, state);
int sampler_idx =
nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
- if (sampler_idx >= 0)
- lower_tex_src_to_offset(b, instr, sampler_idx, shader, pipeline, layout);
+ if (sampler_idx >= 0) {
+ assert(nir_tex_instr_need_sampler(instr));
+ lower_tex_src(b, instr, sampler_idx, state);
+ }
if (texture_idx < 0 && sampler_idx < 0)
return false;
- /* If we don't have a sampler, we assign it the idx we reserve for this
- * case, and we ensure that it is using the correct return size.
+ /* If the instruction doesn't have a sampler (i.e. txf) we use backend_flags
+ * to bind a default sampler state to configure precission.
*/
if (sampler_idx < 0) {
- instr->sampler_index = return_size == 16 ?
+ state->needs_default_sampler_state = true;
+ instr->backend_flags = return_size == 16 ?
V3DV_NO_SAMPLER_16BIT_IDX : V3DV_NO_SAMPLER_32BIT_IDX;
}
return true;
}
-/* FIXME: really similar to lower_tex_src_to_offset, perhaps refactor? */
+/* FIXME: really similar to lower_tex_src, perhaps refactor? */
static void
lower_image_deref(nir_builder *b,
nir_intrinsic_instr *instr,
- nir_shader *shader,
- struct v3dv_pipeline *pipeline,
- const struct v3dv_pipeline_layout *layout)
+ struct lower_pipeline_layout_state *state)
{
nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
- nir_ssa_def *index = NULL;
+ nir_def *index = NULL;
unsigned array_elements = 1;
unsigned base_index = 0;
while (deref->deref_type != nir_deref_type_var) {
- assert(deref->parent.is_ssa);
nir_deref_instr *parent =
nir_instr_as_deref(deref->parent.ssa->parent_instr);
@@ -806,8 +762,8 @@ lower_image_deref(nir_builder *b,
}
index = nir_iadd(b, index,
- nir_imul(b, nir_imm_int(b, array_elements),
- nir_ssa_for_src(b, deref->arr.index, 1)));
+ nir_imul_imm(b, deref->arr.index.ssa,
+ array_elements));
}
array_elements *= glsl_get_length(parent->type);
@@ -820,7 +776,7 @@ lower_image_deref(nir_builder *b,
uint32_t set = deref->var->data.descriptor_set;
uint32_t binding = deref->var->data.binding;
- struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
+ struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
struct v3dv_descriptor_set_binding_layout *binding_layout =
&set_layout->binding[binding];
@@ -830,8 +786,8 @@ lower_image_deref(nir_builder *b,
binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
struct v3dv_descriptor_map *map =
- pipeline_get_descriptor_map(pipeline, binding_layout->type,
- shader->info.stage, false);
+ pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
+ b->shader->info.stage, false);
int desc_index =
descriptor_map_add(map,
@@ -839,7 +795,9 @@ lower_image_deref(nir_builder *b,
deref->var->data.binding,
array_index,
binding_layout->array_size,
- 32 /* return_size: doesn't apply for textures */);
+ 0,
+ 32 /* return_size: doesn't apply for textures */,
+ 0);
/* Note: we don't need to do anything here in relation to the precision and
* the output size because for images we can infer that info from the image
@@ -853,53 +811,35 @@ lower_image_deref(nir_builder *b,
}
static bool
-lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
- nir_shader *shader,
- struct v3dv_pipeline *pipeline,
- const struct v3dv_pipeline_layout *layout)
+lower_intrinsic(nir_builder *b,
+ nir_intrinsic_instr *instr,
+ struct lower_pipeline_layout_state *state)
{
switch (instr->intrinsic) {
- case nir_intrinsic_load_layer_id:
- /* FIXME: if layered rendering gets supported, this would need a real
- * lowering
- */
- nir_ssa_def_rewrite_uses(&instr->dest.ssa,
- nir_imm_int(b, 0));
- nir_instr_remove(&instr->instr);
- return true;
-
case nir_intrinsic_load_push_constant:
- lower_load_push_constant(b, instr, pipeline);
+ lower_load_push_constant(b, instr, state);
return true;
case nir_intrinsic_vulkan_resource_index:
- lower_vulkan_resource_index(b, instr, shader, pipeline, layout);
+ lower_vulkan_resource_index(b, instr, state);
return true;
case nir_intrinsic_load_vulkan_descriptor: {
/* Loading the descriptor happens as part of load/store instructions,
* so for us this is a no-op.
*/
- nir_ssa_def_rewrite_uses(&instr->dest.ssa, instr->src[0].ssa);
+ nir_def_rewrite_uses(&instr->def, instr->src[0].ssa);
nir_instr_remove(&instr->instr);
return true;
}
case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_store:
- case nir_intrinsic_image_deref_atomic_add:
- case nir_intrinsic_image_deref_atomic_imin:
- case nir_intrinsic_image_deref_atomic_umin:
- case nir_intrinsic_image_deref_atomic_imax:
- case nir_intrinsic_image_deref_atomic_umax:
- case nir_intrinsic_image_deref_atomic_and:
- case nir_intrinsic_image_deref_atomic_or:
- case nir_intrinsic_image_deref_atomic_xor:
- case nir_intrinsic_image_deref_atomic_exchange:
- case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_deref_atomic:
+ case nir_intrinsic_image_deref_atomic_swap:
case nir_intrinsic_image_deref_size:
case nir_intrinsic_image_deref_samples:
- lower_image_deref(b, instr, shader, pipeline, layout);
+ lower_image_deref(b, instr, state);
return true;
default:
@@ -908,32 +848,23 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
}
static bool
-lower_impl(nir_function_impl *impl,
- nir_shader *shader,
- struct v3dv_pipeline *pipeline,
- const struct v3dv_pipeline_layout *layout)
+lower_pipeline_layout_cb(nir_builder *b,
+ nir_instr *instr,
+ void *_state)
{
- nir_builder b;
- nir_builder_init(&b, impl);
bool progress = false;
+ struct lower_pipeline_layout_state *state = _state;
- nir_foreach_block(block, impl) {
- nir_foreach_instr_safe(instr, block) {
- b.cursor = nir_before_instr(instr);
- switch (instr->type) {
- case nir_instr_type_tex:
- progress |=
- lower_sampler(&b, nir_instr_as_tex(instr), shader, pipeline, layout);
- break;
- case nir_instr_type_intrinsic:
- progress |=
- lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader,
- pipeline, layout);
- break;
- default:
- break;
- }
- }
+ b->cursor = nir_before_instr(instr);
+ switch (instr->type) {
+ case nir_instr_type_tex:
+ progress |= lower_sampler(b, nir_instr_as_tex(instr), state);
+ break;
+ case nir_instr_type_intrinsic:
+ progress |= lower_intrinsic(b, nir_instr_as_intrinsic(instr), state);
+ break;
+ default:
+ break;
}
return progress;
@@ -942,25 +873,62 @@ lower_impl(nir_function_impl *impl,
static bool
lower_pipeline_layout_info(nir_shader *shader,
struct v3dv_pipeline *pipeline,
- const struct v3dv_pipeline_layout *layout)
+ const struct v3dv_pipeline_layout *layout,
+ bool *needs_default_sampler_state)
{
bool progress = false;
- nir_foreach_function(function, shader) {
- if (function->impl)
- progress |= lower_impl(function->impl, shader, pipeline, layout);
- }
+ struct lower_pipeline_layout_state state = {
+ .pipeline = pipeline,
+ .layout = layout,
+ .needs_default_sampler_state = false,
+ };
+
+ progress = nir_shader_instructions_pass(shader, lower_pipeline_layout_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance,
+ &state);
+
+ *needs_default_sampler_state = state.needs_default_sampler_state;
return progress;
}
+/* This flips gl_PointCoord.y to match Vulkan requirements */
+static bool
+lower_point_coord_cb(nir_builder *b, nir_intrinsic_instr *intr, void *_state)
+{
+ if (intr->intrinsic != nir_intrinsic_load_input)
+ return false;
+
+ if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PNTC)
+ return false;
+
+ b->cursor = nir_after_instr(&intr->instr);
+ nir_def *result = &intr->def;
+ result =
+ nir_vector_insert_imm(b, result,
+ nir_fsub_imm(b, 1.0, nir_channel(b, result, 1)), 1);
+ nir_def_rewrite_uses_after(&intr->def,
+ result, result->parent_instr);
+ return true;
+}
+
+static bool
+v3d_nir_lower_point_coord(nir_shader *s)
+{
+ assert(s->info.stage == MESA_SHADER_FRAGMENT);
+ return nir_shader_intrinsics_pass(s, lower_point_coord_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance, NULL);
+}
static void
lower_fs_io(nir_shader *nir)
{
/* Our backend doesn't handle array fragment shader outputs */
NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
- NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
+ NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
MESA_SHADER_FRAGMENT);
@@ -968,8 +936,8 @@ lower_fs_io(nir_shader *nir)
nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
MESA_SHADER_FRAGMENT);
- NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
- type_size_vec4, 0);
+ NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+ type_size_vec4, 0);
}
static void
@@ -1014,8 +982,7 @@ shader_debug_output(const char *message, void *data)
static void
pipeline_populate_v3d_key(struct v3d_key *key,
const struct v3dv_pipeline_stage *p_stage,
- uint32_t ucp_enables,
- bool robust_buffer_access)
+ uint32_t ucp_enables)
{
assert(p_stage->pipeline->shared_data &&
p_stage->pipeline->shared_data->maps[p_stage->stage]);
@@ -1051,7 +1018,8 @@ pipeline_populate_v3d_key(struct v3d_key *key,
switch (p_stage->stage) {
case BROADCOM_SHADER_VERTEX:
case BROADCOM_SHADER_VERTEX_BIN:
- key->is_last_geometry_stage = p_stage->pipeline->gs == NULL;
+ key->is_last_geometry_stage =
+ p_stage->pipeline->stages[BROADCOM_SHADER_GEOMETRY] == NULL;
break;
case BROADCOM_SHADER_GEOMETRY:
case BROADCOM_SHADER_GEOMETRY_BIN:
@@ -1078,27 +1046,42 @@ pipeline_populate_v3d_key(struct v3d_key *key,
*/
key->ucp_enables = ucp_enables;
- key->robust_buffer_access = robust_buffer_access;
+ const VkPipelineRobustnessBufferBehaviorEXT robust_buffer_enabled =
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT;
- key->environment = V3D_ENVIRONMENT_VULKAN;
+ const VkPipelineRobustnessImageBehaviorEXT robust_image_enabled =
+ VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_EXT;
+
+ key->robust_uniform_access =
+ p_stage->robustness.uniform_buffers == robust_buffer_enabled;
+ key->robust_storage_access =
+ p_stage->robustness.storage_buffers == robust_buffer_enabled;
+ key->robust_image_access =
+ p_stage->robustness.images == robust_image_enabled;
}
/* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the
* same. For not using prim_mode that is the one already used on v3d
*/
-static const enum pipe_prim_type vk_to_pipe_prim_type[] = {
- [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS,
- [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES,
- [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN,
- [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY,
- [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY,
+static const enum mesa_prim vk_to_mesa_prim[] = {
+ [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = MESA_PRIM_POINTS,
+ [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = MESA_PRIM_LINES,
+ [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = MESA_PRIM_LINE_STRIP,
+ [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = MESA_PRIM_TRIANGLES,
+ [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = MESA_PRIM_TRIANGLE_STRIP,
+ [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = MESA_PRIM_TRIANGLE_FAN,
+ [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = MESA_PRIM_LINES_ADJACENCY,
+ [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = MESA_PRIM_LINE_STRIP_ADJACENCY,
+ [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = MESA_PRIM_TRIANGLES_ADJACENCY,
+ [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = MESA_PRIM_TRIANGLE_STRIP_ADJACENCY,
};
+uint32_t
+v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim)
+{
+ return v3d_hw_prim_type(vk_to_mesa_prim[vk_prim]);
+}
+
static const enum pipe_logicop vk_to_pipe_logicop[] = {
[VK_LOGIC_OP_CLEAR] = PIPE_LOGICOP_CLEAR,
[VK_LOGIC_OP_AND] = PIPE_LOGICOP_AND,
@@ -1118,9 +1101,74 @@ static const enum pipe_logicop vk_to_pipe_logicop[] = {
[VK_LOGIC_OP_SET] = PIPE_LOGICOP_SET,
};
+static bool
+enable_line_smooth(uint8_t topology,
+ const VkPipelineRasterizationStateCreateInfo *rs_info)
+{
+ if (!rs_info || rs_info->rasterizerDiscardEnable)
+ return false;
+
+ const VkPipelineRasterizationLineStateCreateInfoKHR *ls_info =
+ vk_find_struct_const(rs_info->pNext,
+ PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_KHR);
+
+ if (!ls_info)
+ return false;
+
+ switch(topology) {
+ case MESA_PRIM_LINES:
+ case MESA_PRIM_LINE_LOOP:
+ case MESA_PRIM_LINE_STRIP:
+ case MESA_PRIM_LINES_ADJACENCY:
+ case MESA_PRIM_LINE_STRIP_ADJACENCY:
+ return ls_info->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR;
+ default:
+ return false;
+ }
+}
+
+static void
+v3d_fs_key_set_color_attachment(struct v3d_fs_key *key,
+ const struct v3dv_pipeline_stage *p_stage,
+ uint32_t index,
+ VkFormat fb_format)
+{
+ key->cbufs |= 1 << index;
+
+ enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
+
+ /* If logic operations are enabled then we might emit color reads and we
+ * need to know the color buffer format and swizzle for that
+ */
+ if (key->logicop_func != PIPE_LOGICOP_COPY) {
+ /* Framebuffer formats should be single plane */
+ assert(vk_format_get_plane_count(fb_format) == 1);
+ key->color_fmt[index].format = fb_pipe_format;
+ memcpy(key->color_fmt[index].swizzle,
+ v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format, 0),
+ sizeof(key->color_fmt[index].swizzle));
+ }
+
+ const struct util_format_description *desc =
+ vk_format_description(fb_format);
+
+ if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+ desc->channel[0].size == 32) {
+ key->f32_color_rb |= 1 << index;
+ }
+
+ if (p_stage->nir->info.fs.untyped_color_outputs) {
+ if (util_format_is_pure_uint(fb_pipe_format))
+ key->uint_color_rb |= 1 << index;
+ else if (util_format_is_pure_sint(fb_pipe_format))
+ key->int_color_rb |= 1 << index;
+ }
+}
+
static void
pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ const struct vk_render_pass_state *rendering_info,
const struct v3dv_pipeline_stage *p_stage,
bool has_geometry_shader,
uint32_t ucp_enables)
@@ -1129,16 +1177,29 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
memset(key, 0, sizeof(*key));
- const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
- pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables, rba);
+ struct v3dv_device *device = p_stage->pipeline->device;
+ assert(device);
+
+ pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables);
const VkPipelineInputAssemblyStateCreateInfo *ia_info =
pCreateInfo->pInputAssemblyState;
- uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+ uint8_t topology = vk_to_mesa_prim[ia_info->topology];
+
+ key->is_points = (topology == MESA_PRIM_POINTS);
+ key->is_lines = (topology >= MESA_PRIM_LINES &&
+ topology <= MESA_PRIM_LINE_STRIP);
+
+ if (key->is_points) {
+ /* This mask represents state for GL_ARB_point_sprite which is not
+ * relevant to Vulkan.
+ */
+ key->point_sprite_mask = 0;
+
+ /* Vulkan mandates upper left. */
+ key->point_coord_upper_left = true;
+ }
- key->is_points = (topology == PIPE_PRIM_POINTS);
- key->is_lines = (topology >= PIPE_PRIM_LINES &&
- topology <= PIPE_PRIM_LINE_STRIP);
key->has_gs = has_geometry_shader;
const VkPipelineColorBlendStateCreateInfo *cb_info =
@@ -1150,6 +1211,7 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
PIPE_LOGICOP_COPY;
const bool raster_enabled =
+ pCreateInfo->pRasterizationState &&
!pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
/* Multisample rasterization state must be ignored if rasterization
@@ -1162,68 +1224,24 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
- if (key->msaa) {
- key->sample_coverage =
- p_stage->pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
+ if (key->msaa)
key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
- key->sample_alpha_to_one = ms_info->alphaToOneEnable;
- }
+
+ key->sample_alpha_to_one = ms_info->alphaToOneEnable;
}
+ key->line_smoothing = enable_line_smooth(topology, pCreateInfo->pRasterizationState);
+
/* This is intended for V3D versions before 4.1, otherwise we just use the
* tile buffer load/store swap R/B bit.
*/
key->swap_color_rb = 0;
- const struct v3dv_render_pass *pass =
- v3dv_render_pass_from_handle(pCreateInfo->renderPass);
- const struct v3dv_subpass *subpass = p_stage->pipeline->subpass;
- for (uint32_t i = 0; i < subpass->color_count; i++) {
- const uint32_t att_idx = subpass->color_attachments[i].attachment;
- if (att_idx == VK_ATTACHMENT_UNUSED)
+ for (uint32_t i = 0; i < rendering_info->color_attachment_count; i++) {
+ if (rendering_info->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
continue;
-
- key->cbufs |= 1 << i;
-
- VkFormat fb_format = pass->attachments[att_idx].desc.format;
- enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
-
- /* If logic operations are enabled then we might emit color reads and we
- * need to know the color buffer format and swizzle for that
- */
- if (key->logicop_func != PIPE_LOGICOP_COPY) {
- key->color_fmt[i].format = fb_pipe_format;
- key->color_fmt[i].swizzle =
- v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format);
- }
-
- const struct util_format_description *desc =
- vk_format_description(fb_format);
-
- if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
- desc->channel[0].size == 32) {
- key->f32_color_rb |= 1 << i;
- }
-
- if (p_stage->nir->info.fs.untyped_color_outputs) {
- if (util_format_is_pure_uint(fb_pipe_format))
- key->uint_color_rb |= 1 << i;
- else if (util_format_is_pure_sint(fb_pipe_format))
- key->int_color_rb |= 1 << i;
- }
-
- if (key->is_points) {
- /* FIXME: The mask would need to be computed based on the shader
- * inputs. On gallium it is done at st_atom_rasterizer
- * (sprite_coord_enable). anv seems (need to confirm) to do that on
- * genX_pipeline (PointSpriteTextureCoordinateEnable). Would be also
- * better to have tests to guide filling the mask.
- */
- key->point_sprite_mask = 0;
-
- /* Vulkan mandates upper left. */
- key->point_coord_upper_left = true;
- }
+ v3d_fs_key_set_color_attachment(key, p_stage, i,
+ rendering_info->color_attachment_formats[i]);
}
}
@@ -1247,10 +1265,12 @@ pipeline_populate_v3d_gs_key(struct v3d_gs_key *key,
assert(p_stage->stage == BROADCOM_SHADER_GEOMETRY ||
p_stage->stage == BROADCOM_SHADER_GEOMETRY_BIN);
+ struct v3dv_device *device = p_stage->pipeline->device;
+ assert(device);
+
memset(key, 0, sizeof(*key));
- const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
- pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
+ pipeline_populate_v3d_key(&key->base, p_stage, 0);
struct v3dv_pipeline *pipeline = p_stage->pipeline;
@@ -1289,10 +1309,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
assert(p_stage->stage == BROADCOM_SHADER_VERTEX ||
p_stage->stage == BROADCOM_SHADER_VERTEX_BIN);
- memset(key, 0, sizeof(*key));
+ struct v3dv_device *device = p_stage->pipeline->device;
+ assert(device);
- const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
- pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
+ memset(key, 0, sizeof(*key));
+ pipeline_populate_v3d_key(&key->base, p_stage, 0);
struct v3dv_pipeline *pipeline = p_stage->pipeline;
@@ -1301,11 +1322,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
*/
const VkPipelineInputAssemblyStateCreateInfo *ia_info =
pCreateInfo->pInputAssemblyState;
- uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+ uint8_t topology = vk_to_mesa_prim[ia_info->topology];
/* FIXME: PRIM_POINTS is not enough, in gallium the full check is
- * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
- key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);
+ * MESA_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
+ key->per_vertex_point_size = (topology == MESA_PRIM_POINTS);
key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
@@ -1318,7 +1339,7 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
key->num_used_outputs = 0;
} else {
/* Linking against GS binning program */
- assert(pipeline->gs);
+ assert(pipeline->stages[BROADCOM_SHADER_GEOMETRY]);
struct v3dv_shader_variant *gs_bin_variant =
pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
@@ -1333,7 +1354,7 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
sizeof(key->used_outputs));
}
} else { /* Render VS */
- if (pipeline->gs) {
+ if (pipeline->stages[BROADCOM_SHADER_GEOMETRY]) {
/* Linking against GS render program */
struct v3dv_shader_variant *gs_variant =
pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
@@ -1370,8 +1391,10 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
const VkVertexInputAttributeDescription *desc =
&vi_info->pVertexAttributeDescriptions[i];
assert(desc->location < MAX_VERTEX_ATTRIBS);
- if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+ if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
+ desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+ }
}
}
@@ -1407,14 +1430,33 @@ pipeline_stage_create_binning(const struct v3dv_pipeline_stage *src,
p_stage->stage = bin_stage;
p_stage->entrypoint = src->entrypoint;
p_stage->module = src->module;
- p_stage->nir = src->nir ? nir_shader_clone(NULL, src->nir) : NULL;
+ /* For binning shaders we will clone the NIR code from the corresponding
+ * render shader later, when we call pipeline_compile_xxx_shader. This way
+ * we only have to run the relevant NIR lowerings once for render shaders
+ */
+ p_stage->nir = NULL;
+ p_stage->program_id = src->program_id;
p_stage->spec_info = src->spec_info;
- p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
+ p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
+ p_stage->robustness = src->robustness;
memcpy(p_stage->shader_sha1, src->shader_sha1, 20);
return p_stage;
}
+/*
+ * Based on some creation flags we assume that the QPU would be needed later
+ * to gather further info. In that case we just keep the qput_insts around,
+ * instead of map/unmap the bo later.
+ */
+static bool
+pipeline_keep_qpu(struct v3dv_pipeline *pipeline)
+{
+ return pipeline->flags &
+ (VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR |
+ VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR);
+}
+
/**
* Returns false if it was not able to allocate or map the assembly bo memory.
*/
@@ -1454,9 +1496,10 @@ upload_assembly(struct v3dv_pipeline *pipeline)
memcpy(bo->map + offset, variant->qpu_insts, variant->qpu_insts_size);
offset += variant->qpu_insts_size;
- /* We dont need qpu_insts anymore. */
- free(variant->qpu_insts);
- variant->qpu_insts = NULL;
+ if (!pipeline_keep_qpu(pipeline)) {
+ free(variant->qpu_insts);
+ variant->qpu_insts = NULL;
+ }
}
}
assert(total_size == offset);
@@ -1474,20 +1517,27 @@ pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
- /* We need to include all shader stages in the sha1 key as linking may modify
- * the shader code in any stage. An alternative would be to use the
+ if (pipeline->layout) {
+ _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
+ sizeof(pipeline->layout->sha1));
+ }
+
+ /* We need to include all shader stages in the sha1 key as linking may
+ * modify the shader code in any stage. An alternative would be to use the
* serialized NIR, but that seems like an overkill.
*/
- _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
- sizeof(pipeline->vs->shader_sha1));
+ for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+ if (broadcom_shader_stage_is_binning(stage))
+ continue;
- if (pipeline->gs) {
- _mesa_sha1_update(&ctx, pipeline->gs->shader_sha1,
- sizeof(pipeline->gs->shader_sha1));
- }
+ struct v3dv_pipeline_stage *p_stage = pipeline->stages[stage];
+ if (p_stage == NULL)
+ continue;
- _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
- sizeof(pipeline->fs->shader_sha1));
+ assert(stage != BROADCOM_SHADER_COMPUTE);
+
+ _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
+ }
_mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
@@ -1502,8 +1552,15 @@ pipeline_hash_compute(const struct v3dv_pipeline *pipeline,
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
- _mesa_sha1_update(&ctx, pipeline->cs->shader_sha1,
- sizeof(pipeline->cs->shader_sha1));
+ if (pipeline->layout) {
+ _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
+ sizeof(pipeline->layout->sha1));
+ }
+
+ struct v3dv_pipeline_stage *p_stage =
+ pipeline->stages[BROADCOM_SHADER_COMPUTE];
+
+ _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
_mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
@@ -1553,7 +1610,7 @@ pipeline_check_spill_size(struct v3dv_pipeline *pipeline)
* so it is assumed that the caller will prove a pointer that the
* shader_variant will own.
*
- * Creation doesn't include allocate a BD to store the content of qpu_insts,
+ * Creation doesn't include allocate a BO to store the content of qpu_insts,
* as we will try to share the same bo for several shader variants. Also note
* that qpu_ints being NULL is valid, for example if we are creating the
* shader_variants from the cache, so we can just upload the assembly of all
@@ -1615,13 +1672,11 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
int64_t stage_start = os_time_get_nano();
struct v3dv_pipeline *pipeline = p_stage->pipeline;
- struct v3dv_physical_device *physical_device =
- &pipeline->device->instance->physicalDevice;
+ struct v3dv_physical_device *physical_device = pipeline->device->pdevice;
const struct v3d_compiler *compiler = physical_device->compiler;
+ gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(p_stage->stage);
- if (V3D_DEBUG & (V3D_DEBUG_NIR |
- v3d_debug_flag_for_shader_stage
- (broadcom_shader_stage_to_gl(p_stage->stage)))) {
+ if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) {
fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n",
broadcom_shader_stage_name(p_stage->stage),
p_stage->program_id);
@@ -1632,8 +1687,7 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
uint64_t *qpu_insts;
uint32_t qpu_insts_size;
struct v3d_prog_data *prog_data;
- uint32_t prog_data_size =
- v3d_prog_data_size(broadcom_shader_stage_to_gl(p_stage->stage));
+ uint32_t prog_data_size = v3d_prog_data_size(gl_stage);
qpu_insts = v3d_compile(compiler,
key, &prog_data,
@@ -1646,7 +1700,7 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
if (!qpu_insts) {
fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n",
- gl_shader_stage_name(p_stage->stage),
+ broadcom_shader_stage_name(p_stage->stage),
p_stage->program_id);
*out_vk_result = VK_ERROR_UNKNOWN;
} else {
@@ -1667,59 +1721,6 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
return variant;
}
-/* FIXME: C&P from st, common place? */
-static void
-st_nir_opts(nir_shader *nir)
-{
- bool progress;
-
- do {
- progress = false;
-
- NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
- /* Linking deals with unused inputs/outputs, but here we can remove
- * things local to the shader in the hopes that we can cleanup other
- * things. This pass will also remove variables with only stores, so we
- * might be able to make progress after it.
- */
- NIR_PASS(progress, nir, nir_remove_dead_variables,
- (nir_variable_mode)(nir_var_function_temp |
- nir_var_shader_temp |
- nir_var_mem_shared),
- NULL);
-
- NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
- NIR_PASS(progress, nir, nir_opt_dead_write_vars);
-
- if (nir->options->lower_to_scalar) {
- NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
- NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
- }
-
- NIR_PASS_V(nir, nir_lower_alu);
- NIR_PASS_V(nir, nir_lower_pack);
- NIR_PASS(progress, nir, nir_copy_prop);
- NIR_PASS(progress, nir, nir_opt_remove_phis);
- NIR_PASS(progress, nir, nir_opt_dce);
- if (nir_opt_trivial_continues(nir)) {
- progress = true;
- NIR_PASS(progress, nir, nir_copy_prop);
- NIR_PASS(progress, nir, nir_opt_dce);
- }
- NIR_PASS(progress, nir, nir_opt_if, false);
- NIR_PASS(progress, nir, nir_opt_dead_cf);
- NIR_PASS(progress, nir, nir_opt_cse);
- NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
-
- NIR_PASS(progress, nir, nir_opt_algebraic);
- NIR_PASS(progress, nir, nir_opt_constant_folding);
-
- NIR_PASS(progress, nir, nir_opt_undef);
- NIR_PASS(progress, nir, nir_opt_conditional_discard);
- } while (progress);
-}
-
static void
link_shaders(nir_shader *producer, nir_shader *consumer)
{
@@ -1727,34 +1728,34 @@ link_shaders(nir_shader *producer, nir_shader *consumer)
assert(consumer);
if (producer->options->lower_to_scalar) {
- NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
- NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
+ NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
+ NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
}
nir_lower_io_arrays_to_elements(producer, consumer);
- st_nir_opts(producer);
- st_nir_opts(consumer);
+ v3d_optimize_nir(NULL, producer);
+ v3d_optimize_nir(NULL, consumer);
if (nir_link_opt_varyings(producer, consumer))
- st_nir_opts(consumer);
+ v3d_optimize_nir(NULL, consumer);
- NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
- NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
+ NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
+ NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
if (nir_remove_unused_varyings(producer, consumer)) {
- NIR_PASS_V(producer, nir_lower_global_vars_to_local);
- NIR_PASS_V(consumer, nir_lower_global_vars_to_local);
+ NIR_PASS(_, producer, nir_lower_global_vars_to_local);
+ NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
- st_nir_opts(producer);
- st_nir_opts(consumer);
+ v3d_optimize_nir(NULL, producer);
+ v3d_optimize_nir(NULL, consumer);
/* Optimizations can cause varyings to become unused.
* nir_compact_varyings() depends on all dead varyings being removed so
* we need to call nir_remove_dead_variables() again here.
*/
- NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
- NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
+ NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
+ NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
}
}
@@ -1768,6 +1769,9 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
assert(pipeline->shared_data &&
pipeline->shared_data->maps[p_stage->stage]);
+ NIR_PASS_V(p_stage->nir, nir_vk_lower_ycbcr_tex,
+ lookup_ycbcr_conversion, layout);
+
nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir));
/* We add this because we need a valid sampler for nir_lower_tex to do
@@ -1777,18 +1781,27 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
* We add two of those, one for the case we need a 16bit return_size, and
* another for the case we need a 32bit return size.
*/
- UNUSED unsigned index =
- descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
- -1, -1, -1, 0, 16);
+ struct v3dv_descriptor_maps *maps =
+ pipeline->shared_data->maps[p_stage->stage];
+
+ UNUSED unsigned index;
+ index = descriptor_map_add(&maps->sampler_map, -1, -1, -1, 0, 0, 16, 0);
assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
- index =
- descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
- -2, -2, -2, 0, 32);
+ index = descriptor_map_add(&maps->sampler_map, -2, -2, -2, 0, 0, 32, 0);
assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
/* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
- NIR_PASS_V(p_stage->nir, lower_pipeline_layout_info, pipeline, layout);
+ bool needs_default_sampler_state = false;
+ NIR_PASS(_, p_stage->nir, lower_pipeline_layout_info, pipeline, layout,
+ &needs_default_sampler_state);
+
+ /* If in the end we didn't need to use the default sampler states and the
+ * shader doesn't need any other samplers, get rid of them so we can
+ * recognize that this program doesn't use any samplers at all.
+ */
+ if (!needs_default_sampler_state && maps->sampler_map.num_desc == 2)
+ maps->sampler_map.num_desc = 0;
p_stage->feedback.duration += os_time_get_nano() - stage_start;
}
@@ -1830,7 +1843,7 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
if (nir) {
assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage));
- /* A NIR cach hit doesn't avoid the large majority of pipeline stage
+ /* A NIR cache hit doesn't avoid the large majority of pipeline stage
* creation so the cache hit is not recorded in the pipeline feedback
* flags
*/
@@ -1866,53 +1879,34 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
return NULL;
}
-static void
-pipeline_hash_shader(const struct vk_shader_module *module,
- const char *entrypoint,
- gl_shader_stage stage,
- const VkSpecializationInfo *spec_info,
- unsigned char *sha1_out)
-{
- struct mesa_sha1 ctx;
- _mesa_sha1_init(&ctx);
-
- _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
- _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
- _mesa_sha1_update(&ctx, &stage, sizeof(stage));
- if (spec_info) {
- _mesa_sha1_update(&ctx, spec_info->pMapEntries,
- spec_info->mapEntryCount *
- sizeof(*spec_info->pMapEntries));
- _mesa_sha1_update(&ctx, spec_info->pData,
- spec_info->dataSize);
- }
-
- _mesa_sha1_final(&ctx, sha1_out);
-}
-
static VkResult
pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
const VkAllocationCallbacks *pAllocator,
const VkGraphicsPipelineCreateInfo *pCreateInfo)
{
- assert(pipeline->vs_bin != NULL);
- if (pipeline->vs_bin->nir == NULL) {
- assert(pipeline->vs->nir);
- pipeline->vs_bin->nir = nir_shader_clone(NULL, pipeline->vs->nir);
+ struct v3dv_pipeline_stage *p_stage_vs =
+ pipeline->stages[BROADCOM_SHADER_VERTEX];
+ struct v3dv_pipeline_stage *p_stage_vs_bin =
+ pipeline->stages[BROADCOM_SHADER_VERTEX_BIN];
+
+ assert(p_stage_vs_bin != NULL);
+ if (p_stage_vs_bin->nir == NULL) {
+ assert(p_stage_vs->nir);
+ p_stage_vs_bin->nir = nir_shader_clone(NULL, p_stage_vs->nir);
}
VkResult vk_result;
struct v3d_vs_key key;
- pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs);
+ pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs);
pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] =
- pipeline_compile_shader_variant(pipeline->vs, &key.base, sizeof(key),
+ pipeline_compile_shader_variant(p_stage_vs, &key.base, sizeof(key),
pAllocator, &vk_result);
if (vk_result != VK_SUCCESS)
return vk_result;
- pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin);
+ pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs_bin);
pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
- pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key),
+ pipeline_compile_shader_variant(p_stage_vs_bin, &key.base, sizeof(key),
pAllocator, &vk_result);
return vk_result;
@@ -1923,26 +1917,30 @@ pipeline_compile_geometry_shader(struct v3dv_pipeline *pipeline,
const VkAllocationCallbacks *pAllocator,
const VkGraphicsPipelineCreateInfo *pCreateInfo)
{
- assert(pipeline->gs);
+ struct v3dv_pipeline_stage *p_stage_gs =
+ pipeline->stages[BROADCOM_SHADER_GEOMETRY];
+ struct v3dv_pipeline_stage *p_stage_gs_bin =
+ pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN];
- assert(pipeline->gs_bin != NULL);
- if (pipeline->gs_bin->nir == NULL) {
- assert(pipeline->gs->nir);
- pipeline->gs_bin->nir = nir_shader_clone(NULL, pipeline->gs->nir);
+ assert(p_stage_gs);
+ assert(p_stage_gs_bin != NULL);
+ if (p_stage_gs_bin->nir == NULL) {
+ assert(p_stage_gs->nir);
+ p_stage_gs_bin->nir = nir_shader_clone(NULL, p_stage_gs->nir);
}
VkResult vk_result;
struct v3d_gs_key key;
- pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs);
+ pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs);
pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] =
- pipeline_compile_shader_variant(pipeline->gs, &key.base, sizeof(key),
+ pipeline_compile_shader_variant(p_stage_gs, &key.base, sizeof(key),
pAllocator, &vk_result);
if (vk_result != VK_SUCCESS)
return vk_result;
- pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs_bin);
+ pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs_bin);
pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN] =
- pipeline_compile_shader_variant(pipeline->gs_bin, &key.base, sizeof(key),
+ pipeline_compile_shader_variant(p_stage_gs_bin, &key.base, sizeof(key),
pAllocator, &vk_result);
return vk_result;
@@ -1953,19 +1951,26 @@ pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
const VkAllocationCallbacks *pAllocator,
const VkGraphicsPipelineCreateInfo *pCreateInfo)
{
- struct v3dv_pipeline_stage *p_stage = pipeline->vs;
-
- p_stage = pipeline->fs;
+ struct v3dv_pipeline_stage *p_stage_vs =
+ pipeline->stages[BROADCOM_SHADER_VERTEX];
+ struct v3dv_pipeline_stage *p_stage_fs =
+ pipeline->stages[BROADCOM_SHADER_FRAGMENT];
+ struct v3dv_pipeline_stage *p_stage_gs =
+ pipeline->stages[BROADCOM_SHADER_GEOMETRY];
struct v3d_fs_key key;
+ pipeline_populate_v3d_fs_key(&key, pCreateInfo, &pipeline->rendering_info,
+ p_stage_fs, p_stage_gs != NULL,
+ get_ucp_enable_mask(p_stage_vs));
- pipeline_populate_v3d_fs_key(&key, pCreateInfo, p_stage,
- pipeline->gs != NULL,
- get_ucp_enable_mask(pipeline->vs));
+ if (key.is_points) {
+ assert(key.point_coord_upper_left);
+ NIR_PASS(_, p_stage_fs->nir, v3d_nir_lower_point_coord);
+ }
VkResult vk_result;
pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] =
- pipeline_compile_shader_variant(p_stage, &key.base, sizeof(key),
+ pipeline_compile_shader_variant(p_stage_fs, &key.base, sizeof(key),
pAllocator, &vk_result);
return vk_result;
@@ -1976,16 +1981,20 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
struct v3dv_pipeline_key *key,
const VkGraphicsPipelineCreateInfo *pCreateInfo)
{
+ struct v3dv_device *device = pipeline->device;
+ assert(device);
+
memset(key, 0, sizeof(*key));
- key->robust_buffer_access =
- pipeline->device->features.robustBufferAccess;
+
+ key->line_smooth = pipeline->line_smooth;
const bool raster_enabled =
+ pCreateInfo->pRasterizationState &&
!pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
const VkPipelineInputAssemblyStateCreateInfo *ia_info =
pCreateInfo->pInputAssemblyState;
- key->topology = vk_to_pipe_prim_type[ia_info->topology];
+ key->topology = vk_to_mesa_prim[ia_info->topology];
const VkPipelineColorBlendStateCreateInfo *cb_info =
raster_enabled ? pCreateInfo->pColorBlendState : NULL;
@@ -2004,34 +2013,32 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
- if (key->msaa) {
- key->sample_coverage =
- pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
+ if (key->msaa)
key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
- key->sample_alpha_to_one = ms_info->alphaToOneEnable;
- }
+
+ key->sample_alpha_to_one = ms_info->alphaToOneEnable;
}
- const struct v3dv_render_pass *pass =
- v3dv_render_pass_from_handle(pCreateInfo->renderPass);
- const struct v3dv_subpass *subpass = pipeline->subpass;
- for (uint32_t i = 0; i < subpass->color_count; i++) {
- const uint32_t att_idx = subpass->color_attachments[i].attachment;
- if (att_idx == VK_ATTACHMENT_UNUSED)
+ struct vk_render_pass_state *ri = &pipeline->rendering_info;
+ for (uint32_t i = 0; i < ri->color_attachment_count; i++) {
+ if (ri->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
continue;
key->cbufs |= 1 << i;
- VkFormat fb_format = pass->attachments[att_idx].desc.format;
+ VkFormat fb_format = ri->color_attachment_formats[i];
enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
/* If logic operations are enabled then we might emit color reads and we
* need to know the color buffer format and swizzle for that
*/
if (key->logicop_func != PIPE_LOGICOP_COPY) {
+ /* Framebuffer formats should be single plane */
+ assert(vk_format_get_plane_count(fb_format) == 1);
key->color_fmt[i].format = fb_pipe_format;
- key->color_fmt[i].swizzle = v3dv_get_format_swizzle(pipeline->device,
- fb_format);
+ memcpy(key->color_fmt[i].swizzle,
+ v3dv_get_format_swizzle(pipeline->device, fb_format, 0),
+ sizeof(key->color_fmt[i].swizzle));
}
const struct util_format_description *desc =
@@ -2049,12 +2056,13 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
const VkVertexInputAttributeDescription *desc =
&vi_info->pVertexAttributeDescriptions[i];
assert(desc->location < MAX_VERTEX_ATTRIBS);
- if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+ if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
+ desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+ }
}
- assert(pipeline->subpass);
- key->has_multiview = pipeline->subpass->view_mask != 0;
+ key->has_multiview = ri->view_mask != 0;
}
static void
@@ -2062,14 +2070,15 @@ pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
struct v3dv_pipeline_key *key,
const VkComputePipelineCreateInfo *pCreateInfo)
{
+ struct v3dv_device *device = pipeline->device;
+ assert(device);
+
/* We use the same pipeline key for graphics and compute, but we don't need
* to add a field to flag compute keys because this key is not used alone
* to search in the cache, we also use the SPIR-V or the serialized NIR for
* example, which already flags compute shaders.
*/
memset(key, 0, sizeof(*key));
- key->robust_buffer_access =
- pipeline->device->features.robustBufferAccess;
}
static struct v3dv_pipeline_shared_data *
@@ -2102,9 +2111,10 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
continue;
}
- if (stage == BROADCOM_SHADER_GEOMETRY && !pipeline->gs) {
+ if (stage == BROADCOM_SHADER_GEOMETRY &&
+ !pipeline->stages[BROADCOM_SHADER_GEOMETRY]) {
/* We always inject a custom GS if we have multiview */
- if (!pipeline->subpass->view_mask)
+ if (!pipeline->rendering_info.view_mask)
continue;
}
@@ -2146,69 +2156,52 @@ fail:
static void
write_creation_feedback(struct v3dv_pipeline *pipeline,
const void *next,
- const VkPipelineCreationFeedbackEXT *pipeline_feedback,
+ const VkPipelineCreationFeedback *pipeline_feedback,
uint32_t stage_count,
const VkPipelineShaderStageCreateInfo *stages)
{
- const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
- vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+ const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+ vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
if (create_feedback) {
typed_memcpy(create_feedback->pPipelineCreationFeedback,
pipeline_feedback,
1);
- assert(stage_count == create_feedback->pipelineStageCreationFeedbackCount);
+ const uint32_t feedback_stage_count =
+ create_feedback->pipelineStageCreationFeedbackCount;
+ assert(feedback_stage_count <= stage_count);
- for (uint32_t i = 0; i < stage_count; i++) {
+ for (uint32_t i = 0; i < feedback_stage_count; i++) {
gl_shader_stage s = vk_to_mesa_shader_stage(stages[i].stage);
- switch (s) {
- case MESA_SHADER_VERTEX:
- create_feedback->pPipelineStageCreationFeedbacks[i] =
- pipeline->vs->feedback;
-
- create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
- pipeline->vs_bin->feedback.duration;
- break;
+ enum broadcom_shader_stage bs = gl_shader_stage_to_broadcom(s);
- case MESA_SHADER_GEOMETRY:
- create_feedback->pPipelineStageCreationFeedbacks[i] =
- pipeline->gs->feedback;
+ create_feedback->pPipelineStageCreationFeedbacks[i] =
+ pipeline->stages[bs]->feedback;
+ if (broadcom_shader_stage_is_render_with_binning(bs)) {
+ enum broadcom_shader_stage bs_bin =
+ broadcom_binning_shader_stage_for_render_stage(bs);
create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
- pipeline->gs_bin->feedback.duration;
- break;
-
- case MESA_SHADER_FRAGMENT:
- create_feedback->pPipelineStageCreationFeedbacks[i] =
- pipeline->fs->feedback;
- break;
-
- case MESA_SHADER_COMPUTE:
- create_feedback->pPipelineStageCreationFeedbacks[i] =
- pipeline->cs->feedback;
- break;
-
- default:
- unreachable("not supported shader stage");
+ pipeline->stages[bs_bin]->feedback.duration;
}
}
}
}
-static uint32_t
+static enum mesa_prim
multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
{
switch (pipeline->topology) {
- case PIPE_PRIM_POINTS:
- return GL_POINTS;
- case PIPE_PRIM_LINES:
- case PIPE_PRIM_LINE_STRIP:
- return GL_LINES;
- case PIPE_PRIM_TRIANGLES:
- case PIPE_PRIM_TRIANGLE_STRIP:
- case PIPE_PRIM_TRIANGLE_FAN:
- return GL_TRIANGLES;
+ case MESA_PRIM_POINTS:
+ return MESA_PRIM_POINTS;
+ case MESA_PRIM_LINES:
+ case MESA_PRIM_LINE_STRIP:
+ return MESA_PRIM_LINES;
+ case MESA_PRIM_TRIANGLES:
+ case MESA_PRIM_TRIANGLE_STRIP:
+ case MESA_PRIM_TRIANGLE_FAN:
+ return MESA_PRIM_TRIANGLES;
default:
/* Since we don't allow GS with multiview, we can only see non-adjacency
* primitives.
@@ -2217,19 +2210,19 @@ multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
}
}
-static uint32_t
+static enum mesa_prim
multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
{
switch (pipeline->topology) {
- case PIPE_PRIM_POINTS:
- return GL_POINTS;
- case PIPE_PRIM_LINES:
- case PIPE_PRIM_LINE_STRIP:
- return GL_LINE_STRIP;
- case PIPE_PRIM_TRIANGLES:
- case PIPE_PRIM_TRIANGLE_STRIP:
- case PIPE_PRIM_TRIANGLE_FAN:
- return GL_TRIANGLE_STRIP;
+ case MESA_PRIM_POINTS:
+ return MESA_PRIM_POINTS;
+ case MESA_PRIM_LINES:
+ case MESA_PRIM_LINE_STRIP:
+ return MESA_PRIM_LINE_STRIP;
+ case MESA_PRIM_TRIANGLES:
+ case MESA_PRIM_TRIANGLE_STRIP:
+ case MESA_PRIM_TRIANGLE_FAN:
+ return MESA_PRIM_TRIANGLE_STRIP;
default:
/* Since we don't allow GS with multiview, we can only see non-adjacency
* primitives.
@@ -2244,8 +2237,9 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
const VkAllocationCallbacks *pAllocator)
{
/* Create the passthrough GS from the VS output interface */
- pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
- nir_shader *vs_nir = pipeline->vs->nir;
+ struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX];
+ p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache);
+ nir_shader *vs_nir = p_stage_vs->nir;
const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
@@ -2255,7 +2249,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
nir->info.outputs_written = vs_nir->info.outputs_written |
(1ull << VARYING_SLOT_LAYER);
- uint32_t vertex_count = u_vertices_per_prim(pipeline->topology);
+ uint32_t vertex_count = mesa_vertices_per_prim(pipeline->topology);
nir->info.gs.input_primitive =
multiview_gs_input_primitive_from_pipeline(pipeline);
nir->info.gs.output_primitive =
@@ -2297,7 +2291,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
out_layer->data.location = VARYING_SLOT_LAYER;
/* Get the view index value that we will write to gl_Layer */
- nir_ssa_def *layer =
+ nir_def *layer =
nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32);
/* Emit all output vertices */
@@ -2323,8 +2317,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
/* Attach the geometry shader to the pipeline */
struct v3dv_device *device = pipeline->device;
- struct v3dv_physical_device *physical_device =
- &device->instance->physicalDevice;
+ struct v3dv_physical_device *physical_device = device->pdevice;
struct v3dv_pipeline_stage *p_stage =
vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
@@ -2340,21 +2333,36 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
p_stage->entrypoint = "main";
p_stage->module = 0;
p_stage->nir = nir;
- pipeline_compute_sha1_from_nir(p_stage->nir, p_stage->shader_sha1);
+ pipeline_compute_sha1_from_nir(p_stage);
p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
+ p_stage->robustness = pipeline->stages[BROADCOM_SHADER_VERTEX]->robustness;
pipeline->has_gs = true;
- pipeline->gs = p_stage;
+ pipeline->stages[BROADCOM_SHADER_GEOMETRY] = p_stage;
pipeline->active_stages |= MESA_SHADER_GEOMETRY;
- pipeline->gs_bin =
- pipeline_stage_create_binning(pipeline->gs, pAllocator);
- if (pipeline->gs_bin == NULL)
- return false;
+ pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] =
+ pipeline_stage_create_binning(p_stage, pAllocator);
+ if (pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] == NULL)
+ return false;
return true;
}
+static void
+pipeline_check_buffer_device_address(struct v3dv_pipeline *pipeline)
+{
+ for (int i = BROADCOM_SHADER_VERTEX; i < BROADCOM_SHADER_STAGES; i++) {
+ struct v3dv_shader_variant *variant = pipeline->shared_data->variants[i];
+ if (variant && variant->prog_data.base->has_global_address) {
+ pipeline->uses_buffer_device_address = true;
+ return;
+ }
+ }
+
+ pipeline->uses_buffer_device_address = false;
+}
+
/*
* It compiles a pipeline. Note that it also allocate internal object, but if
* some allocations success, but other fails, the method is not freeing the
@@ -2371,14 +2379,13 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
const VkGraphicsPipelineCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator)
{
- VkPipelineCreationFeedbackEXT pipeline_feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+ VkPipelineCreationFeedback pipeline_feedback = {
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
};
int64_t pipeline_start = os_time_get_nano();
struct v3dv_device *device = pipeline->device;
- struct v3dv_physical_device *physical_device =
- &device->instance->physicalDevice;
+ struct v3dv_physical_device *physical_device = device->pdevice;
/* First pass to get some common info from the shader, and create the
* individual pipeline_stage objects
@@ -2394,26 +2401,24 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
if (p_stage == NULL)
return VK_ERROR_OUT_OF_HOST_MEMORY;
- /* Note that we are assigning program_id slightly differently that
- * v3d. Here we are assigning one per pipeline stage, so vs and vs_bin
- * would have a different program_id, while v3d would have the same for
- * both. For the case of v3dv, it is more natural to have an id this way,
- * as right now we are using it for debugging, not for shader-db.
- */
p_stage->program_id =
p_atomic_inc_return(&physical_device->next_program_id);
+ enum broadcom_shader_stage broadcom_stage =
+ gl_shader_stage_to_broadcom(stage);
+
p_stage->pipeline = pipeline;
- p_stage->stage = gl_shader_stage_to_broadcom(stage);
+ p_stage->stage = broadcom_stage;
p_stage->entrypoint = sinfo->pName;
p_stage->module = vk_shader_module_from_handle(sinfo->module);
p_stage->spec_info = sinfo->pSpecializationInfo;
- pipeline_hash_shader(p_stage->module,
- p_stage->entrypoint,
- stage,
- p_stage->spec_info,
- p_stage->shader_sha1);
+ vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
+ pCreateInfo->pNext, sinfo->pNext);
+
+ vk_pipeline_hash_shader_stage(&pCreateInfo->pStages[i],
+ &p_stage->robustness,
+ p_stage->shader_sha1);
pipeline->active_stages |= sinfo->stage;
@@ -2421,36 +2426,24 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
* worry about getting the nir shader for now.
*/
p_stage->nir = NULL;
-
- switch(stage) {
- case MESA_SHADER_VERTEX:
- pipeline->vs = p_stage;
- pipeline->vs_bin =
- pipeline_stage_create_binning(pipeline->vs, pAllocator);
- if (pipeline->vs_bin == NULL)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
- break;
-
- case MESA_SHADER_GEOMETRY:
+ pipeline->stages[broadcom_stage] = p_stage;
+ if (broadcom_stage == BROADCOM_SHADER_GEOMETRY)
pipeline->has_gs = true;
- pipeline->gs = p_stage;
- pipeline->gs_bin =
- pipeline_stage_create_binning(pipeline->gs, pAllocator);
- if (pipeline->gs_bin == NULL)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
- break;
- case MESA_SHADER_FRAGMENT:
- pipeline->fs = p_stage;
- break;
+ if (broadcom_shader_stage_is_render_with_binning(broadcom_stage)) {
+ enum broadcom_shader_stage broadcom_stage_bin =
+ broadcom_binning_shader_stage_for_render_stage(broadcom_stage);
- default:
- unreachable("not supported shader stage");
+ pipeline->stages[broadcom_stage_bin] =
+ pipeline_stage_create_binning(p_stage, pAllocator);
+
+ if (pipeline->stages[broadcom_stage_bin] == NULL)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
}
}
/* Add a no-op fragment shader if needed */
- if (!pipeline->fs) {
+ if (!pipeline->stages[BROADCOM_SHADER_FRAGMENT]) {
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
&v3dv_nir_options,
"noop_fs");
@@ -2467,109 +2460,126 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
p_stage->entrypoint = "main";
p_stage->module = 0;
p_stage->nir = b.shader;
- pipeline_compute_sha1_from_nir(p_stage->nir, p_stage->shader_sha1);
+ vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
+ NULL, NULL);
+ pipeline_compute_sha1_from_nir(p_stage);
p_stage->program_id =
p_atomic_inc_return(&physical_device->next_program_id);
- pipeline->fs = p_stage;
+ pipeline->stages[BROADCOM_SHADER_FRAGMENT] = p_stage;
pipeline->active_stages |= MESA_SHADER_FRAGMENT;
}
/* If multiview is enabled, we inject a custom passthrough geometry shader
* to broadcast draw calls to the appropriate views.
*/
- assert(!pipeline->subpass->view_mask || (!pipeline->has_gs && !pipeline->gs));
- if (pipeline->subpass->view_mask) {
+ const uint32_t view_mask = pipeline->rendering_info.view_mask;
+ assert(!view_mask ||
+ (!pipeline->has_gs && !pipeline->stages[BROADCOM_SHADER_GEOMETRY]));
+ if (view_mask) {
if (!pipeline_add_multiview_gs(pipeline, cache, pAllocator))
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
- /* First we try to get the variants from the pipeline cache */
- struct v3dv_pipeline_key pipeline_key;
- pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
- unsigned char pipeline_sha1[20];
- pipeline_hash_graphics(pipeline, &pipeline_key, pipeline_sha1);
-
- bool cache_hit = false;
-
- pipeline->shared_data =
- v3dv_pipeline_cache_search_for_pipeline(cache,
- pipeline_sha1,
- &cache_hit);
-
- if (pipeline->shared_data != NULL) {
- /* A correct pipeline must have at least a VS and FS */
- assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
- assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
- assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
- assert(!pipeline->gs ||
- pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
- assert(!pipeline->gs ||
- pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
-
- if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
- pipeline_feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
-
- goto success;
+ /* First we try to get the variants from the pipeline cache (unless we are
+ * required to capture internal representations, since in that case we need
+ * compile).
+ */
+ bool needs_executable_info =
+ pCreateInfo->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
+ if (!needs_executable_info) {
+ struct v3dv_pipeline_key pipeline_key;
+ pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
+ pipeline_hash_graphics(pipeline, &pipeline_key, pipeline->sha1);
+
+ bool cache_hit = false;
+
+ pipeline->shared_data =
+ v3dv_pipeline_cache_search_for_pipeline(cache,
+ pipeline->sha1,
+ &cache_hit);
+
+ if (pipeline->shared_data != NULL) {
+ /* A correct pipeline must have at least a VS and FS */
+ assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
+ assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
+ assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
+ assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] ||
+ pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
+ assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] ||
+ pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
+
+ if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+ pipeline_feedback.flags |=
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+
+ goto success;
+ }
}
- if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
- return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+ if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
+ return VK_PIPELINE_COMPILE_REQUIRED;
/* Otherwise we try to get the NIR shaders (either from the original SPIR-V
* shader or the pipeline cache) and compile.
*/
pipeline->shared_data =
- v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline, true);
-
- pipeline->vs->feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
- if (pipeline->gs)
- pipeline->gs->feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
- pipeline->fs->feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
-
- if (!pipeline->vs->nir)
- pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
- if (pipeline->gs && !pipeline->gs->nir)
- pipeline->gs->nir = pipeline_stage_get_nir(pipeline->gs, pipeline, cache);
- if (!pipeline->fs->nir)
- pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache);
+ v3dv_pipeline_shared_data_new_empty(pipeline->sha1, pipeline, true);
+ if (!pipeline->shared_data)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+ struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX];
+ struct v3dv_pipeline_stage *p_stage_fs = pipeline->stages[BROADCOM_SHADER_FRAGMENT];
+ struct v3dv_pipeline_stage *p_stage_gs = pipeline->stages[BROADCOM_SHADER_GEOMETRY];
+
+ p_stage_vs->feedback.flags |=
+ VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+ if (p_stage_gs)
+ p_stage_gs->feedback.flags |=
+ VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+ p_stage_fs->feedback.flags |=
+ VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+
+ if (!p_stage_vs->nir)
+ p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache);
+ if (p_stage_gs && !p_stage_gs->nir)
+ p_stage_gs->nir = pipeline_stage_get_nir(p_stage_gs, pipeline, cache);
+ if (!p_stage_fs->nir)
+ p_stage_fs->nir = pipeline_stage_get_nir(p_stage_fs, pipeline, cache);
/* Linking + pipeline lowerings */
- if (pipeline->gs) {
- link_shaders(pipeline->gs->nir, pipeline->fs->nir);
- link_shaders(pipeline->vs->nir, pipeline->gs->nir);
+ if (p_stage_gs) {
+ link_shaders(p_stage_gs->nir, p_stage_fs->nir);
+ link_shaders(p_stage_vs->nir, p_stage_gs->nir);
} else {
- link_shaders(pipeline->vs->nir, pipeline->fs->nir);
+ link_shaders(p_stage_vs->nir, p_stage_fs->nir);
}
- pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout);
- lower_fs_io(pipeline->fs->nir);
+ pipeline_lower_nir(pipeline, p_stage_fs, pipeline->layout);
+ lower_fs_io(p_stage_fs->nir);
- if (pipeline->gs) {
- pipeline_lower_nir(pipeline, pipeline->gs, pipeline->layout);
- lower_gs_io(pipeline->gs->nir);
+ if (p_stage_gs) {
+ pipeline_lower_nir(pipeline, p_stage_gs, pipeline->layout);
+ lower_gs_io(p_stage_gs->nir);
}
- pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout);
- lower_vs_io(pipeline->vs->nir);
+ pipeline_lower_nir(pipeline, p_stage_vs, pipeline->layout);
+ lower_vs_io(p_stage_vs->nir);
/* Compiling to vir */
VkResult vk_result;
/* We should have got all the variants or no variants from the cache */
assert(!pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
- vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator, pCreateInfo);
+ vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator,
+ pCreateInfo);
if (vk_result != VK_SUCCESS)
return vk_result;
assert(!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] &&
!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
- if (pipeline->gs) {
+ if (p_stage_gs) {
vk_result =
pipeline_compile_geometry_shader(pipeline, pAllocator, pCreateInfo);
if (vk_result != VK_SUCCESS)
@@ -2590,6 +2600,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
success:
+ pipeline_check_buffer_device_address(pipeline);
+
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
write_creation_feedback(pipeline,
pCreateInfo->pNext,
@@ -2600,7 +2612,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
/* Since we have the variants in the pipeline shared data we can now free
* the pipeline stages.
*/
- pipeline_free_stages(device, pipeline, pAllocator);
+ if (!needs_executable_info)
+ pipeline_free_stages(device, pipeline, pAllocator);
pipeline_check_spill_size(pipeline);
@@ -2638,139 +2651,11 @@ compute_vpm_config(struct v3dv_pipeline *pipeline)
return VK_SUCCESS;
}
-static unsigned
-v3dv_dynamic_state_mask(VkDynamicState state)
-{
- switch(state) {
- case VK_DYNAMIC_STATE_VIEWPORT:
- return V3DV_DYNAMIC_VIEWPORT;
- case VK_DYNAMIC_STATE_SCISSOR:
- return V3DV_DYNAMIC_SCISSOR;
- case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
- return V3DV_DYNAMIC_STENCIL_COMPARE_MASK;
- case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
- return V3DV_DYNAMIC_STENCIL_WRITE_MASK;
- case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
- return V3DV_DYNAMIC_STENCIL_REFERENCE;
- case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
- return V3DV_DYNAMIC_BLEND_CONSTANTS;
- case VK_DYNAMIC_STATE_DEPTH_BIAS:
- return V3DV_DYNAMIC_DEPTH_BIAS;
- case VK_DYNAMIC_STATE_LINE_WIDTH:
- return V3DV_DYNAMIC_LINE_WIDTH;
- case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
- return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
-
- /* Depth bounds testing is not available in in V3D 4.2 so here we are just
- * ignoring this dynamic state. We are already asserting at pipeline creation
- * time that depth bounds testing is not enabled.
- */
- case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
- return 0;
-
- default:
- unreachable("Unhandled dynamic state");
- }
-}
-
-static void
-pipeline_init_dynamic_state(
- struct v3dv_pipeline *pipeline,
- const VkPipelineDynamicStateCreateInfo *pDynamicState,
- const VkPipelineViewportStateCreateInfo *pViewportState,
- const VkPipelineDepthStencilStateCreateInfo *pDepthStencilState,
- const VkPipelineColorBlendStateCreateInfo *pColorBlendState,
- const VkPipelineRasterizationStateCreateInfo *pRasterizationState,
- const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
-{
- pipeline->dynamic_state = default_dynamic_state;
- struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
-
- /* Create a mask of enabled dynamic states */
- uint32_t dynamic_states = 0;
- if (pDynamicState) {
- uint32_t count = pDynamicState->dynamicStateCount;
- for (uint32_t s = 0; s < count; s++) {
- dynamic_states |=
- v3dv_dynamic_state_mask(pDynamicState->pDynamicStates[s]);
- }
- }
-
- /* For any pipeline states that are not dynamic, set the dynamic state
- * from the static pipeline state.
- */
- if (pViewportState) {
- if (!(dynamic_states & V3DV_DYNAMIC_VIEWPORT)) {
- dynamic->viewport.count = pViewportState->viewportCount;
- typed_memcpy(dynamic->viewport.viewports, pViewportState->pViewports,
- pViewportState->viewportCount);
-
- for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
- v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
- dynamic->viewport.scale[i],
- dynamic->viewport.translate[i]);
- }
- }
-
- if (!(dynamic_states & V3DV_DYNAMIC_SCISSOR)) {
- dynamic->scissor.count = pViewportState->scissorCount;
- typed_memcpy(dynamic->scissor.scissors, pViewportState->pScissors,
- pViewportState->scissorCount);
- }
- }
-
- if (pDepthStencilState) {
- if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
- dynamic->stencil_compare_mask.front =
- pDepthStencilState->front.compareMask;
- dynamic->stencil_compare_mask.back =
- pDepthStencilState->back.compareMask;
- }
-
- if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
- dynamic->stencil_write_mask.front = pDepthStencilState->front.writeMask;
- dynamic->stencil_write_mask.back = pDepthStencilState->back.writeMask;
- }
-
- if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
- dynamic->stencil_reference.front = pDepthStencilState->front.reference;
- dynamic->stencil_reference.back = pDepthStencilState->back.reference;
- }
- }
-
- if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
- memcpy(dynamic->blend_constants, pColorBlendState->blendConstants,
- sizeof(dynamic->blend_constants));
- }
-
- if (pRasterizationState) {
- if (pRasterizationState->depthBiasEnable &&
- !(dynamic_states & V3DV_DYNAMIC_DEPTH_BIAS)) {
- dynamic->depth_bias.constant_factor =
- pRasterizationState->depthBiasConstantFactor;
- dynamic->depth_bias.depth_bias_clamp =
- pRasterizationState->depthBiasClamp;
- dynamic->depth_bias.slope_factor =
- pRasterizationState->depthBiasSlopeFactor;
- }
- if (!(dynamic_states & V3DV_DYNAMIC_LINE_WIDTH))
- dynamic->line_width = pRasterizationState->lineWidth;
- }
-
- if (pColorWriteState && !(dynamic_states & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
- dynamic->color_write_enable = 0;
- for (uint32_t i = 0; i < pColorWriteState->attachmentCount; i++)
- dynamic->color_write_enable |= pColorWriteState->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
- }
-
- pipeline->dynamic_state.mask = dynamic_states;
-}
-
static bool
-stencil_op_is_no_op(const VkStencilOpState *stencil)
+stencil_op_is_no_op(struct vk_stencil_test_face_state *stencil)
{
- return stencil->depthFailOp == VK_STENCIL_OP_KEEP &&
- stencil->compareOp == VK_COMPARE_OP_ALWAYS;
+ return stencil->op.depth_fail == VK_STENCIL_OP_KEEP &&
+ stencil->op.compare == VK_COMPARE_OP_ALWAYS;
}
static void
@@ -2786,113 +2671,63 @@ enable_depth_bias(struct v3dv_pipeline *pipeline,
/* Check the depth/stencil attachment description for the subpass used with
* this pipeline.
*/
- assert(pipeline->pass && pipeline->subpass);
- struct v3dv_render_pass *pass = pipeline->pass;
- struct v3dv_subpass *subpass = pipeline->subpass;
-
- if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
- return;
-
- assert(subpass->ds_attachment.attachment < pass->attachment_count);
- struct v3dv_render_pass_attachment *att =
- &pass->attachments[subpass->ds_attachment.attachment];
-
- if (att->desc.format == VK_FORMAT_D16_UNORM)
+ VkFormat ds_format = pipeline->rendering_info.depth_attachment_format;
+ if (ds_format == VK_FORMAT_D16_UNORM)
pipeline->depth_bias.is_z16 = true;
pipeline->depth_bias.enabled = true;
}
-static void
-pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
- const VkPipelineDepthStencilStateCreateInfo *ds_info)
+/* Computes the ez_state based on a given vk_dynamic_graphics_state. Note
+ * that the parameter dyn doesn't need to be pipeline->dynamic_graphics_state,
+ * as this method can be used by the cmd_buffer too.
+ */
+void
+v3dv_compute_ez_state(struct vk_dynamic_graphics_state *dyn,
+ struct v3dv_pipeline *pipeline,
+ enum v3dv_ez_state *ez_state,
+ bool *incompatible_ez_test)
{
- if (!ds_info || !ds_info->depthTestEnable) {
- pipeline->ez_state = V3D_EZ_DISABLED;
+ if (!dyn->ds.depth.test_enable) {
+ *ez_state = V3D_EZ_DISABLED;
return;
}
- switch (ds_info->depthCompareOp) {
+ switch (dyn->ds.depth.compare_op) {
case VK_COMPARE_OP_LESS:
case VK_COMPARE_OP_LESS_OR_EQUAL:
- pipeline->ez_state = V3D_EZ_LT_LE;
+ *ez_state = V3D_EZ_LT_LE;
break;
case VK_COMPARE_OP_GREATER:
case VK_COMPARE_OP_GREATER_OR_EQUAL:
- pipeline->ez_state = V3D_EZ_GT_GE;
+ *ez_state = V3D_EZ_GT_GE;
break;
case VK_COMPARE_OP_NEVER:
case VK_COMPARE_OP_EQUAL:
- pipeline->ez_state = V3D_EZ_UNDECIDED;
+ *ez_state = V3D_EZ_UNDECIDED;
break;
default:
- pipeline->ez_state = V3D_EZ_DISABLED;
+ *ez_state = V3D_EZ_DISABLED;
+ *incompatible_ez_test = true;
break;
}
/* If stencil is enabled and is not a no-op, we need to disable EZ */
- if (ds_info->stencilTestEnable &&
- (!stencil_op_is_no_op(&ds_info->front) ||
- !stencil_op_is_no_op(&ds_info->back))) {
- pipeline->ez_state = V3D_EZ_DISABLED;
+ if (dyn->ds.stencil.test_enable &&
+ (!stencil_op_is_no_op(&dyn->ds.stencil.front) ||
+ !stencil_op_is_no_op(&dyn->ds.stencil.back))) {
+ *ez_state = V3D_EZ_DISABLED;
}
-}
-static bool
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
-{
- for (uint8_t i = 0; i < pipeline->va_count; i++) {
- if (vk_format_is_int(pipeline->va[i].vk_format))
- return true;
+ /* If the FS writes Z, then it may update against the chosen EZ direction */
+ struct v3dv_shader_variant *fs_variant =
+ pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+ if (fs_variant && fs_variant->prog_data.fs->writes_z &&
+ !fs_variant->prog_data.fs->writes_z_from_fep) {
+ *ez_state = V3D_EZ_DISABLED;
}
- return false;
}
-/* @pipeline can be NULL. We assume in that case that all the attributes have
- * a float format (we only create an all-float BO once and we reuse it with
- * all float pipelines), otherwise we look at the actual type of each
- * attribute used with the specific pipeline passed in.
- */
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline)
-{
- uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
- struct v3dv_bo *bo;
-
- bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
-
- if (!bo) {
- fprintf(stderr, "failed to allocate memory for the default "
- "attribute values\n");
- return NULL;
- }
-
- bool ok = v3dv_bo_map(device, bo, size);
- if (!ok) {
- fprintf(stderr, "failed to map default attribute values buffer\n");
- return false;
- }
-
- uint32_t *attrs = bo->map;
- uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
- for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
- attrs[i * 4 + 0] = 0;
- attrs[i * 4 + 1] = 0;
- attrs[i * 4 + 2] = 0;
- VkFormat attr_format =
- pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
- if (i < va_count && vk_format_is_int(attr_format)) {
- attrs[i * 4 + 3] = 1;
- } else {
- attrs[i * 4 + 3] = fui(1.0);
- }
- }
-
- v3dv_bo_unmap(device, bo);
-
- return bo;
-}
static void
pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
@@ -2918,6 +2753,135 @@ pipeline_set_sample_rate_shading(struct v3dv_pipeline *pipeline,
ms_info->sampleShadingEnable;
}
+static void
+pipeline_setup_rendering_info(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline,
+ const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *alloc)
+{
+ struct vk_render_pass_state *rp = &pipeline->rendering_info;
+
+ if (pipeline->pass) {
+ assert(pipeline->subpass);
+ struct v3dv_render_pass *pass = pipeline->pass;
+ struct v3dv_subpass *subpass = pipeline->subpass;
+ const uint32_t attachment_idx = subpass->ds_attachment.attachment;
+
+ rp->view_mask = subpass->view_mask;
+
+ rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
+ rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
+ rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
+ if (attachment_idx != VK_ATTACHMENT_UNUSED) {
+ VkFormat ds_format = pass->attachments[attachment_idx].desc.format;
+ if (vk_format_has_depth(ds_format)) {
+ rp->depth_attachment_format = ds_format;
+ rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
+ }
+ if (vk_format_has_stencil(ds_format)) {
+ rp->stencil_attachment_format = ds_format;
+ rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
+ }
+ }
+
+ rp->color_attachment_count = subpass->color_count;
+ for (uint32_t i = 0; i < subpass->color_count; i++) {
+ const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+ if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+ rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
+ continue;
+ }
+ rp->color_attachment_formats[i] =
+ pass->attachments[attachment_idx].desc.format;
+ rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
+ }
+ return;
+ }
+
+ const VkPipelineRenderingCreateInfo *ri =
+ vk_find_struct_const(pCreateInfo->pNext,
+ PIPELINE_RENDERING_CREATE_INFO);
+ if (ri) {
+ rp->view_mask = ri->viewMask;
+
+ rp->color_attachment_count = ri->colorAttachmentCount;
+ for (int i = 0; i < ri->colorAttachmentCount; i++) {
+ rp->color_attachment_formats[i] = ri->pColorAttachmentFormats[i];
+ if (rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
+ rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
+ }
+ }
+
+ rp->depth_attachment_format = ri->depthAttachmentFormat;
+ if (ri->depthAttachmentFormat != VK_FORMAT_UNDEFINED)
+ rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
+
+ rp->stencil_attachment_format = ri->stencilAttachmentFormat;
+ if (ri->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)
+ rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
+
+ return;
+ }
+
+ /* From the Vulkan spec for VkPipelineRenderingCreateInfo:
+ *
+ * "if this structure is not specified, and the pipeline does not include
+ * a VkRenderPass, viewMask and colorAttachmentCount are 0, and
+ * depthAttachmentFormat and stencilAttachmentFormat are
+ * VK_FORMAT_UNDEFINED.
+ */
+ pipeline->rendering_info = (struct vk_render_pass_state) {
+ .view_mask = 0,
+ .attachments = 0,
+ .color_attachment_count = 0,
+ .depth_attachment_format = VK_FORMAT_UNDEFINED,
+ .stencil_attachment_format = VK_FORMAT_UNDEFINED,
+ };
+}
+
+static VkResult
+pipeline_init_dynamic_state(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline,
+ struct vk_graphics_pipeline_state *pipeline_state,
+ const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ const VkPipelineColorWriteCreateInfoEXT *cw_info)
+{
+ VkResult result = VK_SUCCESS;
+ struct vk_graphics_pipeline_all_state all;
+ result = vk_graphics_pipeline_state_fill(&pipeline->device->vk, pipeline_state,
+ pCreateInfo, &pipeline->rendering_info, 0,
+ &all, NULL, 0, NULL);
+ if (result != VK_SUCCESS)
+ return result;
+
+ vk_dynamic_graphics_state_fill(&pipeline->dynamic_graphics_state, pipeline_state);
+
+ struct v3dv_dynamic_state *v3dv_dyn = &pipeline->dynamic;
+ struct vk_dynamic_graphics_state *dyn = &pipeline->dynamic_graphics_state;
+
+ if (BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
+ BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_SCISSORS)) {
+ /* FIXME: right now we don't support multiViewport so viewporst[0] would
+ * work now, but would need to change if we allow multiple viewports.
+ */
+ v3dv_X(device, viewport_compute_xform)(&dyn->vp.viewports[0],
+ v3dv_dyn->viewport.scale[0],
+ v3dv_dyn->viewport.translate[0]);
+
+ }
+
+ v3dv_dyn->color_write_enable =
+ (1ull << (4 * V3D_MAX_RENDER_TARGETS(device->devinfo.ver))) - 1;
+ if (cw_info && BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+ v3dv_dyn->color_write_enable = 0;
+ for (uint32_t i = 0; i < cw_info->attachmentCount; i++)
+ v3dv_dyn->color_write_enable |=
+ cw_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
+ }
+
+ return result;
+}
+
static VkResult
pipeline_init(struct v3dv_pipeline *pipeline,
struct v3dv_device *device,
@@ -2928,25 +2892,34 @@ pipeline_init(struct v3dv_pipeline *pipeline,
VkResult result = VK_SUCCESS;
pipeline->device = device;
+ pipeline->flags = pCreateInfo->flags;
V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, pCreateInfo->layout);
pipeline->layout = layout;
+ v3dv_pipeline_layout_ref(pipeline->layout);
V3DV_FROM_HANDLE(v3dv_render_pass, render_pass, pCreateInfo->renderPass);
- assert(pCreateInfo->subpass < render_pass->subpass_count);
- pipeline->pass = render_pass;
- pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
+ if (render_pass) {
+ assert(pCreateInfo->subpass < render_pass->subpass_count);
+ pipeline->pass = render_pass;
+ pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
+ }
+
+ pipeline_setup_rendering_info(device, pipeline, pCreateInfo, pAllocator);
const VkPipelineInputAssemblyStateCreateInfo *ia_info =
pCreateInfo->pInputAssemblyState;
- pipeline->topology = vk_to_pipe_prim_type[ia_info->topology];
+ pipeline->topology = vk_to_mesa_prim[ia_info->topology];
/* If rasterization is not enabled, various CreateInfo structs must be
* ignored.
*/
const bool raster_enabled =
+ pCreateInfo->pRasterizationState &&
!pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
+ pipeline->rasterization_enabled = raster_enabled;
+
const VkPipelineViewportStateCreateInfo *vp_info =
raster_enabled ? pCreateInfo->pViewportState : NULL;
@@ -2957,11 +2930,17 @@ pipeline_init(struct v3dv_pipeline *pipeline,
raster_enabled ? pCreateInfo->pRasterizationState : NULL;
const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info =
- rs_info ? vk_find_struct_const(
+ raster_enabled ? vk_find_struct_const(
rs_info->pNext,
PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT) :
NULL;
+ const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info =
+ raster_enabled ? vk_find_struct_const(
+ rs_info->pNext,
+ PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT) :
+ NULL;
+
const VkPipelineColorBlendStateCreateInfo *cb_info =
raster_enabled ? pCreateInfo->pColorBlendState : NULL;
@@ -2973,22 +2952,35 @@ pipeline_init(struct v3dv_pipeline *pipeline,
PIPELINE_COLOR_WRITE_CREATE_INFO_EXT) :
NULL;
- pipeline_init_dynamic_state(pipeline,
- pCreateInfo->pDynamicState,
- vp_info, ds_info, cb_info, rs_info, cw_info);
+ struct vk_graphics_pipeline_state pipeline_state = { };
+ result = pipeline_init_dynamic_state(device, pipeline, &pipeline_state,
+ pCreateInfo, cw_info);
- /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
- * feature and it shouldn't be used by any pipeline.
- */
- assert(!ds_info || !ds_info->depthBoundsTestEnable);
+ if (result != VK_SUCCESS) {
+ /* Caller would already destroy the pipeline, and we didn't allocate any
+ * extra info. We don't need to do anything else.
+ */
+ return result;
+ }
- v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info,
- rs_info, pv_info, ms_info);
+ const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_control =
+ vp_info ? vk_find_struct_const(vp_info->pNext,
+ PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT) :
+ NULL;
+
+ if (depth_clip_control)
+ pipeline->negative_one_to_one = depth_clip_control->negativeOneToOne;
- pipeline_set_ez_state(pipeline, ds_info);
enable_depth_bias(pipeline, rs_info);
+
+ v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info,
+ rs_info, pv_info, ls_info,
+ ms_info,
+ &pipeline_state);
+
pipeline_set_sample_mask(pipeline, ms_info);
pipeline_set_sample_rate_shading(pipeline, ms_info);
+ pipeline->line_smooth = enable_line_smooth(pipeline->topology, rs_info);
pipeline->primitive_restart =
pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
@@ -3011,15 +3003,22 @@ pipeline_init(struct v3dv_pipeline *pipeline,
v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
- if (pipeline_has_integer_vertex_attrib(pipeline)) {
+ if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
pipeline->default_attribute_values =
- v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
+ v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
+
if (!pipeline->default_attribute_values)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
} else {
pipeline->default_attribute_values = NULL;
}
+ /* This must be done after the pipeline has been compiled */
+ v3dv_compute_ez_state(&pipeline->dynamic_graphics_state,
+ pipeline,
+ &pipeline->ez_state,
+ &pipeline->incompatible_ez_test);
+
return result;
}
@@ -3044,15 +3043,13 @@ graphics_pipeline_create(VkDevice _device,
VK_OBJECT_TYPE_PIPELINE);
if (pipeline == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- result = pipeline_init(pipeline, device, cache,
- pCreateInfo,
- pAllocator);
+ result = pipeline_init(pipeline, device, cache, pCreateInfo, pAllocator);
if (result != VK_SUCCESS) {
v3dv_destroy_pipeline(pipeline, device, pAllocator);
- if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
+ if (result == VK_PIPELINE_COMPILE_REQUIRED)
*pPipeline = VK_NULL_HANDLE;
return result;
}
@@ -3073,7 +3070,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device);
VkResult result = VK_SUCCESS;
- if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+ if (V3D_DBG(SHADERS))
mtx_lock(&device->pdevice->mutex);
uint32_t i = 0;
@@ -3091,7 +3088,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
pPipelines[i] = VK_NULL_HANDLE;
if (pCreateInfos[i].flags &
- VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
+ VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
break;
}
}
@@ -3099,7 +3096,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
for (; i < count; i++)
pPipelines[i] = VK_NULL_HANDLE;
- if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+ if (V3D_DBG(SHADERS))
mtx_unlock(&device->pdevice->mutex);
return result;
@@ -3118,12 +3115,20 @@ shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
}
static void
-lower_cs_shared(struct nir_shader *nir)
+lower_compute(struct nir_shader *nir)
{
- NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
- nir_var_mem_shared, shared_type_info);
- NIR_PASS_V(nir, nir_lower_explicit_io,
- nir_var_mem_shared, nir_address_format_32bit_offset);
+ if (!nir->info.shared_memory_explicit_layout) {
+ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+ nir_var_mem_shared, shared_type_info);
+ }
+
+ NIR_PASS(_, nir, nir_lower_explicit_io,
+ nir_var_mem_shared, nir_address_format_32bit_offset);
+
+ struct nir_lower_compute_system_values_options sysval_options = {
+ .has_base_workgroup_id = true,
+ };
+ NIR_PASS_V(nir, nir_lower_compute_system_values, &sysval_options);
}
static VkResult
@@ -3132,14 +3137,13 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
const VkComputePipelineCreateInfo *info,
const VkAllocationCallbacks *alloc)
{
- VkPipelineCreationFeedbackEXT pipeline_feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+ VkPipelineCreationFeedback pipeline_feedback = {
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
};
int64_t pipeline_start = os_time_get_nano();
struct v3dv_device *device = pipeline->device;
- struct v3dv_physical_device *physical_device =
- &device->instance->physicalDevice;
+ struct v3dv_physical_device *physical_device = device->pdevice;
const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
@@ -3156,61 +3160,69 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
p_stage->entrypoint = sinfo->pName;
p_stage->module = vk_shader_module_from_handle(sinfo->module);
p_stage->spec_info = sinfo->pSpecializationInfo;
- p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
+ p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
- pipeline_hash_shader(p_stage->module,
- p_stage->entrypoint,
- stage,
- p_stage->spec_info,
- p_stage->shader_sha1);
+ vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
+ info->pNext, sinfo->pNext);
+
+ vk_pipeline_hash_shader_stage(&info->stage,
+ &p_stage->robustness,
+ p_stage->shader_sha1);
- /* We try to get directly the variant first from the cache */
p_stage->nir = NULL;
- pipeline->cs = p_stage;
+ pipeline->stages[BROADCOM_SHADER_COMPUTE] = p_stage;
pipeline->active_stages |= sinfo->stage;
- struct v3dv_pipeline_key pipeline_key;
- pipeline_populate_compute_key(pipeline, &pipeline_key, info);
- unsigned char pipeline_sha1[20];
- pipeline_hash_compute(pipeline, &pipeline_key, pipeline_sha1);
-
- bool cache_hit = false;
- pipeline->shared_data =
- v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1, &cache_hit);
-
- if (pipeline->shared_data != NULL) {
- assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
- if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
- pipeline_feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
-
- goto success;
+ /* First we try to get the variants from the pipeline cache (unless we are
+ * required to capture internal representations, since in that case we need
+ * compile).
+ */
+ bool needs_executable_info =
+ info->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
+ if (!needs_executable_info) {
+ struct v3dv_pipeline_key pipeline_key;
+ pipeline_populate_compute_key(pipeline, &pipeline_key, info);
+ pipeline_hash_compute(pipeline, &pipeline_key, pipeline->sha1);
+
+ bool cache_hit = false;
+ pipeline->shared_data =
+ v3dv_pipeline_cache_search_for_pipeline(cache, pipeline->sha1, &cache_hit);
+
+ if (pipeline->shared_data != NULL) {
+ assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+ if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+ pipeline_feedback.flags |=
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+
+ goto success;
+ }
}
- if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
- return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+ if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
+ return VK_PIPELINE_COMPILE_REQUIRED;
- pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline_sha1,
+ pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline->sha1,
pipeline,
false);
+ if (!pipeline->shared_data)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
- p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+ p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
/* If not found on cache, compile it */
p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
assert(p_stage->nir);
- st_nir_opts(p_stage->nir);
+ v3d_optimize_nir(NULL, p_stage->nir);
pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
- lower_cs_shared(p_stage->nir);
+ lower_compute(p_stage->nir);
VkResult result = VK_SUCCESS;
struct v3d_key key;
memset(&key, 0, sizeof(key));
- pipeline_populate_v3d_key(&key, p_stage, 0,
- pipeline->device->features.robustBufferAccess);
+ pipeline_populate_v3d_key(&key, p_stage, 0);
pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE] =
pipeline_compile_shader_variant(p_stage, &key, sizeof(key),
alloc, &result);
@@ -3225,6 +3237,8 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
success:
+ pipeline_check_buffer_device_address(pipeline);
+
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
write_creation_feedback(pipeline,
info->pNext,
@@ -3233,9 +3247,10 @@ success:
&info->stage);
/* As we got the variants in pipeline->shared_data, after compiling we
- * don't need the pipeline_stages
+ * don't need the pipeline_stages.
*/
- pipeline_free_stages(device, pipeline, alloc);
+ if (!needs_executable_info)
+ pipeline_free_stages(device, pipeline, alloc);
pipeline_check_spill_size(pipeline);
@@ -3253,8 +3268,11 @@ compute_pipeline_init(struct v3dv_pipeline *pipeline,
pipeline->device = device;
pipeline->layout = layout;
+ v3dv_pipeline_layout_ref(pipeline->layout);
VkResult result = pipeline_compile_compute(pipeline, cache, info, alloc);
+ if (result != VK_SUCCESS)
+ return result;
return result;
}
@@ -3279,13 +3297,13 @@ compute_pipeline_create(VkDevice _device,
pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
VK_OBJECT_TYPE_PIPELINE);
if (pipeline == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
result = compute_pipeline_init(pipeline, device, cache,
pCreateInfo, pAllocator);
if (result != VK_SUCCESS) {
v3dv_destroy_pipeline(pipeline, device, pAllocator);
- if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
+ if (result == VK_PIPELINE_COMPILE_REQUIRED)
*pPipeline = VK_NULL_HANDLE;
return result;
}
@@ -3306,7 +3324,7 @@ v3dv_CreateComputePipelines(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device);
VkResult result = VK_SUCCESS;
- if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+ if (V3D_DBG(SHADERS))
mtx_lock(&device->pdevice->mutex);
uint32_t i = 0;
@@ -3323,7 +3341,7 @@ v3dv_CreateComputePipelines(VkDevice _device,
pPipelines[i] = VK_NULL_HANDLE;
if (pCreateInfos[i].flags &
- VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
+ VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
break;
}
}
@@ -3331,8 +3349,303 @@ v3dv_CreateComputePipelines(VkDevice _device,
for (; i < createInfoCount; i++)
pPipelines[i] = VK_NULL_HANDLE;
- if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
+ if (V3D_DBG(SHADERS))
mtx_unlock(&device->pdevice->mutex);
return result;
}
+
+static nir_shader *
+pipeline_get_nir(struct v3dv_pipeline *pipeline,
+ enum broadcom_shader_stage stage)
+{
+ assert(stage >= 0 && stage < BROADCOM_SHADER_STAGES);
+ if (pipeline->stages[stage])
+ return pipeline->stages[stage]->nir;
+
+ return NULL;
+}
+
+static struct v3d_prog_data *
+pipeline_get_prog_data(struct v3dv_pipeline *pipeline,
+ enum broadcom_shader_stage stage)
+{
+ if (pipeline->shared_data->variants[stage])
+ return pipeline->shared_data->variants[stage]->prog_data.base;
+ return NULL;
+}
+
+static uint64_t *
+pipeline_get_qpu(struct v3dv_pipeline *pipeline,
+ enum broadcom_shader_stage stage,
+ uint32_t *qpu_size)
+{
+ struct v3dv_shader_variant *variant =
+ pipeline->shared_data->variants[stage];
+ if (!variant) {
+ *qpu_size = 0;
+ return NULL;
+ }
+
+ *qpu_size = variant->qpu_insts_size;
+ return variant->qpu_insts;
+}
+
+/* FIXME: we use the same macro in various drivers, maybe move it to
+ * the common vk_util.h?
+ */
+#define WRITE_STR(field, ...) ({ \
+ memset(field, 0, sizeof(field)); \
+ UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
+ assert(_i > 0 && _i < sizeof(field)); \
+})
+
+static bool
+write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
+ const char *data)
+{
+ ir->isText = VK_TRUE;
+
+ size_t data_len = strlen(data) + 1;
+
+ if (ir->pData == NULL) {
+ ir->dataSize = data_len;
+ return true;
+ }
+
+ strncpy(ir->pData, data, ir->dataSize);
+ if (ir->dataSize < data_len)
+ return false;
+
+ ir->dataSize = data_len;
+ return true;
+}
+
+static void
+append(char **str, size_t *offset, const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ ralloc_vasprintf_rewrite_tail(str, offset, fmt, args);
+ va_end(args);
+}
+
+static void
+pipeline_collect_executable_data(struct v3dv_pipeline *pipeline)
+{
+ if (pipeline->executables.mem_ctx)
+ return;
+
+ pipeline->executables.mem_ctx = ralloc_context(NULL);
+ util_dynarray_init(&pipeline->executables.data,
+ pipeline->executables.mem_ctx);
+
+ /* Don't crash for failed/bogus pipelines */
+ if (!pipeline->shared_data)
+ return;
+
+ for (int s = BROADCOM_SHADER_VERTEX; s <= BROADCOM_SHADER_COMPUTE; s++) {
+ VkShaderStageFlags vk_stage =
+ mesa_to_vk_shader_stage(broadcom_shader_stage_to_gl(s));
+ if (!(vk_stage & pipeline->active_stages))
+ continue;
+
+ char *nir_str = NULL;
+ char *qpu_str = NULL;
+
+ if (pipeline_keep_qpu(pipeline)) {
+ nir_shader *nir = pipeline_get_nir(pipeline, s);
+ nir_str = nir ?
+ nir_shader_as_str(nir, pipeline->executables.mem_ctx) : NULL;
+
+ uint32_t qpu_size;
+ uint64_t *qpu = pipeline_get_qpu(pipeline, s, &qpu_size);
+ if (qpu) {
+ uint32_t qpu_inst_count = qpu_size / sizeof(uint64_t);
+ qpu_str = rzalloc_size(pipeline->executables.mem_ctx,
+ qpu_inst_count * 96);
+ size_t offset = 0;
+ for (int i = 0; i < qpu_inst_count; i++) {
+ const char *str = v3d_qpu_disasm(&pipeline->device->devinfo, qpu[i]);
+ append(&qpu_str, &offset, "%s\n", str);
+ ralloc_free((void *)str);
+ }
+ }
+ }
+
+ struct v3dv_pipeline_executable_data data = {
+ .stage = s,
+ .nir_str = nir_str,
+ .qpu_str = qpu_str,
+ };
+ util_dynarray_append(&pipeline->executables.data,
+ struct v3dv_pipeline_executable_data, data);
+ }
+}
+
+static const struct v3dv_pipeline_executable_data *
+pipeline_get_executable(struct v3dv_pipeline *pipeline, uint32_t index)
+{
+ assert(index < util_dynarray_num_elements(&pipeline->executables.data,
+ struct v3dv_pipeline_executable_data));
+ return util_dynarray_element(&pipeline->executables.data,
+ struct v3dv_pipeline_executable_data,
+ index);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPipelineExecutableInternalRepresentationsKHR(
+ VkDevice device,
+ const VkPipelineExecutableInfoKHR *pExecutableInfo,
+ uint32_t *pInternalRepresentationCount,
+ VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
+{
+ V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
+
+ pipeline_collect_executable_data(pipeline);
+
+ VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
+ pInternalRepresentations, pInternalRepresentationCount);
+
+ bool incomplete = false;
+ const struct v3dv_pipeline_executable_data *exe =
+ pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
+
+ if (exe->nir_str) {
+ vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
+ &out, ir) {
+ WRITE_STR(ir->name, "NIR (%s)", broadcom_shader_stage_name(exe->stage));
+ WRITE_STR(ir->description, "Final NIR form");
+ if (!write_ir_text(ir, exe->nir_str))
+ incomplete = true;
+ }
+ }
+
+ if (exe->qpu_str) {
+ vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
+ &out, ir) {
+ WRITE_STR(ir->name, "QPU (%s)", broadcom_shader_stage_name(exe->stage));
+ WRITE_STR(ir->description, "Final QPU assembly");
+ if (!write_ir_text(ir, exe->qpu_str))
+ incomplete = true;
+ }
+ }
+
+ return incomplete ? VK_INCOMPLETE : vk_outarray_status(&out);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPipelineExecutablePropertiesKHR(
+ VkDevice device,
+ const VkPipelineInfoKHR *pPipelineInfo,
+ uint32_t *pExecutableCount,
+ VkPipelineExecutablePropertiesKHR *pProperties)
+{
+ V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pPipelineInfo->pipeline);
+
+ pipeline_collect_executable_data(pipeline);
+
+ VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
+ pProperties, pExecutableCount);
+
+ util_dynarray_foreach(&pipeline->executables.data,
+ struct v3dv_pipeline_executable_data, exe) {
+ vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
+ gl_shader_stage mesa_stage = broadcom_shader_stage_to_gl(exe->stage);
+ props->stages = mesa_to_vk_shader_stage(mesa_stage);
+
+ WRITE_STR(props->name, "%s (%s)",
+ _mesa_shader_stage_to_abbrev(mesa_stage),
+ broadcom_shader_stage_is_binning(exe->stage) ?
+ "Binning" : "Render");
+
+ WRITE_STR(props->description, "%s",
+ _mesa_shader_stage_to_string(mesa_stage));
+
+ props->subgroupSize = V3D_CHANNELS;
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPipelineExecutableStatisticsKHR(
+ VkDevice device,
+ const VkPipelineExecutableInfoKHR *pExecutableInfo,
+ uint32_t *pStatisticCount,
+ VkPipelineExecutableStatisticKHR *pStatistics)
+{
+ V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
+
+ pipeline_collect_executable_data(pipeline);
+
+ const struct v3dv_pipeline_executable_data *exe =
+ pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
+
+ struct v3d_prog_data *prog_data =
+ pipeline_get_prog_data(pipeline, exe->stage);
+
+ struct v3dv_shader_variant *variant =
+ pipeline->shared_data->variants[exe->stage];
+ uint32_t qpu_inst_count = variant->qpu_insts_size / sizeof(uint64_t);
+
+ VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
+ pStatistics, pStatisticCount);
+
+ if (qpu_inst_count > 0) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "Compile Strategy");
+ WRITE_STR(stat->description, "Chosen compile strategy index");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = prog_data->compile_strategy_idx;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "Instruction Count");
+ WRITE_STR(stat->description, "Number of QPU instructions");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = qpu_inst_count;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "Thread Count");
+ WRITE_STR(stat->description, "Number of QPU threads dispatched");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = prog_data->threads;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "Spill Size");
+ WRITE_STR(stat->description, "Size of the spill buffer in bytes");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = prog_data->spill_size;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "TMU Spills");
+ WRITE_STR(stat->description, "Number of times a register was spilled "
+ "to memory");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = prog_data->spill_size;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "TMU Fills");
+ WRITE_STR(stat->description, "Number of times a register was filled "
+ "from memory");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = prog_data->spill_size;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "QPU Read Stalls");
+ WRITE_STR(stat->description, "Number of cycles the QPU stalls for a "
+ "register read dependency");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = prog_data->qpu_read_stalls;
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
diff --git a/src/broadcom/vulkan/v3dv_pipeline_cache.c b/src/broadcom/vulkan/v3dv_pipeline_cache.c
index 02721ec1d79..d2124ee0b08 100644
--- a/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -22,7 +22,7 @@
*/
#include "v3dv_private.h"
-#include "vulkan/util/vk_util.h"
+#include "vk_util.h"
#include "util/blob.h"
#include "nir/nir_serialize.h"
@@ -61,20 +61,22 @@ cache_dump_stats(struct v3dv_pipeline_cache *cache)
fprintf(stderr, " cache entries: %d\n", cache->stats.count);
fprintf(stderr, " cache miss count: %d\n", cache->stats.miss);
fprintf(stderr, " cache hit count: %d\n", cache->stats.hit);
+
+ fprintf(stderr, " on-disk cache hit count: %d\n", cache->stats.on_disk_hit);
}
static void
pipeline_cache_lock(struct v3dv_pipeline_cache *cache)
{
if (!cache->externally_synchronized)
- pthread_mutex_lock(&cache->mutex);
+ mtx_lock(&cache->mutex);
}
static void
pipeline_cache_unlock(struct v3dv_pipeline_cache *cache)
{
if (!cache->externally_synchronized)
- pthread_mutex_unlock(&cache->mutex);
+ mtx_unlock(&cache->mutex);
}
void
@@ -178,7 +180,7 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
} else {
cache->nir_stats.hit++;
if (debug_cache) {
- fprintf(stderr, "\tnir cache hit: %p\n", nir);
+ fprintf(stderr, "[v3dv nir cache] hit: %p\n", nir);
if (dump_stats)
cache_dump_stats(cache);
}
@@ -188,7 +190,7 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
cache->nir_stats.miss++;
if (debug_cache) {
- fprintf(stderr, "\tnir cache miss\n");
+ fprintf(stderr, "[v3dv nir cache] miss\n");
if (dump_stats)
cache_dump_stats(cache);
}
@@ -203,7 +205,7 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
bool cache_enabled)
{
cache->device = device;
- pthread_mutex_init(&cache->mutex, NULL);
+ mtx_init(&cache->mutex, mtx_plain);
if (cache_enabled) {
cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
@@ -219,7 +221,7 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
cache->stats.count = 0;
cache->externally_synchronized = flags &
- VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT;
+ VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT;
} else {
cache->nir_cache = NULL;
cache->cache = NULL;
@@ -241,7 +243,7 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
struct blob *blob);
/**
- * It searchs for pipeline cached data, and returns a v3dv_pipeline_shared_data with
+ * It searches for pipeline cached data, and returns a v3dv_pipeline_shared_data with
* it, or NULL if doesn't have it cached. On the former, it will increases the
* ref_count, so caller is responsible to unref it.
*/
@@ -273,7 +275,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
cache->stats.hit++;
*cache_hit = true;
if (debug_cache) {
- fprintf(stderr, "\tcache hit: %p\n", cache_entry);
+ fprintf(stderr, "[v3dv cache] hit: %p\n", cache_entry);
if (dump_stats)
cache_dump_stats(cache);
}
@@ -288,7 +290,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
cache->stats.miss++;
if (debug_cache) {
- fprintf(stderr, "\tcache miss\n");
+ fprintf(stderr, "[v3dv cache] miss\n");
if (dump_stats)
cache_dump_stats(cache);
}
@@ -300,7 +302,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
struct disk_cache *disk_cache = device->pdevice->disk_cache;
/* Note that the on-disk-cache can be independently disabled, while keeping
* the pipeline cache working, by using the environment variable
- * MESA_GLSL_CACHE_DISABLE. In that case the calls to disk_cache_put/get
+ * MESA_SHADER_CACHE_DISABLE. In that case the calls to disk_cache_put/get
* will not do anything.
*/
if (disk_cache && device->instance->pipeline_cache_enabled) {
@@ -309,25 +311,32 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
size_t buffer_size;
uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size);
+ if (V3D_DBG(CACHE)) {
+ char sha1buf[41];
+ _mesa_sha1_format(sha1buf, cache_key);
+ fprintf(stderr, "[v3dv on-disk cache] %s %s\n",
+ buffer ? "hit" : "miss",
+ sha1buf);
+ }
+
if (buffer) {
struct blob_reader blob;
struct v3dv_pipeline_shared_data *shared_data;
- if (debug_cache)
- fprintf(stderr, "\ton-disk-cache hit\n");
-
blob_reader_init(&blob, buffer, buffer_size);
shared_data = v3dv_pipeline_shared_data_create_from_blob(cache, &blob);
free(buffer);
if (shared_data) {
+ /* Technically we could increase on_disk_hit as soon as we have a
+ * buffer, but we are more interested on hits that got a valid
+ * shared_data
+ */
+ cache->stats.on_disk_hit++;
if (cache)
pipeline_cache_upload_shared_data(cache, shared_data, true);
return shared_data;
}
- } else {
- if (debug_cache)
- fprintf(stderr, "\ton-disk-cache miss\n");
}
}
#endif
@@ -393,15 +402,13 @@ v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
"pipeline shader assembly", true);
if (!bo) {
fprintf(stderr, "failed to allocate memory for shaders assembly\n");
- v3dv_pipeline_shared_data_unref(cache->device, new_entry);
- return NULL;
+ goto fail;
}
bool ok = v3dv_bo_map(cache->device, bo, total_assembly_size);
if (!ok) {
fprintf(stderr, "failed to map source shader buffer\n");
- v3dv_pipeline_shared_data_unref(cache->device, new_entry);
- return NULL;
+ goto fail;
}
memcpy(bo->map, total_assembly, total_assembly_size);
@@ -409,6 +416,10 @@ v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
new_entry->assembly_bo = bo;
return new_entry;
+
+fail:
+ v3dv_pipeline_shared_data_unref(cache->device, new_entry);
+ return NULL;
}
static void
@@ -425,8 +436,13 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
return;
pipeline_cache_lock(cache);
- struct hash_entry *entry =
- _mesa_hash_table_search(cache->cache, shared_data->sha1_key);
+ struct hash_entry *entry = NULL;
+
+ /* If this is being called from the disk cache, we already know that the
+ * entry is not on the hash table
+ */
+ if (!from_disk_cache)
+ entry = _mesa_hash_table_search(cache->cache, shared_data->sha1_key);
if (entry) {
pipeline_cache_unlock(cache);
@@ -464,14 +480,12 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
cache_key cache_key;
disk_cache_compute_key(disk_cache, shared_data->sha1_key, 20, cache_key);
- disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
- if (debug_cache) {
+ if (V3D_DBG(CACHE)) {
char sha1buf[41];
_mesa_sha1_format(sha1buf, shared_data->sha1_key);
-
- fprintf(stderr, "on-disk-cache, new cache entry with sha1 key %s:%p\n\n",
- sha1buf, shared_data);
+ fprintf(stderr, "[v3dv on-disk cache] storing %s\n", sha1buf);
}
+ disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
}
blob_finish(&binary);
@@ -528,7 +542,7 @@ shader_variant_create_from_blob(struct v3dv_device *device,
if (blob->overrun)
return NULL;
- uint ulist_data_size = sizeof(uint32_t) * ulist_count;
+ size_t ulist_data_size = sizeof(uint32_t) * ulist_count;
const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
if (blob->overrun)
return NULL;
@@ -564,6 +578,7 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
const unsigned char *sha1_key = blob_read_bytes(blob, 20);
struct v3dv_descriptor_maps *maps[BROADCOM_SHADER_STAGES] = { 0 };
+ struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 };
uint8_t descriptor_maps_count = blob_read_uint8(blob);
for (uint8_t count = 0; count < descriptor_maps_count; count++) {
@@ -573,14 +588,14 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
blob_read_bytes(blob, sizeof(struct v3dv_descriptor_maps));
if (blob->overrun)
- return NULL;
+ goto fail;
maps[stage] = vk_zalloc2(&cache->device->vk.alloc, NULL,
sizeof(struct v3dv_descriptor_maps), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (maps[stage] == NULL)
- return NULL;
+ goto fail;
memcpy(maps[stage], current_maps, sizeof(struct v3dv_descriptor_maps));
if (broadcom_shader_stage_is_render_with_binning(stage)) {
@@ -592,8 +607,6 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
uint8_t variant_count = blob_read_uint8(blob);
- struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 };
-
for (uint8_t count = 0; count < variant_count; count++) {
uint8_t stage = blob_read_uint8(blob);
struct v3dv_shader_variant *variant =
@@ -606,10 +619,25 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
blob_read_bytes(blob, total_assembly_size);
if (blob->overrun)
- return NULL;
+ goto fail;
+
+ struct v3dv_pipeline_shared_data *data =
+ v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants,
+ total_assembly, total_assembly_size);
+
+ if (!data)
+ goto fail;
- return v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants,
- total_assembly, total_assembly_size);
+ return data;
+
+fail:
+ for (int i = 0; i < BROADCOM_SHADER_STAGES; i++) {
+ if (maps[i])
+ vk_free2(&cache->device->vk.alloc, NULL, maps[i]);
+ if (variants[i])
+ v3dv_shader_variant_destroy(cache->device, variants[i]);
+ }
+ return NULL;
}
static void
@@ -618,7 +646,7 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache,
const void *data)
{
struct v3dv_device *device = cache->device;
- struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+ struct v3dv_physical_device *pdevice = device->pdevice;
struct vk_pipeline_cache_header header;
if (cache->cache == NULL || cache->nir_cache == NULL)
@@ -695,7 +723,7 @@ v3dv_CreatePipelineCache(VkDevice _device,
VK_OBJECT_TYPE_PIPELINE_CACHE);
if (cache == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
v3dv_pipeline_cache_init(cache, device, pCreateInfo->flags,
device->instance->pipeline_cache_enabled);
@@ -714,7 +742,7 @@ v3dv_CreatePipelineCache(VkDevice _device,
void
v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache)
{
- pthread_mutex_destroy(&cache->mutex);
+ mtx_destroy(&cache->mutex);
if (dump_stats_on_destroy)
cache_dump_stats(cache);
@@ -934,7 +962,7 @@ v3dv_GetPipelineCacheData(VkDevice _device,
blob_init_fixed(&blob, NULL, SIZE_MAX);
}
- struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+ struct v3dv_physical_device *pdevice = device->pdevice;
VkResult result = VK_INCOMPLETE;
pipeline_cache_lock(cache);
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index b5ab7ed2c59..892afcf3ab8 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
@@ -36,12 +36,24 @@
#include <vulkan/vk_icd.h>
#include <vk_enum_to_str.h>
+#include "vk_descriptor_update_template.h"
#include "vk_device.h"
+#include "vk_device_memory.h"
+#include "vk_format.h"
#include "vk_instance.h"
#include "vk_image.h"
+#include "vk_log.h"
#include "vk_physical_device.h"
#include "vk_shader_module.h"
+#include "vk_sync.h"
+#include "vk_sync_timeline.h"
#include "vk_util.h"
+#include "vk_ycbcr_conversion.h"
+
+#include "vk_command_buffer.h"
+#include "vk_command_pool.h"
+#include "vk_queue.h"
+#include "vk_pipeline.h"
#include <xf86drm.h>
@@ -53,6 +65,13 @@
#define VG(x) ((void)0)
#endif
+#include "util/detect_os.h"
+
+#if DETECT_OS_ANDROID
+#include <vndk/hardware_buffer.h>
+#include "util/u_gralloc/u_gralloc.h"
+#endif
+
#include "v3dv_limits.h"
#include "common/v3d_device_info.h"
@@ -68,8 +87,9 @@
#include "vk_debug_report.h"
#include "util/set.h"
#include "util/hash_table.h"
+#include "util/sparse_array.h"
#include "util/xmlconfig.h"
-#include "u_atomic.h"
+#include "util/u_atomic.h"
#include "v3dv_entrypoints.h"
#include "v3dv_bo.h"
@@ -84,7 +104,7 @@
#include "wsi_common.h"
/* A non-fatal assert. Useful for debugging. */
-#ifdef DEBUG
+#if MESA_DEBUG
#define v3dv_assert(x) ({ \
if (unlikely(!(x))) \
fprintf(stderr, "%s:%d ASSERT: %s", __FILE__, __LINE__, #x); \
@@ -94,7 +114,7 @@
#endif
#define perf_debug(...) do { \
- if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) \
+ if (V3D_DBG(PERF)) \
fprintf(stderr, __VA_ARGS__); \
} while (0)
@@ -111,13 +131,15 @@ struct v3d_simulator_file;
/* Minimum required by the Vulkan 1.1 spec */
#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
+/* Maximum performance counters number */
+#define V3D_MAX_PERFCNT 93
+
struct v3dv_physical_device {
struct vk_physical_device vk;
char *name;
int32_t render_fd;
int32_t display_fd;
- int32_t master_fd;
/* We need these because it is not clear how to detect
* valid devids in a portable way
@@ -128,11 +150,19 @@ struct v3dv_physical_device {
dev_t primary_devid;
dev_t render_devid;
+#if using_v3d_simulator
+ uint32_t device_id;
+#endif
+
uint8_t driver_build_sha1[20];
uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
uint8_t device_uuid[VK_UUID_SIZE];
uint8_t driver_uuid[VK_UUID_SIZE];
+ struct vk_sync_type drm_syncobj_type;
+ struct vk_sync_timeline_type sync_timeline_type;
+ const struct vk_sync_type *sync_types[3];
+
struct disk_cache *disk_cache;
mtx_t mutex;
@@ -148,14 +178,41 @@ struct v3dv_physical_device {
const struct v3d_compiler *compiler;
uint32_t next_program_id;
+ alignas(8) uint64_t heap_used;
+
+ /* This array holds all our 'struct v3dv_bo' allocations. We use this
+ * so we can add a refcount to our BOs and check if a particular BO
+ * was already allocated in this device using its GEM handle. This is
+ * necessary to properly manage BO imports, because the kernel doesn't
+ * refcount the underlying BO memory.
+ *
+ * Specifically, when self-importing (i.e. importing a BO into the same
+ * device that created it), the kernel will give us the same BO handle
+ * for both BOs and we must only free it once when both references are
+ * freed. Otherwise, if we are not self-importing, we get two different BO
+ * handles, and we want to free each one individually.
+ *
+ * The BOs in this map all have a refcnt with the reference counter and
+ * only self-imported BOs will ever have a refcnt > 1.
+ */
+ struct util_sparse_array bo_map;
+
struct {
bool merge_jobs;
} options;
+
+ struct {
+ bool cpu_queue;
+ bool multisync;
+ bool perfmon;
+ } caps;
};
-VkResult v3dv_physical_device_acquire_display(struct v3dv_instance *instance,
- struct v3dv_physical_device *pdevice,
- VkIcdSurfaceBase *surface);
+static inline struct v3dv_bo *
+v3dv_device_lookup_bo(struct v3dv_physical_device *device, uint32_t handle)
+{
+ return (struct v3dv_bo *) util_sparse_array_get(&device->bo_map, handle);
+}
VkResult v3dv_wsi_init(struct v3dv_physical_device *physical_device);
void v3dv_wsi_finish(struct v3dv_physical_device *physical_device);
@@ -172,64 +229,72 @@ void v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device);
void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device);
bool v3dv_meta_can_use_tlb(struct v3dv_image *image,
+ uint8_t plane,
+ uint8_t miplevel,
const VkOffset3D *offset,
+ const VkExtent3D *extent,
VkFormat *compat_format);
struct v3dv_instance {
struct vk_instance vk;
- int physicalDeviceCount;
- struct v3dv_physical_device physicalDevice;
-
bool pipeline_cache_enabled;
bool default_pipeline_cache_enabled;
};
-/* Tracks wait threads spawned from a single vkQueueSubmit call */
-struct v3dv_queue_submit_wait_info {
- /* struct vk_object_base base; ?*/
- struct list_head list_link;
-
- struct v3dv_device *device;
-
- /* List of wait threads spawned for any command buffers in a particular
- * call to vkQueueSubmit.
- */
- uint32_t wait_thread_count;
- struct {
- pthread_t thread;
- bool finished;
- } wait_threads[16];
-
- /* The master wait thread for the entire submit. This will wait for all
- * other threads in this submit to complete before processing signal
- * semaphores and fences.
+/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd,
+ * tfu), we still need a syncobj to track the last overall job submitted
+ * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can
+ * start expecting multisync to be present and drop the legacy implementation
+ * together with this V3DV_QUEUE_ANY tracker.
+ */
+enum v3dv_queue_type {
+ V3DV_QUEUE_CL = 0,
+ V3DV_QUEUE_CSD,
+ V3DV_QUEUE_TFU,
+ V3DV_QUEUE_CPU,
+ V3DV_QUEUE_ANY,
+ V3DV_QUEUE_COUNT,
+};
+
+/* For each GPU queue, we use a syncobj to track the last job submitted. We
+ * set the flag `first` to determine when we are starting a new cmd buffer
+ * batch and therefore a job submitted to a given queue will be the first in a
+ * cmd buf batch.
+ */
+struct v3dv_last_job_sync {
+ /* If the job is the first submitted to a GPU queue in a cmd buffer batch.
+ *
+ * We use V3DV_QUEUE_{CL,CSD,TFU} both with and without multisync.
*/
- pthread_t master_wait_thread;
-
- /* List of semaphores (and fence) to signal after all wait threads completed
- * and all command buffer jobs in the submission have been sent to the GPU.
+ bool first[V3DV_QUEUE_COUNT];
+ /* Array of syncobj to track the last job submitted to a GPU queue.
+ *
+ * With multisync we use V3DV_QUEUE_{CL,CSD,TFU} to track syncobjs for each
+ * queue, but without multisync we only track the last job submitted to any
+ * queue in V3DV_QUEUE_ANY.
*/
- uint32_t signal_semaphore_count;
- VkSemaphore *signal_semaphores;
- VkFence fence;
+ uint32_t syncs[V3DV_QUEUE_COUNT];
};
struct v3dv_queue {
- struct vk_object_base base;
+ struct vk_queue vk;
struct v3dv_device *device;
- VkDeviceQueueCreateFlags flags;
- /* A list of active v3dv_queue_submit_wait_info */
- struct list_head submit_wait_list;
-
- /* A mutex to prevent concurrent access to the list of wait threads */
- mtx_t mutex;
+ struct v3dv_last_job_sync last_job_syncs;
struct v3dv_job *noop_job;
+
+ /* The last active perfmon ID to prevent mixing of counter results when a
+ * job is submitted with a different perfmon id.
+ */
+ uint32_t last_perfmon_id;
};
+VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue,
+ struct vk_queue_submit *submit);
+
#define V3DV_META_BLIT_CACHE_KEY_SIZE (4 * sizeof(uint32_t))
#define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (3 * sizeof(uint32_t) + \
sizeof(VkComponentMapping))
@@ -261,27 +326,27 @@ struct v3dv_meta_texel_buffer_copy_pipeline {
};
struct v3dv_pipeline_key {
- bool robust_buffer_access;
uint8_t topology;
uint8_t logicop_func;
bool msaa;
- bool sample_coverage;
bool sample_alpha_to_coverage;
bool sample_alpha_to_one;
uint8_t cbufs;
struct {
enum pipe_format format;
- const uint8_t *swizzle;
+ uint8_t swizzle[4];
} color_fmt[V3D_MAX_DRAW_BUFFERS];
uint8_t f32_color_rb;
uint32_t va_swap_rb_mask;
bool has_multiview;
+ bool line_smooth;
};
struct v3dv_pipeline_cache_stats {
uint32_t miss;
uint32_t hit;
uint32_t count;
+ uint32_t on_disk_hit;
};
/* Equivalent to gl_shader_stage, but including the coordinate shaders
@@ -411,11 +476,11 @@ struct v3dv_device {
struct v3d_device_info devinfo;
struct v3dv_queue queue;
- /* A sync object to track the last job submitted to the GPU. */
- uint32_t last_job_sync;
+ /* Guards query->maybe_available and value for timestamps */
+ mtx_t query_mutex;
- /* A mutex to prevent concurrent access to last_job_sync from the queue */
- mtx_t mutex;
+ /* Signaled whenever a query is ended */
+ cnd_t query_ended;
/* Resources used for meta operations */
struct {
@@ -457,37 +522,107 @@ struct v3dv_device {
uint32_t bo_size;
uint32_t bo_count;
+ /* Event handling resources.
+ *
+ * Our implementation of events uses a BO to store event state (signaled vs
+ * reset) and dispatches compute shaders to handle GPU event functions
+ * (signal, reset, wait). This struct holds all the resources required
+ * by the implementation.
+ */
+ struct {
+ mtx_t lock;
+
+ /* BO for the event states: signaled (1) or reset (0) */
+ struct v3dv_bo *bo;
+
+ /* We pre-allocate all the events we can fit for the size of the BO we
+ * create to track their states, where each event has an index which is
+ * basically the offset of its state in that BO. We keep a free list with
+ * the pre-allocated events that are available.
+ */
+ uint32_t event_count;
+ struct v3dv_event *events;
+ struct list_head free_list;
+
+ /* Vulkan resources to access the event BO from shaders. We have a
+ * pipeline that sets the state of an event and another that waits on
+ * a single event. Both pipelines require access to the event state BO,
+ * for which we need to allocate a single descripot set.
+ */
+ VkBuffer buffer;
+ VkDeviceMemory mem;
+ VkDescriptorSetLayout descriptor_set_layout;
+ VkPipelineLayout pipeline_layout;
+ VkDescriptorPool descriptor_pool;
+ VkDescriptorSet descriptor_set;
+ VkPipeline set_event_pipeline;
+ VkPipeline wait_event_pipeline;
+ } events;
+
+ /* Query handling resources.
+ *
+ * Our implementation of occlusion queries uses a BO per pool to keep track
+ * of the per-query availability state and dispatches compute shaders to
+ * handle GPU query functions that read and write that state. This struct
+ * holds Vulkan resources that can be shared across all query pools to
+ * implement this. This framework may be extended in the future to handle
+ * more query types.
+ */
+ struct {
+ VkDescriptorSetLayout buf_descriptor_set_layout;
+
+ /* Set query availability */
+ VkPipelineLayout avail_pipeline_layout;
+ VkPipeline avail_pipeline;
+
+ /* Reset query availability and clear occlusion counters */
+ VkPipelineLayout reset_occlusion_pipeline_layout;
+ VkPipeline reset_occlusion_pipeline;
+
+ /* Copy query results */
+ VkPipelineLayout copy_pipeline_layout;
+ VkPipeline copy_pipeline[8];
+ } queries;
+
struct v3dv_pipeline_cache default_pipeline_cache;
- /* GL_SHADER_STATE_RECORD needs to speficy default attribute values. The
+ /* GL_SHADER_STATE_RECORD needs to specify default attribute values. The
* following covers the most common case, that is all attributes format
* being float being float, allowing us to reuse the same BO for all
* pipelines matching this requirement. Pipelines that need integer
* attributes will create their own BO.
+ *
+ * Note that since v71 the default attribute values are not needed, so this
+ * can be NULL.
*/
struct v3dv_bo *default_attribute_float;
- VkPhysicalDeviceFeatures features;
+
+ void *device_address_mem_ctx;
+ struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */
+
+#if DETECT_OS_ANDROID
+ struct u_gralloc *gralloc;
+#endif
};
struct v3dv_device_memory {
- struct vk_object_base base;
+ struct vk_device_memory vk;
struct v3dv_bo *bo;
const VkMemoryType *type;
- bool has_bo_ownership;
bool is_for_wsi;
+ bool is_for_device_address;
};
#define V3D_OUTPUT_IMAGE_FORMAT_NO 255
#define TEXTURE_DATA_FORMAT_NO 255
-struct v3dv_format {
- bool supported;
-
- /* One of V3D33_OUTPUT_IMAGE_FORMAT_*, or OUTPUT_IMAGE_FORMAT_NO */
+#define V3DV_MAX_PLANE_COUNT 3
+struct v3dv_format_plane {
+ /* One of V3D42_OUTPUT_IMAGE_FORMAT_*, or OUTPUT_IMAGE_FORMAT_NO */
uint8_t rt_type;
- /* One of V3D33_TEXTURE_DATA_FORMAT_*. */
+ /* One of V3D42_TEXTURE_DATA_FORMAT_*. */
uint8_t tex_type;
/* Swizzle to apply to the RGBA shader output for storing to the tile
@@ -499,15 +634,54 @@ struct v3dv_format {
/* Whether the return value is 16F/I/UI or 32F/I/UI. */
uint8_t return_size;
+};
+
+struct v3dv_format {
+ /* Non 0 plane count implies supported */
+ uint8_t plane_count;
+
+ struct v3dv_format_plane planes[V3DV_MAX_PLANE_COUNT];
/* If the format supports (linear) filtering when texturing. */
bool supports_filtering;
};
+/* Note that although VkImageAspectFlags would allow to combine more than one
+ * PLANE bit, for all the use cases we implement that use VkImageAspectFlags,
+ * only one plane is allowed, like for example vkCmdCopyImage:
+ *
+ * "If srcImage has a VkFormat with two planes then for each element of
+ * pRegions, srcSubresource.aspectMask must be VK_IMAGE_ASPECT_PLANE_0_BIT
+ * or VK_IMAGE_ASPECT_PLANE_1_BIT"
+ *
+ */
+static uint8_t v3dv_plane_from_aspect(VkImageAspectFlags aspect)
+{
+ switch (aspect) {
+ case VK_IMAGE_ASPECT_COLOR_BIT:
+ case VK_IMAGE_ASPECT_DEPTH_BIT:
+ case VK_IMAGE_ASPECT_STENCIL_BIT:
+ case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT:
+ case VK_IMAGE_ASPECT_PLANE_0_BIT:
+ case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT:
+ return 0;
+ case VK_IMAGE_ASPECT_PLANE_1_BIT:
+ case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
+ return 1;
+ case VK_IMAGE_ASPECT_PLANE_2_BIT:
+ case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
+ return 2;
+ default:
+ unreachable("invalid image aspect");
+ }
+}
+
struct v3d_resource_slice {
uint32_t offset;
uint32_t stride;
uint32_t padded_height;
+ uint32_t width;
+ uint32_t height;
/* Size of a single pane of the slice. For 3D textures, there will be
* a number of panes equal to the minified, power-of-two-aligned
* depth.
@@ -518,24 +692,85 @@ struct v3d_resource_slice {
uint32_t padded_height_of_output_image_in_uif_blocks;
};
+bool v3dv_format_swizzle_needs_rb_swap(const uint8_t *swizzle);
+bool v3dv_format_swizzle_needs_reverse(const uint8_t *swizzle);
+
struct v3dv_image {
struct vk_image vk;
const struct v3dv_format *format;
- uint32_t cpp;
bool tiled;
- struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS];
- uint64_t size; /* Total size in bytes */
- uint32_t cube_map_stride;
+ uint8_t plane_count;
- struct v3dv_device_memory *mem;
- VkDeviceSize mem_offset;
- uint32_t alignment;
+ /* If 0, this is a multi-plane image with use disjoint memory, where each
+ * plane binds a different device memory. Otherwise, all the planes share
+ * the same device memory and this stores the total size of the image in
+ * bytes.
+ */
+ uint32_t non_disjoint_size;
+
+ struct {
+ uint32_t cpp;
+
+ struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS];
+ /* Total size of the plane in bytes. */
+ uint64_t size;
+ uint32_t cube_map_stride;
+
+ /* If not using disjoint memory, mem and mem_offset is the same for all
+ * planes, in which case mem_offset is the offset of plane 0.
+ */
+ struct v3dv_device_memory *mem;
+ VkDeviceSize mem_offset;
+ uint32_t alignment;
+
+ /* Pre-subsampled per plane width and height
+ */
+ uint32_t width;
+ uint32_t height;
+
+ /* Even if we can get it from the parent image format, we keep the
+ * format here for convenience
+ */
+ VkFormat vk_format;
+ } planes[V3DV_MAX_PLANE_COUNT];
+
+ /* Used only when sampling a linear texture (which V3D doesn't support).
+ * This holds a tiled copy of the image we can use for that purpose.
+ */
+ struct v3dv_image *shadow;
+
+#if DETECT_OS_ANDROID
+ /* Image is backed by VK_ANDROID_native_buffer, */
+ bool is_native_buffer_memory;
+ /* Image is backed by VK_ANDROID_external_memory_android_hardware_buffer */
+ bool is_ahb;
+ VkImageDrmFormatModifierExplicitCreateInfoEXT *android_explicit_layout;
+ VkSubresourceLayout *android_plane_layouts;
+#endif
};
+VkResult
+v3dv_image_init(struct v3dv_device *device,
+ const VkImageCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ struct v3dv_image *image);
+
VkImageViewType v3dv_image_type_to_view_type(VkImageType type);
+static uint32_t
+v3dv_image_aspect_to_plane(const struct v3dv_image *image,
+ VkImageAspectFlagBits aspect)
+{
+ assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
+
+ /* Because we always put image and view planes in aspect-bit-order, the
+ * plane index is the number of bits in the image aspect before aspect.
+ */
+ return util_bitcount(image->vk.aspects & (aspect - 1));
+}
+
/* Pre-generating packets needs to consider changes in packet sizes across hw
* versions. Keep things simple and allocate enough space for any supported
* version. We ensure the size is large enough through static asserts.
@@ -553,31 +788,50 @@ struct v3dv_image_view {
struct vk_image_view vk;
const struct v3dv_format *format;
- bool swap_rb;
- uint32_t internal_bpp;
- uint32_t internal_type;
- uint32_t offset;
- /* Precomputed (composed from createinfo->components and formar swizzle)
- * swizzles to pass in to the shader key.
- *
- * This could be also included on the descriptor bo, but the shader state
- * packet doesn't need it on a bo, so we can just avoid a memory copy
- */
- uint8_t swizzle[4];
+ uint8_t view_swizzle[4];
- /* Prepacked TEXTURE_SHADER_STATE. It will be copied to the descriptor info
- * during UpdateDescriptorSets.
- *
- * Empirical tests show that cube arrays need a different shader state
- * depending on whether they are used with a sampler or not, so for these
- * we generate two states and select the one to use based on the descriptor
- * type.
+ uint8_t plane_count;
+ struct {
+ uint8_t image_plane;
+
+ bool swap_rb;
+ bool channel_reverse;
+ uint32_t internal_bpp;
+ uint32_t internal_type;
+ uint32_t offset;
+
+ /* Precomputed swizzle (composed from the view swizzle and the format
+ * swizzle).
+ *
+ * This could be also included on the descriptor bo, but the shader state
+ * packet doesn't need it on a bo, so we can just avoid a memory copy
+ */
+ uint8_t swizzle[4];
+
+ /* Prepacked TEXTURE_SHADER_STATE. It will be copied to the descriptor info
+ * during UpdateDescriptorSets.
+ *
+ * Empirical tests show that cube arrays need a different shader state
+ * depending on whether they are used with a sampler or not, so for these
+ * we generate two states and select the one to use based on the descriptor
+ * type.
+ */
+ uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH];
+ } planes[V3DV_MAX_PLANE_COUNT];
+
+ /* Used only when sampling a linear texture (which V3D doesn't support).
+ * This would represent a view over the tiled shadow image.
*/
- uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH];
+ struct v3dv_image_view *shadow;
};
-uint32_t v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer);
+VkResult v3dv_create_image_view(struct v3dv_device *device,
+ const VkImageViewCreateInfo *pCreateInfo,
+ VkImageView *pView);
+
+uint32_t v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer,
+ uint8_t plane);
struct v3dv_buffer {
struct vk_object_base base;
@@ -590,6 +844,15 @@ struct v3dv_buffer {
VkDeviceSize mem_offset;
};
+void
+v3dv_buffer_init(struct v3dv_device *device,
+ const VkBufferCreateInfo *pCreateInfo,
+ struct v3dv_buffer *buffer,
+ uint32_t alignment);
+
+void
+v3dv_buffer_bind_memory(const VkBindBufferMemoryInfo *info);
+
struct v3dv_buffer_view {
struct vk_object_base base;
@@ -622,6 +885,8 @@ struct v3dv_subpass {
struct v3dv_subpass_attachment *resolve_attachments;
struct v3dv_subpass_attachment ds_attachment;
+ struct v3dv_subpass_attachment ds_resolve_attachment;
+ bool resolve_depth, resolve_stencil;
/* If we need to emit the clear of the depth/stencil attachment using a
* a draw call instead of using the TLB (GFXH-1461).
@@ -634,7 +899,7 @@ struct v3dv_subpass {
};
struct v3dv_render_pass_attachment {
- VkAttachmentDescription desc;
+ VkAttachmentDescription2 desc;
uint32_t first_subpass;
uint32_t last_subpass;
@@ -650,10 +915,11 @@ struct v3dv_render_pass_attachment {
uint32_t last_subpass;
} views[MAX_MULTIVIEW_VIEW_COUNT];
- /* If this is a multismapled attachment that is going to be resolved,
- * whether we can use the TLB resolve on store.
+ /* If this is a multisampled attachment that is going to be resolved,
+ * whether we may be able to use the TLB hardware resolve based on the
+ * attachment format.
*/
- bool use_tlb_resolve;
+ bool try_tlb_resolve;
};
struct v3dv_render_pass {
@@ -678,7 +944,7 @@ struct v3dv_framebuffer {
uint32_t layers;
/* Typically, edge tiles in the framebuffer have padding depending on the
- * underlying tiling layout. One consequnce of this is that when the
+ * underlying tiling layout. One consequence of this is that when the
* framebuffer dimensions are not aligned to tile boundaries, tile stores
* would still write full tiles on the edges and write to the padded area.
* If the framebuffer is aliasing a smaller region of a larger image, then
@@ -690,6 +956,11 @@ struct v3dv_framebuffer {
uint32_t attachment_count;
uint32_t color_attachment_count;
+
+ /* Notice that elements in 'attachments' will be NULL if the framebuffer
+ * was created imageless. The driver is expected to access attachment info
+ * from the command buffer state instead.
+ */
struct v3dv_image_view *attachments[0];
};
@@ -699,7 +970,9 @@ struct v3dv_frame_tiling {
uint32_t layers;
uint32_t render_target_count;
uint32_t internal_bpp;
+ uint32_t total_color_bpp;
bool msaa;
+ bool double_buffer;
uint32_t tile_width;
uint32_t tile_height;
uint32_t draw_tiles_x;
@@ -710,22 +983,26 @@ struct v3dv_frame_tiling {
uint32_t frame_height_in_supertiles;
};
-void v3dv_framebuffer_compute_internal_bpp_msaa(const struct v3dv_framebuffer *framebuffer,
- const struct v3dv_subpass *subpass,
- uint8_t *max_bpp, bool *msaa);
-
bool v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device,
const VkRect2D *area,
struct v3dv_framebuffer *fb,
struct v3dv_render_pass *pass,
uint32_t subpass_idx);
-struct v3dv_cmd_pool {
- struct vk_object_base base;
-
- VkAllocationCallbacks alloc;
- struct list_head cmd_buffers;
-};
+/* Checks if we need to emit 2 initial tile clears for double buffer mode.
+ * This happens when we render at least 2 tiles, because in this mode each
+ * tile uses a different half of the tile buffer memory so we can have 2 tiles
+ * in flight (one being stored to memory and the next being rendered). In this
+ * scenario, if we emit a single initial tile clear we would only clear the
+ * first half of the tile buffer.
+ */
+static inline bool
+v3dv_do_double_initial_tile_clear(const struct v3dv_frame_tiling *tiling)
+{
+ return tiling->double_buffer &&
+ (tiling->draw_tiles_x > 1 || tiling->draw_tiles_y > 1 ||
+ tiling->layers > 1);
+}
enum v3dv_cmd_buffer_status {
V3DV_CMD_BUFFER_STATUS_NEW = 0,
@@ -748,100 +1025,67 @@ struct v3dv_cmd_buffer_attachment_state {
/* The hardware clear value */
union v3dv_clear_value clear_value;
+
+ /* The underlying image view (from the framebuffer or, if imageless
+ * framebuffer is used, from VkRenderPassAttachmentBeginInfo.
+ */
+ struct v3dv_image_view *image_view;
+
+ /* If this is a multisampled attachment with a resolve operation. */
+ bool has_resolve;
+
+ /* If this is a multisampled attachment with a resolve operation,
+ * whether we can use the TLB for the resolve.
+ */
+ bool use_tlb_resolve;
};
+/* Cached values derived from Vulkan viewport/count */
struct v3dv_viewport_state {
- uint32_t count;
- VkViewport viewports[MAX_VIEWPORTS];
float translate[MAX_VIEWPORTS][3];
float scale[MAX_VIEWPORTS][3];
};
-struct v3dv_scissor_state {
- uint32_t count;
- VkRect2D scissors[MAX_SCISSORS];
-};
-
-/* Mostly a v3dv mapping of VkDynamicState, used to track which data as
- * defined as dynamic
- */
-enum v3dv_dynamic_state_bits {
- V3DV_DYNAMIC_VIEWPORT = 1 << 0,
- V3DV_DYNAMIC_SCISSOR = 1 << 1,
- V3DV_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 2,
- V3DV_DYNAMIC_STENCIL_WRITE_MASK = 1 << 3,
- V3DV_DYNAMIC_STENCIL_REFERENCE = 1 << 4,
- V3DV_DYNAMIC_BLEND_CONSTANTS = 1 << 5,
- V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6,
- V3DV_DYNAMIC_LINE_WIDTH = 1 << 7,
- V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8,
- V3DV_DYNAMIC_ALL = (1 << 9) - 1,
-};
-
-/* Flags for dirty pipeline state.
+/* Flags for custom dirty state, that could lead to packet emission.
+ *
+ * Note *custom*, for all the dynamic state tracking coming from the Vulkan
+ * API, we use the Mesa runtime framework and their predefined flags
+ * (MESA_VK_DYNAMIC_XXX).
+ *
+ * Here we defined additional flags used to track dirty state.
*/
enum v3dv_cmd_dirty_bits {
- V3DV_CMD_DIRTY_VIEWPORT = 1 << 0,
- V3DV_CMD_DIRTY_SCISSOR = 1 << 1,
- V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK = 1 << 2,
- V3DV_CMD_DIRTY_STENCIL_WRITE_MASK = 1 << 3,
- V3DV_CMD_DIRTY_STENCIL_REFERENCE = 1 << 4,
- V3DV_CMD_DIRTY_PIPELINE = 1 << 5,
- V3DV_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 6,
- V3DV_CMD_DIRTY_VERTEX_BUFFER = 1 << 7,
- V3DV_CMD_DIRTY_INDEX_BUFFER = 1 << 8,
- V3DV_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 9,
- V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 10,
- V3DV_CMD_DIRTY_PUSH_CONSTANTS = 1 << 11,
- V3DV_CMD_DIRTY_BLEND_CONSTANTS = 1 << 12,
- V3DV_CMD_DIRTY_OCCLUSION_QUERY = 1 << 13,
- V3DV_CMD_DIRTY_DEPTH_BIAS = 1 << 14,
- V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 15,
- V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 16,
- V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 17,
+ V3DV_CMD_DIRTY_PIPELINE = 1 << 0,
+ V3DV_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1,
+ V3DV_CMD_DIRTY_VERTEX_BUFFER = 1 << 2,
+ V3DV_CMD_DIRTY_INDEX_BUFFER = 1 << 3,
+ V3DV_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 4,
+ V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 5,
+ V3DV_CMD_DIRTY_PUSH_CONSTANTS = 1 << 6,
+ V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO = 1 << 7,
+ V3DV_CMD_DIRTY_OCCLUSION_QUERY = 1 << 8,
+ V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 9,
+ V3DV_CMD_DIRTY_DRAW_ID = 1 << 10,
+ V3DV_CMD_DIRTY_ALL = (1 << 10) - 1,
};
struct v3dv_dynamic_state {
- /**
- * Bitmask of (1 << VK_DYNAMIC_STATE_*).
- * Defines the set of saved dynamic state.
+ /* FIXME: we keep some viewport info cached (translate, scale) because we
+ * use that on more that one place. But note that translate_z and scale_z
+ * is also used in several places, and we recompute it based on
+ * scissor/viewport info all time. So perhaps we could do the same with the
+ * x and y component.
*/
- uint32_t mask;
-
struct v3dv_viewport_state viewport;
- struct v3dv_scissor_state scissor;
-
- struct {
- uint32_t front;
- uint32_t back;
- } stencil_compare_mask;
-
- struct {
- uint32_t front;
- uint32_t back;
- } stencil_write_mask;
-
- struct {
- uint32_t front;
- uint32_t back;
- } stencil_reference;
-
- float blend_constants[4];
-
- struct {
- float constant_factor;
- float depth_bias_clamp;
- float slope_factor;
- } depth_bias;
-
- float line_width;
-
+ /* We cache the color_write_enable as the vulkan runtime keeps a 8-bit
+ * bitset with a bit per attachment, but in order to combine with the
+ * color_write_masks is easier to cache a 32-bit bitset with 4 bits per
+ * attachment.
+ */
uint32_t color_write_enable;
};
-extern const struct v3dv_dynamic_state default_dynamic_state;
-
void v3dv_viewport_compute_xform(const VkViewport *viewport,
float scale[3],
float translate[3]);
@@ -855,15 +1099,12 @@ enum v3dv_ez_state {
enum v3dv_job_type {
V3DV_JOB_TYPE_GPU_CL = 0,
- V3DV_JOB_TYPE_GPU_CL_SECONDARY,
+ V3DV_JOB_TYPE_GPU_CL_INCOMPLETE,
V3DV_JOB_TYPE_GPU_TFU,
V3DV_JOB_TYPE_GPU_CSD,
V3DV_JOB_TYPE_CPU_RESET_QUERIES,
V3DV_JOB_TYPE_CPU_END_QUERY,
V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
- V3DV_JOB_TYPE_CPU_SET_EVENT,
- V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
- V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
};
@@ -874,7 +1115,7 @@ struct v3dv_reset_query_cpu_job_info {
uint32_t count;
};
-struct v3dv_end_query_cpu_job_info {
+struct v3dv_end_query_info {
struct v3dv_query_pool *pool;
uint32_t query;
@@ -892,31 +1133,14 @@ struct v3dv_copy_query_results_cpu_job_info {
VkQueryResultFlags flags;
};
-struct v3dv_event_set_cpu_job_info {
- struct v3dv_event *event;
- int state;
-};
-
-struct v3dv_event_wait_cpu_job_info {
- /* List of events to wait on */
- uint32_t event_count;
- struct v3dv_event **events;
-
- /* Whether any postponed jobs after the wait should wait on semaphores */
- bool sem_wait;
-};
+struct v3dv_submit_sync_info {
+ /* List of syncs to wait before running a job */
+ uint32_t wait_count;
+ struct vk_sync_wait *waits;
-struct v3dv_copy_buffer_to_image_cpu_job_info {
- struct v3dv_image *image;
- struct v3dv_buffer *buffer;
- uint32_t buffer_offset;
- uint32_t buffer_stride;
- uint32_t buffer_layer_stride;
- VkOffset3D image_offset;
- VkExtent3D image_extent;
- uint32_t mip_level;
- uint32_t base_layer;
- uint32_t layer_count;
+ /* List of syncs to signal when all jobs complete */
+ uint32_t signal_count;
+ struct vk_sync_signal *signals;
};
struct v3dv_csd_indirect_cpu_job_info {
@@ -936,6 +1160,19 @@ struct v3dv_timestamp_query_cpu_job_info {
uint32_t count;
};
+/* Number of perfmons required to handle all supported performance counters */
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
+ DRM_V3D_MAX_PERF_COUNTERS)
+
+struct v3dv_perf_query {
+ uint32_t kperfmon_ids[V3DV_MAX_PERFMONS];
+
+ /* A DRM syncobj to wait on the GPU jobs for which we are collecting
+ * performance data.
+ */
+ struct vk_sync *last_job_sync;
+};
+
struct v3dv_job {
struct list_head list_link;
@@ -945,6 +1182,61 @@ struct v3dv_job {
*/
bool is_clone;
+ /* If this is a cloned job, if it has its own BCL resource. This happens
+ * when we suspend jobs with in command buffers with the
+ * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT flag.
+ */
+ bool clone_owns_bcl;
+
+ /* VK_KHR_dynamic_rendering */
+ bool suspending;
+ bool resuming;
+ struct v3dv_cl_out *suspend_branch_inst_ptr;
+ uint32_t suspended_bcl_end;
+
+ /* If the job executes on the transfer stage of the pipeline */
+ bool is_transfer;
+
+ /* VK_KHR_buffer_device_address allows shaders to use pointers that can
+ * dereference memory in any buffer that has been flagged with
+ * VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT. These buffers may not
+ * be bound via descriptor sets, so we need to make sure that a job that
+ * uses this functionality includes all these buffers in its kernel
+ * submission.
+ */
+ bool uses_buffer_device_address;
+
+ /* True if we have not identified anything that would be incompatible
+ * with double-buffer (like MSAA) or that would make double-buffer mode
+ * not efficient (like tile loads or not having any stores).
+ */
+ bool can_use_double_buffer;
+
+ /* This structure keeps track of various scores to inform a heuristic
+ * for double-buffer mode.
+ */
+ struct {
+ /* Cost of geometry shading */
+ uint32_t geom;
+ /* Cost of shader rendering */
+ uint32_t render;
+ } double_buffer_score;
+
+ /* We only need to allocate tile state for all layers if the binner
+ * writes primitives to layers other than the first. This can only be
+ * done using layered rendering (writing gl_Layer from a geometry shader),
+ * so for other cases of multilayered framebuffers (typically with
+ * meta copy/clear operations) that won't use layered rendering, we only
+ * need one layer worth of of tile state for the binner.
+ */
+ bool allocate_tile_state_for_all_layers;
+
+ /* A pointer to the location of the TILE_BINNING_MODE_CFG packet so we can
+ * rewrite it to enable double-buffer mode by the time we have enough info
+ * about the job to make that decision.
+ */
+ struct v3dv_cl_out *bcl_tile_binning_mode_ptr;
+
enum v3dv_job_type type;
struct v3dv_device *device;
@@ -988,6 +1280,9 @@ struct v3dv_job {
*/
bool decided_global_ez_enable;
+ /* If the job emitted any draw calls with Early Z/S enabled */
+ bool has_ez_draws;
+
/* If this job has been configured to use early Z/S clear */
bool early_zs_clear;
@@ -1000,8 +1295,10 @@ struct v3dv_job {
*/
bool always_flush;
- /* Whether we need to serialize this job in our command stream */
- bool serialize;
+ /* A mask of V3DV_BARRIER_* indicating the source(s) of the barrier. We
+ * can use this to select the hw queues where we need to serialize the job.
+ */
+ uint8_t serialize;
/* If this is a CL job, whether we should sync before binning */
bool needs_bcl_sync;
@@ -1009,11 +1306,8 @@ struct v3dv_job {
/* Job specs for CPU jobs */
union {
struct v3dv_reset_query_cpu_job_info query_reset;
- struct v3dv_end_query_cpu_job_info query_end;
+ struct v3dv_end_query_info query_end;
struct v3dv_copy_query_results_cpu_job_info query_copy_results;
- struct v3dv_event_set_cpu_job_info event_set;
- struct v3dv_event_wait_cpu_job_info event_wait;
- struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
struct v3dv_csd_indirect_cpu_job_info csd_indirect;
struct v3dv_timestamp_query_cpu_job_info query_timestamp;
} cpu;
@@ -1028,6 +1322,9 @@ struct v3dv_job {
uint32_t wg_base[3];
struct drm_v3d_submit_csd submit;
} csd;
+
+ /* Perfmons with last job sync for CSD and CL jobs */
+ struct v3dv_perf_query *perf;
};
void v3dv_job_init(struct v3dv_job *job,
@@ -1045,10 +1342,17 @@ void v3dv_job_start_frame(struct v3dv_job *job,
uint32_t height,
uint32_t layers,
bool allocate_tile_state_for_all_layers,
+ bool allocate_tile_state_now,
uint32_t render_target_count,
uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
bool msaa);
+bool v3dv_job_type_is_gpu(struct v3dv_job *job);
+
+struct v3dv_job *
+v3dv_job_clone(struct v3dv_job *job, bool skip_bcl);
+
struct v3dv_job *
v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
struct v3dv_cmd_buffer *cmd_buffer);
@@ -1065,7 +1369,26 @@ v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t *alloc_count,
void **ptr);
-void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer);
+void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
+ bool indexed, bool indirect,
+ uint32_t vertex_count);
+
+bool v3dv_job_allocate_tile_state(struct v3dv_job *job);
+
+void
+v3dv_setup_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkRenderingInfoKHR *pRenderingInfo);
+
+void
+v3dv_destroy_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dv_setup_dynamic_render_pass(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkRenderingInfoKHR *pRenderingInfo);
+
+void
+v3dv_setup_dynamic_render_pass_inheritance(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkCommandBufferInheritanceRenderingInfo *info);
/* FIXME: only used on v3dv_cmd_buffer and v3dvx_cmd_buffer, perhaps move to a
* cmd_buffer specific header?
@@ -1094,9 +1417,46 @@ struct v3dv_cmd_pipeline_state {
struct v3dv_descriptor_state descriptor_state;
};
+enum {
+ V3DV_BARRIER_GRAPHICS_BIT = (1 << 0),
+ V3DV_BARRIER_COMPUTE_BIT = (1 << 1),
+ V3DV_BARRIER_TRANSFER_BIT = (1 << 2),
+ V3DV_BARRIER_CPU_BIT = (1 << 3),
+};
+#define V3DV_BARRIER_ALL (V3DV_BARRIER_GRAPHICS_BIT | \
+ V3DV_BARRIER_TRANSFER_BIT | \
+ V3DV_BARRIER_COMPUTE_BIT | \
+ V3DV_BARRIER_CPU_BIT);
+
+struct v3dv_barrier_state {
+ /* Mask of V3DV_BARRIER_* indicating where we consume a barrier. */
+ uint8_t dst_mask;
+
+ /* For each possible consumer of a barrier, a mask of V3DV_BARRIER_*
+ * indicating the sources of the dependency.
+ */
+ uint8_t src_mask_graphics;
+ uint8_t src_mask_transfer;
+ uint8_t src_mask_compute;
+
+ /* For graphics barriers, access masks involved. Used to decide if we need
+ * to execute a binning or render barrier.
+ */
+ VkAccessFlags2 bcl_buffer_access;
+ VkAccessFlags2 bcl_image_access;
+};
+
struct v3dv_cmd_buffer_state {
struct v3dv_render_pass *pass;
struct v3dv_framebuffer *framebuffer;
+
+ /* VK_KHR_dynamic_rendering */
+ struct v3dv_render_pass dynamic_pass;
+ struct v3dv_subpass dynamic_subpass;
+ struct v3dv_render_pass_attachment dynamic_attachments[18 /* (8 color + D/S) x 2 (for resolves) */];
+ struct v3dv_subpass_attachment dynamic_subpass_attachments[18];
+ struct v3dv_framebuffer *dynamic_framebuffer;
+
VkRect2D render_area;
/* Current job being recorded */
@@ -1107,8 +1467,16 @@ struct v3dv_cmd_buffer_state {
struct v3dv_cmd_pipeline_state gfx;
struct v3dv_cmd_pipeline_state compute;
+ /* For most state tracking we rely on vk_dynamic_graphics_state, but we
+ * maintain a custom structure for some state-related data that we want to
+ * cache.
+ */
struct v3dv_dynamic_state dynamic;
+ /* This dirty is for v3dv_cmd_dirty_bits (FIXME: perhaps we should be more
+ * explicit about it). For dirty flags coming from Vulkan dynamic state,
+ * use the vk_dynamic_graphics_state handled by the vk_cmd_buffer
+ */
uint32_t dirty;
VkShaderStageFlagBits dirty_descriptor_stages;
VkShaderStageFlagBits dirty_push_constants_stages;
@@ -1128,6 +1496,14 @@ struct v3dv_cmd_buffer_state {
*/
bool tile_aligned_render_area;
+ /* FIXME: we have just one client-side BO for the push constants,
+ * independently of the stageFlags in vkCmdPushConstants, and the
+ * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage
+ * tuning in the future if it makes sense.
+ */
+ uint32_t push_constants_size;
+ uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4];
+
uint32_t attachment_alloc_count;
struct v3dv_cmd_buffer_attachment_state *attachments;
@@ -1151,14 +1527,21 @@ struct v3dv_cmd_buffer_state {
/* Current view index for multiview rendering */
uint32_t view_index;
+ /* Current draw ID for multidraw */
+ uint32_t draw_id;
+
/* Used to flag OOM conditions during command buffer recording */
bool oom;
- /* Whether we have recorded a pipeline barrier that we still need to
- * process.
- */
- bool has_barrier;
- bool has_bcl_barrier;
+ /* If we are currently recording job(s) for a transfer operation */
+ bool is_transfer;
+
+ /* VK_KHR_dynamic_rendering */
+ bool suspending;
+ bool resuming;
+
+ /* Barrier state tracking */
+ struct v3dv_barrier_state barrier;
/* Secondary command buffer state */
struct {
@@ -1178,12 +1561,14 @@ struct v3dv_cmd_buffer_state {
bool tile_aligned_render_area;
VkRect2D render_area;
+ struct vk_dynamic_graphics_state dynamic_graphics_state;
struct v3dv_dynamic_state dynamic;
struct v3dv_cmd_pipeline_state gfx;
bool has_descriptor_state;
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
+ uint32_t push_constants_size;
} meta;
/* Command buffer state for queries */
@@ -1196,19 +1581,49 @@ struct v3dv_cmd_buffer_state {
struct {
uint32_t used_count;
uint32_t alloc_count;
- struct v3dv_end_query_cpu_job_info *states;
+ struct v3dv_end_query_info *states;
} end;
- /* This BO is not NULL if we have an active query, that is, we have
- * called vkCmdBeginQuery but not vkCmdEndQuery.
- */
struct {
+ /* This BO is not NULL if we have an active occlusion query, that is,
+ * we have called vkCmdBeginQuery but not vkCmdEndQuery.
+ */
struct v3dv_bo *bo;
uint32_t offset;
+ /* When the driver emits draw calls to implement other operations in
+ * the middle of a render pass (such as an attachment clear), we need
+ * to pause occlusion query recording and resume it later so that
+ * these draw calls don't register in occlussion counters. We use
+ * this to store the BO reference in which we should resume occlusion
+ * query counters after the driver is done emitting its draw calls.
+ */
+ struct v3dv_bo *paused_bo;
+
+ /* This pointer is not NULL if we have an active performance query */
+ struct v3dv_perf_query *perf;
} active_query;
} query;
+
+ /* This is dynamic state since VK_EXT_extended_dynamic_state. */
+ bool z_updates_enable;
+
+ /* ez_state can be dynamic since VK_EXT_extended_dynamic_state so we need
+ * to keep track of it in the cmd_buffer state
+ */
+ enum v3dv_ez_state ez_state;
+
+ /* incompatible_ez_test can be dynamic since VK_EXT_extended_dynamic_state
+ * so we need to keep track of it in the cmd_buffer state
+ */
+ bool incompatible_ez_test;
+
};
+void
+v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer,
+ uint32_t vp_idx,
+ float *translate_z, float *scale_z);
+
/* The following struct represents the info from a descriptor that we store on
* the host memory. They are mostly links to other existing vulkan objects,
* like the image_view in order to access to swizzle info, or the buffer used
@@ -1228,8 +1643,8 @@ struct v3dv_descriptor {
struct {
struct v3dv_buffer *buffer;
- uint32_t offset;
- uint32_t range;
+ size_t offset;
+ size_t range;
};
struct v3dv_buffer_view *buffer_view;
@@ -1237,28 +1652,90 @@ struct v3dv_descriptor {
};
struct v3dv_query {
+ /* Used by queries where we implement result copying in the CPU so we can
+ * tell if the relevant jobs have been submitted for execution. Currently
+ * these are all but occlusion queries.
+ */
bool maybe_available;
+
union {
- /* Used by GPU queries (occlusion) */
+ /* Used by occlusion queries */
struct {
- struct v3dv_bo *bo;
+ /* Offset of this query in the occlusion query counter BO */
uint32_t offset;
- };
- /* Used by CPU queries (timestamp) */
- uint64_t value;
+ } occlusion;
+
+ /* Used by timestamp queries */
+ struct {
+ /* Offset of this query in the timestamp BO for its value */
+ uint32_t offset;
+
+ /* Syncobj to signal timestamp query availability */
+ struct vk_sync *sync;
+ } timestamp;
+
+ /* Used by performance queries */
+ struct v3dv_perf_query perf;
};
};
struct v3dv_query_pool {
struct vk_object_base base;
- struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
+ /* Per-pool Vulkan resources required to implement GPU-side query
+ * functions (only occlusion queries for now).
+ */
+ struct {
+ /* Buffer to access the BO with the occlusion query results and
+ * availability info.
+ */
+ VkBuffer buf;
+ VkDeviceMemory mem;
+
+ /* Descriptor set for accessing the buffer from a pipeline. */
+ VkDescriptorPool descriptor_pool;
+ VkDescriptorSet descriptor_set;
+ } meta;
+
+ /* Only used with occlusion queries */
+ struct {
+ /* BO with the occlusion counters and query availability */
+ struct v3dv_bo *bo;
+ /* Offset of the availability info in the BO */
+ uint32_t avail_offset;
+ } occlusion;
+
+ /* Only used with timestamp queries */
+ struct {
+ /* BO with the query timestamp values */
+ struct v3dv_bo *bo;
+ } timestamp;
+
+ /* Only used with performance queries */
+ struct {
+ uint32_t ncounters;
+ uint8_t counters[V3D_MAX_PERFCNT];
+
+ /* V3D has a limit on the number of counters we can track in a
+ * single performance monitor, so if too many counters are requested
+ * we need to create multiple monitors to record all of them. This
+ * field represents the number of monitors required for the number
+ * of counters requested.
+ */
+ uint8_t nperfmons;
+ } perfmon;
VkQueryType query_type;
uint32_t query_count;
struct v3dv_query *queries;
};
+VkResult
+v3dv_query_allocate_resources(struct v3dv_device *decice);
+
+void
+v3dv_query_free_resources(struct v3dv_device *decice);
+
VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
struct v3dv_query_pool *pool,
uint32_t first,
@@ -1267,6 +1744,16 @@ VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
VkDeviceSize stride,
VkQueryResultFlags flags);
+void v3dv_reset_query_pool_cpu(struct v3dv_device *device,
+ struct v3dv_query_pool *query_pool,
+ uint32_t first,
+ uint32_t last);
+
+void v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query, uint32_t count,
+ uint8_t availability);
+
typedef void (*v3dv_cmd_buffer_private_obj_destroy_cb)(VkDevice device,
uint64_t pobj,
VkAllocationCallbacks *alloc);
@@ -1276,33 +1763,20 @@ struct v3dv_cmd_buffer_private_obj {
v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb;
};
+extern const struct vk_command_buffer_ops v3dv_cmd_buffer_ops;
+
struct v3dv_cmd_buffer {
- struct vk_object_base base;
+ struct vk_command_buffer vk;
struct v3dv_device *device;
- struct v3dv_cmd_pool *pool;
- struct list_head pool_link;
-
- /* Used at submit time to link command buffers in the submission that have
- * spawned wait threads, so we can then wait on all of them to complete
- * before we process any signal sempahores or fences.
- */
- struct list_head list_link;
-
VkCommandBufferUsageFlags usage_flags;
- VkCommandBufferLevel level;
enum v3dv_cmd_buffer_status status;
struct v3dv_cmd_buffer_state state;
- /* FIXME: we have just one client-side and bo for the push constants,
- * independently of the stageFlags in vkCmdPushConstants, and the
- * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage
- * tunning in the future if it makes sense.
- */
- uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4];
+ /* Buffer where we upload push constant data to resolve indirect indexing */
struct v3dv_cl_reloc push_constants_resource;
/* Collection of Vulkan objects created internally by the driver (typically
@@ -1321,6 +1795,10 @@ struct v3dv_cmd_buffer {
/* The current descriptor pool for texel buffer copy sources */
VkDescriptorPool dspool;
} texel_buffer_copy;
+ struct {
+ /* The current descriptor pool for the copy query results output buffer */
+ VkDescriptorPool dspool;
+ } query;
} meta;
/* List of jobs in the command buffer. For primary command buffers it
@@ -1346,19 +1824,16 @@ void v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer);
void v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
bool push_descriptor_state);
void v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
- uint32_t dirty_dynamic_state,
bool needs_subpass_resume);
-void v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_query_pool *pool,
- uint32_t first,
- uint32_t count);
-
void v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool,
uint32_t query,
VkQueryControlFlags flags);
+void v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
+void v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
+
void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool,
uint32_t query);
@@ -1375,38 +1850,58 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
struct drm_v3d_submit_tfu *tfu);
-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
+void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
+ struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts);
void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
uint64_t obj,
v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb);
-struct v3dv_semaphore {
- struct vk_object_base base;
+void v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
+ struct v3dv_barrier_state *src);
- /* A syncobject handle associated with this semaphore */
- uint32_t sync;
+void v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_job *job);
- /* A temporary syncobject handle produced from a vkImportSemaphoreFd. */
- uint32_t temp_sync;
-};
+bool v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
+ VkImageAspectFlags aspect,
+ uint32_t first_subpass_idx,
+ VkAttachmentLoadOp load_op,
+ uint32_t last_subpass_idx,
+ VkAttachmentStoreOp store_op);
-struct v3dv_fence {
- struct vk_object_base base;
+bool v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
+ VkImageAspectFlags aspect,
+ uint32_t last_subpass_idx,
+ VkAttachmentStoreOp store_op);
- /* A syncobject handle associated with this fence */
- uint32_t sync;
+void v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
+ const VkDependencyInfo *info);
- /* A temporary syncobject handle produced from a vkImportFenceFd. */
- uint32_t temp_sync;
-};
+bool v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_image *dst,
+ struct v3dv_image *src,
+ const VkImageCopy2 *region);
struct v3dv_event {
struct vk_object_base base;
- int state;
+
+ /* Link in the device list of pre-allocated free events */
+ struct list_head link;
+
+ /* Each event gets a different index, which we use to compute the offset
+ * in the BO we use to track their state (signaled vs reset).
+ */
+ uint32_t index;
};
+VkResult
+v3dv_event_allocate_resources(struct v3dv_device *device);
+
+void
+v3dv_event_free_resources(struct v3dv_device *device);
+
struct v3dv_shader_variant {
enum broadcom_shader_stage stage;
@@ -1428,9 +1923,11 @@ struct v3dv_shader_variant {
*/
uint32_t assembly_offset;
- /* Note: it is really likely that qpu_insts would be NULL, as it will be
- * used only temporarily, to upload it to the shared bo, as we compile the
- * different stages individually.
+ /* Note: don't assume qpu_insts to be always NULL or not-NULL. In general
+ * we will try to free it as soon as we upload it to the shared bo while we
+ * compile the different stages. But we can decide to keep it around based
+ * on some pipeline creation flags, like
+ * VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT.
*/
uint64_t *qpu_insts;
uint32_t qpu_insts_size;
@@ -1462,7 +1959,9 @@ struct v3dv_pipeline_stage {
/** A name for this program, so you can track it in shader-db output. */
uint32_t program_id;
- VkPipelineCreationFeedbackEXT feedback;
+ VkPipelineCreationFeedback feedback;
+
+ struct vk_pipeline_robustness_state robustness;
};
/* We are using the descriptor pool entry for two things:
@@ -1486,6 +1985,9 @@ struct v3dv_descriptor_pool_entry
struct v3dv_descriptor_pool {
struct vk_object_base base;
+ /* A list with all descriptor sets allocated from the pool. */
+ struct list_head set_list;
+
/* If this descriptor pool has been allocated for the driver for internal
* use, typically to implement meta operations.
*/
@@ -1515,9 +2017,12 @@ struct v3dv_descriptor_pool {
struct v3dv_descriptor_set {
struct vk_object_base base;
+ /* List link into the list of all sets allocated from the pool */
+ struct list_head pool_link;
+
struct v3dv_descriptor_pool *pool;
- const struct v3dv_descriptor_set_layout *layout;
+ struct v3dv_descriptor_set_layout *layout;
/* Offset relative to the descriptor pool bo for this set */
uint32_t base_offset;
@@ -1533,7 +2038,7 @@ struct v3dv_descriptor_set_binding_layout {
/* Number of array elements in this binding */
uint32_t array_size;
- /* Index into the flattend descriptor set */
+ /* Index into the flattened descriptor set */
uint32_t descriptor_index;
uint32_t dynamic_offset_count;
@@ -1548,6 +2053,11 @@ struct v3dv_descriptor_set_binding_layout {
* if there are no immutable samplers.
*/
uint32_t immutable_samplers_offset;
+
+ /* Descriptors for multiplanar combined image samplers are larger.
+ * For mutable descriptors, this is always 1.
+ */
+ uint8_t plane_stride;
};
struct v3dv_descriptor_set_layout {
@@ -1571,10 +2081,35 @@ struct v3dv_descriptor_set_layout {
/* Number of dynamic offsets used by this descriptor set */
uint16_t dynamic_offset_count;
+ /* Descriptor set layouts can be destroyed even if they are still being
+ * used.
+ */
+ uint32_t ref_cnt;
+
/* Bindings in this descriptor set */
struct v3dv_descriptor_set_binding_layout binding[0];
};
+void
+v3dv_descriptor_set_layout_destroy(struct v3dv_device *device,
+ struct v3dv_descriptor_set_layout *set_layout);
+
+static inline void
+v3dv_descriptor_set_layout_ref(struct v3dv_descriptor_set_layout *set_layout)
+{
+ assert(set_layout && set_layout->ref_cnt >= 1);
+ p_atomic_inc(&set_layout->ref_cnt);
+}
+
+static inline void
+v3dv_descriptor_set_layout_unref(struct v3dv_device *device,
+ struct v3dv_descriptor_set_layout *set_layout)
+{
+ assert(set_layout && set_layout->ref_cnt >= 1);
+ if (p_atomic_dec_zero(&set_layout->ref_cnt))
+ v3dv_descriptor_set_layout_destroy(device, set_layout);
+}
+
struct v3dv_pipeline_layout {
struct vk_object_base base;
@@ -1590,8 +2125,37 @@ struct v3dv_pipeline_layout {
uint32_t dynamic_offset_count;
uint32_t push_constant_size;
+
+ /* Pipeline layouts can be destroyed after creating pipelines since
+ * maintenance4.
+ */
+ uint32_t ref_cnt;
+
+ unsigned char sha1[20];
};
+void
+v3dv_pipeline_layout_destroy(struct v3dv_device *device,
+ struct v3dv_pipeline_layout *layout,
+ const VkAllocationCallbacks *alloc);
+
+static inline void
+v3dv_pipeline_layout_ref(struct v3dv_pipeline_layout *layout)
+{
+ assert(layout && layout->ref_cnt >= 1);
+ p_atomic_inc(&layout->ref_cnt);
+}
+
+static inline void
+v3dv_pipeline_layout_unref(struct v3dv_device *device,
+ struct v3dv_pipeline_layout *layout,
+ const VkAllocationCallbacks *alloc)
+{
+ assert(layout && layout->ref_cnt >= 1);
+ if (p_atomic_dec_zero(&layout->ref_cnt))
+ v3dv_pipeline_layout_destroy(device, layout, alloc);
+}
+
/*
* We are using descriptor maps for ubo/ssbo and texture/samplers, so we need
* it to be big enough to include the max value for all of them.
@@ -1599,18 +2163,20 @@ struct v3dv_pipeline_layout {
* FIXME: one alternative would be to allocate the map as big as you need for
* each descriptor type. That would means more individual allocations.
*/
-#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \
- MAX_UNIFORM_BUFFERS, \
+#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \
+ MAX_UNIFORM_BUFFERS + MAX_INLINE_UNIFORM_BUFFERS, \
MAX_STORAGE_BUFFERS)
struct v3dv_descriptor_map {
- /* TODO: avoid fixed size array/justify the size */
+ /* FIXME: avoid fixed size array/justify the size */
unsigned num_desc; /* Number of descriptors */
int set[DESCRIPTOR_MAP_SIZE];
int binding[DESCRIPTOR_MAP_SIZE];
int array_index[DESCRIPTOR_MAP_SIZE];
int array_size[DESCRIPTOR_MAP_SIZE];
+ uint8_t plane[DESCRIPTOR_MAP_SIZE];
+ bool used[DESCRIPTOR_MAP_SIZE];
/* NOTE: the following is only for sampler, but this is the easier place to
* put it.
@@ -1620,57 +2186,19 @@ struct v3dv_descriptor_map {
struct v3dv_sampler {
struct vk_object_base base;
+ struct vk_ycbcr_conversion *conversion;
bool compare_enable;
bool unnormalized_coordinates;
- bool clamp_to_transparent_black_border;
- /* Prepacked SAMPLER_STATE, that is referenced as part of the tmu
+ /* Prepacked per plane SAMPLER_STATE, that is referenced as part of the tmu
* configuration. If needed it will be copied to the descriptor info during
* UpdateDescriptorSets
*/
+ uint8_t plane_count;
uint8_t sampler_state[V3DV_SAMPLER_STATE_LENGTH];
};
-struct v3dv_descriptor_template_entry {
- /* The type of descriptor in this entry */
- VkDescriptorType type;
-
- /* Binding in the descriptor set */
- uint32_t binding;
-
- /* Offset at which to write into the descriptor set binding */
- uint32_t array_element;
-
- /* Number of elements to write into the descriptor set binding */
- uint32_t array_count;
-
- /* Offset into the user provided data */
- size_t offset;
-
- /* Stride between elements into the user provided data */
- size_t stride;
-};
-
-struct v3dv_descriptor_update_template {
- struct vk_object_base base;
-
- VkPipelineBindPoint bind_point;
-
- /* The descriptor set this template corresponds to. This value is only
- * valid if the template was created with the templateType
- * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET.
- */
- uint8_t set;
-
- /* Number of entries in this template */
- uint32_t entry_count;
-
- /* Entries of the template */
- struct v3dv_descriptor_template_entry entries[0];
-};
-
-
/* We keep two special values for the sampler idx that represents exactly when a
* sampler is not needed/provided. The main use is that even if we don't have
* sampler, we still need to do the output unpacking (through
@@ -1685,32 +2213,6 @@ struct v3dv_descriptor_update_template {
#define V3DV_NO_SAMPLER_16BIT_IDX 0
#define V3DV_NO_SAMPLER_32BIT_IDX 1
-/*
- * Following two methods are using on the combined to/from texture/sampler
- * indices maps at v3dv_pipeline.
- */
-static inline uint32_t
-v3dv_pipeline_combined_index_key_create(uint32_t texture_index,
- uint32_t sampler_index)
-{
- return texture_index << 24 | sampler_index;
-}
-
-static inline void
-v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key,
- uint32_t *texture_index,
- uint32_t *sampler_index)
-{
- uint32_t texture = combined_index_key >> 24;
- uint32_t sampler = combined_index_key & 0xffffff;
-
- if (texture_index)
- *texture_index = texture;
-
- if (sampler_index)
- *sampler_index = sampler;
-}
-
struct v3dv_descriptor_maps {
struct v3dv_descriptor_map ubo_map;
struct v3dv_descriptor_map ssbo_map;
@@ -1733,50 +2235,59 @@ struct v3dv_pipeline_shared_data {
struct v3dv_bo *assembly_bo;
};
+struct v3dv_pipeline_executable_data {
+ enum broadcom_shader_stage stage;
+ char *nir_str;
+ char *qpu_str;
+};
+
struct v3dv_pipeline {
struct vk_object_base base;
struct v3dv_device *device;
VkShaderStageFlags active_stages;
+ VkPipelineCreateFlags flags;
struct v3dv_render_pass *pass;
struct v3dv_subpass *subpass;
- /* Note: We can't use just a MESA_SHADER_STAGES array because we also need
- * to track binning shaders. Note these will be freed once the pipeline
- * has been compiled.
- */
- struct v3dv_pipeline_stage *vs;
- struct v3dv_pipeline_stage *vs_bin;
- struct v3dv_pipeline_stage *gs;
- struct v3dv_pipeline_stage *gs_bin;
- struct v3dv_pipeline_stage *fs;
- struct v3dv_pipeline_stage *cs;
+ struct v3dv_pipeline_stage *stages[BROADCOM_SHADER_STAGES];
+
+ /* For VK_KHR_dynamic_rendering */
+ struct vk_render_pass_state rendering_info;
/* Flags for whether optional pipeline stages are present, for convenience */
bool has_gs;
+ /* Whether any stage in this pipeline uses VK_KHR_buffer_device_address */
+ bool uses_buffer_device_address;
+
/* Spilling memory requirements */
struct {
struct v3dv_bo *bo;
uint32_t size_per_thread;
} spill;
- struct v3dv_dynamic_state dynamic_state;
+ struct vk_dynamic_graphics_state dynamic_graphics_state;
+ struct v3dv_dynamic_state dynamic;
struct v3dv_pipeline_layout *layout;
- /* Whether this pipeline enables depth writes */
- bool z_updates_enable;
-
enum v3dv_ez_state ez_state;
+ /* If ez_state is V3D_EZ_DISABLED, if the reason for disabling is that the
+ * pipeline selects an incompatible depth test function.
+ */
+ bool incompatible_ez_test;
+
+ bool rasterization_enabled;
bool msaa;
bool sample_rate_shading;
uint32_t sample_mask;
bool primitive_restart;
+ bool negative_one_to_one;
/* Accessed by binding. So vb[binding]->stride is the stride of the vertex
* array with such binding
@@ -1799,12 +2310,18 @@ struct v3dv_pipeline {
} va[MAX_VERTEX_ATTRIBS];
uint32_t va_count;
- enum pipe_prim_type topology;
+ enum mesa_prim topology;
+
+ bool line_smooth;
struct v3dv_pipeline_shared_data *shared_data;
+ /* It is the combined stages sha1, layout sha1, plus the pipeline key sha1. */
+ unsigned char sha1[20];
+
/* In general we can reuse v3dv_device->default_attribute_float, so note
- * that the following can be NULL.
+ * that the following can be NULL. In 7.x this is not used, so it will be
+ * always NULL.
*
* FIXME: the content of this BO will be small, so it could be improved to
* be uploaded to a common BO. But as in most cases it will be NULL, it is
@@ -1838,6 +2355,11 @@ struct v3dv_pipeline {
bool is_z16;
} depth_bias;
+ struct {
+ void *mem_ctx;
+ struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
+ } executables;
+
/* Packets prepacked during pipeline creation
*/
uint8_t cfg_bits[V3DV_CFG_BITS_LENGTH];
@@ -1848,6 +2370,13 @@ struct v3dv_pipeline {
uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
};
+static inline bool
+v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
+{
+ return device->devinfo.ver > 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
+}
+
static inline VkPipelineBindPoint
v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
{
@@ -1872,28 +2401,17 @@ const nir_shader_compiler_options *v3dv_pipeline_get_nir_options(void);
uint32_t v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev);
uint32_t v3dv_physical_device_device_id(struct v3dv_physical_device *dev);
-VkResult __vk_errorf(struct v3dv_instance *instance, VkResult error,
- const char *file, int line,
- const char *format, ...);
-
-#define vk_error(instance, error) __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
-#define vk_errorf(instance, error, format, ...) __vk_errorf(instance, error, __FILE__, __LINE__, format, ## __VA_ARGS__);
-
-#ifdef DEBUG
#define v3dv_debug_ignored_stype(sType) \
- fprintf(stderr, "%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType))
-#else
-#define v3dv_debug_ignored_stype(sType)
-#endif
+ mesa_logd("%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType))
-const uint8_t *v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f);
-uint8_t v3dv_get_tex_return_size(const struct v3dv_format *vf, bool compare_enable);
+const uint8_t *v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f,
+ uint8_t plane);
const struct v3dv_format *
v3dv_get_compatible_tfu_format(struct v3dv_device *device,
uint32_t bpp, VkFormat *out_vk_format);
bool v3dv_buffer_format_supports_features(struct v3dv_device *device,
VkFormat vk_format,
- VkFormatFeatureFlags features);
+ VkFormatFeatureFlags2 features);
struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline *pipeline,
@@ -1953,6 +2471,14 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
uint32_t index,
uint32_t *dynamic_offset);
+struct v3dv_cl_reloc
+v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
+ struct v3dv_descriptor_state *descriptor_state,
+ struct v3dv_descriptor_map *map,
+ struct v3dv_pipeline_layout *pipeline_layout,
+ uint32_t index,
+ VkDescriptorType *out_type);
+
const struct v3dv_sampler *
v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state,
struct v3dv_descriptor_map *map,
@@ -1973,13 +2499,6 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device,
struct v3dv_pipeline_layout *pipeline_layout,
uint32_t index);
-const struct v3dv_format*
-v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_state,
- struct v3dv_descriptor_map *map,
- struct v3dv_pipeline_layout *pipeline_layout,
- uint32_t index,
- VkFormat *out_vk_format);
-
struct v3dv_bo*
v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_state,
struct v3dv_descriptor_map *map,
@@ -2020,71 +2539,56 @@ void
v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
struct v3dv_pipeline_cache *cache);
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline);
-
-void v3dv_shader_module_internal_init(struct v3dv_device *device,
- struct vk_shader_module *module,
- nir_shader *nir);
-
-#define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType) \
- \
- static inline struct __v3dv_type * \
- __v3dv_type ## _from_handle(__VkType _handle) \
- { \
- return (struct __v3dv_type *) _handle; \
- } \
- \
- static inline __VkType \
- __v3dv_type ## _to_handle(struct __v3dv_type *_obj) \
- { \
- return (__VkType) _obj; \
- }
-
-#define V3DV_DEFINE_NONDISP_HANDLE_CASTS(__v3dv_type, __VkType) \
- \
- static inline struct __v3dv_type * \
- __v3dv_type ## _from_handle(__VkType _handle) \
- { \
- return (struct __v3dv_type *)(uintptr_t) _handle; \
- } \
- \
- static inline __VkType \
- __v3dv_type ## _to_handle(struct __v3dv_type *_obj) \
- { \
- return (__VkType)(uintptr_t) _obj; \
- }
+VkResult
+v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
+ nir_shader *nir,
+ VkPipelineLayout pipeline_layout,
+ VkPipeline *pipeline);
#define V3DV_FROM_HANDLE(__v3dv_type, __name, __handle) \
- struct __v3dv_type *__name = __v3dv_type ## _from_handle(__handle)
-
-V3DV_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, VkCommandBuffer)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_device, VkDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_instance, VkInstance)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_physical_device, VkPhysicalDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_queue, VkQueue)
-
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_cmd_pool, VkCommandPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, VkBuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, VkBufferView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, VkDeviceMemory)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, VkDescriptorPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, VkDescriptorSet)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, VkDescriptorSetLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, VkDescriptorUpdateTemplate)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, VkEvent)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, VkFence)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, VkFramebuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, VkImage)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, VkImageView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, VkPipeline)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, VkPipelineCache)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, VkPipelineLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, VkQueryPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, VkRenderPass)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, VkSampler)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, VkSemaphore)
+ VK_FROM_HANDLE(__v3dv_type, __name, __handle)
+
+VK_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, vk.base, VkCommandBuffer,
+ VK_OBJECT_TYPE_COMMAND_BUFFER)
+VK_DEFINE_HANDLE_CASTS(v3dv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_instance, vk.base, VkInstance,
+ VK_OBJECT_TYPE_INSTANCE)
+VK_DEFINE_HANDLE_CASTS(v3dv_physical_device, vk.base, VkPhysicalDevice,
+ VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, base, VkBuffer,
+ VK_OBJECT_TYPE_BUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, base, VkBufferView,
+ VK_OBJECT_TYPE_BUFFER_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, vk.base, VkDeviceMemory,
+ VK_OBJECT_TYPE_DEVICE_MEMORY)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, base, VkDescriptorPool,
+ VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, base, VkDescriptorSet,
+ VK_OBJECT_TYPE_DESCRIPTOR_SET)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, base,
+ VkDescriptorSetLayout,
+ VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, base, VkFramebuffer,
+ VK_OBJECT_TYPE_FRAMEBUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, vk.base, VkImage,
+ VK_OBJECT_TYPE_IMAGE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, vk.base, VkImageView,
+ VK_OBJECT_TYPE_IMAGE_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, base, VkPipeline,
+ VK_OBJECT_TYPE_PIPELINE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, base, VkPipelineCache,
+ VK_OBJECT_TYPE_PIPELINE_CACHE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, base, VkPipelineLayout,
+ VK_OBJECT_TYPE_PIPELINE_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, base, VkQueryPool,
+ VK_OBJECT_TYPE_QUERY_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, base, VkRenderPass,
+ VK_OBJECT_TYPE_RENDER_PASS)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, base, VkSampler,
+ VK_OBJECT_TYPE_SAMPLER)
static inline int
v3dv_ioctl(int fd, unsigned long request, void *arg)
@@ -2132,19 +2636,39 @@ u64_compare(const void *key1, const void *key2)
return memcmp(key1, key2, sizeof(uint64_t)) == 0;
}
-/* Helper to call hw ver speficic functions */
+/* Helper to call hw ver specific functions */
#define v3dv_X(device, thing) ({ \
__typeof(&v3d42_##thing) v3d_X_thing; \
switch (device->devinfo.ver) { \
case 42: \
v3d_X_thing = &v3d42_##thing; \
break; \
+ case 71: \
+ v3d_X_thing = &v3d71_##thing; \
+ break; \
default: \
unreachable("Unsupported hardware generation"); \
} \
v3d_X_thing; \
})
+/* Helper to get hw-specific macro values */
+#define V3DV_X(device, thing) ({ \
+ __typeof(V3D42_##thing) V3D_X_THING; \
+ switch (device->devinfo.ver) { \
+ case 42: \
+ V3D_X_THING = V3D42_##thing; \
+ break; \
+ case 71: \
+ V3D_X_THING = V3D71_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ V3D_X_THING; \
+})
+
+
/* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
* define v3dX for each version supported, because when we compile code that
@@ -2157,6 +2681,45 @@ u64_compare(const void *key1, const void *key2)
# define v3dX(x) v3d42_##x
# include "v3dvx_private.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dvx_private.h"
+# undef v3dX
#endif
+VkResult
+v3dv_update_image_layout(struct v3dv_device *device,
+ struct v3dv_image *image,
+ uint64_t modifier,
+ bool disjoint,
+ const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info);
+
+float
+v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline,
+ struct v3dv_cmd_buffer *buffer);
+
+
+void
+v3dv_compute_ez_state(struct vk_dynamic_graphics_state *dyn,
+ struct v3dv_pipeline *pipeline,
+ enum v3dv_ez_state *ez_state,
+ bool *incompatible_ez_test);
+
+uint32_t v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim);
+
+#if DETECT_OS_ANDROID
+VkResult
+v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc,
+ struct u_gralloc_buffer_handle *in_hnd,
+ VkImageDrmFormatModifierExplicitCreateInfoEXT *out,
+ VkSubresourceLayout *out_layouts,
+ int max_planes);
+
+VkResult
+v3dv_import_native_buffer_fd(VkDevice device_h,
+ int dma_buf,
+ const VkAllocationCallbacks *alloc,
+ VkImage image_h);
+#endif /* DETECT_OS_ANDROID */
+
#endif /* V3DV_PRIVATE_H */
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 0deb430fc16..7231c694fff 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -23,6 +23,224 @@
#include "v3dv_private.h"
+#include "util/timespec.h"
+#include "compiler/nir/nir_builder.h"
+
+static void
+kperfmon_create(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+ assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
+
+ struct drm_v3d_perfmon_create req = {
+ .ncounters = MIN2(pool->perfmon.ncounters -
+ i * DRM_V3D_MAX_PERF_COUNTERS,
+ DRM_V3D_MAX_PERF_COUNTERS),
+ };
+ memcpy(req.counters,
+ &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
+ req.ncounters);
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_PERFMON_CREATE,
+ &req);
+ if (ret)
+ fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
+
+ pool->queries[query].perf.kperfmon_ids[i] = req.id;
+ }
+}
+
+static void
+kperfmon_destroy(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ /* Skip destroying if never created */
+ if (!pool->queries[query].perf.kperfmon_ids[0])
+ return;
+
+ for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+ struct drm_v3d_perfmon_destroy req = {
+ .id = pool->queries[query].perf.kperfmon_ids[i]
+ };
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_PERFMON_DESTROY,
+ &req);
+
+ if (ret) {
+ fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
+ req.id, strerror(ret));
+ }
+ }
+}
+
+/**
+ * Creates a VkBuffer (and VkDeviceMemory) to access a BO.
+ */
+static VkResult
+create_vk_storage_buffer(struct v3dv_device *device,
+ struct v3dv_bo *bo,
+ VkBuffer *vk_buf,
+ VkDeviceMemory *vk_mem)
+{
+ VkDevice vk_device = v3dv_device_to_handle(device);
+
+ VkBufferCreateInfo buf_info = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .size = bo->size,
+ .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+ };
+ VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf);
+ if (result != VK_SUCCESS)
+ return result;
+
+ struct v3dv_device_memory *mem =
+ vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
+ VK_OBJECT_TYPE_DEVICE_MEMORY);
+ if (!mem)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+ mem->bo = bo;
+ mem->type = &device->pdevice->memory.memoryTypes[0];
+
+ *vk_mem = v3dv_device_memory_to_handle(mem);
+ VkBindBufferMemoryInfo bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+ .buffer = *vk_buf,
+ .memory = *vk_mem,
+ .memoryOffset = 0,
+ };
+ v3dv_BindBufferMemory2(vk_device, 1, &bind_info);
+
+ return VK_SUCCESS;
+}
+
+static void
+destroy_vk_storage_buffer(struct v3dv_device *device,
+ VkBuffer *vk_buf,
+ VkDeviceMemory *vk_mem)
+{
+ if (*vk_mem) {
+ vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem));
+ *vk_mem = VK_NULL_HANDLE;
+ }
+
+ v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL);
+ *vk_buf = VK_NULL_HANDLE;
+}
+
+/**
+ * Allocates descriptor sets to access query pool BO (availability and
+ * occlusion query results) from Vulkan pipelines.
+ */
+static VkResult
+create_pool_descriptors(struct v3dv_device *device,
+ struct v3dv_query_pool *pool)
+{
+ assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+ VkDevice vk_device = v3dv_device_to_handle(device);
+
+ VkDescriptorPoolSize pool_size = {
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .descriptorCount = 1,
+ };
+ VkDescriptorPoolCreateInfo pool_info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+ .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+ .maxSets = 1,
+ .poolSizeCount = 1,
+ .pPoolSizes = &pool_size,
+ };
+ VkResult result =
+ v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL,
+ &pool->meta.descriptor_pool);
+
+ if (result != VK_SUCCESS)
+ return result;
+
+ VkDescriptorSetAllocateInfo alloc_info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+ .descriptorPool = pool->meta.descriptor_pool,
+ .descriptorSetCount = 1,
+ .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+ };
+ result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info,
+ &pool->meta.descriptor_set);
+ if (result != VK_SUCCESS)
+ return result;
+
+ VkDescriptorBufferInfo desc_buf_info = {
+ .buffer = pool->meta.buf,
+ .offset = 0,
+ .range = VK_WHOLE_SIZE,
+ };
+
+ VkWriteDescriptorSet write = {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstSet = pool->meta.descriptor_set,
+ .dstBinding = 0,
+ .dstArrayElement = 0,
+ .descriptorCount = 1,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .pBufferInfo = &desc_buf_info,
+ };
+ v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
+
+ return VK_SUCCESS;
+}
+
+static void
+destroy_pool_descriptors(struct v3dv_device *device,
+ struct v3dv_query_pool *pool)
+{
+ assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+
+ v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
+ pool->meta.descriptor_pool,
+ 1, &pool->meta.descriptor_set);
+ pool->meta.descriptor_set = VK_NULL_HANDLE;
+
+ v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
+ pool->meta.descriptor_pool, NULL);
+ pool->meta.descriptor_pool = VK_NULL_HANDLE;
+}
+
+static VkResult
+pool_create_meta_resources(struct v3dv_device *device,
+ struct v3dv_query_pool *pool)
+{
+ VkResult result;
+
+ if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
+ return VK_SUCCESS;
+
+ result = create_vk_storage_buffer(device, pool->occlusion.bo,
+ &pool->meta.buf, &pool->meta.mem);
+ if (result != VK_SUCCESS)
+ return result;
+
+ result = create_pool_descriptors(device, pool);
+ if (result != VK_SUCCESS)
+ return result;
+
+ return VK_SUCCESS;
+}
+
+static void
+pool_destroy_meta_resources(struct v3dv_device *device,
+ struct v3dv_query_pool *pool)
+{
+ if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
+ return;
+
+ destroy_pool_descriptors(device, pool);
+ destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem);
+}
+
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device,
const VkQueryPoolCreateInfo *pCreateInfo,
@@ -32,74 +250,149 @@ v3dv_CreateQueryPool(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device);
assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
- pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP);
+ pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
+ pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
assert(pCreateInfo->queryCount > 0);
struct v3dv_query_pool *pool =
vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
VK_OBJECT_TYPE_QUERY_POOL);
if (pool == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pool->query_type = pCreateInfo->queryType;
pool->query_count = pCreateInfo->queryCount;
+ uint32_t query_idx = 0;
VkResult result;
const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (pool->queries == NULL) {
- result = vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail;
}
- if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION: {
/* The hardware allows us to setup groups of 16 queries in consecutive
* 4-byte addresses, requiring only that each group of 16 queries is
* aligned to a 1024 byte boundary.
*/
const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
- const uint32_t bo_size = query_groups * 1024;
- pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
- if (!pool->bo) {
- result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ uint32_t bo_size = query_groups * 1024;
+ /* After the counters we store avalability data, 1 byte/query */
+ pool->occlusion.avail_offset = bo_size;
+ bo_size += pool->query_count;
+ pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true);
+ if (!pool->occlusion.bo) {
+ result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
goto fail;
}
- if (!v3dv_bo_map(device, pool->bo, bo_size)) {
- result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) {
+ result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
goto fail;
}
+ break;
}
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+ const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+
+ assert(pq_info);
+
+ pool->perfmon.ncounters = pq_info->counterIndexCount;
+ for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
+ pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
- uint32_t i;
- for (i = 0; i < pool->query_count; i++) {
- pool->queries[i].maybe_available = false;
+ pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
+ DRM_V3D_MAX_PERF_COUNTERS);
+
+ assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
+ break;
+ }
+ case VK_QUERY_TYPE_TIMESTAMP: {
+ /* 8 bytes per query used for the timestamp value. We have all
+ * timestamps tightly packed first in the buffer.
+ */
+ const uint32_t bo_size = pool->query_count * 8;
+ pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true);
+ if (!pool->timestamp.bo) {
+ result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ goto fail;
+ }
+ if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) {
+ result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ goto fail;
+ }
+ break;
+ }
+ default:
+ unreachable("Unsupported query type");
+ }
+
+ /* Initialize queries in the pool */
+ for (; query_idx < pool->query_count; query_idx++) {
+ pool->queries[query_idx].maybe_available = false;
switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION: {
- const uint32_t query_group = i / 16;
- const uint32_t query_offset = query_group * 1024 + (i % 16) * 4;
- pool->queries[i].bo = pool->bo;
- pool->queries[i].offset = query_offset;
+ const uint32_t query_group = query_idx / 16;
+ const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
+ pool->queries[query_idx].occlusion.offset = query_offset;
break;
}
case VK_QUERY_TYPE_TIMESTAMP:
- pool->queries[i].value = 0;
+ pool->queries[query_idx].timestamp.offset = query_idx * 8;
+ result = vk_sync_create(&device->vk,
+ &device->pdevice->drm_syncobj_type, 0, 0,
+ &pool->queries[query_idx].timestamp.sync);
+ if (result != VK_SUCCESS)
+ goto fail;
+ break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+ result = vk_sync_create(&device->vk,
+ &device->pdevice->drm_syncobj_type, 0, 0,
+ &pool->queries[query_idx].perf.last_job_sync);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ kperfmon_create(device, pool, query_idx);
break;
+ }
default:
unreachable("Unsupported query type");
}
}
+ /* Create meta resources */
+ result = pool_create_meta_resources(device, pool);
+ if (result != VK_SUCCESS)
+ goto fail;
+
*pQueryPool = v3dv_query_pool_to_handle(pool);
return VK_SUCCESS;
fail:
- if (pool->bo)
- v3dv_bo_free(device, pool->bo);
+ if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ for (uint32_t j = 0; j < query_idx; j++)
+ vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync);
+ }
+
+ if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ for (uint32_t j = 0; j < query_idx; j++)
+ vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
+ }
+
+ if (pool->occlusion.bo)
+ v3dv_bo_free(device, pool->occlusion.bo);
+ if (pool->timestamp.bo)
+ v3dv_bo_free(device, pool->timestamp.bo);
if (pool->queries)
vk_free2(&device->vk.alloc, pAllocator, pool->queries);
+ pool_destroy_meta_resources(device, pool);
vk_object_free(&device->vk, pAllocator, pool);
return result;
@@ -116,17 +409,34 @@ v3dv_DestroyQueryPool(VkDevice _device,
if (!pool)
return;
- if (pool->bo)
- v3dv_bo_free(device, pool->bo);
+ if (pool->occlusion.bo)
+ v3dv_bo_free(device, pool->occlusion.bo);
+
+ if (pool->timestamp.bo)
+ v3dv_bo_free(device, pool->timestamp.bo);
+
+ if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ for (uint32_t i = 0; i < pool->query_count; i++)
+ vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync);
+ }
+
+ if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ for (uint32_t i = 0; i < pool->query_count; i++) {
+ kperfmon_destroy(device, pool, i);
+ vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
+ }
+ }
if (pool->queries)
vk_free2(&device->vk.alloc, pAllocator, pool->queries);
+ pool_destroy_meta_resources(device, pool);
+
vk_object_free(&device->vk, pAllocator, pool);
}
static void
-write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
+write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
{
if (do_64bit) {
uint64_t *dst64 = (uint64_t *) dst;
@@ -138,89 +448,255 @@ write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
}
static VkResult
-get_occlusion_query_result(struct v3dv_device *device,
- struct v3dv_query_pool *pool,
- uint32_t query,
- bool do_wait,
- bool *available,
- uint64_t *value)
+query_wait_available(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ struct v3dv_query *q,
+ uint32_t query_idx)
{
- assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+ /* For occlusion queries we prefer to poll the availability BO in a loop
+ * to waiting on the query results BO, because the latter would
+ * make us wait for any job running queries from the pool, even if those
+ * queries do not involve the one we want to wait on.
+ */
+ if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+ uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
+ pool->occlusion.avail_offset + query_idx;
+ while (*q_addr == 0)
+ usleep(250);
+ return VK_SUCCESS;
+ }
- struct v3dv_query *q = &pool->queries[query];
- assert(q->bo && q->bo->map);
+ if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ if (vk_sync_wait(&device->vk, q->timestamp.sync,
+ 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
+ return vk_device_set_lost(&device->vk, "Query job wait failed");
+ }
+ return VK_SUCCESS;
+ }
- if (do_wait) {
- /* From the Vulkan 1.0 spec:
- *
- * "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not
- * become available in a finite amount of time (e.g. due to not
- * issuing a query since the last reset), a VK_ERROR_DEVICE_LOST
- * error may occur."
+ assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+ /* For performance queries we need to wait for the queue to signal that
+ * the query has been submitted for execution before anything else.
+ */
+ VkResult result = VK_SUCCESS;
+ if (!q->maybe_available) {
+ struct timespec timeout;
+ timespec_get(&timeout, TIME_UTC);
+ timespec_add_msec(&timeout, &timeout, 2000);
+
+ mtx_lock(&device->query_mutex);
+ while (!q->maybe_available) {
+ if (vk_device_is_lost(&device->vk)) {
+ result = VK_ERROR_DEVICE_LOST;
+ break;
+ }
+
+ int ret = cnd_timedwait(&device->query_ended,
+ &device->query_mutex,
+ &timeout);
+ if (ret != thrd_success) {
+ mtx_unlock(&device->query_mutex);
+ result = vk_device_set_lost(&device->vk, "Query wait failed");
+ break;
+ }
+ }
+ mtx_unlock(&device->query_mutex);
+
+ if (result != VK_SUCCESS)
+ return result;
+
+ /* For performance queries, we also need to wait for the relevant syncobj
+ * to be signaled to ensure completion of the GPU work.
*/
- if (!q->maybe_available)
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+ if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+ vk_sync_wait(&device->vk, q->perf.last_job_sync,
+ 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
+ return vk_device_set_lost(&device->vk, "Query job wait failed");
+ }
+ }
+
+ return result;
+}
+
+static VkResult
+query_check_available(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ struct v3dv_query *q,
+ uint32_t query_idx)
+{
+ /* For occlusion we check the availability BO */
+ if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+ const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
+ pool->occlusion.avail_offset + query_idx;
+ return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY;
+ }
- if (!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+ /* For timestamp queries, we need to check if the relevant job
+ * has completed.
+ */
+ if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ if (vk_sync_wait(&device->vk, q->timestamp.sync,
+ 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
+ return VK_NOT_READY;
+ }
+ return VK_SUCCESS;
+ }
+
+ /* For other queries we need to check if the queue has submitted the query
+ * for execution at all.
+ */
+ assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+ if (!q->maybe_available)
+ return VK_NOT_READY;
+
+ /* For performance queries, we also need to check if the relevant GPU job
+ * has completed.
+ */
+ if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+ vk_sync_wait(&device->vk, q->perf.last_job_sync,
+ 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
+ return VK_NOT_READY;
+ }
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+query_is_available(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_wait,
+ bool *available)
+{
+ struct v3dv_query *q = &pool->queries[query];
+
+ if (do_wait) {
+ VkResult result = query_wait_available(device, pool, q, query);
+ if (result != VK_SUCCESS) {
+ *available = false;
+ return result;
+ }
*available = true;
} else {
- *available = q->maybe_available && v3dv_bo_wait(device, q->bo, 0);
+ VkResult result = query_check_available(device, pool, q, query);
+ assert(result == VK_SUCCESS || result == VK_NOT_READY);
+ *available = (result == VK_SUCCESS);
}
- const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
- *value = (uint64_t) *((uint32_t *)query_addr);
return VK_SUCCESS;
}
static VkResult
-get_timestamp_query_result(struct v3dv_device *device,
- struct v3dv_query_pool *pool,
- uint32_t query,
- bool do_wait,
- bool *available,
- uint64_t *value)
+write_occlusion_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
+{
+ assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+
+ if (vk_device_is_lost(&device->vk))
+ return VK_ERROR_DEVICE_LOST;
+
+ struct v3dv_query *q = &pool->queries[query];
+ assert(pool->occlusion.bo && pool->occlusion.bo->map);
+
+ const uint8_t *query_addr =
+ ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
+ write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
+ return VK_SUCCESS;
+}
+
+static VkResult
+write_timestamp_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
{
assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
struct v3dv_query *q = &pool->queries[query];
- if (do_wait) {
- /* From the Vulkan 1.0 spec:
- *
- * "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not
- * become available in a finite amount of time (e.g. due to not
- * issuing a query since the last reset), a VK_ERROR_DEVICE_LOST
- * error may occur."
- */
- if (!q->maybe_available)
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+ const uint8_t *query_addr =
+ ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset;
- *available = true;
- } else {
- *available = q->maybe_available;
+ write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr));
+ return VK_SUCCESS;
+}
+
+static VkResult
+write_performance_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
+{
+ assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+ struct v3dv_query *q = &pool->queries[query];
+ uint64_t counter_values[V3D_MAX_PERFCNT];
+
+ for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+ struct drm_v3d_perfmon_get_values req = {
+ .id = q->perf.kperfmon_ids[i],
+ .values_ptr = (uintptr_t)(&counter_values[i *
+ DRM_V3D_MAX_PERF_COUNTERS])
+ };
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_PERFMON_GET_VALUES,
+ &req);
+
+ if (ret) {
+ fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
+ return vk_error(device, VK_ERROR_DEVICE_LOST);
+ }
}
- *value = q->value;
+ for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
+ write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
+
return VK_SUCCESS;
}
static VkResult
-get_query_result(struct v3dv_device *device,
- struct v3dv_query_pool *pool,
- uint32_t query,
- bool do_wait,
- bool *available,
- uint64_t *value)
+write_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
+{
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ return write_occlusion_query_result(device, pool, query, do_64bit,
+ data, slot);
+ case VK_QUERY_TYPE_TIMESTAMP:
+ return write_timestamp_query_result(device, pool, query, do_64bit,
+ data, slot);
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ return write_performance_query_result(device, pool, query, do_64bit,
+ data, slot);
+ default:
+ unreachable("Unsupported query type");
+ }
+}
+
+static uint32_t
+get_query_result_count(struct v3dv_query_pool *pool)
{
switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION:
- return get_occlusion_query_result(device, pool, query, do_wait,
- available, value);
case VK_QUERY_TYPE_TIMESTAMP:
- return get_timestamp_query_result(device, pool, query, do_wait,
- available, value);
+ return 1;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ return pool->perfmon.ncounters;
default:
unreachable("Unsupported query type");
}
@@ -239,16 +715,18 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
assert(first + count <= pool->query_count);
assert(data);
- const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT;
+ const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
+ pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
+ uint32_t result_count = get_query_result_count(pool);
+
VkResult result = VK_SUCCESS;
for (uint32_t i = first; i < first + count; i++) {
bool available = false;
- uint64_t value = 0;
VkResult query_result =
- get_query_result(device, pool, i, do_wait, &available, &value);
+ query_is_available(device, pool, i, do_wait, &available);
if (query_result == VK_ERROR_DEVICE_LOST)
result = VK_ERROR_DEVICE_LOST;
@@ -266,11 +744,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
const bool write_result = available || do_partial;
if (write_result)
- write_query_result(data, slot, do_64bit, value);
- slot++;
+ write_query_result(device, pool, i, do_64bit, data, slot);
+ slot += result_count;
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
- write_query_result(data, slot++, do_64bit, available ? 1u : 0u);
+ write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
if (!write_result && result != VK_ERROR_DEVICE_LOST)
result = VK_NOT_READY;
@@ -298,6 +776,170 @@ v3dv_GetQueryPoolResults(VkDevice _device,
pData, stride, flags);
}
+/* Emits a series of vkCmdDispatchBase calls to execute all the workgroups
+ * required to handle a number of queries considering per-dispatch limits.
+ */
+static void
+cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer,
+ uint32_t query_count)
+{
+ VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+ uint32_t dispatched = 0;
+ const uint32_t max_batch_size = 65535;
+ while (dispatched < query_count) {
+ uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size);
+ v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1);
+ dispatched += batch_size;
+ }
+}
+
+void
+v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query, uint32_t count,
+ uint8_t availability)
+{
+ assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+ struct v3dv_device *device = cmd_buffer->device;
+ VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+ /* We are about to emit a compute job to set query availability and we need
+ * to ensure this executes after the graphics work using the queries has
+ * completed.
+ */
+ VkMemoryBarrier2 barrier = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+ .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ };
+ VkDependencyInfo barrier_info = {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .memoryBarrierCount = 1,
+ .pMemoryBarriers = &barrier,
+ };
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+
+ /* Dispatch queries */
+ v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+ v3dv_CmdBindPipeline(vk_cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->queries.avail_pipeline);
+
+ v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->queries.avail_pipeline_layout,
+ 0, 1, &pool->meta.descriptor_set,
+ 0, NULL);
+
+ struct {
+ uint32_t offset;
+ uint32_t query;
+ uint8_t availability;
+ } push_data = { pool->occlusion.avail_offset, query, availability };
+ v3dv_CmdPushConstants(vk_cmd_buffer,
+ device->queries.avail_pipeline_layout,
+ VK_SHADER_STAGE_COMPUTE_BIT,
+ 0, sizeof(push_data), &push_data);
+ cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
+
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+static void
+cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query, uint32_t count)
+{
+ struct v3dv_device *device = cmd_buffer->device;
+ VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+ /* Ensure the GPU is done with the queries in the graphics queue before
+ * we reset in the compute queue.
+ */
+ VkMemoryBarrier2 barrier = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+ .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ };
+ VkDependencyInfo barrier_info = {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .memoryBarrierCount = 1,
+ .pMemoryBarriers = &barrier,
+ };
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+
+ /* Emit compute reset */
+ v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+ v3dv_CmdBindPipeline(vk_cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->queries.reset_occlusion_pipeline);
+
+ v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->queries.reset_occlusion_pipeline_layout,
+ 0, 1, &pool->meta.descriptor_set,
+ 0, NULL);
+ struct {
+ uint32_t offset;
+ uint32_t query;
+ } push_data = { pool->occlusion.avail_offset, query };
+ v3dv_CmdPushConstants(vk_cmd_buffer,
+ device->queries.reset_occlusion_pipeline_layout,
+ VK_SHADER_STAGE_COMPUTE_BIT,
+ 0, sizeof(push_data), &push_data);
+
+ cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
+
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+
+ /* Ensure future work in the graphics queue using the queries doesn't start
+ * before the reset completed.
+ */
+ barrier = (VkMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+ .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT,
+ };
+ barrier_info = (VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .memoryBarrierCount = 1,
+ .pMemoryBarriers = &barrier,
+ };
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+}
+
+static void
+cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t first, uint32_t count)
+{
+ assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+ cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count);
+}
+
+static void
+cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t first, uint32_t count)
+{
+ assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION);
+
+ struct v3dv_job *job =
+ v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+ V3DV_JOB_TYPE_CPU_RESET_QUERIES,
+ cmd_buffer, -1);
+ v3dv_return_if_oom(cmd_buffer, NULL);
+ job->cpu.query_reset.pool = pool;
+ job->cpu.query_reset.first = first;
+ job->cpu.query_reset.count = count;
+ list_addtail(&job->list_link, &cmd_buffer->jobs);
+}
+
VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
@@ -307,7 +949,261 @@ v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
- v3dv_cmd_buffer_reset_queries(cmd_buffer, pool, firstQuery, queryCount);
+ /* Resets can only happen outside a render pass instance so we should not
+ * be in the middle of job recording.
+ */
+ assert(cmd_buffer->state.pass == NULL);
+ assert(cmd_buffer->state.job == NULL);
+
+ assert(firstQuery < pool->query_count);
+ assert(firstQuery + queryCount <= pool->query_count);
+
+ /* We can reset occlusion queries in the GPU, but for other query types
+ * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed
+ * in the queue.
+ */
+ if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+ cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount);
+ } else {
+ cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool,
+ firstQuery, queryCount);
+ }
+}
+
+/**
+ * Creates a descriptor pool so we can create a descriptors for the destination
+ * buffers of vkCmdCopyQueryResults for queries where this is implemented in
+ * the GPU.
+ */
+static VkResult
+create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ /* If this is not the first pool we create one for this command buffer
+ * size it based on the size of the currently exhausted pool.
+ */
+ uint32_t descriptor_count = 32;
+ if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) {
+ struct v3dv_descriptor_pool *exhausted_pool =
+ v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool);
+ descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
+ }
+
+ /* Create the descriptor pool */
+ cmd_buffer->meta.query.dspool = VK_NULL_HANDLE;
+ VkDescriptorPoolSize pool_size = {
+ .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+ .descriptorCount = descriptor_count,
+ };
+ VkDescriptorPoolCreateInfo info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+ .maxSets = descriptor_count,
+ .poolSizeCount = 1,
+ .pPoolSizes = &pool_size,
+ .flags = 0,
+ };
+ VkResult result =
+ v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
+ &info,
+ &cmd_buffer->device->vk.alloc,
+ &cmd_buffer->meta.query.dspool);
+
+ if (result == VK_SUCCESS) {
+ assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
+ const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool;
+
+ v3dv_cmd_buffer_add_private_obj(
+ cmd_buffer, (uintptr_t) vk_pool,
+ (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
+
+ struct v3dv_descriptor_pool *pool =
+ v3dv_descriptor_pool_from_handle(vk_pool);
+ pool->is_driver_internal = true;
+ }
+
+ return result;
+}
+
+static VkResult
+allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
+ VkDescriptorSet *set)
+{
+ /* Make sure we have a descriptor pool */
+ VkResult result;
+ if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) {
+ result = create_storage_buffer_descriptor_pool(cmd_buffer);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+ assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
+
+ /* Allocate descriptor set */
+ struct v3dv_device *device = cmd_buffer->device;
+ VkDevice vk_device = v3dv_device_to_handle(device);
+ VkDescriptorSetAllocateInfo info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+ .descriptorPool = cmd_buffer->meta.query.dspool,
+ .descriptorSetCount = 1,
+ .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+ };
+ result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
+
+ /* If we ran out of pool space, grow the pool and try again */
+ if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
+ result = create_storage_buffer_descriptor_pool(cmd_buffer);
+ if (result == VK_SUCCESS) {
+ info.descriptorPool = cmd_buffer->meta.query.dspool;
+ result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
+ }
+ }
+
+ return result;
+}
+
+static uint32_t
+copy_pipeline_index_from_flags(VkQueryResultFlags flags)
+{
+ uint32_t index = 0;
+ if (flags & VK_QUERY_RESULT_64_BIT)
+ index |= 1;
+ if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+ index |= 2;
+ if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
+ index |= 4;
+ assert(index < 8);
+ return index;
+}
+
+static nir_shader *
+get_copy_query_results_cs(VkQueryResultFlags flags);
+
+static void
+cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t first, uint32_t count,
+ struct v3dv_buffer *buf,
+ uint32_t offset, uint32_t stride,
+ VkQueryResultFlags flags)
+{
+ struct v3dv_device *device = cmd_buffer->device;
+ VkDevice vk_device = v3dv_device_to_handle(device);
+ VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+
+ /* Create the required copy pipeline if not yet created */
+ uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags);
+ if (!device->queries.copy_pipeline[pipeline_idx]) {
+ nir_shader *copy_query_results_cs_nir = get_copy_query_results_cs(flags);
+ VkResult result =
+ v3dv_create_compute_pipeline_from_nir(
+ device, copy_query_results_cs_nir,
+ device->queries.copy_pipeline_layout,
+ &device->queries.copy_pipeline[pipeline_idx]);
+ ralloc_free(copy_query_results_cs_nir);
+ if (result != VK_SUCCESS) {
+ fprintf(stderr, "Failed to create copy query results pipeline\n");
+ return;
+ }
+ }
+
+ /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called
+ * and that already waits maybe we don't (since this is serialized
+ * in the compute queue with EndQuery anyway).
+ */
+ if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+ VkMemoryBarrier2 barrier = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+ .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ };
+ VkDependencyInfo barrier_info = {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .memoryBarrierCount = 1,
+ .pMemoryBarriers = &barrier,
+ };
+ v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
+ }
+
+ /* Allocate and setup descriptor set for output buffer */
+ VkDescriptorSet out_buf_descriptor_set;
+ VkResult result =
+ allocate_storage_buffer_descriptor_set(cmd_buffer,
+ &out_buf_descriptor_set);
+ if (result != VK_SUCCESS) {
+ fprintf(stderr, "vkCmdCopyQueryPoolResults failed: "
+ "could not allocate descriptor.\n");
+ return;
+ }
+
+ VkDescriptorBufferInfo desc_buf_info = {
+ .buffer = v3dv_buffer_to_handle(buf),
+ .offset = 0,
+ .range = VK_WHOLE_SIZE,
+ };
+ VkWriteDescriptorSet write = {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstSet = out_buf_descriptor_set,
+ .dstBinding = 0,
+ .dstArrayElement = 0,
+ .descriptorCount = 1,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .pBufferInfo = &desc_buf_info,
+ };
+ v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
+
+ /* Dispatch copy */
+ v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+ assert(device->queries.copy_pipeline[pipeline_idx]);
+ v3dv_CmdBindPipeline(vk_cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->queries.copy_pipeline[pipeline_idx]);
+
+ VkDescriptorSet sets[2] = {
+ pool->meta.descriptor_set,
+ out_buf_descriptor_set,
+ };
+ v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ device->queries.copy_pipeline_layout,
+ 0, 2, sets, 0, NULL);
+
+ struct {
+ uint32_t avail_offset, first, offset, stride, flags;
+ } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags };
+ v3dv_CmdPushConstants(vk_cmd_buffer,
+ device->queries.copy_pipeline_layout,
+ VK_SHADER_STAGE_COMPUTE_BIT,
+ 0, sizeof(push_data), &push_data);
+
+ cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
+
+ v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
+}
+
+static void
+cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t first,
+ uint32_t count,
+ struct v3dv_buffer *dst,
+ uint32_t offset,
+ uint32_t stride,
+ VkQueryResultFlags flags)
+{
+ struct v3dv_job *job =
+ v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+ V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
+ cmd_buffer, -1);
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ job->cpu.query_copy_results.pool = pool;
+ job->cpu.query_copy_results.first = first;
+ job->cpu.query_copy_results.count = count;
+ job->cpu.query_copy_results.dst = dst;
+ job->cpu.query_copy_results.offset = offset;
+ job->cpu.query_copy_results.stride = stride;
+ job->cpu.query_copy_results.flags = flags;
+
+ list_addtail(&job->list_link, &cmd_buffer->jobs);
}
VKAPI_ATTR void VKAPI_CALL
@@ -324,9 +1220,30 @@ v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
- v3dv_cmd_buffer_copy_query_results(cmd_buffer, pool,
- firstQuery, queryCount,
- dst, dstOffset, stride, flags);
+ /* Copies can only happen outside a render pass instance so we should not
+ * be in the middle of job recording.
+ */
+ assert(cmd_buffer->state.pass == NULL);
+ assert(cmd_buffer->state.job == NULL);
+
+ assert(firstQuery < pool->query_count);
+ assert(firstQuery + queryCount <= pool->query_count);
+
+ /* For occlusion queries we implement the copy in the GPU but for other
+ * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu
+ * when executed in the queue.
+ */
+ if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+ cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool,
+ firstQuery, queryCount,
+ dst, (uint32_t) dstOffset,
+ (uint32_t) stride, flags);
+ } else {
+ cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool,
+ firstQuery, queryCount,
+ dst, (uint32_t)dstOffset,
+ (uint32_t) stride, flags);
+ }
}
VKAPI_ATTR void VKAPI_CALL
@@ -351,3 +1268,537 @@ v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
}
+
+void
+v3dv_reset_query_pool_cpu(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t first,
+ uint32_t count)
+{
+ mtx_lock(&device->query_mutex);
+
+ if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ assert(first + count <= pool->query_count);
+
+ /* Reset timestamp */
+ uint8_t *base_addr;
+ base_addr = ((uint8_t *) pool->timestamp.bo->map) +
+ pool->queries[first].timestamp.offset;
+ memset(base_addr, 0, 8 * count);
+
+ for (uint32_t i = first; i < first + count; i++) {
+ if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS)
+ fprintf(stderr, "Failed to reset sync");
+ }
+
+ mtx_unlock(&device->query_mutex);
+ return;
+ }
+
+ for (uint32_t i = first; i < first + count; i++) {
+ assert(i < pool->query_count);
+ struct v3dv_query *q = &pool->queries[i];
+ q->maybe_available = false;
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION: {
+ /* Reset availability */
+ uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) +
+ pool->occlusion.avail_offset + first;
+ memset(base_addr, 0, count);
+
+ /* Reset occlusion counter */
+ const uint8_t *q_addr =
+ ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
+ uint32_t *counter = (uint32_t *) q_addr;
+ *counter = 0;
+ break;
+ }
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ kperfmon_destroy(device, pool, i);
+ kperfmon_create(device, pool, i);
+ if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
+ fprintf(stderr, "Failed to reset sync");
+ break;
+ default:
+ unreachable("Unsupported query type");
+ }
+ }
+
+ mtx_unlock(&device->query_mutex);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_ResetQueryPool(VkDevice _device,
+ VkQueryPool queryPool,
+ uint32_t firstQuery,
+ uint32_t queryCount)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
+
+ v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+ VkPhysicalDevice physicalDevice,
+ uint32_t queueFamilyIndex,
+ uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+ V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
+
+ return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
+ pCounters,
+ pCounterDescriptions);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
+ VkPhysicalDevice physicalDevice,
+ const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
+ uint32_t *pNumPasses)
+{
+ *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
+ DRM_V3D_MAX_PERF_COUNTERS);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_AcquireProfilingLockKHR(
+ VkDevice _device,
+ const VkAcquireProfilingLockInfoKHR *pInfo)
+{
+ return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_ReleaseProfilingLockKHR(VkDevice device)
+{
+}
+
+static inline void
+nir_set_query_availability(nir_builder *b,
+ nir_def *buf,
+ nir_def *offset,
+ nir_def *query_idx,
+ nir_def *avail)
+{
+ offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
+ nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
+}
+
+static inline nir_def *
+nir_get_query_availability(nir_builder *b,
+ nir_def *buf,
+ nir_def *offset,
+ nir_def *query_idx)
+{
+ offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
+ nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
+ return nir_i2i32(b, avail);
+}
+
+static nir_shader *
+get_set_query_availability_cs()
+{
+ const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+ nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+ "set query availability cs");
+
+ nir_def *buf =
+ nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+ .desc_set = 0,
+ .binding = 0,
+ .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+ /* This assumes a local size of 1 and a horizontal-only dispatch. If we
+ * ever change any of these parameters we need to update how we compute the
+ * query index here.
+ */
+ nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+
+ nir_def *offset =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+ nir_def *query_idx =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+ nir_def *avail =
+ nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);
+
+ query_idx = nir_iadd(&b, query_idx, wg_id);
+ nir_set_query_availability(&b, buf, offset, query_idx, avail);
+
+ return b.shader;
+}
+
+static inline nir_def *
+nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
+{
+ nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
+ nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
+ nir_def *offset =
+ nir_iadd(b, nir_imul_imm(b, query_group, 1024),
+ nir_imul_imm(b, query_group_offset, 4));
+ return offset;
+}
+
+static inline void
+nir_reset_occlusion_counter(nir_builder *b,
+ nir_def *buf,
+ nir_def *query_idx)
+{
+ nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
+ nir_def *zero = nir_imm_int(b, 0);
+ nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
+}
+
+static inline nir_def *
+nir_read_occlusion_counter(nir_builder *b,
+ nir_def *buf,
+ nir_def *query_idx)
+{
+ nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
+ return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
+}
+
+static nir_shader *
+get_reset_occlusion_query_cs()
+{
+ const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+ nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+ "reset occlusion query cs");
+
+ nir_def *buf =
+ nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+ .desc_set = 0,
+ .binding = 0,
+ .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+ /* This assumes a local size of 1 and a horizontal-only dispatch. If we
+ * ever change any of these parameters we need to update how we compute the
+ * query index here.
+ */
+ nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+
+ nir_def *avail_offset =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+ nir_def *base_query_idx =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+ nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
+
+ nir_set_query_availability(&b, buf, avail_offset, query_idx,
+ nir_imm_intN_t(&b, 0, 8));
+ nir_reset_occlusion_counter(&b, buf, query_idx);
+
+ return b.shader;
+}
+
+static void
+write_query_buffer(nir_builder *b,
+ nir_def *buf,
+ nir_def **offset,
+ nir_def *value,
+ bool flag_64bit)
+{
+ if (flag_64bit) {
+ /* Create a 64-bit value using a vec2 with the .Y component set to 0
+ * so we can write a 64-bit value in a single store.
+ */
+ nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
+ nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
+ *offset = nir_iadd_imm(b, *offset, 8);
+ } else {
+ nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
+ *offset = nir_iadd_imm(b, *offset, 4);
+ }
+}
+
+static nir_shader *
+get_copy_query_results_cs(VkQueryResultFlags flags)
+{
+ bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT;
+ bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
+ bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
+
+ const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+ nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
+ "copy query results cs");
+
+ nir_def *buf =
+ nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+ .desc_set = 0,
+ .binding = 0,
+ .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+ nir_def *buf_out =
+ nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
+ .desc_set = 1,
+ .binding = 0,
+ .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+
+ /* Read push constants */
+ nir_def *avail_offset =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
+
+ nir_def *base_query_idx =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
+
+ nir_def *base_offset_out =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);
+
+ nir_def *stride =
+ nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);
+
+ /* This assumes a local size of 1 and a horizontal-only dispatch. If we
+ * ever change any of these parameters we need to update how we compute the
+ * query index here.
+ */
+ nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+ nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
+
+ /* Read query availability if needed */
+ nir_def *avail = NULL;
+ if (flag_avail || !flag_partial)
+ avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);
+
+ /* Write occusion query result... */
+ nir_def *offset =
+ nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));
+
+ /* ...if partial is requested, we always write */
+ if(flag_partial) {
+ nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
+ write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
+ } else {
+ /*...otherwise, we only write if the query is available */
+ nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
+ nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
+ write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
+ nir_pop_if(&b, if_stmt);
+ }
+
+ /* Write query availability */
+ if (flag_avail)
+ write_query_buffer(&b, buf_out, &offset, avail, flag_64bit);
+
+ return b.shader;
+}
+
+static bool
+create_query_pipelines(struct v3dv_device *device)
+{
+ VkResult result;
+ VkPipeline pipeline;
+
+ /* Set layout: single storage buffer */
+ if (!device->queries.buf_descriptor_set_layout) {
+ VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
+ .binding = 0,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .descriptorCount = 1,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ };
+ VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+ .bindingCount = 1,
+ .pBindings = &descriptor_set_layout_binding,
+ };
+ result =
+ v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
+ &descriptor_set_layout_info,
+ &device->vk.alloc,
+ &device->queries.buf_descriptor_set_layout);
+ if (result != VK_SUCCESS)
+ return false;
+ }
+
+ /* Set availability pipeline.
+ *
+ * Pipeline layout:
+ * - 1 storage buffer for the BO with the query availability.
+ * - 2 push constants:
+ * 0B: offset of the availability info in the buffer (4 bytes)
+ * 4B: base query index (4 bytes).
+ * 8B: availability (1 byte).
+ */
+ if (!device->queries.avail_pipeline_layout) {
+ VkPipelineLayoutCreateInfo pipeline_layout_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+ .setLayoutCount = 1,
+ .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+ .pushConstantRangeCount = 1,
+ .pPushConstantRanges =
+ &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 },
+ };
+
+ result =
+ v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+ &pipeline_layout_info,
+ &device->vk.alloc,
+ &device->queries.avail_pipeline_layout);
+
+ if (result != VK_SUCCESS)
+ return false;
+ }
+
+ if (!device->queries.avail_pipeline) {
+ nir_shader *set_query_availability_cs_nir = get_set_query_availability_cs();
+ result = v3dv_create_compute_pipeline_from_nir(device,
+ set_query_availability_cs_nir,
+ device->queries.avail_pipeline_layout,
+ &pipeline);
+ ralloc_free(set_query_availability_cs_nir);
+ if (result != VK_SUCCESS)
+ return false;
+
+ device->queries.avail_pipeline = pipeline;
+ }
+
+ /* Reset occlusion query pipeline.
+ *
+ * Pipeline layout:
+ * - 1 storage buffer for the BO with the occlusion and availability data.
+ * - Push constants:
+ * 0B: offset of the availability info in the buffer (4B)
+ * 4B: base query index (4B)
+ */
+ if (!device->queries.reset_occlusion_pipeline_layout) {
+ VkPipelineLayoutCreateInfo pipeline_layout_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+ .setLayoutCount = 1,
+ .pSetLayouts = &device->queries.buf_descriptor_set_layout,
+ .pushConstantRangeCount = 1,
+ .pPushConstantRanges =
+ &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 },
+ };
+
+ result =
+ v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+ &pipeline_layout_info,
+ &device->vk.alloc,
+ &device->queries.reset_occlusion_pipeline_layout);
+
+ if (result != VK_SUCCESS)
+ return false;
+ }
+
+ if (!device->queries.reset_occlusion_pipeline) {
+ nir_shader *reset_occlusion_query_cs_nir = get_reset_occlusion_query_cs();
+ result = v3dv_create_compute_pipeline_from_nir(
+ device,
+ reset_occlusion_query_cs_nir,
+ device->queries.reset_occlusion_pipeline_layout,
+ &pipeline);
+ ralloc_free(reset_occlusion_query_cs_nir);
+ if (result != VK_SUCCESS)
+ return false;
+
+ device->queries.reset_occlusion_pipeline = pipeline;
+ }
+
+ /* Copy query results pipelines.
+ *
+ * Pipeline layout:
+ * - 1 storage buffer for the BO with the query availability and occlusion.
+ * - 1 storage buffer for the output.
+ * - Push constants:
+ * 0B: offset of the availability info in the buffer (4B)
+ * 4B: base query index (4B)
+ * 8B: offset into output buffer (4B)
+ * 12B: stride (4B)
+ *
+ * We create multiple specialized pipelines depending on the copy flags
+ * to remove conditionals from the copy shader and get more optimized
+ * pipelines.
+ */
+ if (!device->queries.copy_pipeline_layout) {
+ VkDescriptorSetLayout set_layouts[2] = {
+ device->queries.buf_descriptor_set_layout,
+ device->queries.buf_descriptor_set_layout
+ };
+ VkPipelineLayoutCreateInfo pipeline_layout_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+ .setLayoutCount = 2,
+ .pSetLayouts = set_layouts,
+ .pushConstantRangeCount = 1,
+ .pPushConstantRanges =
+ &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 },
+ };
+
+ result =
+ v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+ &pipeline_layout_info,
+ &device->vk.alloc,
+ &device->queries.copy_pipeline_layout);
+
+ if (result != VK_SUCCESS)
+ return false;
+ }
+
+ /* Actual copy pipelines are created lazily on demand since there can be up
+ * to 8 depending on the flags used, however it is likely that applications
+ * will use the same flags every time and only one pipeline is required.
+ */
+
+ return true;
+}
+
+static void
+destroy_query_pipelines(struct v3dv_device *device)
+{
+ VkDevice _device = v3dv_device_to_handle(device);
+
+ /* Availability pipeline */
+ v3dv_DestroyPipeline(_device, device->queries.avail_pipeline,
+ &device->vk.alloc);
+ device->queries.avail_pipeline = VK_NULL_HANDLE;
+ v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout,
+ &device->vk.alloc);
+ device->queries.avail_pipeline_layout = VK_NULL_HANDLE;
+
+ /* Reset occlusion pipeline */
+ v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline,
+ &device->vk.alloc);
+ device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE;
+ v3dv_DestroyPipelineLayout(_device,
+ device->queries.reset_occlusion_pipeline_layout,
+ &device->vk.alloc);
+ device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE;
+
+ /* Copy pipelines */
+ for (int i = 0; i < 8; i++) {
+ v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i],
+ &device->vk.alloc);
+ device->queries.copy_pipeline[i] = VK_NULL_HANDLE;
+ }
+ v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout,
+ &device->vk.alloc);
+ device->queries.copy_pipeline_layout = VK_NULL_HANDLE;
+
+ v3dv_DestroyDescriptorSetLayout(_device,
+ device->queries.buf_descriptor_set_layout,
+ &device->vk.alloc);
+ device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE;
+}
+
+/**
+ * Allocates device resources for implementing certain types of queries.
+ */
+VkResult
+v3dv_query_allocate_resources(struct v3dv_device *device)
+{
+ if (!create_query_pipelines(device))
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ return VK_SUCCESS;
+}
+
+void
+v3dv_query_free_resources(struct v3dv_device *device)
+{
+ destroy_query_pipelines(device);
+}
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 05343b0a24c..ac981984c4f 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -25,6 +25,9 @@
#include "drm-uapi/v3d_drm.h"
#include "broadcom/clif/clif_dump.h"
+#include "util/libsync.h"
+#include "util/os_time.h"
+#include "vk_drm_syncobj.h"
#include <errno.h>
#include <time.h>
@@ -34,16 +37,16 @@ v3dv_clif_dump(struct v3dv_device *device,
struct v3dv_job *job,
struct drm_v3d_submit_cl *submit)
{
- if (!(V3D_DEBUG & (V3D_DEBUG_CL |
- V3D_DEBUG_CL_NO_BIN |
- V3D_DEBUG_CLIF)))
+ if (!(V3D_DBG(CL) ||
+ V3D_DBG(CL_NO_BIN) ||
+ V3D_DBG(CLIF)))
return;
struct clif_dump *clif = clif_dump_init(&device->devinfo,
stderr,
- V3D_DEBUG & (V3D_DEBUG_CL |
- V3D_DEBUG_CL_NO_BIN),
- V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
+ V3D_DBG(CL) ||
+ V3D_DBG(CL_NO_BIN),
+ V3D_DBG(CL_NO_BIN));
set_foreach(job->bos, entry) {
struct v3dv_bo *bo = (void *)entry->key;
@@ -67,131 +70,415 @@ v3dv_clif_dump(struct v3dv_device *device,
clif_dump_destroy(clif);
}
-static uint64_t
-gettime_ns()
+static VkResult
+queue_wait_idle(struct v3dv_queue *queue,
+ struct v3dv_submit_sync_info *sync_info)
{
- struct timespec current;
- clock_gettime(CLOCK_MONOTONIC, &current);
- return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
-}
+ int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
+ queue->last_job_syncs.syncs, 4,
+ INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+ NULL);
+ if (ret)
+ return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m");
-static uint64_t
-get_absolute_timeout(uint64_t timeout)
-{
- uint64_t current_time = gettime_ns();
- uint64_t max_timeout = (uint64_t) INT64_MAX - current_time;
+ bool first = true;
+ for (int i = 0; i < 4; i++) {
+ if (!queue->last_job_syncs.first[i])
+ first = false;
+ }
- timeout = MIN2(max_timeout, timeout);
+ /* If we're not the first job, that means we're waiting on some
+ * per-queue-type syncobj which transitively waited on the semaphores
+ * so we can skip the semaphore wait.
+ */
+ if (first) {
+ VkResult result = vk_sync_wait_many(&queue->device->vk,
+ sync_info->wait_count,
+ sync_info->waits,
+ VK_SYNC_WAIT_COMPLETE,
+ UINT64_MAX);
+ if (result != VK_SUCCESS)
+ return result;
+ }
- return (current_time + timeout);
-}
+ for (int i = 0; i < 4; i++)
+ queue->last_job_syncs.first[i] = false;
-static VkResult
-queue_submit_job(struct v3dv_queue *queue,
- struct v3dv_job *job,
- bool do_sem_wait,
- pthread_t *wait_thread);
+ return VK_SUCCESS;
+}
-/* Waits for active CPU wait threads spawned before the current thread to
- * complete and submit all their GPU jobs.
- */
static void
-cpu_queue_wait_idle(struct v3dv_queue *queue)
+multisync_free(struct v3dv_device *device,
+ struct drm_v3d_multi_sync *ms)
{
- const pthread_t this_thread = pthread_self();
-
-retry:
- mtx_lock(&queue->mutex);
- list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
- &queue->submit_wait_list, list_link) {
- for (uint32_t i = 0; i < info->wait_thread_count; i++) {
- if (info->wait_threads[i].finished)
- continue;
-
- /* Because we are testing this against the list of spawned threads
- * it will never match for the main thread, so when we call this from
- * the main thread we are effectively waiting for all active threads
- * to complete, and otherwise we are only waiting for work submitted
- * before the wait thread that called this (a wait thread should never
- * be waiting for work submitted after it).
- */
- if (info->wait_threads[i].thread == this_thread)
- goto done;
+ vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
+ vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
+}
- /* Wait and try again */
- mtx_unlock(&queue->mutex);
- usleep(500); /* 0.5 ms */
- goto retry;
- }
+static struct drm_v3d_sem *
+set_in_syncs(struct v3dv_queue *queue,
+ struct v3dv_job *job,
+ enum v3dv_queue_type queue_sync,
+ uint32_t *count,
+ struct vk_sync_wait *waits,
+ unsigned wait_count,
+ struct v3dv_submit_sync_info *sync_info)
+{
+ struct v3dv_device *device = queue->device;
+ uint32_t n_syncs = 0;
+
+ /* If this is the first job submitted to a given GPU queue in this cmd buf
+ * batch, it has to wait on wait semaphores (if any) before running.
+ */
+ if (queue->last_job_syncs.first[queue_sync])
+ n_syncs = sync_info->wait_count;
+
+ /* If the serialize flag is set the job needs to be serialized in the
+ * corresponding queues. Notice that we may implement transfer operations
+ * as both CL or TFU jobs.
+ *
+ * FIXME: maybe we could track more precisely if the source of a transfer
+ * barrier is a CL and/or a TFU job.
+ */
+ bool sync_csd = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
+ bool sync_tfu = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
+ bool sync_cl = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
+ V3DV_BARRIER_TRANSFER_BIT);
+ bool sync_cpu = job->serialize & V3DV_BARRIER_CPU_BIT;
+
+ *count = n_syncs;
+ if (sync_cl)
+ (*count)++;
+ if (sync_tfu)
+ (*count)++;
+ if (sync_csd)
+ (*count)++;
+ if (sync_cpu)
+ (*count)++;
+
+ *count += wait_count;
+
+ if (!*count)
+ return NULL;
+
+ struct drm_v3d_sem *syncs =
+ vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
+ 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+ if (!syncs)
+ return NULL;
+
+ for (int i = 0; i < n_syncs; i++) {
+ syncs[i].handle =
+ vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
}
-done:
- mtx_unlock(&queue->mutex);
+ for (int i = 0; i < wait_count; i++) {
+ syncs[n_syncs++].handle =
+ vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
+ }
+
+ if (sync_cl)
+ syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
+
+ if (sync_csd)
+ syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
+
+ if (sync_tfu)
+ syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
+
+ if (sync_cpu)
+ syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
+
+ assert(n_syncs == *count);
+ return syncs;
}
-static VkResult
-gpu_queue_wait_idle(struct v3dv_queue *queue)
+static struct drm_v3d_sem *
+set_out_syncs(struct v3dv_queue *queue,
+ struct v3dv_job *job,
+ enum v3dv_queue_type queue_sync,
+ uint32_t *count,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
struct v3dv_device *device = queue->device;
- mtx_lock(&device->mutex);
- uint32_t last_job_sync = device->last_job_sync;
- mtx_unlock(&device->mutex);
+ uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
- int ret = drmSyncobjWait(device->pdevice->render_fd,
- &last_job_sync, 1, INT64_MAX, 0, NULL);
- if (ret)
- return VK_ERROR_DEVICE_LOST;
+ /* We always signal the syncobj from `device->last_job_syncs` related to
+ * this v3dv_queue_type to track the last job submitted to this queue.
+ */
+ (*count) = n_vk_syncs + 1;
- return VK_SUCCESS;
+ struct drm_v3d_sem *syncs =
+ vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
+ 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+ if (!syncs)
+ return NULL;
+
+ if (n_vk_syncs) {
+ for (unsigned i = 0; i < n_vk_syncs; i++) {
+ syncs[i].handle =
+ vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
+ }
+ }
+
+ syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
+
+ return syncs;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueueWaitIdle(VkQueue _queue)
+static void
+set_ext(struct drm_v3d_extension *ext,
+ struct drm_v3d_extension *next,
+ uint32_t id,
+ uintptr_t flags)
{
- V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
+ ext->next = (uintptr_t)(void *)next;
+ ext->id = id;
+ ext->flags = flags;
+}
- /* Check that we don't have any wait threads running in the CPU first,
- * as these can spawn new GPU jobs.
- */
- cpu_queue_wait_idle(queue);
+/* This function sets the extension for multiple in/out syncobjs. When it is
+ * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
+ * Otherwise, the extension id is 0, which means an out-of-memory error.
+ */
+static void
+set_multisync(struct drm_v3d_multi_sync *ms,
+ struct v3dv_submit_sync_info *sync_info,
+ struct vk_sync_wait *waits,
+ unsigned wait_count,
+ struct drm_v3d_extension *next,
+ struct v3dv_device *device,
+ struct v3dv_job *job,
+ enum v3dv_queue_type in_queue_sync,
+ enum v3dv_queue_type out_queue_sync,
+ enum v3d_queue wait_stage,
+ bool signal_syncs)
+{
+ struct v3dv_queue *queue = &device->queue;
+ uint32_t out_sync_count = 0, in_sync_count = 0;
+ struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
- /* Check we don't have any GPU jobs running */
- return gpu_queue_wait_idle(queue);
+ in_syncs = set_in_syncs(queue, job, in_queue_sync,
+ &in_sync_count, waits, wait_count, sync_info);
+ if (!in_syncs && in_sync_count)
+ goto fail;
+
+ out_syncs = set_out_syncs(queue, job, out_queue_sync,
+ &out_sync_count, sync_info, signal_syncs);
+
+ assert(out_sync_count > 0);
+
+ if (!out_syncs)
+ goto fail;
+
+ set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
+ ms->wait_stage = wait_stage;
+ ms->out_sync_count = out_sync_count;
+ ms->out_syncs = (uintptr_t)(void *)out_syncs;
+ ms->in_sync_count = in_sync_count;
+ ms->in_syncs = (uintptr_t)(void *)in_syncs;
+
+ return;
+
+fail:
+ if (in_syncs)
+ vk_free(&device->vk.alloc, in_syncs);
+ assert(!out_syncs);
+
+ return;
}
static VkResult
-handle_reset_query_cpu_job(struct v3dv_job *job)
+handle_reset_query_cpu_job(struct v3dv_queue *queue,
+ struct v3dv_job *job,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
+ struct v3dv_device *device = queue->device;
struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
assert(info->pool);
- /* We are about to reset query counters so we need to make sure that
- * The GPU is not using them. The exception is timestamp queries, since
- * we handle those in the CPU.
- *
- * FIXME: we could avoid blocking the main thread for this if we use
- * submission thread.
+ assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
+
+ if (device->pdevice->caps.cpu_queue) {
+ assert(info->first + info->count <= info->pool->query_count);
+
+ struct drm_v3d_submit_cpu submit = {0};
+ struct drm_v3d_multi_sync ms = {0};
+
+ uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+ uintptr_t *kperfmon_ids = NULL;
+
+ if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ submit.bo_handle_count = 1;
+ submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
+
+ struct drm_v3d_reset_timestamp_query reset = {0};
+
+ set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
+
+ reset.count = info->count;
+ reset.offset = info->pool->queries[info->first].timestamp.offset;
+
+ for (uint32_t i = 0; i < info->count; i++) {
+ struct v3dv_query *query = &info->pool->queries[info->first + i];
+ syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
+ }
+
+ reset.syncs = (uintptr_t)(void *)syncs;
+
+ set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
+ V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ } else {
+ assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+ struct drm_v3d_reset_performance_query reset = {0};
+
+ set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
+
+ struct vk_sync_wait waits[info->count];
+ unsigned wait_count = 0;
+ for (int i = 0; i < info->count; i++) {
+ struct v3dv_query *query = &info->pool->queries[info->first + i];
+ /* Only wait for a query if we've used it otherwise we will be
+ * waiting forever for the fence to become signaled.
+ */
+ if (query->maybe_available) {
+ waits[wait_count] = (struct vk_sync_wait){
+ .sync = query->perf.last_job_sync
+ };
+ wait_count++;
+ };
+ }
+
+ reset.count = info->count;
+ reset.nperfmons = info->pool->perfmon.nperfmons;
+
+ kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
+
+ for (uint32_t i = 0; i < info->count; i++) {
+ struct v3dv_query *query = &info->pool->queries[info->first + i];
+
+ syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+ kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
+ }
+
+ reset.syncs = (uintptr_t)(void *)syncs;
+ reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
+
+ set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
+ V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ }
+
+ submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+ submit.extensions = (uintptr_t)(void *)&ms;
+
+ /* From the Vulkan spec for vkCmdResetQueryPool:
+ *
+ * "This command defines an execution dependency between other query commands
+ * that reference the same query.
+ * ...
+ * The second synchronization scope includes all commands which reference the
+ * queries in queryPool indicated by firstQuery and queryCount that occur later
+ * in submission order."
+ *
+ * This means we should ensure that any timestamps after a reset don't execute before
+ * the reset, however, for timestamps queries in particular we don't have to do
+ * anything special because timestamp queries have to wait for all previously
+ * submitted work to complete before executing (which we accomplish by using
+ * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
+ */
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
+
+ free(syncs);
+ free(kperfmon_ids);
+ multisync_free(device, &ms);
+
+ queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
+
+ if (ret)
+ return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
+
+ return VK_SUCCESS;
+ }
+
+ /* We are about to reset query counters in user-space so we need to make
+ * sure that the GPU is not using them.
*/
- if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
- v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
-
- for (uint32_t i = info->first; i < info->first + info->count; i++) {
- assert(i < info->pool->query_count);
- struct v3dv_query *q = &info->pool->queries[i];
- q->maybe_available = false;
- switch (info->pool->query_type) {
- case VK_QUERY_TYPE_OCCLUSION: {
- const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
- uint32_t *counter = (uint32_t *) q_addr;
- *counter = 0;
- break;
+ if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ VkResult result = queue_wait_idle(queue, sync_info);
+ if (result != VK_SUCCESS)
+ return result;
+
+ v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
+ }
+
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ struct vk_sync_wait waits[info->count];
+ unsigned wait_count = 0;
+ for (int i = 0; i < info->count; i++) {
+ struct v3dv_query *query = &info->pool->queries[info->first + i];
+ /* Only wait for a query if we've used it otherwise we will be
+ * waiting forever for the fence to become signaled.
+ */
+ if (query->maybe_available) {
+ waits[wait_count] = (struct vk_sync_wait){
+ .sync = query->perf.last_job_sync
+ };
+ wait_count++;
+ };
}
- case VK_QUERY_TYPE_TIMESTAMP:
- q->value = 0;
- break;
- default:
- unreachable("Unsupported query type");
+
+ VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
+ VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
+{
+ int err;
+ static const enum v3dv_queue_type queues_to_sync[] = {
+ V3DV_QUEUE_CL,
+ V3DV_QUEUE_CSD,
+ };
+
+ for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
+ enum v3dv_queue_type queue_type = queues_to_sync[i];
+ int tmp_fd = -1;
+
+ err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
+ queue->last_job_syncs.syncs[queue_type],
+ &tmp_fd);
+
+ if (err) {
+ close(*fd);
+ return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+ "sync file export failed: %m");
+ }
+
+ err = sync_accumulate("v3dv", fd, tmp_fd);
+
+ if (err) {
+ close(tmp_fd);
+ close(*fd);
+ return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+ "failed to accumulate sync files: %m");
}
}
@@ -199,36 +486,200 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
}
static VkResult
-handle_end_query_cpu_job(struct v3dv_job *job)
+handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
{
- struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
+ VkResult result = VK_SUCCESS;
+
+ mtx_lock(&job->device->query_mutex);
+
+ struct v3dv_end_query_info *info = &job->cpu.query_end;
+ struct v3dv_queue *queue = &job->device->queue;
+
+ int err = 0;
+ int fd = -1;
+
+ assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ result = export_perfmon_last_job_sync(queue, job, &fd);
+
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ assert(fd >= 0);
+ }
+
for (uint32_t i = 0; i < info->count; i++) {
assert(info->query + i < info->pool->query_count);
struct v3dv_query *query = &info->pool->queries[info->query + i];
+
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+ err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
+ syncobj, fd);
+
+ if (err) {
+ result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+ "sync file import failed: %m");
+ goto fail;
+ }
+ }
+
query->maybe_available = true;
}
- return VK_SUCCESS;
+fail:
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
+ close(fd);
+
+ cnd_broadcast(&job->device->query_ended);
+ mtx_unlock(&job->device->query_mutex);
+
+ return result;
}
static VkResult
-handle_copy_query_results_cpu_job(struct v3dv_job *job)
+handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
+ struct v3dv_job *job,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
+ struct v3dv_device *device = queue->device;
struct v3dv_copy_query_results_cpu_job_info *info =
&job->cpu.query_copy_results;
+ assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+ info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
+
assert(info->dst && info->dst->mem && info->dst->mem->bo);
struct v3dv_bo *bo = info->dst->mem->bo;
+ if (device->pdevice->caps.cpu_queue) {
+ struct drm_v3d_submit_cpu submit = {0};
+ struct drm_v3d_multi_sync ms = {0};
+
+ uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+ uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+ uint32_t *bo_handles = NULL;
+ uintptr_t *kperfmon_ids = NULL;
+
+ if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
+ submit.bo_handle_count = 2;
+
+ bo_handles = (uint32_t *)
+ malloc(sizeof(uint32_t) * submit.bo_handle_count);
+
+ bo_handles[0] = bo->handle;
+ bo_handles[1] = info->pool->timestamp.bo->handle;
+ submit.bo_handles = (uintptr_t)(void *)bo_handles;
+
+ struct drm_v3d_copy_timestamp_query copy = {0};
+
+ set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
+
+ copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
+ copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
+ copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
+ copy.offset = info->offset + info->dst->mem_offset;
+ copy.stride = info->stride;
+ copy.count = info->count;
+
+ for (uint32_t i = 0; i < info->count; i++) {
+ assert(info->first < info->pool->query_count);
+ assert(info->first + info->count <= info->pool->query_count);
+ struct v3dv_query *query = &info->pool->queries[info->first + i];
+
+ offsets[i] = query->timestamp.offset;
+ syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
+ }
+
+ copy.offsets = (uintptr_t)(void *)offsets;
+ copy.syncs = (uintptr_t)(void *)syncs;
+
+ set_multisync(&ms, sync_info, NULL, 0, (void *)&copy, device, job,
+ V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ } else {
+ assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+ submit.bo_handle_count = 1;
+ submit.bo_handles = (uintptr_t)(void *)&bo->handle;
+
+ struct drm_v3d_copy_performance_query copy = {0};
+
+ set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
+
+ /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
+ * results for each query are written as an array of the type indicated
+ * by VkPerformanceCounterKHR::storage for the counter being queried.
+ * For v3dv, VkPerformanceCounterKHR::storage is
+ * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
+ */
+ copy.do_64bit = true;
+ copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
+ copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
+ copy.offset = info->offset + info->dst->mem_offset;
+ copy.stride = info->stride;
+ copy.count = info->count;
+ copy.nperfmons = info->pool->perfmon.nperfmons;
+ copy.ncounters = info->pool->perfmon.ncounters;
+
+ kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
+
+ struct vk_sync_wait waits[info->count];
+ unsigned wait_count = 0;
+
+ for (uint32_t i = 0; i < info->count; i++) {
+ assert(info->first < info->pool->query_count);
+ assert(info->first + info->count <= info->pool->query_count);
+ struct v3dv_query *query = &info->pool->queries[info->first + i];
+
+ syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+ kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
+
+ if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
+ waits[wait_count] = (struct vk_sync_wait){
+ .sync = query->perf.last_job_sync
+ };
+ wait_count++;
+ }
+ }
+
+ copy.syncs = (uintptr_t)(void *)syncs;
+ copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
+
+ set_multisync(&ms, sync_info, waits, wait_count, (void *)&copy, device, job,
+ V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ }
+
+ submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+ submit.extensions = (uintptr_t)(void *)&ms;
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
+
+ free(kperfmon_ids);
+ free(bo_handles);
+ free(offsets);
+ free(syncs);
+ multisync_free(device, &ms);
+
+ queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
+
+ if (ret)
+ return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
+
+ return VK_SUCCESS;
+ }
+
/* Map the entire dst buffer for the CPU copy if needed */
assert(!bo->map || bo->map_size == bo->size);
if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
- return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
- /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a
- * sync wait on the CPU for the corresponding GPU jobs to finish. We might
- * want to use a submission thread to avoid blocking on the main thread.
- */
uint8_t *offset = ((uint8_t *) bo->map) +
info->offset + info->dst->mem_offset;
v3dv_get_query_pool_results_cpu(job->device,
@@ -243,344 +694,213 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
}
static VkResult
-handle_set_event_cpu_job(struct v3dv_job *job, bool is_wait_thread)
+handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
+ struct v3dv_job *job,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
- /* From the Vulkan 1.0 spec:
- *
- * "When vkCmdSetEvent is submitted to a queue, it defines an execution
- * dependency on commands that were submitted before it, and defines an
- * event signal operation which sets the event to the signaled state.
- * The first synchronization scope includes every command previously
- * submitted to the same queue, including those in the same command
- * buffer and batch".
- *
- * So we should wait for all prior work to be completed before signaling
- * the event, this includes all active CPU wait threads spawned for any
- * command buffer submitted *before* this.
- *
- * FIXME: we could avoid blocking the main thread for this if we use a
- * submission thread.
- */
+ struct v3dv_device *device = queue->device;
- /* If we are calling this from a wait thread it will only wait
- * wait threads sspawned before it, otherwise it will wait for
- * all active threads to complete.
- */
- cpu_queue_wait_idle(&job->device->queue);
+ assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
+ struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
- VkResult result = gpu_queue_wait_idle(&job->device->queue);
- if (result != VK_SUCCESS)
- return result;
+ if (!device->pdevice->caps.cpu_queue) {
+ /* Wait for completion of all work queued before the timestamp query */
+ VkResult result = queue_wait_idle(queue, sync_info);
+ if (result != VK_SUCCESS)
+ return result;
- struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set;
- p_atomic_set(&info->event->state, info->state);
+ mtx_lock(&job->device->query_mutex);
- return VK_SUCCESS;
-}
+ /* Compute timestamp */
+ struct timespec t;
+ clock_gettime(CLOCK_MONOTONIC, &t);
-static bool
-check_wait_events_complete(struct v3dv_job *job)
-{
- assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+ for (uint32_t i = 0; i < info->count; i++) {
+ assert(info->query + i < info->pool->query_count);
+ struct v3dv_query *query = &info->pool->queries[info->query + i];
+ query->maybe_available = true;
- struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
- for (uint32_t i = 0; i < info->event_count; i++) {
- if (!p_atomic_read(&info->events[i]->state))
- return false;
- }
- return true;
-}
+ /* Value */
+ uint8_t *value_addr =
+ ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
+ *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
-static void
-wait_thread_finish(struct v3dv_queue *queue, pthread_t thread)
-{
- mtx_lock(&queue->mutex);
- list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
- &queue->submit_wait_list, list_link) {
- for (uint32_t i = 0; i < info->wait_thread_count; i++) {
- if (info->wait_threads[i].thread == thread) {
- info->wait_threads[i].finished = true;
- goto done;
- }
+ /* Availability */
+ result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
}
- }
- unreachable(!"Failed to finish wait thread: not found");
+ cnd_broadcast(&job->device->query_ended);
+ mtx_unlock(&job->device->query_mutex);
-done:
- mtx_unlock(&queue->mutex);
-}
-
-static void *
-event_wait_thread_func(void *_job)
-{
- struct v3dv_job *job = (struct v3dv_job *) _job;
- assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
- struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
-
- /* Wait for events to be signaled */
- const useconds_t wait_interval_ms = 1;
- while (!check_wait_events_complete(job))
- usleep(wait_interval_ms * 1000);
-
- /* Now continue submitting pending jobs for the same command buffer after
- * the wait job.
- */
- struct v3dv_queue *queue = &job->device->queue;
- list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
- &job->cmd_buffer->jobs, list_link) {
- /* We don't want to spawn more than one wait thread per command buffer.
- * If this job also requires a wait for events, we will do the wait here.
- */
- VkResult result = queue_submit_job(queue, pjob, info->sem_wait, NULL);
- if (result == VK_NOT_READY) {
- while (!check_wait_events_complete(pjob)) {
- usleep(wait_interval_ms * 1000);
- }
- result = VK_SUCCESS;
- }
-
- if (result != VK_SUCCESS) {
- fprintf(stderr, "Wait thread job execution failed.\n");
- goto done;
- }
+ return result;
}
-done:
- wait_thread_finish(queue, pthread_self());
- return NULL;
-}
+ struct drm_v3d_submit_cpu submit = {0};
-static VkResult
-spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread)
+ submit.bo_handle_count = 1;
+ submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
-{
- assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
- assert(job->cmd_buffer);
- assert(wait_thread != NULL);
+ struct drm_v3d_timestamp_query timestamp = {0};
- if (pthread_create(wait_thread, NULL, event_wait_thread_func, job))
- return vk_error(job->device->instance, VK_ERROR_DEVICE_LOST);
+ set_ext(&timestamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
- return VK_NOT_READY;
-}
+ timestamp.count = info->count;
-static VkResult
-handle_wait_events_cpu_job(struct v3dv_job *job,
- bool sem_wait,
- pthread_t *wait_thread)
-{
- assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
- struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+ uint32_t *offsets =
+ (uint32_t *) malloc(sizeof(uint32_t) * info->count);
+ uint32_t *syncs =
+ (uint32_t *) malloc(sizeof(uint32_t) * info->count);
- /* If all events are signaled then we are done and can continue submitting
- * the rest of the command buffer normally.
- */
- if (check_wait_events_complete(job))
- return VK_SUCCESS;
+ for (uint32_t i = 0; i < info->count; i++) {
+ assert(info->query + i < info->pool->query_count);
+ struct v3dv_query *query = &info->pool->queries[info->query + i];
+ query->maybe_available = true;
- /* Otherwise, we put the rest of the command buffer on a wait thread until
- * all events are signaled. We only spawn a new thread on the first
- * wait job we see for a command buffer, any additional wait jobs in the
- * same command buffer will run in that same wait thread and will get here
- * with a NULL wait_thread pointer.
- *
- * Also, whether we spawn a wait thread or not, we always return
- * VK_NOT_READY (unless an error happened), so we stop trying to submit
- * any jobs in the same command buffer after the wait job. The wait thread
- * will attempt to submit them after the wait completes.
- */
- info->sem_wait = sem_wait;
- if (wait_thread)
- return spawn_event_wait_thread(job, wait_thread);
- else
- return VK_NOT_READY;
-}
+ offsets[i] = query->timestamp.offset;
+ syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
+ }
-static VkResult
-handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
-{
- assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
- struct v3dv_copy_buffer_to_image_cpu_job_info *info =
- &job->cpu.copy_buffer_to_image;
+ timestamp.offsets = (uintptr_t)(void *)offsets;
+ timestamp.syncs = (uintptr_t)(void *)syncs;
- /* Wait for all GPU work to finish first, since we may be accessing
- * the BOs involved in the operation.
+ struct drm_v3d_multi_sync ms = {0};
+
+ /* The CPU job should be serialized so it only executes after all previously
+ * submitted work has completed
*/
- v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
-
- /* Map BOs */
- struct v3dv_bo *dst_bo = info->image->mem->bo;
- assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
- if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
- return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- void *dst_ptr = dst_bo->map;
-
- struct v3dv_bo *src_bo = info->buffer->mem->bo;
- assert(!src_bo->map || src_bo->map_size == src_bo->size);
- if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
- return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- void *src_ptr = src_bo->map;
-
- const struct v3d_resource_slice *slice =
- &info->image->slices[info->mip_level];
-
- const struct pipe_box box = {
- info->image_offset.x, info->image_offset.y, info->base_layer,
- info->image_extent.width, info->image_extent.height, info->layer_count,
- };
+ job->serialize = V3DV_BARRIER_ALL;
- /* Copy each layer */
- for (uint32_t i = 0; i < info->layer_count; i++) {
- const uint32_t dst_offset =
- v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i);
- const uint32_t src_offset =
- info->buffer->mem_offset + info->buffer_offset +
- info->buffer_layer_stride * i;
- v3d_store_tiled_image(
- dst_ptr + dst_offset, slice->stride,
- src_ptr + src_offset, info->buffer_stride,
- slice->tiling, info->image->cpp, slice->padded_height, &box);
- }
+ set_multisync(&ms, sync_info, NULL, 0, (void *)&timestamp, device, job,
+ V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- return VK_SUCCESS;
-}
+ submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+ submit.extensions = (uintptr_t)(void *)&ms;
-static VkResult
-handle_timestamp_query_cpu_job(struct v3dv_job *job)
-{
- assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
- struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
- /* Wait for completion of all work queued before the timestamp query */
- v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
+ free(offsets);
+ free(syncs);
+ multisync_free(device, &ms);
- /* Compute timestamp */
- struct timespec t;
- clock_gettime(CLOCK_MONOTONIC, &t);
+ queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
- for (uint32_t i = 0; i < info->count; i++) {
- assert(info->query + i < info->pool->query_count);
- struct v3dv_query *query = &info->pool->queries[info->query + i];
- query->maybe_available = true;
- if (i == 0)
- query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
- }
+ if (ret)
+ return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
return VK_SUCCESS;
}
static VkResult
-handle_csd_job(struct v3dv_queue *queue,
- struct v3dv_job *job,
- bool do_sem_wait);
-
-static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
struct v3dv_job *job,
- bool do_sem_wait)
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
+ struct v3dv_device *device = queue->device;
+
assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
assert(info->csd_job);
- /* Make sure the GPU is no longer using the indirect buffer*/
- assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
- v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE);
-
- /* Map the indirect buffer and read the dispatch parameters */
assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
struct v3dv_bo *bo = info->buffer->mem->bo;
- if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
- return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- assert(bo->map);
- const uint32_t offset = info->buffer->mem_offset + info->offset;
- const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
- if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
- return VK_SUCCESS;
+ if (!device->pdevice->caps.cpu_queue) {
+ /* Make sure the GPU is no longer using the indirect buffer*/
+ v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
- if (memcmp(group_counts, info->csd_job->csd.wg_count,
- sizeof(info->csd_job->csd.wg_count)) != 0) {
- v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
- }
+ /* Map the indirect buffer and read the dispatch parameters */
+ if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
+ return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ assert(bo->map);
- handle_csd_job(queue, info->csd_job, do_sem_wait);
+ const uint32_t offset = info->buffer->mem_offset + info->offset;
+ const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
+ if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
+ return VK_SUCCESS;
- return VK_SUCCESS;
-}
+ if (memcmp(group_counts, info->csd_job->csd.wg_count,
+ sizeof(info->csd_job->csd.wg_count)) != 0) {
+ v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
+ }
-static VkResult
-process_semaphores_to_signal(struct v3dv_device *device,
- uint32_t count, const VkSemaphore *sems)
-{
- if (count == 0)
return VK_SUCCESS;
+ }
- int render_fd = device->pdevice->render_fd;
+ struct v3dv_job *csd_job = info->csd_job;
- int fd;
- mtx_lock(&device->mutex);
- drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
- mtx_unlock(&device->mutex);
- if (fd == -1)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ struct drm_v3d_submit_cpu submit = {0};
- VkResult result = VK_SUCCESS;
- for (uint32_t i = 0; i < count; i++) {
- struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
-
- int ret;
- if (!sem->temp_sync)
- ret = drmSyncobjImportSyncFile(render_fd, sem->sync, fd);
- else
- ret = drmSyncobjImportSyncFile(render_fd, sem->temp_sync, fd);
-
- if (ret) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- break;
- }
+ submit.bo_handle_count = 1;
+ submit.bo_handles = (uintptr_t)(void *)&bo->handle;
+
+ csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
+ uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
+ uint32_t bo_idx = 0;
+ set_foreach (csd_job->bos, entry) {
+ struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
+ bo_handles[bo_idx++] = bo->handle;
}
+ csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
- assert(fd >= 0);
- close(fd);
+ struct drm_v3d_indirect_csd indirect = {0};
- return result;
-}
+ set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
-static VkResult
-process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
-{
- if (_fence == VK_NULL_HANDLE)
- return VK_SUCCESS;
+ indirect.submit = csd_job->csd.submit;
+ indirect.offset = info->buffer->mem_offset + info->offset;
+ indirect.wg_size = info->wg_size;
+
+ for (int i = 0; i < 3; i++) {
+ if (info->wg_uniform_offsets[i]) {
+ assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
+ indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
+ } else {
+ indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
+ }
+ }
- struct v3dv_fence *fence = v3dv_fence_from_handle(_fence);
+ indirect.indirect = csd_job->indirect.bo->handle;
- int render_fd = device->pdevice->render_fd;
+ struct drm_v3d_multi_sync ms = {0};
- int fd;
- mtx_lock(&device->mutex);
- drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
- mtx_unlock(&device->mutex);
- if (fd == -1)
+ /* We need to configure the semaphores of this job with the indirect
+ * CSD job, as the CPU job must obey to the CSD job synchronization
+ * demands, such as barriers.
+ */
+ set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
+ V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
+ if (!ms.base.id)
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- int ret;
- if (!fence->temp_sync)
- ret = drmSyncobjImportSyncFile(render_fd, fence->sync, fd);
- else
- ret = drmSyncobjImportSyncFile(render_fd, fence->temp_sync, fd);
+ submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+ submit.extensions = (uintptr_t)(void *)&ms;
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
+
+ free(bo_handles);
+ multisync_free(device, &ms);
- assert(fd >= 0);
- close(fd);
+ queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
+ queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
- return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS;
+ if (ret)
+ return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
+
+ return VK_SUCCESS;
}
static VkResult
handle_cl_job(struct v3dv_queue *queue,
struct v3dv_job *job,
- bool do_sem_wait)
+ uint32_t counter_pass_idx,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
struct v3dv_device *device = queue->device;
@@ -599,7 +919,8 @@ handle_cl_job(struct v3dv_queue *queue,
struct v3dv_bo *bcl_fist_bo =
list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
submit.bcl_start = bcl_fist_bo->offset;
- submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
+ submit.bcl_end = job->suspending ? job->suspended_bcl_end :
+ job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
submit.rcl_start = job->rcl.bo->offset;
submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
@@ -611,6 +932,17 @@ handle_cl_job(struct v3dv_queue *queue,
if (job->tmu_dirty_rcl)
submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
+ /* If the job uses VK_KHR_buffer_device_address we need to ensure all
+ * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
+ * are included.
+ */
+ if (job->uses_buffer_device_address) {
+ util_dynarray_foreach(&queue->device->device_address_bo_list,
+ struct v3dv_bo *, bo) {
+ v3dv_job_add_bo(job, *bo);
+ }
+ }
+
submit.bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
@@ -622,34 +954,64 @@ handle_cl_job(struct v3dv_queue *queue,
assert(bo_idx == submit.bo_handle_count);
submit.bo_handles = (uintptr_t)(void *)bo_handles;
- /* We need a binning sync if we are waiting on a sempahore (do_sem_wait) or
- * if the job comes after a pipeline barrier than involves geometry stages
- * (needs_bcl_sync).
+ submit.perfmon_id = job->perf ?
+ job->perf->kperfmon_ids[counter_pass_idx] : 0;
+ const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
+ queue->last_perfmon_id = submit.perfmon_id;
+
+ /* We need a binning sync if we are the first CL job waiting on a semaphore
+ * with a wait stage that involves the geometry pipeline, or if the job
+ * comes after a pipeline barrier that involves geometry stages
+ * (needs_bcl_sync) or when performance queries are in use.
*
* We need a render sync if the job doesn't need a binning sync but has
* still been flagged for serialization. It should be noted that RCL jobs
* don't start until the previous RCL job has finished so we don't really
* need to add a fence for those, however, we might need to wait on a CSD or
* TFU job, which are not automatically serialized with CL jobs.
- *
- * FIXME: for now, if we are asked to wait on any semaphores, we just wait
- * on the last job we submitted. In the future we might want to pass the
- * actual syncobj of the wait semaphores so we don't block on the last RCL
- * if we only need to wait for a previous CSD or TFU, for example, but
- * we would have to extend our kernel interface to support the case where
- * we have more than one semaphore to wait on.
*/
- const bool needs_bcl_sync = do_sem_wait || job->needs_bcl_sync;
- const bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
+ bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
+ if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
+ for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
+ needs_bcl_sync = sync_info->waits[i].stage_mask &
+ (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
+ VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
+ VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+ VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
+ VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
+ }
+ }
+
+ bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
+
+ /* Replace single semaphore settings whenever our kernel-driver supports
+ * multiple semaphores extension.
+ */
+ struct drm_v3d_multi_sync ms = { 0 };
+ enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
+ set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
+ V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
+ submit.extensions = (uintptr_t)(void *)&ms;
+
+ /* We are using multisync so disable legacy single-sync interface */
+ submit.in_sync_rcl = 0;
+ submit.in_sync_bcl = 0;
+ submit.out_sync = 0;
- mtx_lock(&queue->device->mutex);
- submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0;
- submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0;
- submit.out_sync = device->last_job_sync;
v3dv_clif_dump(device, job, &submit);
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_CL, &submit);
- mtx_unlock(&queue->device->mutex);
static bool warned = false;
if (ret && !warned) {
@@ -659,9 +1021,12 @@ handle_cl_job(struct v3dv_queue *queue,
}
free(bo_handles);
+ multisync_free(device, &ms);
+
+ queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
if (ret)
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+ return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
return VK_SUCCESS;
}
@@ -669,23 +1034,37 @@ handle_cl_job(struct v3dv_queue *queue,
static VkResult
handle_tfu_job(struct v3dv_queue *queue,
struct v3dv_job *job,
- bool do_sem_wait)
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
+ assert(!V3D_DBG(DISABLE_TFU));
+
struct v3dv_device *device = queue->device;
- const bool needs_sync = do_sem_wait || job->serialize;
+ /* Replace single semaphore settings whenever our kernel-driver supports
+ * multiple semaphore extension.
+ */
+ struct drm_v3d_multi_sync ms = { 0 };
+ set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
+ V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
+ job->tfu.extensions = (uintptr_t)(void *)&ms;
+
+ /* We are using multisync so disable legacy single-sync interface */
+ job->tfu.in_sync = 0;
+ job->tfu.out_sync = 0;
- mtx_lock(&device->mutex);
- job->tfu.in_sync = needs_sync ? device->last_job_sync : 0;
- job->tfu.out_sync = device->last_job_sync;
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
- mtx_unlock(&device->mutex);
- if (ret != 0) {
- fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
- }
+ multisync_free(device, &ms);
+ queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
+
+ if (ret != 0)
+ return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
return VK_SUCCESS;
}
@@ -693,12 +1072,25 @@ handle_tfu_job(struct v3dv_queue *queue,
static VkResult
handle_csd_job(struct v3dv_queue *queue,
struct v3dv_job *job,
- bool do_sem_wait)
+ uint32_t counter_pass_idx,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
struct v3dv_device *device = queue->device;
struct drm_v3d_submit_csd *submit = &job->csd.submit;
+ /* If the job uses VK_KHR_buffer_device_address we need to ensure all
+ * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
+ * are included.
+ */
+ if (job->uses_buffer_device_address) {
+ util_dynarray_foreach(&queue->device->device_address_bo_list,
+ struct v3dv_bo *, bo) {
+ v3dv_job_add_bo(job, *bo);
+ }
+ }
+
submit->bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
@@ -710,14 +1102,28 @@ handle_csd_job(struct v3dv_queue *queue,
assert(bo_idx == submit->bo_handle_count);
submit->bo_handles = (uintptr_t)(void *)bo_handles;
- const bool needs_sync = do_sem_wait || job->serialize;
+ /* Replace single semaphore settings whenever our kernel-driver supports
+ * multiple semaphore extension.
+ */
+ struct drm_v3d_multi_sync ms = { 0 };
+ set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
+ V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
+ if (!ms.base.id)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
+ submit->extensions = (uintptr_t)(void *)&ms;
+
+ /* We are using multisync so disable legacy single-sync interface */
+ submit->in_sync = 0;
+ submit->out_sync = 0;
+
+ submit->perfmon_id = job->perf ?
+ job->perf->kperfmon_ids[counter_pass_idx] : 0;
+ queue->last_perfmon_id = submit->perfmon_id;
- mtx_lock(&queue->device->mutex);
- submit->in_sync = needs_sync ? device->last_job_sync : 0;
- submit->out_sync = device->last_job_sync;
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_CSD, submit);
- mtx_unlock(&queue->device->mutex);
static bool warned = false;
if (ret && !warned) {
@@ -728,43 +1134,39 @@ handle_csd_job(struct v3dv_queue *queue,
free(bo_handles);
+ multisync_free(device, &ms);
+ queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
+
if (ret)
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+ return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
return VK_SUCCESS;
}
static VkResult
-queue_submit_job(struct v3dv_queue *queue,
+queue_handle_job(struct v3dv_queue *queue,
struct v3dv_job *job,
- bool do_sem_wait,
- pthread_t *wait_thread)
+ uint32_t counter_pass_idx,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
- assert(job);
-
switch (job->type) {
case V3DV_JOB_TYPE_GPU_CL:
- return handle_cl_job(queue, job, do_sem_wait);
+ return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
case V3DV_JOB_TYPE_GPU_TFU:
- return handle_tfu_job(queue, job, do_sem_wait);
+ return handle_tfu_job(queue, job, sync_info, signal_syncs);
case V3DV_JOB_TYPE_GPU_CSD:
- return handle_csd_job(queue, job, do_sem_wait);
+ return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
- return handle_reset_query_cpu_job(job);
+ return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
case V3DV_JOB_TYPE_CPU_END_QUERY:
- return handle_end_query_cpu_job(job);
+ return handle_end_query_cpu_job(job, counter_pass_idx);
case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
- return handle_copy_query_results_cpu_job(job);
- case V3DV_JOB_TYPE_CPU_SET_EVENT:
- return handle_set_event_cpu_job(job, wait_thread != NULL);
- case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
- return handle_wait_events_cpu_job(job, do_sem_wait, wait_thread);
- case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
- return handle_copy_buffer_to_image_cpu_job(job);
+ return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
- return handle_csd_indirect_cpu_job(queue, job, do_sem_wait);
+ return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
- return handle_timestamp_query_cpu_job(job);
+ return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
default:
unreachable("Unhandled job type");
}
@@ -777,772 +1179,128 @@ queue_create_noop_job(struct v3dv_queue *queue)
queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!queue->noop_job)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
v3dv_X(device, job_emit_noop)(queue->noop_job);
+ /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
+ * serialized across all hw queues to comply with Vulkan's signal operation
+ * order requirements, which basically require that signal operations occur
+ * in submission order.
+ */
+ queue->noop_job->serialize = V3DV_BARRIER_ALL;
+
return VK_SUCCESS;
}
static VkResult
-queue_submit_noop_job(struct v3dv_queue *queue, const VkSubmitInfo *pSubmit)
+queue_submit_noop_job(struct v3dv_queue *queue,
+ uint32_t counter_pass_idx,
+ struct v3dv_submit_sync_info *sync_info,
+ bool signal_syncs)
{
- /* VkQueue host access is externally synchronized so we don't need to lock
- * here for the static variable.
- */
if (!queue->noop_job) {
VkResult result = queue_create_noop_job(queue);
if (result != VK_SUCCESS)
return result;
}
- return queue_submit_job(queue, queue->noop_job,
- pSubmit->waitSemaphoreCount > 0, NULL);
-}
-
-static VkResult
-queue_submit_cmd_buffer(struct v3dv_queue *queue,
- struct v3dv_cmd_buffer *cmd_buffer,
- const VkSubmitInfo *pSubmit,
- pthread_t *wait_thread)
-{
- assert(cmd_buffer);
- assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE);
-
- if (list_is_empty(&cmd_buffer->jobs))
- return queue_submit_noop_job(queue, pSubmit);
-
- list_for_each_entry_safe(struct v3dv_job, job,
- &cmd_buffer->jobs, list_link) {
- VkResult result = queue_submit_job(queue, job,
- pSubmit->waitSemaphoreCount > 0,
- wait_thread);
- if (result != VK_SUCCESS)
- return result;
- }
-
- return VK_SUCCESS;
-}
-
-static void
-add_wait_thread_to_list(struct v3dv_device *device,
- pthread_t thread,
- struct v3dv_queue_submit_wait_info **wait_info)
-{
- /* If this is the first time we spawn a wait thread for this queue
- * submission create a v3dv_queue_submit_wait_info to track this and
- * any other threads in the same submission and add it to the global list
- * in the queue.
- */
- if (*wait_info == NULL) {
- *wait_info =
- vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- (*wait_info)->device = device;
- }
-
- /* And add the thread to the list of wait threads for this submission */
- const uint32_t thread_idx = (*wait_info)->wait_thread_count;
- assert(thread_idx < 16);
- (*wait_info)->wait_threads[thread_idx].thread = thread;
- (*wait_info)->wait_threads[thread_idx].finished = false;
- (*wait_info)->wait_thread_count++;
-}
-
-static void
-add_signal_semaphores_to_wait_list(struct v3dv_device *device,
- const VkSubmitInfo *pSubmit,
- struct v3dv_queue_submit_wait_info *wait_info)
-{
- assert(wait_info);
-
- if (pSubmit->signalSemaphoreCount == 0)
- return;
-
- /* FIXME: We put all the semaphores in a list and we signal all of them
- * together from the submit master thread when the last wait thread in the
- * submit completes. We could do better though: group the semaphores per
- * submit and signal them as soon as all wait threads for a particular
- * submit completes. Not sure if the extra work would be worth it though,
- * since we only spawn waith threads for event waits and only when the
- * event if set from the host after the queue submission.
- */
-
- /* Check the size of the current semaphore list */
- const uint32_t prev_count = wait_info->signal_semaphore_count;
- const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore);
- VkSemaphore *prev_list = wait_info->signal_semaphores;
-
- /* Resize the list to hold the additional semaphores */
- const uint32_t extra_alloc_size =
- pSubmit->signalSemaphoreCount * sizeof(VkSemaphore);
- wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount;
- wait_info->signal_semaphores =
- vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-
- /* Copy the old list to the new allocation and free the old list */
- if (prev_count > 0) {
- memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size);
- vk_free(&device->vk.alloc, prev_list);
- }
-
- /* Add the new semaphores to the list */
- memcpy(wait_info->signal_semaphores + prev_count,
- pSubmit->pSignalSemaphores, extra_alloc_size);
-}
-
-static VkResult
-queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
- const VkSubmitInfo *pSubmit,
- struct v3dv_queue_submit_wait_info **wait_info)
-{
- VkResult result = VK_SUCCESS;
- bool has_wait_threads = false;
-
- /* Even if we don't have any actual work to submit we still need to wait
- * on the wait semaphores and signal the signal semaphores and fence, so
- * in this scenario we just submit a trivial no-op job so we don't have
- * to do anything special, it should not be a common case anyway.
- */
- if (pSubmit->commandBufferCount == 0) {
- result = queue_submit_noop_job(queue, pSubmit);
- } else {
- for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
- pthread_t wait_thread;
- struct v3dv_cmd_buffer *cmd_buffer =
- v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
- result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit,
- &wait_thread);
-
- /* We get VK_NOT_READY if we had to spawn a wait thread for the
- * command buffer. In that scenario, we want to continue submitting
- * any pending command buffers in the batch, but we don't want to
- * process any signal semaphores for the batch until we know we have
- * submitted every job for every command buffer in the batch.
- */
- if (result == VK_NOT_READY) {
- result = VK_SUCCESS;
- add_wait_thread_to_list(queue->device, wait_thread, wait_info);
- has_wait_threads = true;
- }
-
- if (result != VK_SUCCESS)
- break;
- }
- }
-
- if (result != VK_SUCCESS)
- return result;
-
- /* If had to emit any wait threads in this submit we need to wait for all
- * of them to complete before we can signal any semaphores.
- */
- if (!has_wait_threads) {
- return process_semaphores_to_signal(queue->device,
- pSubmit->signalSemaphoreCount,
- pSubmit->pSignalSemaphores);
- } else {
- assert(*wait_info);
- add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info);
- return VK_NOT_READY;
- }
+ assert(queue->noop_job);
+ return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
+ sync_info, signal_syncs);
}
-static void *
-master_wait_thread_func(void *_wait_info)
+VkResult
+v3dv_queue_driver_submit(struct vk_queue *vk_queue,
+ struct vk_queue_submit *submit)
{
- struct v3dv_queue_submit_wait_info *wait_info =
- (struct v3dv_queue_submit_wait_info *) _wait_info;
-
- struct v3dv_queue *queue = &wait_info->device->queue;
-
- /* Wait for all command buffer wait threads to complete */
- for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) {
- int res = pthread_join(wait_info->wait_threads[i].thread, NULL);
- if (res != 0)
- fprintf(stderr, "Wait thread failed to join.\n");
- }
-
- /* Signal semaphores and fences */
+ struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
VkResult result;
- result = process_semaphores_to_signal(wait_info->device,
- wait_info->signal_semaphore_count,
- wait_info->signal_semaphores);
- if (result != VK_SUCCESS)
- fprintf(stderr, "Wait thread semaphore signaling failed.");
-
- result = process_fence_to_signal(wait_info->device, wait_info->fence);
- if (result != VK_SUCCESS)
- fprintf(stderr, "Wait thread fence signaling failed.");
-
- /* Release wait_info */
- mtx_lock(&queue->mutex);
- list_del(&wait_info->list_link);
- mtx_unlock(&queue->mutex);
-
- vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores);
- vk_free(&wait_info->device->vk.alloc, wait_info);
-
- return NULL;
-}
-
-
-static VkResult
-spawn_master_wait_thread(struct v3dv_queue *queue,
- struct v3dv_queue_submit_wait_info *wait_info)
-
-{
- VkResult result = VK_SUCCESS;
-
- mtx_lock(&queue->mutex);
- if (pthread_create(&wait_info->master_wait_thread, NULL,
- master_wait_thread_func, wait_info)) {
- result = vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
- goto done;
- }
-
- list_addtail(&wait_info->list_link, &queue->submit_wait_list);
-
-done:
- mtx_unlock(&queue->mutex);
- return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueueSubmit(VkQueue _queue,
- uint32_t submitCount,
- const VkSubmitInfo* pSubmits,
- VkFence fence)
-{
- V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-
- struct v3dv_queue_submit_wait_info *wait_info = NULL;
-
- VkResult result = VK_SUCCESS;
- for (uint32_t i = 0; i < submitCount; i++) {
- result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info);
- if (result != VK_SUCCESS && result != VK_NOT_READY)
- goto done;
- }
-
- if (!wait_info) {
- assert(result != VK_NOT_READY);
- result = process_fence_to_signal(queue->device, fence);
- goto done;
- }
-
- /* We emitted wait threads, so we have to spwan a master thread for this
- * queue submission that waits for all other threads to complete and then
- * will signal any semaphores and fences.
- */
- assert(wait_info);
- wait_info->fence = fence;
- result = spawn_master_wait_thread(queue, wait_info);
-
-done:
- return result;
-}
-
-static void
-destroy_syncobj(uint32_t device_fd, uint32_t *sync)
-{
- assert(sync);
- drmSyncobjDestroy(device_fd, *sync);
- *sync = 0;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSemaphore(VkDevice _device,
- const VkSemaphoreCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkSemaphore *pSemaphore)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
-
- struct v3dv_semaphore *sem =
- vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore),
- VK_OBJECT_TYPE_SEMAPHORE);
- if (sem == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync);
- if (ret) {
- vk_object_free(&device->vk, pAllocator, sem);
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- }
-
- *pSemaphore = v3dv_semaphore_to_handle(sem);
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceExternalSemaphoreProperties(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
- VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
-{
- switch (pExternalSemaphoreInfo->handleType) {
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
- pExternalSemaphoreProperties->exportFromImportedHandleTypes =
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
- pExternalSemaphoreProperties->compatibleHandleTypes =
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
-
- /* FIXME: we can't import external semaphores until we improve the kernel
- * submit interface to handle multiple in syncobjs, because once we have
- * an imported semaphore in our list of semaphores to wait on, we can no
- * longer use the workaround of waiting on the last syncobj fence produced
- * from the device, since the imported semaphore may not (and in fact, it
- * would typically not) have been produced from same device.
- *
- * This behavior is exercised via dEQP-VK.synchronization.cross_instance.*.
- * Particularly, this test:
- * dEQP-VK.synchronization.cross_instance.dedicated.
- * write_ssbo_compute_read_vertex_input.buffer_16384_binary_semaphore_fd
- * fails consistently because of this, so it'll be a good reference to
- * verify the implementation when the kernel bits are in place.
- */
- pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
-
- /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties
- * for details on why we can't export to SYNC_FD.
- */
- if (pExternalSemaphoreInfo->handleType !=
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
- pExternalSemaphoreProperties->externalSemaphoreFeatures |=
- VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT;
- }
- break;
- default:
- pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
- pExternalSemaphoreProperties->compatibleHandleTypes = 0;
- pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
- break;
- }
-}
+ struct v3dv_submit_sync_info sync_info = {
+ .wait_count = submit->wait_count,
+ .waits = submit->waits,
+ .signal_count = submit->signal_count,
+ .signals = submit->signals,
+ };
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ImportSemaphoreFdKHR(
- VkDevice _device,
- const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
-
- assert(pImportSemaphoreFdInfo->sType ==
- VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR);
-
- int fd = pImportSemaphoreFdInfo->fd;
- int render_fd = device->pdevice->render_fd;
-
- bool is_temporary =
- pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT ||
- (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT);
-
- uint32_t new_sync;
- switch (pImportSemaphoreFdInfo->handleType) {
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
- /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the
- * special value -1 for fd is treated like a valid sync file descriptor
- * referring to an object that has already signaled. The import
- * operation will succeed and the VkSemaphore will have a temporarily
- * imported payload as if a valid file descriptor had been provided."
- */
- unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
- if (drmSyncobjCreate(render_fd, flags, &new_sync))
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (fd != -1) {
- if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
- drmSyncobjDestroy(render_fd, new_sync);
- return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
+ queue->last_job_syncs.first[i] = true;
+
+ struct v3dv_job *first_suspend_job = NULL;
+ struct v3dv_job *current_suspend_job = NULL;
+ for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
+ struct v3dv_cmd_buffer *cmd_buffer =
+ container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
+ list_for_each_entry_safe(struct v3dv_job, job,
+ &cmd_buffer->jobs, list_link) {
+ if (job->suspending) {
+ job = v3dv_X(job->device,
+ cmd_buffer_prepare_suspend_job_for_submit)(job);
+ if (!job)
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
- }
- break;
- }
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: {
- if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
- return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
- break;
- }
- default:
- return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
- }
- destroy_syncobj(render_fd, &sem->temp_sync);
- if (is_temporary) {
- sem->temp_sync = new_sync;
- } else {
- destroy_syncobj(render_fd, &sem->sync);
- sem->sync = new_sync;
- }
-
- /* From the Vulkan 1.0.53 spec:
- *
- * "Importing a semaphore payload from a file descriptor transfers
- * ownership of the file descriptor from the application to the
- * Vulkan implementation. The application must not perform any
- * operations on the file descriptor after a successful import."
- *
- * If the import fails, we leave the file descriptor open.
- */
- if (fd != -1)
- close(fd);
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetSemaphoreFdKHR(VkDevice _device,
- const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
- int *pFd)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore);
-
- assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
-
- *pFd = -1;
- int render_fd = device->pdevice->render_fd;
- switch (pGetFdInfo->handleType) {
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
- drmSyncobjExportSyncFile(render_fd, sem->sync, pFd);
- if (*pFd == -1)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- break;
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
- drmSyncobjHandleToFD(render_fd, sem->sync, pFd);
- if (*pFd == -1)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- break;
- }
- default:
- unreachable("Unsupported external semaphore handle type");
- }
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySemaphore(VkDevice _device,
- VkSemaphore semaphore,
- const VkAllocationCallbacks *pAllocator)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore);
-
- if (sem == NULL)
- return;
-
- destroy_syncobj(device->pdevice->render_fd, &sem->sync);
- destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync);
-
- vk_object_free(&device->vk, pAllocator, sem);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateFence(VkDevice _device,
- const VkFenceCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkFence *pFence)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
-
- struct v3dv_fence *fence =
- vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence),
- VK_OBJECT_TYPE_FENCE);
- if (fence == NULL)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- unsigned flags = 0;
- if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
- flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
- int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync);
- if (ret) {
- vk_object_free(&device->vk, pAllocator, fence);
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- }
-
- *pFence = v3dv_fence_to_handle(fence);
+ if (job->suspending && !job->resuming) {
+ assert(!first_suspend_job);
+ assert(!current_suspend_job);
+ first_suspend_job = job;
+ }
- return VK_SUCCESS;
-}
+ if (job->resuming) {
+ assert(first_suspend_job);
+ assert(current_suspend_job);
+ v3dv_X(job->device, job_patch_resume_address)(first_suspend_job,
+ current_suspend_job,
+ job);
+ current_suspend_job = NULL;
+ }
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceExternalFenceProperties(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo,
- VkExternalFenceProperties *pExternalFenceProperties)
+ if (job->suspending) {
+ current_suspend_job = job;
+ } else {
+ assert(!current_suspend_job);
+ struct v3dv_job *submit_job = first_suspend_job ?
+ first_suspend_job : job;
+ result =
+ queue_handle_job(queue, submit_job, submit->perf_pass_index,
+ &sync_info, false);
-{
- switch (pExternalFenceInfo->handleType) {
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
- pExternalFenceProperties->exportFromImportedHandleTypes =
- VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
- pExternalFenceProperties->compatibleHandleTypes =
- VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
- pExternalFenceProperties->externalFenceFeatures =
- VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
-
- /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not
- * the syncobj itself, and that fence is only created after we have
- * submitted to the kernel and updated the syncobj for the fence to import
- * the actual DRM fence created with the submission. Unfortunately, if the
- * queue submission has a 'wait for events' we may hold any jobs after the
- * wait in a user-space thread until the events are signaled, and in that
- * case we don't update the out fence of the submit until the events are
- * signaled and we can submit all the jobs involved with the vkQueueSubmit
- * call. This means that if the applications submits with an out fence and
- * a wait for events, trying to export the out fence to a SYNC_FD rigth
- * after the submission and before the events are signaled will fail,
- * because the actual DRM fence won't exist yet. This is not a problem
- * with OPAQUE_FD because in this case we export the entire syncobj, not
- * the underlying DRM fence. To fix this we need to rework our kernel
- * interface to be more flexible and accept multiple in/out syncobjs so
- * we can implement event waits as regular fence waits on the kernel side,
- * until then, we can only reliably export OPAQUE_FD.
- */
- if (pExternalFenceInfo->handleType !=
- VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) {
- pExternalFenceProperties->externalFenceFeatures |=
- VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT;
- }
- break;
- default:
- pExternalFenceProperties->exportFromImportedHandleTypes = 0;
- pExternalFenceProperties->compatibleHandleTypes = 0;
- pExternalFenceProperties->externalFenceFeatures = 0;
- break;
- }
-}
+ if (result != VK_SUCCESS)
+ return result;
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ImportFenceFdKHR(VkDevice _device,
- const VkImportFenceFdInfoKHR *pImportFenceFdInfo)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence);
-
- assert(pImportFenceFdInfo->sType ==
- VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
-
- int fd = pImportFenceFdInfo->fd;
- int render_fd = device->pdevice->render_fd;
-
- bool is_temporary =
- pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT ||
- (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT);
-
- uint32_t new_sync;
- switch (pImportFenceFdInfo->handleType) {
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
- /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
- * special value -1 for fd is treated like a valid sync file descriptor
- * referring to an object that has already signaled. The import
- * operation will succeed and the VkFence will have a temporarily
- * imported payload as if a valid file descriptor had been provided."
- */
- unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
- if (drmSyncobjCreate(render_fd, flags, &new_sync))
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (fd != -1) {
- if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
- drmSyncobjDestroy(render_fd, new_sync);
- return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ first_suspend_job = NULL;
}
}
- break;
- }
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
- if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
- return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
- break;
- }
- default:
- return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
- }
-
- destroy_syncobj(render_fd, &fence->temp_sync);
- if (is_temporary) {
- fence->temp_sync = new_sync;
- } else {
- destroy_syncobj(render_fd, &fence->sync);
- fence->sync = new_sync;
- }
-
- /* From the Vulkan 1.0.53 spec:
- *
- * "Importing a fence payload from a file descriptor transfers
- * ownership of the file descriptor from the application to the
- * Vulkan implementation. The application must not perform any
- * operations on the file descriptor after a successful import."
- *
- * If the import fails, we leave the file descriptor open.
- */
- if (fd != -1)
- close(fd);
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroyFence(VkDevice _device,
- VkFence _fence,
- const VkAllocationCallbacks *pAllocator)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
-
- if (fence == NULL)
- return;
-
- destroy_syncobj(device->pdevice->render_fd, &fence->sync);
- destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync);
- vk_object_free(&device->vk, pAllocator, fence);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
-
- int ret = drmSyncobjWait(device->pdevice->render_fd, &fence->sync, 1,
- 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL);
- if (ret == -ETIME)
- return VK_NOT_READY;
- else if (ret)
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetFenceFdKHR(VkDevice _device,
- const VkFenceGetFdInfoKHR *pGetFdInfo,
- int *pFd)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence);
-
- assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
-
- *pFd = -1;
- int render_fd = device->pdevice->render_fd;
- switch (pGetFdInfo->handleType) {
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
- drmSyncobjExportSyncFile(render_fd, fence->sync, pFd);
- if (*pFd == -1)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- break;
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
- drmSyncobjHandleToFD(render_fd, fence->sync, pFd);
- if (*pFd == -1)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- break;
- }
- default:
- unreachable("Unsupported external fence handle type");
- }
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
- uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
- sizeof(*syncobjs) * fenceCount, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!syncobjs)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- int render_fd = device->pdevice->render_fd;
- uint32_t reset_count = 0;
- for (uint32_t i = 0; i < fenceCount; i++) {
- struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
- /* From the Vulkan spec, section 'Importing Fence Payloads':
+ /* If the command buffer ends with a barrier we need to consume it now.
*
- * "If the import is temporary, the fence will be restored to its
- * permanent state the next time that fence is passed to
- * vkResetFences.
- *
- * Note: Restoring a fence to its prior permanent payload is a
- * distinct operation from resetting a fence payload."
- *
- * To restore the previous state, we just need to destroy the temporary.
+ * FIXME: this will drain all hw queues. Instead, we could use the pending
+ * barrier state to limit the queues we serialize against.
*/
- if (fence->temp_sync)
- destroy_syncobj(render_fd, &fence->temp_sync);
- else
- syncobjs[reset_count++] = fence->sync;
+ if (cmd_buffer->state.barrier.dst_mask) {
+ result = queue_submit_noop_job(queue, submit->perf_pass_index,
+ &sync_info, false);
+ if (result != VK_SUCCESS)
+ return result;
+ }
}
- int ret = 0;
- if (reset_count > 0)
- ret = drmSyncobjReset(render_fd, syncobjs, reset_count);
+ assert(!first_suspend_job);
+ assert(!current_suspend_job);
- vk_free(&device->vk.alloc, syncobjs);
-
- if (ret)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_WaitForFences(VkDevice _device,
- uint32_t fenceCount,
- const VkFence *pFences,
- VkBool32 waitAll,
- uint64_t timeout)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
- const uint64_t abs_timeout = get_absolute_timeout(timeout);
-
- uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
- sizeof(*syncobjs) * fenceCount, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!syncobjs)
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- for (uint32_t i = 0; i < fenceCount; i++) {
- struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
- syncobjs[i] = fence->temp_sync ? fence->temp_sync : fence->sync;
+ /* Handle signaling now */
+ if (submit->signal_count > 0) {
+ /* Finish by submitting a no-op job that synchronizes across all queues.
+ * This will ensure that the signal semaphores don't get triggered until
+ * all work on any queue completes. See Vulkan's signal operation order
+ * requirements.
+ */
+ return queue_submit_noop_job(queue, submit->perf_pass_index,
+ &sync_info, true);
}
- unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
- if (waitAll)
- flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
-
- int ret;
- do {
- ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount,
- timeout, flags, NULL);
- } while (ret == -ETIME && gettime_ns() < abs_timeout);
-
- vk_free(&device->vk.alloc, syncobjs);
-
- if (ret == -ETIME)
- return VK_TIMEOUT;
- else if (ret)
- return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
return VK_SUCCESS;
}
@@ -1553,5 +1311,5 @@ v3dv_QueueBindSparse(VkQueue _queue,
VkFence fence)
{
V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
- return vk_error(queue->device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
+ return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
}
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 47bc3a0b17c..eab8c0f0840 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2019 Raspberry Pi
+ * Copyright © 2019 Raspberry Pi Ltd
*
* Based in part on v3d driver which is:
*
@@ -26,16 +26,6 @@
*/
#include "v3dv_private.h"
-#include "vk_format_info.h"
-
-/* The only version specific structure that we need is
- * TMU_CONFIG_PARAMETER_1. This didn't seem to change significantly from
- * previous V3D versions and we don't expect that to change, so for now let's
- * just hardcode the V3D version here.
- */
-#define V3D_VERSION 41
-#include "broadcom/common/v3d_macros.h"
-#include "broadcom/cle/v3dx_pack.h"
/* Our Vulkan resource indices represent indices in descriptor maps which
* include all shader stages, so we need to size the arrays below
@@ -57,7 +47,8 @@ struct state_bo_list {
struct v3dv_bo *states[MAX_TOTAL_STATES];
};
-#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES)
+#define MAX_TOTAL_UNIFORM_BUFFERS ((MAX_UNIFORM_BUFFERS + \
+ MAX_INLINE_UNIFORM_BUFFERS) * MAX_STAGES)
#define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES)
struct buffer_bo_list {
struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS];
@@ -74,29 +65,36 @@ state_bo_in_list(struct state_bo_list *list, struct v3dv_bo *bo)
return false;
}
+static void
+push_constants_bo_free(VkDevice _device,
+ uint64_t bo_ptr,
+ VkAllocationCallbacks *alloc)
+{
+ V3DV_FROM_HANDLE(v3dv_device, device, _device);
+ v3dv_bo_free(device, (struct v3dv_bo *)(uintptr_t) bo_ptr);
+}
+
/*
* This method checks if the ubo used for push constants is needed to be
* updated or not.
*
- * push contants ubo is only used for push constants accessed by a non-const
+ * push constants ubo is only used for push constants accessed by a non-const
* index.
- *
- * FIXME: right now for this cases we are uploading the full
- * push_constants_data. An improvement would be to upload only the data that
- * we need to rely on a UBO.
*/
static void
check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline *pipeline)
{
- if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS) ||
+ if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO) ||
pipeline->layout->push_constant_size == 0)
return;
if (cmd_buffer->push_constants_resource.bo == NULL) {
cmd_buffer->push_constants_resource.bo =
- v3dv_bo_alloc(cmd_buffer->device, MAX_PUSH_CONSTANTS_SIZE,
- "push constants", true);
+ v3dv_bo_alloc(cmd_buffer->device, 4096, "push constants", true);
+
+ v3dv_job_add_bo(cmd_buffer->state.job,
+ cmd_buffer->push_constants_resource.bo);
if (!cmd_buffer->push_constants_resource.bo) {
fprintf(stderr, "Failed to allocate memory for push constants\n");
@@ -105,28 +103,41 @@ check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer,
bool ok = v3dv_bo_map(cmd_buffer->device,
cmd_buffer->push_constants_resource.bo,
- MAX_PUSH_CONSTANTS_SIZE);
+ cmd_buffer->push_constants_resource.bo->size);
if (!ok) {
fprintf(stderr, "failed to map push constants buffer\n");
abort();
}
} else {
- if (cmd_buffer->push_constants_resource.offset + MAX_PUSH_CONSTANTS_SIZE <=
+ if (cmd_buffer->push_constants_resource.offset +
+ cmd_buffer->state.push_constants_size <=
cmd_buffer->push_constants_resource.bo->size) {
- cmd_buffer->push_constants_resource.offset += MAX_PUSH_CONSTANTS_SIZE;
+ cmd_buffer->push_constants_resource.offset +=
+ cmd_buffer->state.push_constants_size;
} else {
- /* FIXME: we got out of space for push descriptors. Should we create
- * a new bo? This could be easier with a uploader
+ /* We ran out of space so we'll have to allocate a new buffer but we
+ * need to ensure the old one is preserved until the end of the command
+ * buffer life and make sure it is eventually freed. We use the
+ * private object machinery in the command buffer for this.
*/
+ v3dv_cmd_buffer_add_private_obj(
+ cmd_buffer, (uintptr_t) cmd_buffer->push_constants_resource.bo,
+ (v3dv_cmd_buffer_private_obj_destroy_cb) push_constants_bo_free);
+
+ /* Now call back so we create a new BO */
+ cmd_buffer->push_constants_resource.bo = NULL;
+ check_push_constants_ubo(cmd_buffer, pipeline);
+ return;
}
}
+ assert(cmd_buffer->state.push_constants_size <= MAX_PUSH_CONSTANTS_SIZE);
memcpy(cmd_buffer->push_constants_resource.bo->map +
cmd_buffer->push_constants_resource.offset,
- cmd_buffer->push_constants_data,
- MAX_PUSH_CONSTANTS_SIZE);
+ cmd_buffer->state.push_constants_data,
+ cmd_buffer->state.push_constants_size);
- cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+ cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO;
}
/** V3D 4.x TMU configuration parameter 0 (texture) */
@@ -203,11 +214,8 @@ write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,
/* Set unnormalized coordinates flag from sampler object */
uint32_t p1_packed = v3d_unit_data_get_offset(data);
if (sampler->unnormalized_coordinates) {
- struct V3DX(TMU_CONFIG_PARAMETER_1) p1_unpacked;
- V3DX(TMU_CONFIG_PARAMETER_1_unpack)((uint8_t *)&p1_packed, &p1_unpacked);
- p1_unpacked.unnormalized_coordinates = true;
- V3DX(TMU_CONFIG_PARAMETER_1_pack)(NULL, (uint8_t *)&p1_packed,
- &p1_unpacked);
+ v3d_pack_unnormalized_coordinates(&cmd_buffer->device->devinfo, &p1_packed,
+ sampler->unnormalized_coordinates);
}
cl_aligned_u32(uniforms, sampler_state_reloc.bo->offset +
@@ -248,13 +256,14 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t dynamic_offset = 0;
- /* For ubos, index is shifted, as 0 is reserved for push constants.
+ /* For ubos, index is shifted, as 0 is reserved for push constants
+ * and 1..MAX_INLINE_UNIFORM_BUFFERS are reserved for inline uniform
+ * buffers.
*/
- if (content == QUNIFORM_UBO_ADDR &&
- v3d_unit_data_get_unit(data) == 0) {
- /* This calls is to ensure that the push_constant_ubo is
- * updated. It already take into account it is should do the
- * update or not
+ uint32_t index = v3d_unit_data_get_unit(data);
+ if (content == QUNIFORM_UBO_ADDR && index == 0) {
+ /* Ensure the push constants UBO is created and updated. This also
+ * adds the BO to the job so we don't need to track it in buffer_bos.
*/
check_push_constants_ubo(cmd_buffer, pipeline);
@@ -265,42 +274,99 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
cl_aligned_u32(uniforms, resource->bo->offset +
resource->offset +
offset + dynamic_offset);
- buffer_bos->ubo[0] = resource->bo;
} else {
- uint32_t index =
- content == QUNIFORM_UBO_ADDR ?
- v3d_unit_data_get_unit(data) - 1 :
- data;
+ if (content == QUNIFORM_UBO_ADDR) {
+ /* We reserve UBO index 0 for push constants in Vulkan (and for the
+ * constant buffer in GL) so the compiler always adds one to all UBO
+ * indices, fix it up before we access the descriptor map, since
+ * indices start from 0 there.
+ */
+ assert(index > 0);
+ index--;
+ } else {
+ index = data;
+ }
struct v3dv_descriptor *descriptor =
v3dv_descriptor_map_get_descriptor(descriptor_state, map,
pipeline->layout,
index, &dynamic_offset);
+
+ /* Inline UBO descriptors store UBO data in descriptor pool memory,
+ * instead of an external buffer.
+ */
assert(descriptor);
- assert(descriptor->buffer);
- assert(descriptor->buffer->mem);
- assert(descriptor->buffer->mem->bo);
if (content == QUNIFORM_GET_SSBO_SIZE ||
content == QUNIFORM_GET_UBO_SIZE) {
cl_aligned_u32(uniforms, descriptor->range);
} else {
- cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset +
- descriptor->buffer->mem_offset +
- descriptor->offset +
- offset + dynamic_offset);
+ /* Inline uniform buffers store their contents in pool memory instead
+ * of an external buffer.
+ */
+ struct v3dv_bo *bo;
+ uint32_t addr;
+ if (descriptor->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ assert(dynamic_offset == 0);
+ struct v3dv_cl_reloc reloc =
+ v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
+ descriptor_state, map,
+ pipeline->layout, index,
+ NULL);
+ bo = reloc.bo;
+ addr = reloc.bo->offset + reloc.offset + offset;
+ } else {
+ assert(descriptor->buffer);
+ assert(descriptor->buffer->mem);
+ assert(descriptor->buffer->mem->bo);
+
+ bo = descriptor->buffer->mem->bo;
+ addr = bo->offset +
+ descriptor->buffer->mem_offset +
+ descriptor->offset +
+ offset + dynamic_offset;
+ }
+
+ cl_aligned_u32(uniforms, addr);
if (content == QUNIFORM_UBO_ADDR) {
- assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS);
- buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo;
+ assert(index < MAX_TOTAL_UNIFORM_BUFFERS);
+ buffer_bos->ubo[index] = bo;
} else {
assert(index < MAX_TOTAL_STORAGE_BUFFERS);
- buffer_bos->ssbo[index] = descriptor->buffer->mem->bo;
+ buffer_bos->ssbo[index] = bo;
}
}
}
}
+static void
+write_inline_uniform(struct v3dv_cl_out **uniforms,
+ uint32_t index,
+ uint32_t offset,
+ struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_pipeline *pipeline,
+ enum broadcom_shader_stage stage)
+{
+ assert(index < MAX_INLINE_UNIFORM_BUFFERS);
+
+ struct v3dv_descriptor_state *descriptor_state =
+ v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
+
+ struct v3dv_descriptor_map *map =
+ &pipeline->shared_data->maps[stage]->ubo_map;
+
+ struct v3dv_cl_reloc reloc =
+ v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
+ descriptor_state, map,
+ pipeline->layout, index,
+ NULL);
+
+ /* Offset comes in 32-bit units */
+ uint32_t *addr = reloc.bo->map + reloc.offset + 4 * offset;
+ cl_aligned_u32(uniforms, *addr);
+}
+
static uint32_t
get_texture_size_from_image_view(struct v3dv_image_view *image_view,
enum quniform_contents contents,
@@ -420,7 +486,6 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
-
for (int i = 0; i < uinfo->count; i++) {
uint32_t data = uinfo->data[i];
@@ -430,24 +495,45 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
break;
case QUNIFORM_UNIFORM:
- cl_aligned_u32(&uniforms, cmd_buffer->push_constants_data[data]);
+ cl_aligned_u32(&uniforms, cmd_buffer->state.push_constants_data[data]);
break;
- case QUNIFORM_VIEWPORT_X_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
+ case QUNIFORM_INLINE_UBO_0:
+ case QUNIFORM_INLINE_UBO_1:
+ case QUNIFORM_INLINE_UBO_2:
+ case QUNIFORM_INLINE_UBO_3:
+ write_inline_uniform(&uniforms,
+ uinfo->contents[i] - QUNIFORM_INLINE_UBO_0, data,
+ cmd_buffer, pipeline, variant->stage);
break;
- case QUNIFORM_VIEWPORT_Y_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
+ case QUNIFORM_VIEWPORT_X_SCALE: {
+ float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
break;
+ }
- case QUNIFORM_VIEWPORT_Z_OFFSET:
- cl_aligned_f(&uniforms, dynamic->viewport.translate[0][2]);
+ case QUNIFORM_VIEWPORT_Y_SCALE: {
+ float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
break;
+ }
- case QUNIFORM_VIEWPORT_Z_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][2]);
+ case QUNIFORM_VIEWPORT_Z_OFFSET: {
+ float translate_z;
+ v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
+ &translate_z, NULL);
+ cl_aligned_f(&uniforms, translate_z);
break;
+ }
+
+ case QUNIFORM_VIEWPORT_Z_SCALE: {
+ float scale_z;
+ v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
+ NULL, &scale_z);
+ cl_aligned_f(&uniforms, scale_z);
+ break;
+ }
case QUNIFORM_SSBO_OFFSET:
case QUNIFORM_UBO_ADDR:
@@ -527,9 +613,9 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
} else if (cmd_buffer->state.framebuffer) {
num_layers = cmd_buffer->state.framebuffer->layers;
} else {
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
num_layers = 2048;
-#if DEBUG
+#if MESA_DEBUG
fprintf(stderr, "Skipping gl_LayerID shader sanity check for "
"secondary command buffer\n");
#endif
@@ -571,6 +657,20 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
cl_aligned_u32(&uniforms, pipeline->spill.size_per_thread);
break;
+ case QUNIFORM_DRAW_ID:
+ cl_aligned_u32(&uniforms, job->cmd_buffer->state.draw_id);
+ break;
+
+ case QUNIFORM_LINE_WIDTH:
+ cl_aligned_u32(&uniforms,
+ job->cmd_buffer->vk.dynamic_graphics_state.rs.line.width);
+ break;
+
+ case QUNIFORM_AA_LINE_WIDTH:
+ cl_aligned_u32(&uniforms,
+ v3dv_get_aa_line_width(pipeline, job->cmd_buffer));
+ break;
+
default:
unreachable("unsupported quniform_contents uniform type\n");
}
diff --git a/src/broadcom/vulkan/v3dv_wsi.c b/src/broadcom/vulkan/v3dv_wsi.c
index 23c542cbc05..78af39448ce 100644
--- a/src/broadcom/vulkan/v3dv_wsi.c
+++ b/src/broadcom/vulkan/v3dv_wsi.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
* based on intel anv code:
* Copyright © 2015 Intel Corporation
@@ -24,123 +24,40 @@
*/
#include "v3dv_private.h"
-#include "drm-uapi/drm_fourcc.h"
-#include "vk_format_info.h"
#include "vk_util.h"
#include "wsi_common.h"
+#include "wsi_common_drm.h"
+#include "wsi_common_entrypoints.h"
static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
v3dv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
{
V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
- PFN_vkVoidFunction func;
-
- func = vk_instance_dispatch_table_get(&pdevice->vk.instance->dispatch_table, pName);
- if (func != NULL)
- return func;
-
- func = vk_physical_device_dispatch_table_get(&pdevice->vk.dispatch_table, pName);
- if (func != NULL)
- return func;
-
- return vk_device_dispatch_table_get(&vk_device_trampolines, pName);
+ return vk_instance_get_proc_addr_unchecked(pdevice->vk.instance, pName);
}
static bool
v3dv_wsi_can_present_on_device(VkPhysicalDevice _pdevice, int fd)
{
V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, _pdevice);
-
- drmDevicePtr fd_devinfo, display_devinfo;
- int ret;
-
- ret = drmGetDevice2(fd, 0, &fd_devinfo);
- if (ret)
- return false;
-
- ret = drmGetDevice2(pdevice->display_fd, 0, &display_devinfo);
- if (ret) {
- drmFreeDevice(&fd_devinfo);
- return false;
- }
-
- bool result = drmDevicesEqual(fd_devinfo, display_devinfo);
-
- drmFreeDevice(&fd_devinfo);
- drmFreeDevice(&display_devinfo);
- return result;
+ assert(pdevice->display_fd != -1);
+ return wsi_common_drm_devices_equal(fd, pdevice->display_fd);
}
-VkResult
-v3dv_wsi_init(struct v3dv_physical_device *physical_device)
-{
- VkResult result;
-
- result = wsi_device_init(&physical_device->wsi_device,
- v3dv_physical_device_to_handle(physical_device),
- v3dv_wsi_proc_addr,
- &physical_device->vk.instance->alloc,
- physical_device->master_fd, NULL, false);
-
- if (result != VK_SUCCESS)
- return result;
- physical_device->wsi_device.supports_modifiers = true;
- physical_device->wsi_device.can_present_on_device =
- v3dv_wsi_can_present_on_device;
-
- return VK_SUCCESS;
-}
-
-void
-v3dv_wsi_finish(struct v3dv_physical_device *physical_device)
-{
- wsi_device_finish(&physical_device->wsi_device,
- &physical_device->vk.instance->alloc);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySurfaceKHR(
- VkInstance _instance,
- VkSurfaceKHR _surface,
- const VkAllocationCallbacks* pAllocator)
+static void
+filter_surface_capabilities(VkSurfaceKHR _surface,
+ VkSurfaceCapabilitiesKHR *caps)
{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
- if (!surface)
- return;
-
- vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- VkSurfaceKHR surface,
- VkBool32* pSupported)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_support(&device->wsi_device,
- queueFamilyIndex,
- surface,
- pSupported);
-}
-
-static void
-constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps)
-{
- /* Our display pipeline requires that images are linear, so we cannot
- * ensure that our swapchain images can be sampled. If we are running under
- * a compositor in windowed mode, the DRM modifier negotiation should
- * probably end up selecting an UIF layout for the swapchain images but it
- * may still choose linear and send images directly for scanout if the
- * surface is in fullscreen mode for example. If we are not running under
- * a compositor, then we would always need them to be linear anyway.
+ /* Display images must be linear so they are restricted. This would
+ * affect sampling usages too, but we don't restrict those since we
+ * support on-the-fly conversion to UIF when sampling for simple 2D
+ * images at a performance penalty.
*/
- caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_SAMPLED_BIT;
+ if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY)
+ caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_STORAGE_BIT;
}
VKAPI_ATTR VkResult VKAPI_CALL
@@ -149,13 +66,11 @@ v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
VkSurfaceKHR surface,
VkSurfaceCapabilitiesKHR* pSurfaceCapabilities)
{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
VkResult result;
- result = wsi_common_get_surface_capabilities(&device->wsi_device,
- surface,
- pSurfaceCapabilities);
- constraint_surface_capabilities(pSurfaceCapabilities);
+ result = wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice,
+ surface,
+ pSurfaceCapabilities);
+ filter_surface_capabilities(surface, pSurfaceCapabilities);
return result;
}
@@ -165,227 +80,50 @@ v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR(
const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo,
VkSurfaceCapabilities2KHR* pSurfaceCapabilities)
{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
VkResult result;
- result = wsi_common_get_surface_capabilities2(&device->wsi_device,
- pSurfaceInfo,
- pSurfaceCapabilities);
- constraint_surface_capabilities(&pSurfaceCapabilities->surfaceCapabilities);
+ result = wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(physicalDevice,
+ pSurfaceInfo,
+ pSurfaceCapabilities);
+ filter_surface_capabilities(pSurfaceInfo->surface,
+ &pSurfaceCapabilities->surfaceCapabilities);
return result;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceFormatsKHR(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- uint32_t* pSurfaceFormatCount,
- VkSurfaceFormatKHR* pSurfaceFormats)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_formats(&device->wsi_device, surface,
- pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceFormats2KHR(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo,
- uint32_t* pSurfaceFormatCount,
- VkSurfaceFormat2KHR* pSurfaceFormats)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
- pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfacePresentModesKHR(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- uint32_t* pPresentModeCount,
- VkPresentModeKHR* pPresentModes)
+VkResult
+v3dv_wsi_init(struct v3dv_physical_device *physical_device)
{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_present_modes(&device->wsi_device, surface,
- pPresentModeCount,
- pPresentModes);
-}
+ VkResult result;
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSwapchainKHR(
- VkDevice _device,
- const VkSwapchainCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSwapchainKHR* pSwapchain)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- struct v3dv_instance *instance = device->instance;
- struct v3dv_physical_device *pdevice = &instance->physicalDevice;
- struct wsi_device *wsi_device = &pdevice->wsi_device;
+ result = wsi_device_init(&physical_device->wsi_device,
+ v3dv_physical_device_to_handle(physical_device),
+ v3dv_wsi_proc_addr,
+ &physical_device->vk.instance->alloc,
+ physical_device->display_fd, NULL,
+ &(struct wsi_device_options){.sw_device = false});
- ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface);
- VkResult result =
- v3dv_physical_device_acquire_display(instance, pdevice, surface);
if (result != VK_SUCCESS)
return result;
- const VkAllocationCallbacks *alloc;
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &device->vk.alloc;
-
- return wsi_common_create_swapchain(wsi_device, _device,
- pCreateInfo, alloc, pSwapchain);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySwapchainKHR(
- VkDevice _device,
- VkSwapchainKHR swapchain,
- const VkAllocationCallbacks* pAllocator)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- const VkAllocationCallbacks *alloc;
+ physical_device->wsi_device.supports_modifiers = true;
+ physical_device->wsi_device.can_present_on_device =
+ v3dv_wsi_can_present_on_device;
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &device->vk.alloc;
+ physical_device->vk.wsi_device = &physical_device->wsi_device;
- wsi_common_destroy_swapchain(_device, swapchain, alloc);
+ return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetSwapchainImagesKHR(
- VkDevice device,
- VkSwapchainKHR swapchain,
- uint32_t* pSwapchainImageCount,
- VkImage* pSwapchainImages)
+void
+v3dv_wsi_finish(struct v3dv_physical_device *physical_device)
{
- return wsi_common_get_images(swapchain,
- pSwapchainImageCount,
- pSwapchainImages);
+ physical_device->vk.wsi_device = NULL;
+ wsi_device_finish(&physical_device->wsi_device,
+ &physical_device->vk.instance->alloc);
}
struct v3dv_image *
v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index)
{
- uint32_t n_images = index + 1;
- VkImage *images = malloc(sizeof(*images) * n_images);
- VkResult result = wsi_common_get_images(swapchain, &n_images, images);
-
- if (result != VK_SUCCESS && result != VK_INCOMPLETE) {
- free(images);
- return NULL;
- }
-
- V3DV_FROM_HANDLE(v3dv_image, image, images[index]);
- free(images);
-
- return image;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AcquireNextImageKHR(
- VkDevice device,
- VkSwapchainKHR swapchain,
- uint64_t timeout,
- VkSemaphore semaphore,
- VkFence fence,
- uint32_t* pImageIndex)
-{
- VkAcquireNextImageInfoKHR acquire_info = {
- .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
- .swapchain = swapchain,
- .timeout = timeout,
- .semaphore = semaphore,
- .fence = fence,
- .deviceMask = 0,
- };
-
- return v3dv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AcquireNextImage2KHR(
- VkDevice _device,
- const VkAcquireNextImageInfoKHR* pAcquireInfo,
- uint32_t* pImageIndex)
-{
- V3DV_FROM_HANDLE(v3dv_device, device, _device);
- V3DV_FROM_HANDLE(v3dv_fence, fence, pAcquireInfo->fence);
- V3DV_FROM_HANDLE(v3dv_semaphore, semaphore, pAcquireInfo->semaphore);
-
- struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
-
- VkResult result;
- result = wsi_common_acquire_next_image2(&pdevice->wsi_device, _device,
- pAcquireInfo, pImageIndex);
-
- if (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR) {
- if (fence)
- drmSyncobjSignal(pdevice->render_fd, &fence->sync, 1);
- if (semaphore)
- drmSyncobjSignal(pdevice->render_fd, &semaphore->sync, 1);
- }
-
- return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueuePresentKHR(
- VkQueue _queue,
- const VkPresentInfoKHR* pPresentInfo)
-{
- V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
- struct v3dv_physical_device *pdevice =
- &queue->device->instance->physicalDevice;
-
- return wsi_common_queue_present(&pdevice->wsi_device,
- v3dv_device_to_handle(queue->device),
- _queue, 0,
- pPresentInfo);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDeviceGroupPresentCapabilitiesKHR(
- VkDevice device,
- VkDeviceGroupPresentCapabilitiesKHR* pCapabilities)
-{
- memset(pCapabilities->presentMask, 0,
- sizeof(pCapabilities->presentMask));
- pCapabilities->presentMask[0] = 0x1;
- pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDeviceGroupSurfacePresentModesKHR(
- VkDevice device,
- VkSurfaceKHR surface,
- VkDeviceGroupPresentModeFlagsKHR* pModes)
-{
- *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDevicePresentRectanglesKHR(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- uint32_t* pRectCount,
- VkRect2D* pRects)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
- return wsi_common_get_present_rectangles(&device->wsi_device,
- surface,
- pRectCount, pRects);
+ VkImage image = wsi_common_get_image(swapchain, index);
+ return v3dv_image_from_handle(image);
}
diff --git a/src/broadcom/vulkan/v3dv_wsi_display.c b/src/broadcom/vulkan/v3dv_wsi_display.c
deleted file mode 100644
index 3d1cf91ecbe..00000000000
--- a/src/broadcom/vulkan/v3dv_wsi_display.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- * based on KHR_display extension code:
- * Copyright © 2017 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that copyright
- * notice and this permission notice appear in supporting documentation, and
- * that the name of the copyright holders not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. The copyright holders make no representations
- * about the suitability of this software for any purpose. It is provided "as
- * is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THIS SOFTWARE.
- */
-#include "v3dv_private.h"
-#include "wsi_common_display.h"
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
- uint32_t *property_count,
- VkDisplayPropertiesKHR *properties)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_physical_device_display_properties(
- physical_device,
- &pdevice->wsi_device,
- property_count,
- properties);
-}
-
-VkResult
-v3dv_GetPhysicalDeviceDisplayProperties2KHR(
- VkPhysicalDevice physical_device,
- uint32_t *pPropertyCount,
- VkDisplayProperties2KHR *pProperties)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_physical_device_display_properties2(
- physical_device,
- &pdevice->wsi_device,
- pPropertyCount,
- pProperties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceDisplayPlanePropertiesKHR(
- VkPhysicalDevice physical_device,
- uint32_t *property_count,
- VkDisplayPlanePropertiesKHR *properties)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_physical_device_display_plane_properties(
- physical_device,
- &pdevice->wsi_device,
- property_count,
- properties);
-}
-
-VkResult
-v3dv_GetPhysicalDeviceDisplayPlaneProperties2KHR(
- VkPhysicalDevice physical_device,
- uint32_t *pPropertyCount,
- VkDisplayPlaneProperties2KHR *pProperties)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_physical_device_display_plane_properties2(
- physical_device,
- &pdevice->wsi_device,
- pPropertyCount,
- pProperties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
- uint32_t plane_index,
- uint32_t *display_count,
- VkDisplayKHR *displays)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_display_plane_supported_displays(
- physical_device,
- &pdevice->wsi_device,
- plane_index,
- display_count,
- displays);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
- VkDisplayKHR display,
- uint32_t *property_count,
- VkDisplayModePropertiesKHR *properties)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_display_mode_properties(physical_device,
- &pdevice->wsi_device,
- display,
- property_count,
- properties);
-}
-
-VkResult
-v3dv_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device,
- VkDisplayKHR display,
- uint32_t *pPropertyCount,
- VkDisplayModeProperties2KHR *pProperties)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_display_mode_properties2(physical_device,
- &pdevice->wsi_device,
- display,
- pPropertyCount,
- pProperties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
- VkDisplayKHR display,
- const VkDisplayModeCreateInfoKHR *create_info,
- const VkAllocationCallbacks *allocator,
- VkDisplayModeKHR *mode)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_display_create_display_mode(physical_device,
- &pdevice->wsi_device,
- display,
- create_info,
- allocator,
- mode);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
- VkDisplayModeKHR mode_khr,
- uint32_t plane_index,
- VkDisplayPlaneCapabilitiesKHR *capabilities)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_get_display_plane_capabilities(physical_device,
- &pdevice->wsi_device,
- mode_khr,
- plane_index,
- capabilities);
-}
-
-VkResult
-v3dv_GetDisplayPlaneCapabilities2KHR(
- VkPhysicalDevice physical_device,
- const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
- VkDisplayPlaneCapabilities2KHR *pCapabilities)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device);
-
- return wsi_get_display_plane_capabilities2(physical_device,
- &pdevice->wsi_device,
- pDisplayPlaneInfo,
- pCapabilities);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateDisplayPlaneSurfaceKHR(
- VkInstance _instance,
- const VkDisplaySurfaceCreateInfoKHR *create_info,
- const VkAllocationCallbacks *allocator,
- VkSurfaceKHR *surface)
-{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
-
- if (allocator)
- alloc = allocator;
- else
- alloc = &instance->vk.alloc;
-
- return wsi_create_display_surface(_instance, alloc,
- create_info, surface);
-}
diff --git a/src/broadcom/vulkan/v3dv_wsi_wayland.c b/src/broadcom/vulkan/v3dv_wsi_wayland.c
deleted file mode 100644
index e61abf3c724..00000000000
--- a/src/broadcom/vulkan/v3dv_wsi_wayland.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright © 2020 Ella Stanforth
- * based on intel anv code:
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "wsi_common_wayland.h"
-#include "v3dv_private.h"
-
-VKAPI_ATTR VkBool32 VKAPI_CALL
-v3dv_GetPhysicalDeviceWaylandPresentationSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- struct wl_display* display)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice);
-
- return wsi_wl_get_presentation_support(&physical_device->wsi_device, display);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateWaylandSurfaceKHR(
- VkInstance _instance,
- const VkWaylandSurfaceCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSurfaceKHR* pSurface)
-{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR);
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &instance->vk.alloc;
-
- return wsi_create_wl_surface(alloc, pCreateInfo, pSurface);
-}
diff --git a/src/broadcom/vulkan/v3dv_wsi_x11.c b/src/broadcom/vulkan/v3dv_wsi_x11.c
deleted file mode 100644
index 4fa99ccd5ab..00000000000
--- a/src/broadcom/vulkan/v3dv_wsi_x11.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- *
- * based mostly on anv driver which is:
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <X11/Xlib-xcb.h>
-#include <X11/xshmfence.h>
-#include <xcb/xcb.h>
-#include <xcb/dri3.h>
-#include <xcb/present.h>
-
-#include "wsi_common_x11.h"
-#include "v3dv_private.h"
-
-VKAPI_ATTR VkBool32 VKAPI_CALL
-v3dv_GetPhysicalDeviceXcbPresentationSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- xcb_connection_t* connection,
- xcb_visualid_t visual_id)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
- return wsi_get_physical_device_xcb_presentation_support(
- &device->wsi_device,
- queueFamilyIndex,
- connection, visual_id);
-}
-
-VKAPI_ATTR VkBool32 VKAPI_CALL
-v3dv_GetPhysicalDeviceXlibPresentationSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- Display* dpy,
- VisualID visualID)
-{
- V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
- return wsi_get_physical_device_xcb_presentation_support(
- &device->wsi_device,
- queueFamilyIndex,
- XGetXCBConnection(dpy), visualID);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateXcbSurfaceKHR(
- VkInstance _instance,
- const VkXcbSurfaceCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSurfaceKHR* pSurface)
-{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR);
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &instance->vk.alloc;
-
- return wsi_create_xcb_surface(alloc, pCreateInfo, pSurface);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateXlibSurfaceKHR(
- VkInstance _instance,
- const VkXlibSurfaceCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSurfaceKHR* pSurface)
-{
- V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR);
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &instance->vk.alloc;
-
- return wsi_create_xlib_surface(alloc, pCreateInfo, pSurface);
-}
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index c2f2c77864b..d7fb087d9a8 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -23,14 +23,13 @@
#include "v3dv_private.h"
#include "broadcom/common/v3d_macros.h"
+#include "broadcom/common/v3d_util.h"
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
#include "util/half_float.h"
-#include "vulkan/util/vk_format.h"
#include "util/u_pack_color.h"
-
-#include "vk_format_info.h"
+#include "vk_format.h"
void
v3dX(job_emit_binning_flush)(struct v3dv_job *job)
@@ -44,6 +43,34 @@ v3dX(job_emit_binning_flush)(struct v3dv_job *job)
}
void
+v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
+{
+ assert(job->can_use_double_buffer);
+ assert(job->frame_tiling.double_buffer);
+ assert(!job->frame_tiling.msaa);
+ assert(job->bcl_tile_binning_mode_ptr);
+
+ const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+ struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = {
+ cl_packet_header(TILE_BINNING_MODE_CFG),
+ };
+ config.width_in_pixels = tiling->width;
+ config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
+ config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
+ config.multisample_mode_4x = tiling->msaa;
+ config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+ config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
+
+ uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
+ cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
+}
+
+void
v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
const struct v3dv_frame_tiling *tiling,
uint32_t layers)
@@ -55,12 +82,27 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
config.number_of_layers = layers;
}
+ assert(!tiling->double_buffer || !tiling->msaa);
+ job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl);
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
+ config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideally we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
}
/* There's definitely nothing in the VCD cache we want. */
@@ -106,18 +148,45 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t buffer)
{
const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+
+ /* We don't support rendering to ycbcr images, so the image view should be
+ * single-plane, and using a single-plane format. But note that the underlying
+ * image can be a ycbcr format, as we support rendering to a specific plane
+ * of an image. This is used for example on some meta_copy code paths, in
+ * order to copy from/to a plane of a ycbcr image.
+ */
+ assert(iview->plane_count == 1);
+ assert(iview->format->plane_count == 1);
+
+ uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
const struct v3d_resource_slice *slice =
- &image->slices[iview->vk.base_mip_level];
+ &image->planes[image_plane].slices[iview->vk.base_mip_level];
+
uint32_t layer_offset =
v3dv_layer_offset(image, iview->vk.base_mip_level,
- iview->vk.base_array_layer + layer);
+ iview->vk.base_array_layer + layer, image_plane);
cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
load.buffer_to_load = buffer;
- load.address = v3dv_cl_address(image->mem->bo, layer_offset);
+ load.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
+
+ load.input_image_format = iview->format->planes[0].rt_type;
+
+ /* If we create an image view with only the stencil format, we
+ * re-interpret the format as RGBA8_UINT, as it is want we want in
+ * general (see CreateImageView).
+ *
+ * However, when we are loading/storing tiles from the ZSTENCIL tile
+ * buffer, we need to use the underlying DS format.
+ */
+ if (buffer == ZSTENCIL &&
+ iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
+ assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
+ load.input_image_format = image->format->planes[image_plane].rt_type;
+ }
- load.input_image_format = iview->format->rt_type;
- load.r_b_swap = iview->swap_rb;
+ load.r_b_swap = iview->planes[0].swap_rb;
+ load.channel_reverse = iview->planes[0].channel_reverse;
load.memory_format = slice->tiling;
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
@@ -135,38 +204,6 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
}
}
-static bool
-check_needs_load(const struct v3dv_cmd_buffer_state *state,
- VkImageAspectFlags aspect,
- uint32_t first_subpass_idx,
- VkAttachmentLoadOp load_op)
-{
- /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
- * testing does not exist in the image.
- */
- if (!aspect)
- return false;
-
- /* Attachment (or view) load operations apply on the first subpass that
- * uses the attachment (or view), otherwise we always need to load.
- */
- if (state->job->first_subpass > first_subpass_idx)
- return true;
-
- /* If the job is continuing a subpass started in another job, we always
- * need to load.
- */
- if (state->job->is_subpass_continue)
- return true;
-
- /* If the area is not aligned to tile boundaries, we always need to load */
- if (!state->tile_aligned_render_area)
- return true;
-
- /* The attachment load operations must be LOAD */
- return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
-}
-
static inline uint32_t
v3dv_zs_buffer(bool depth, bool stencil)
{
@@ -185,7 +222,6 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t layer)
{
const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
- const struct v3dv_framebuffer *framebuffer = state->framebuffer;
const struct v3dv_render_pass *pass = state->pass;
const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
@@ -222,12 +258,20 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
attachment->first_subpass :
attachment->views[layer].first_subpass;
- bool needs_load = check_needs_load(state,
- VK_IMAGE_ASPECT_COLOR_BIT,
- first_subpass,
- attachment->desc.loadOp);
+ uint32_t last_subpass = !pass->multiview_enabled ?
+ attachment->last_subpass :
+ attachment->views[layer].last_subpass;
+
+ bool needs_load =
+ v3dv_cmd_buffer_check_needs_load(state,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ first_subpass,
+ attachment->desc.loadOp,
+ last_subpass,
+ attachment->desc.storeOp);
if (needs_load) {
- struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
+ struct v3dv_image_view *iview =
+ state->attachments[attachment_idx].image_view;
cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
layer, RENDER_TARGET_0 + i);
}
@@ -245,21 +289,29 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
ds_attachment->first_subpass :
ds_attachment->views[layer].first_subpass;
+ uint32_t ds_last_subpass = !pass->multiview_enabled ?
+ ds_attachment->last_subpass :
+ ds_attachment->views[layer].last_subpass;
+
const bool needs_depth_load =
- check_needs_load(state,
- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ds_first_subpass,
- ds_attachment->desc.loadOp);
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_first_subpass,
+ ds_attachment->desc.loadOp,
+ ds_last_subpass,
+ ds_attachment->desc.storeOp);
const bool needs_stencil_load =
- check_needs_load(state,
- ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
- ds_first_subpass,
- ds_attachment->desc.stencilLoadOp);
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+ ds_first_subpass,
+ ds_attachment->desc.stencilLoadOp,
+ ds_last_subpass,
+ ds_attachment->desc.stencilStoreOp);
if (needs_depth_load || needs_stencil_load) {
struct v3dv_image_view *iview =
- framebuffer->attachments[ds_attachment_idx];
+ state->attachments[ds_attachment_idx].image_view;
/* From the Vulkan spec:
*
* "When an image view of a depth/stencil image is used as a
@@ -290,21 +342,53 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
bool is_multisample_resolve)
{
const struct v3dv_image_view *iview =
- cmd_buffer->state.framebuffer->attachments[attachment_idx];
+ cmd_buffer->state.attachments[attachment_idx].image_view;
const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+
+ /* We don't support rendering to ycbcr images, so the image view should be
+ * one-plane, and using a single-plane format. But note that the underlying
+ * image can be a ycbcr format, as we support rendering to a specific plane
+ * of an image. This is used for example on some meta_copy code paths, in
+ * order to copy from/to a plane of a ycbcr image.
+ */
+ assert(iview->plane_count == 1);
+ assert(iview->format->plane_count == 1);
+
+ uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
const struct v3d_resource_slice *slice =
- &image->slices[iview->vk.base_mip_level];
+ &image->planes[image_plane].slices[iview->vk.base_mip_level];
uint32_t layer_offset = v3dv_layer_offset(image,
iview->vk.base_mip_level,
- iview->vk.base_array_layer + layer);
+ iview->vk.base_array_layer + layer,
+ image_plane);
+
+ /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
+ * is broken in earlier V3D versions.
+ */
+ assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = buffer;
- store.address = v3dv_cl_address(image->mem->bo, layer_offset);
+ store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
store.clear_buffer_being_stored = clear;
- store.output_image_format = iview->format->rt_type;
- store.r_b_swap = iview->swap_rb;
+ store.output_image_format = iview->format->planes[0].rt_type;
+
+ /* If we create an image view with only the stencil format, we
+ * re-interpret the format as RGBA8_UINT, as it is want we want in
+ * general (see CreateImageView).
+ *
+ * However, when we are loading/storing tiles from the ZSTENCIL tile
+ * buffer, we need to use the underlying DS format.
+ */
+ if (buffer == ZSTENCIL &&
+ iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
+ assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
+ store.output_image_format = image->format->planes[image_plane].rt_type;
+ }
+
+ store.r_b_swap = iview->planes[0].swap_rb;
+ store.channel_reverse = iview->planes[0].channel_reverse;
store.memory_format = slice->tiling;
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
@@ -349,7 +433,7 @@ check_needs_clear(const struct v3dv_cmd_buffer_state *state,
if (state->job->is_subpass_continue)
return false;
- /* If the render area is not aligned to tile boudaries we can't use the
+ /* If the render area is not aligned to tile boundaries we can't use the
* TLB for a clear.
*/
if (!state->tile_aligned_render_area)
@@ -366,36 +450,6 @@ check_needs_clear(const struct v3dv_cmd_buffer_state *state,
return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
}
-static bool
-check_needs_store(const struct v3dv_cmd_buffer_state *state,
- VkImageAspectFlags aspect,
- uint32_t last_subpass_idx,
- VkAttachmentStoreOp store_op)
-{
- /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
- * testing does not exist in the image.
- */
- if (!aspect)
- return false;
-
- /* Attachment (or view) store operations only apply on the last subpass
- * where the attachment (or view) is used, in other subpasses we always
- * need to store.
- */
- if (state->subpass_idx < last_subpass_idx)
- return true;
-
- /* Attachment store operations only apply on the last job we emit on the the
- * last subpass where the attachment is used, otherwise we always need to
- * store.
- */
- if (!state->job->is_subpass_finish)
- return true;
-
- /* The attachment store operation must be STORE */
- return store_op == VK_ATTACHMENT_STORE_OP_STORE;
-}
-
static void
cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_cl *cl,
@@ -435,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
const VkImageAspectFlags aspects =
vk_format_aspects(ds_attachment->desc.format);
+#if V3D_VERSION <= 42
+ /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+ * for depth/stencil.
+ *
+ * There used to be some confusion regarding the Clear Tile Buffers
+ * Z/S bit also being broken, but we confirmed with Broadcom that this
+ * is not the case, it was just that some other hardware bugs (that we
+ * need to work around, such as GFXH-1461) could cause this bit to behave
+ * incorrectly.
+ *
+ * There used to be another issue where the RTs bit in the Clear Tile
+ * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+ * fixed since V3D 4.1.
+ *
+ * So if we have to emit a clear of depth or stencil we don't use
+ * the per-buffer store clear bit, even if we need to store the buffers,
+ * instead we always have to use the Clear Tile Buffers Z/S bit.
+ * If we have configured the job to do early Z/S clearing, then we
+ * don't want to emit any Clear Tile Buffers command at all here.
+ *
+ * Note that GFXH-1689 is not reproduced in the simulator, where
+ * using the clear buffer bit in depth/stencil stores works fine.
+ */
+
/* Only clear once on the first subpass that uses the attachment */
uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
ds_attachment->first_subpass :
@@ -454,47 +532,59 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
ds_attachment->desc.stencilLoadOp,
subpass->do_stencil_clear_with_draw);
+ use_global_zs_clear = !state->job->early_zs_clear &&
+ (needs_depth_clear || needs_stencil_clear);
+#endif
+#if V3D_VERSION >= 71
+ /* The store command's clear buffer bit cannot be used for Z/S stencil:
+ * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
+ * so we don't want to emit redundant clears here.
+ */
+ use_global_zs_clear = false;
+#endif
+
/* Skip the last store if it is not required */
uint32_t ds_last_subpass = !pass->multiview_enabled ?
ds_attachment->last_subpass :
ds_attachment->views[layer].last_subpass;
bool needs_depth_store =
- check_needs_store(state,
- aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ds_last_subpass,
- ds_attachment->desc.storeOp);
+ v3dv_cmd_buffer_check_needs_store(state,
+ aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_last_subpass,
+ ds_attachment->desc.storeOp);
bool needs_stencil_store =
- check_needs_store(state,
- aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
- ds_last_subpass,
- ds_attachment->desc.stencilStoreOp);
+ v3dv_cmd_buffer_check_needs_store(state,
+ aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+ ds_last_subpass,
+ ds_attachment->desc.stencilStoreOp);
+
+ /* If we have a resolve, handle it before storing the tile */
+ const struct v3dv_cmd_buffer_attachment_state *ds_att_state =
+ &state->attachments[ds_attachment_idx];
+ if (ds_att_state->use_tlb_resolve) {
+ assert(ds_att_state->has_resolve);
+ assert(subpass->resolve_depth || subpass->resolve_stencil);
+ const uint32_t resolve_attachment_idx =
+ subpass->ds_resolve_attachment.attachment;
+ assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED);
+
+ const uint32_t zs_buffer =
+ v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil);
+ cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
+ resolve_attachment_idx, layer,
+ zs_buffer,
+ false, false);
+ has_stores = true;
+ } else if (ds_att_state->has_resolve) {
+ /* If we can't use the TLB to implement the resolve we will need to
+ * store the attachment so we can implement it later using a blit.
+ */
+ needs_depth_store = subpass->resolve_depth;
+ needs_stencil_store = subpass->resolve_stencil;
+ }
- /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
- * for depth/stencil.
- *
- * There used to be some confusion regarding the Clear Tile Buffers
- * Z/S bit also being broken, but we confirmed with Broadcom that this
- * is not the case, it was just that some other hardware bugs (that we
- * need to work around, such as GFXH-1461) could cause this bit to behave
- * incorrectly.
- *
- * There used to be another issue where the RTs bit in the Clear Tile
- * Buffers packet also cleared Z/S, but Broadcom confirmed this is
- * fixed since V3D 4.1.
- *
- * So if we have to emit a clear of depth or stencil we don't use
- * the per-buffer store clear bit, even if we need to store the buffers,
- * instead we always have to use the Clear Tile Buffers Z/S bit.
- * If we have configured the job to do early Z/S clearing, then we
- * don't want to emit any Clear Tile Buffers command at all here.
- *
- * Note that GFXH-1689 is not reproduced in the simulator, where
- * using the clear buffer bit in depth/stencil stores works fine.
- */
- use_global_zs_clear = !state->job->early_zs_clear &&
- (needs_depth_clear || needs_stencil_clear);
if (needs_depth_store || needs_stencil_store) {
const uint32_t zs_buffer =
v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
@@ -536,10 +626,10 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
attachment->views[layer].last_subpass;
bool needs_store =
- check_needs_store(state,
- VK_IMAGE_ASPECT_COLOR_BIT,
- last_subpass,
- attachment->desc.storeOp);
+ v3dv_cmd_buffer_check_needs_store(state,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ last_subpass,
+ attachment->desc.storeOp);
/* If we need to resolve this attachment emit that store first. Notice
* that we must not request a tile buffer clear here in that case, since
@@ -547,15 +637,16 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
* color attachment store below, since the clear happens after the
* store is completed.
*
- * If the attachment doesn't support TLB resolves then we will have to
- * fallback to doing the resolve in a shader separately after this
- * job, so we will need to store the multisampled sttachment even if that
- * wansn't requested by the client.
+ * If the attachment doesn't support TLB resolves (or the render area
+ * is not aligned to tile boundaries) then we will have to fallback to
+ * doing the resolve in a shader separately after this job, so we will
+ * need to store the multisampled attachment even if that wasn't
+ * requested by the client.
*/
- const bool needs_resolve =
- subpass->resolve_attachments &&
- subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
- if (needs_resolve && attachment->use_tlb_resolve) {
+ const struct v3dv_cmd_buffer_attachment_state *att_state =
+ &state->attachments[attachment_idx];
+ if (att_state->use_tlb_resolve) {
+ assert(att_state->has_resolve);
const uint32_t resolve_attachment_idx =
subpass->resolve_attachments[i].attachment;
cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
@@ -563,7 +654,7 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
RENDER_TARGET_0 + i,
false, true);
has_stores = true;
- } else if (needs_resolve) {
+ } else if (att_state->has_resolve) {
needs_store = true;
}
@@ -591,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
* bit and instead we have to emit a single clear of all tile buffers.
*/
if (use_global_zs_clear || use_global_rt_clear) {
+#if V3D_VERSION == 42
cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = use_global_zs_clear;
clear.clear_all_render_targets = use_global_rt_clear;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
}
@@ -698,11 +794,8 @@ set_rcl_early_z_config(struct v3dv_job *job,
bool *early_z_disable,
uint32_t *early_z_test_and_update_direction)
{
- /* If this is true then we have not emitted any draw calls in this job
- * and we don't get any benefits form early Z.
- */
- if (!job->decided_global_ez_enable) {
- assert(job->draw_count == 0);
+ /* Disable if none of the draw calls in this job enabled EZ */
+ if (!job->has_ez_draws) {
*early_z_disable = true;
return;
}
@@ -723,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
}
}
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ * FIXME: for v71 we are not returning all the possible combinations for
+ * render target internal type and clamp. For example for int types we are
+ * always using clamp int, and for 16f we are using clamp none or pos (that
+ * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
+ * summary right now we are just porting what we were doing on 4.2
+ */
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format)
+{
+#if V3D_VERSION == 42
+ if (vk_format_is_int(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_INT;
+ else if (vk_format_is_srgb(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_NORM;
+ else
+ return V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+ switch (rt_type) {
+ case V3D_INTERNAL_TYPE_8I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+ case V3D_INTERNAL_TYPE_8UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_8:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ case V3D_INTERNAL_TYPE_16I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+ case V3D_INTERNAL_TYPE_16UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_16F:
+ return vk_format_is_srgb(vk_format) ?
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+ case V3D_INTERNAL_TYPE_32I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+ case V3D_INTERNAL_TYPE_32UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_32F:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+ default:
+ unreachable("Unknown internal render target type");
+ }
+
+ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+}
+
+static void
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
+ int rt,
+ uint32_t *rt_bpp,
+#if V3D_VERSION == 42
+ uint32_t *rt_type,
+ uint32_t *rt_clamp)
+#else
+ uint32_t *rt_type_clamp)
+#endif
+{
+ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+ assert(state->subpass_idx < state->pass->subpass_count);
+ const struct v3dv_subpass *subpass =
+ &state->pass->subpasses[state->subpass_idx];
+
+ if (rt >= subpass->color_count)
+ return;
+
+ struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+ const uint32_t attachment_idx = attachment->attachment;
+ if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ return;
+
+ assert(attachment_idx < state->framebuffer->attachment_count &&
+ attachment_idx < state->attachment_alloc_count);
+ struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+ assert(vk_format_is_color(iview->vk.format));
+
+ assert(iview->plane_count == 1);
+ *rt_bpp = iview->planes[0].internal_bpp;
+#if V3D_VERSION == 42
+ *rt_type = iview->planes[0].internal_type;
+ *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+#if V3D_VERSION >= 71
+ *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+}
+
void
v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -738,7 +928,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
* buffer.
*/
if (!framebuffer) {
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
return;
}
@@ -756,23 +946,44 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
struct v3dv_cl *rcl = &job->rcl;
- /* Comon config must be the first TILE_RENDERING_MODE_CFG and
+ /* Common config must be the first TILE_RENDERING_MODE_CFG and
* Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
* updates to the previous HW state.
*/
bool do_early_zs_clear = false;
const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+ assert(!tiling->msaa || !tiling->double_buffer);
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
config.image_width_pixels = framebuffer->width;
config.image_height_pixels = framebuffer->height;
config.number_of_render_targets = MAX2(subpass->color_count, 1);
config.multisample_mode_4x = tiling->msaa;
+ config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
const struct v3dv_image_view *iview =
- framebuffer->attachments[ds_attachment_idx];
- config.internal_depth_type = iview->internal_type;
+ state->attachments[ds_attachment_idx].image_view;
+
+ /* At this point the image view should be single-plane. But note that
+ * the underlying image can be multi-plane, and the image view refer
+ * to one specific plane.
+ */
+ assert(iview->plane_count == 1);
+ assert(iview->format->plane_count == 1);
+ config.internal_depth_type = iview->planes[0].internal_type;
set_rcl_early_z_config(job,
&config.early_z_disable,
@@ -787,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
* Early-Z/S clearing is independent of Early Z/S testing, so it is
* possible to enable one but not the other so long as their
* respective requirements are met.
+ *
+ * From V3D 4.5.6, Z/S buffers are always cleared automatically
+ * between tiles, but we still want to enable early ZS clears
+ * when Z/S are not loaded or stored.
*/
struct v3dv_render_pass_attachment *ds_attachment =
&pass->attachments[ds_attachment_idx];
@@ -794,6 +1009,13 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const VkImageAspectFlags ds_aspects =
vk_format_aspects(ds_attachment->desc.format);
+ bool needs_depth_store =
+ v3dv_cmd_buffer_check_needs_store(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp) ||
+ subpass->resolve_depth;
+#if V3D_VERSION <= 42
bool needs_depth_clear =
check_needs_clear(state,
ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
@@ -801,26 +1023,35 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
ds_attachment->desc.loadOp,
subpass->do_depth_clear_with_draw);
- bool needs_depth_store =
- check_needs_store(state,
- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ds_attachment->last_subpass,
- ds_attachment->desc.storeOp);
-
do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+#endif
+#if V3D_VERSION >= 71
+ bool needs_depth_load =
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp);
+ do_early_zs_clear = !needs_depth_load && !needs_depth_store;
+#endif
+
if (do_early_zs_clear &&
vk_format_has_stencil(ds_attachment->desc.format)) {
bool needs_stencil_load =
- check_needs_load(state,
- ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
- ds_attachment->first_subpass,
- ds_attachment->desc.stencilLoadOp);
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.stencilLoadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.stencilStoreOp);
bool needs_stencil_store =
- check_needs_store(state,
- ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
- ds_attachment->last_subpass,
- ds_attachment->desc.stencilStoreOp);
+ v3dv_cmd_buffer_check_needs_store(state,
+ ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.stencilStoreOp) ||
+ subpass->resolve_stencil;
do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
}
@@ -837,25 +1068,38 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
*/
job->early_zs_clear = do_early_zs_clear;
+#if V3D_VERSION >= 71
+ uint32_t base_addr = 0;
+#endif
for (uint32_t i = 0; i < subpass->color_count; i++) {
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.render_target_number = i;
+ rt.stride = 1; /* Unused */
+ }
+#endif
continue;
+ }
struct v3dv_image_view *iview =
- state->framebuffer->attachments[attachment_idx];
+ state->attachments[attachment_idx].image_view;
+ assert(iview->plane_count == 1);
const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+
+ uint8_t plane = v3dv_plane_from_aspect(iview->vk.aspects);
const struct v3d_resource_slice *slice =
- &image->slices[iview->vk.base_mip_level];
+ &image->planes[plane].slices[iview->vk.base_mip_level];
- const uint32_t *clear_color =
+ UNUSED const uint32_t *clear_color =
&state->attachments[attachment_idx].clear_value.color[0];
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
slice->tiling == V3D_TILING_UIF_XOR) {
- int uif_block_height = v3d_utile_height(image->cpp) * 2;
+ int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
uint32_t implicit_padded_height =
align(framebuffer->height, uif_block_height) / uif_block_height;
@@ -866,13 +1110,14 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = clear_color[0];
clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
clear.render_target_number = i;
};
- if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) {
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
clear.clear_color_mid_low_32_bits =
((clear_color[1] >> 24) | (clear_color[2] << 8));
@@ -882,29 +1127,81 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
};
}
- if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
clear.uif_padded_height_in_uif_blocks = clear_pad;
clear.clear_color_high_16_bits = clear_color[3] >> 16;
clear.render_target_number = i;
};
}
+#endif
+
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.clear_color_low_bits = clear_color[0];
+ cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
+ &rt.internal_type_and_clamping);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = base_addr;
+ rt.render_target_number = i;
+
+ /* base_addr in multiples of 512 bits. We divide by 8 because stride
+ * is in 128-bit units, but it is packing 2 rows worth of data, so we
+ * need to divide it by 2 so it is only 1 row, and then again by 4 so
+ * it is in 512-bit units.
+ */
+ base_addr += (tiling->tile_height * rt.stride) / 8;
+ }
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) clear_color[1]) |
+ (((uint64_t) (clear_color[2] & 0xff)) << 32);
+ rt.render_target_number = i;
+ }
+ }
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (clear_color[3])) << 24);
+ rt.render_target_number = i;
+ }
+ }
+#endif
+ }
+
+#if V3D_VERSION >= 71
+ /* If we don't have any color RTs, we still need to emit one and flag
+ * it as not used using stride = 1.
+ */
+ if (subpass->color_count == 0) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.stride = 1;
+ }
}
+#endif
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 0, &rt.render_target_0_internal_bpp,
&rt.render_target_0_internal_type, &rt.render_target_0_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 1, &rt.render_target_1_internal_bpp,
&rt.render_target_1_internal_type, &rt.render_target_1_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 2, &rt.render_target_2_internal_bpp,
&rt.render_target_2_internal_type, &rt.render_target_2_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 3, &rt.render_target_3_internal_bpp,
&rt.render_target_3_internal_type, &rt.render_target_3_clamp);
}
+#endif
/* Ends rendering mode config. */
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -944,12 +1241,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
tiling->frame_height_in_supertiles;
}
- /* Start by clearing the tile buffer. */
- cl_emit(rcl, TILE_COORDINATES, coords) {
- coords.tile_column_number = 0;
- coords.tile_row_number = 0;
- }
-
/* Emit an initial clear of the tile buffers. This is necessary
* for any buffers that should be cleared (since clearing
* normally happens at the *end* of the generic tile list), but
@@ -964,17 +1255,22 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
* changes on V3D 3.x, and 2 dummy stores on 4.x.
*/
for (int i = 0; i < 2; i++) {
- if (i > 0)
- cl_emit(rcl, TILE_COORDINATES, coords);
+ cl_emit(rcl, TILE_COORDINATES, coords);
cl_emit(rcl, END_OF_LOADS, end);
cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE;
}
- if (i == 0 && cmd_buffer->state.tile_aligned_render_area) {
+ if (cmd_buffer->state.tile_aligned_render_area &&
+ (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
+#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
@@ -990,11 +1286,51 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+ float scale[3],
+ float translate[3])
+{
+ float x = viewport->x;
+ float y = viewport->y;
+ float half_width = 0.5f * viewport->width;
+ float half_height = 0.5f * viewport->height;
+ double n = viewport->minDepth;
+ double f = viewport->maxDepth;
+
+ scale[0] = half_width;
+ translate[0] = half_width + x;
+ scale[1] = half_height;
+ translate[1] = half_height + y;
+
+ scale[2] = (f - n);
+ translate[2] = n;
+
+ /* It seems that if the scale is small enough the hardware won't clip
+ * correctly so we work around this my choosing the smallest scale that
+ * seems to work.
+ *
+ * This case is exercised by CTS:
+ * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+ *
+ * V3D 7.x fixes this by using the new
+ * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
+ */
+#if V3D_VERSION <= 42
+ const float min_abs_scale = 0.0005f;
+ if (fabs(scale[2]) < min_abs_scale)
+ scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+#endif
+}
+
+void
v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
{
struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
- /* FIXME: right now we only support one viewport. viewporst[0] would work
- * now, would need to change if we allow multiple viewports
+ struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+ assert(pipeline);
+
+ /* FIXME: right now we don't support multiViewport so viewports[0] would
+ * work now, but would need to change if we allow multiple viewports.
*/
float *vptranslate = dynamic->viewport.translate[0];
float *vpscale = dynamic->viewport.scale[0];
@@ -1010,29 +1346,83 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
v3dv_return_if_oom(cmd_buffer, NULL);
+#if V3D_VERSION == 42
cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+ clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
+ clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
+ }
+#endif
+ float translate_z, scale_z;
+ v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
+ &translate_z, &scale_z);
+
+#if V3D_VERSION == 42
cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
- clip.viewport_z_offset_zc_to_zs = vptranslate[2];
- clip.viewport_z_scale_zc_to_zs = vpscale[2];
+ clip.viewport_z_offset_zc_to_zs = translate_z;
+ clip.viewport_z_scale_zc_to_zs = scale_z;
+ }
+#endif
+
+#if V3D_VERSION >= 71
+ /* If the Z scale is too small guardband clipping may not clip correctly */
+ if (fabsf(scale_z) < 0.01f) {
+ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
+ clip.viewport_z_offset_zc_to_zs = translate_z;
+ clip.viewport_z_scale_zc_to_zs = scale_z;
+ }
+ } else {
+ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+ clip.viewport_z_offset_zc_to_zs = translate_z;
+ clip.viewport_z_scale_zc_to_zs = scale_z;
+ }
}
+#endif
+
cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
- /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */
- float z1 = vptranslate[2];
- float z2 = vptranslate[2] + vpscale[2];
+ /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
+ * we are using OpenGL's [-1, 1] instead.
+ */
+ float z1 = pipeline->negative_one_to_one ? translate_z - scale_z :
+ translate_z;
+ float z2 = translate_z + scale_z;
clip.minimum_zw = MIN2(z1, z2);
clip.maximum_zw = MAX2(z1, z2);
}
cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
- vp.viewport_centre_x_coordinate = vptranslate[0];
- vp.viewport_centre_y_coordinate = vptranslate[1];
+ float vp_fine_x = vptranslate[0];
+ float vp_fine_y = vptranslate[1];
+ int32_t vp_coarse_x = 0;
+ int32_t vp_coarse_y = 0;
+
+ /* The fine coordinates must be unsigned, but coarse can be signed */
+ if (unlikely(vp_fine_x < 0)) {
+ int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64);
+ vp_fine_x += 64.0f * blocks_64;
+ vp_coarse_x -= blocks_64;
+ }
+
+ if (unlikely(vp_fine_y < 0)) {
+ int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64);
+ vp_fine_y += 64.0f * blocks_64;
+ vp_coarse_y -= blocks_64;
+ }
+
+ vp.fine_x = vp_fine_x;
+ vp.fine_y = vp_fine_y;
+ vp.coarse_x = vp_coarse_x;
+ vp.coarse_y = vp_coarse_y;
}
- cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
+ BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
+ MESA_VK_DYNAMIC_VP_VIEWPORTS);
}
void
@@ -1042,52 +1432,62 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer)
assert(job);
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic;
-
- const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
- V3DV_DYNAMIC_STENCIL_WRITE_MASK |
- V3DV_DYNAMIC_STENCIL_REFERENCE;
+ struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
v3dv_cl_ensure_space_with_branch(&job->bcl,
2 * cl_packet_length(STENCIL_CFG));
v3dv_return_if_oom(cmd_buffer, NULL);
+ bool any_dynamic_stencil_state =
+ BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+ BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+ BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+ BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP);
+
bool emitted_stencil = false;
- for (uint32_t i = 0; i < 2; i++) {
+ const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
+ const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
+
+ const bool needs_front_and_back = any_dynamic_stencil_state ?
+ memcmp(front, back, sizeof(*front)) != 0 :
+ pipeline->emit_stencil_cfg[1] == true;
+ const unsigned stencil_packets = needs_front_and_back ? 2 : 1;
+
+ for (uint32_t i = 0; i < stencil_packets; i++) {
if (pipeline->emit_stencil_cfg[i]) {
- if (dynamic_state->mask & dynamic_stencil_states) {
- cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
- pipeline->stencil_cfg[i], config) {
- if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) {
- config.stencil_test_mask =
- i == 0 ? dynamic_state->stencil_compare_mask.front :
- dynamic_state->stencil_compare_mask.back;
- }
- if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) {
- config.stencil_write_mask =
- i == 0 ? dynamic_state->stencil_write_mask.front :
- dynamic_state->stencil_write_mask.back;
- }
- if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) {
- config.stencil_ref_value =
- i == 0 ? dynamic_state->stencil_reference.front :
- dynamic_state->stencil_reference.back;
- }
+ if (any_dynamic_stencil_state) {
+ const struct vk_stencil_test_face_state *stencil_state =
+ i == 0 ? front : back;
+
+ /* If we have any dynamic stencil state we just emit the entire
+ * packet since for simplicity
+ */
+ cl_emit(&job->bcl, STENCIL_CFG, config) {
+ config.front_config = !needs_front_and_back || i == 0;
+ config.back_config = !needs_front_and_back || i == 1;
+ config.stencil_test_mask = stencil_state->compare_mask & 0xff;
+ config.stencil_write_mask = stencil_state->write_mask & 0xff;
+ config.stencil_ref_value = stencil_state->reference & 0xff;
+ config.stencil_test_function = stencil_state->op.compare;
+ config.stencil_pass_op =
+ v3dX(translate_stencil_op)(stencil_state->op.pass);
+ config.depth_test_fail_op =
+ v3dX(translate_stencil_op)(stencil_state->op.depth_fail);
+ config.stencil_test_fail_op =
+ v3dX(translate_stencil_op)(stencil_state->op.fail);
}
} else {
cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
}
-
emitted_stencil = true;
}
}
-
if (emitted_stencil) {
- const uint32_t dynamic_stencil_dirty_flags =
- V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
- V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
- V3DV_CMD_DIRTY_STENCIL_REFERENCE;
- cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags;
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK);
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE);
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
}
}
@@ -1103,19 +1503,51 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_job *job = cmd_buffer->state.job;
assert(job);
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
v3dv_return_if_oom(cmd_buffer, NULL);
- struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
- bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
- bias.depth_offset_units = dynamic->depth_bias.constant_factor;
+ bias.depth_offset_factor = dyn->rs.depth_bias.slope;
+ bias.depth_offset_units = dyn->rs.depth_bias.constant;
+#if V3D_VERSION <= 42
if (pipeline->depth_bias.is_z16)
bias.depth_offset_units *= 256.0f;
- bias.limit = dynamic->depth_bias.depth_bias_clamp;
+#endif
+ bias.limit = dyn->rs.depth_bias.clamp;
}
- cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
+}
+
+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ /* No depthBounds support for v42, so this method is empty in that case.
+ *
+ * Note that this method is being called as v3dv_job_init flags all state
+ * as dirty. See FIXME note in v3dv_job_init.
+ */
+#if V3D_VERSION >= 71
+ struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+
+ if (!dyn->ds.depth.bounds_test.enable)
+ return;
+
+ struct v3dv_job *job = cmd_buffer->state.job;
+ assert(job);
+
+ v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
+ bounds.lower_test_limit = dyn->ds.depth.bounds_test.min;
+ bounds.upper_test_limit = dyn->ds.depth.bounds_test.max;
+ }
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS);
+#endif
}
void
@@ -1124,14 +1556,17 @@ v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_job *job = cmd_buffer->state.job;
assert(job);
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
v3dv_return_if_oom(cmd_buffer, NULL);
cl_emit(&job->bcl, LINE_WIDTH, line) {
- line.line_width = cmd_buffer->state.dynamic.line_width;
+ line.line_width = v3dv_get_aa_line_width(cmd_buffer->state.gfx.pipeline,
+ cmd_buffer);
}
- cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH;
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH);
}
void
@@ -1161,10 +1596,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
+ const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
+ const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
+
const uint32_t blend_packets_size =
cl_packet_length(BLEND_ENABLES) +
cl_packet_length(BLEND_CONSTANT_COLOR) +
- cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
+ cl_packet_length(BLEND_CFG) * max_color_rts;
v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
v3dv_return_if_oom(cmd_buffer, NULL);
@@ -1176,23 +1614,26 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
- for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+ for (uint32_t i = 0; i < max_color_rts; i++) {
if (pipeline->blend.enables & (1 << i))
cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
}
}
- if (pipeline->blend.needs_color_constants &&
- cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) {
- struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+ if (pipeline->blend.needs_color_constants) {
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+
cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
- color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]);
- color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]);
- color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]);
- color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]);
+ color.red_f16 = _mesa_float_to_half(dyn->cb.blend_constants[0]);
+ color.green_f16 = _mesa_float_to_half(dyn->cb.blend_constants[1]);
+ color.blue_f16 = _mesa_float_to_half(dyn->cb.blend_constants[2]);
+ color.alpha_f16 = _mesa_float_to_half(dyn->cb.blend_constants[3]);
}
- cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS;
}
+
+ BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
+ MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
}
void
@@ -1202,13 +1643,21 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS));
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+ struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
+ uint32_t color_write_mask = ~v3dv_dyn->color_write_enable |
+ pipeline->blend.color_write_masks;
+
+#if V3D_VERSION <= 42
+ /* Only 4 RTs */
+ color_write_mask &= 0xffff;
+#endif
+
cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
- mask.mask = (~dynamic->color_write_enable |
- pipeline->blend.color_write_masks) & 0xffff;
+ mask.mask = color_write_mask;
}
- cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
+ BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
+ MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
}
static void
@@ -1346,11 +1795,33 @@ v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
-static void
-job_update_ez_state(struct v3dv_job *job,
- struct v3dv_pipeline *pipeline,
- struct v3dv_cmd_buffer *cmd_buffer)
+#if V3D_VERSION == 42
+/* Updates cmd_buffer, and their job, early z state tracking. Returns false if
+ * EZ must be disabled for the current draw call.
+ */
+static bool
+cmd_buffer_update_ez_state(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_pipeline *pipeline)
{
+ struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
+ /* Update first cmd_buffer ez_state tracking. If possible we reuse the
+ * values from the pipeline
+ */
+ if (!BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP) &&
+ !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) &&
+ !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) &&
+ !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
+ cmd_buffer->state.ez_state = pipeline->ez_state;
+ cmd_buffer->state.incompatible_ez_test =
+ pipeline->incompatible_ez_test;
+ } else {
+ v3dv_compute_ez_state(dyn, pipeline,
+ &cmd_buffer->state.ez_state,
+ &cmd_buffer->state.incompatible_ez_test);
+ }
+
+ struct v3dv_job *job = cmd_buffer->state.job;
+ assert(job);
/* If first_ez_state is V3D_EZ_DISABLED it means that we have already
* determined that we should disable EZ completely for all draw calls in
* this job. This will cause us to disable EZ for the entire job in the
@@ -1360,9 +1831,15 @@ job_update_ez_state(struct v3dv_job *job,
*/
if (job->first_ez_state == V3D_EZ_DISABLED) {
assert(job->ez_state == V3D_EZ_DISABLED);
- return;
+ return false;
}
+ /* If ez_state is V3D_EZ_DISABLED it means that we have already decided
+ * that EZ must be disabled for the remaining of the frame.
+ */
+ if (job->ez_state == V3D_EZ_DISABLED)
+ return false;
+
/* This is part of the pre draw call handling, so we should be inside a
* render pass.
*/
@@ -1371,7 +1848,7 @@ job_update_ez_state(struct v3dv_job *job,
/* If this is the first time we update EZ state for this job we first check
* if there is anything that requires disabling it completely for the entire
* job (based on state that is not related to the current draw call and
- * pipeline state).
+ * pipeline/cmd_buffer state).
*/
if (!job->decided_global_ez_enable) {
job->decided_global_ez_enable = true;
@@ -1382,13 +1859,14 @@ job_update_ez_state(struct v3dv_job *job,
if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
job->first_ez_state = V3D_EZ_DISABLED;
job->ez_state = V3D_EZ_DISABLED;
- return;
+ return false;
}
- /* GFXH-1918: the early-z buffer may load incorrect depth values
- * if the frame has odd width or height.
+ /* GFXH-1918: the early-z buffer may load incorrect depth values if the
+ * frame has odd width or height, or if the buffer is 16-bit and
+ * multisampled.
*
- * So we need to disable EZ in this case.
+ * So we need to disable EZ in these cases.
*/
const struct v3dv_render_pass_attachment *ds_attachment =
&state->pass->attachments[subpass->ds_attachment.attachment];
@@ -1397,21 +1875,32 @@ job_update_ez_state(struct v3dv_job *job,
vk_format_aspects(ds_attachment->desc.format);
bool needs_depth_load =
- check_needs_load(state,
- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ds_attachment->first_subpass,
- ds_attachment->desc.loadOp);
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp);
if (needs_depth_load) {
+ if (ds_attachment->desc.format == VK_FORMAT_D16_UNORM &&
+ ds_attachment->desc.samples != VK_SAMPLE_COUNT_1_BIT) {
+ perf_debug("Loading depth aspect from a multisampled 16-bit "
+ "depth buffer disables early-Z tests.\n");
+ job->first_ez_state = V3D_EZ_DISABLED;
+ job->ez_state = V3D_EZ_DISABLED;
+ return false;
+ }
+
struct v3dv_framebuffer *fb = state->framebuffer;
if (!fb) {
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
perf_debug("Loading depth aspect in a secondary command buffer "
"without framebuffer info disables early-z tests.\n");
job->first_ez_state = V3D_EZ_DISABLED;
job->ez_state = V3D_EZ_DISABLED;
- return;
+ return false;
}
if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
@@ -1419,24 +1908,18 @@ job_update_ez_state(struct v3dv_job *job,
"or height disables early-Z tests.\n");
job->first_ez_state = V3D_EZ_DISABLED;
job->ez_state = V3D_EZ_DISABLED;
- return;
+ return false;
}
}
}
/* Otherwise, we can decide to selectively enable or disable EZ for draw
- * calls using the CFG_BITS packet based on the bound pipeline state.
+ * calls using the CFG_BITS packet based on the bound pipeline state, or
+ * cmd_buffer state if some stencil/depth flags were dynamic.
*/
-
- /* If the FS writes Z, then it may update against the chosen EZ direction */
- struct v3dv_shader_variant *fs_variant =
- pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
- if (fs_variant->prog_data.fs->writes_z) {
- job->ez_state = V3D_EZ_DISABLED;
- return;
- }
-
- switch (pipeline->ez_state) {
+ bool disable_ez = false;
+ bool incompatible_test = false;
+ switch (cmd_buffer->state.ez_state) {
case V3D_EZ_UNDECIDED:
/* If the pipeline didn't pick a direction but didn't disable, then go
* along with the current EZ state. This allows EZ optimization for Z
@@ -1449,25 +1932,40 @@ job_update_ez_state(struct v3dv_job *job,
/* If the pipeline picked a direction, then it needs to match the current
* direction if we've decided on one.
*/
- if (job->ez_state == V3D_EZ_UNDECIDED)
- job->ez_state = pipeline->ez_state;
- else if (job->ez_state != pipeline->ez_state)
- job->ez_state = V3D_EZ_DISABLED;
+ if (job->ez_state == V3D_EZ_UNDECIDED) {
+ job->ez_state = cmd_buffer->state.ez_state;
+ } else if (job->ez_state != pipeline->ez_state) {
+ disable_ez = true;
+ incompatible_test = true;
+ }
break;
case V3D_EZ_DISABLED:
- /* If the pipeline disables EZ because of a bad Z func or stencil
- * operation, then we can't do any more EZ in this frame.
- */
- job->ez_state = V3D_EZ_DISABLED;
+ disable_ez = true;
+ incompatible_test = cmd_buffer->state.incompatible_ez_test;
break;
}
- if (job->first_ez_state == V3D_EZ_UNDECIDED &&
- job->ez_state != V3D_EZ_DISABLED) {
+ if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) {
+ assert(job->ez_state != V3D_EZ_DISABLED);
job->first_ez_state = job->ez_state;
}
+
+ /* If we had to disable EZ because of an incompatible test direction and
+ * and the cmd buffer writes depth then we need to disable EZ for the rest
+ * of the frame.
+ */
+ if (incompatible_test && cmd_buffer->state.z_updates_enable) {
+ assert(disable_ez);
+ job->ez_state = V3D_EZ_DISABLED;
+ }
+
+ if (!disable_ez)
+ job->has_ez_draws = true;
+
+ return !disable_ez;
}
+#endif
void
v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
@@ -1478,16 +1976,60 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
- job_update_ez_state(job, pipeline, cmd_buffer);
-
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
v3dv_return_if_oom(cmd_buffer, NULL);
+ struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+
+ /* Disable depth/stencil if we don't have a D/S attachment */
+ bool has_depth =
+ pipeline->rendering_info.depth_attachment_format != VK_FORMAT_UNDEFINED;
+ bool has_stencil =
+ pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
+
cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
- config.early_z_enable = job->ez_state != V3D_EZ_DISABLED;
+ if (dyn->ds.depth.test_enable && has_depth) {
+ config.z_updates_enable = dyn->ds.depth.write_enable;
+ config.depth_test_function = dyn->ds.depth.compare_op;
+ } else {
+ config.depth_test_function = VK_COMPARE_OP_ALWAYS;
+ }
+
+ config.stencil_enable = dyn->ds.stencil.test_enable && has_stencil;
+
+ cmd_buffer->state.z_updates_enable = config.z_updates_enable;
+#if V3D_VERSION == 42
+ bool enable_ez = cmd_buffer_update_ez_state(cmd_buffer, pipeline);
+ config.early_z_enable = enable_ez;
config.early_z_updates_enable = config.early_z_enable &&
- pipeline->z_updates_enable;
- }
+ cmd_buffer->state.z_updates_enable;
+#endif
+
+ if (pipeline->rasterization_enabled) {
+ assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_CULL_MODE));
+ assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_FRONT_FACE));
+ config.enable_forward_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT);
+ config.enable_reverse_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT);
+ /* Seems like the hardware is backwards regarding this setting... */
+ config.clockwise_primitives = dyn->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
+ }
+
+ /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
+ * feature and it shouldn't be used by any pipeline.
+ */
+ assert(cmd_buffer->device->devinfo.ver >= 71 ||
+ !dyn->ds.depth.bounds_test.enable);
+#if V3D_VERSION >= 71
+ config.depth_bounds_test_enable =
+ dyn->ds.depth.bounds_test.enable && has_depth;
+#endif
+ }
+
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE);
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE);
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
+ BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
}
void
@@ -1523,7 +2065,8 @@ cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
if (!job)
return NULL;
- job->serialize = true;
+ /* FIXME: we can do better than all barriers */
+ job->serialize = V3DV_BARRIER_ALL;
job->needs_bcl_sync = is_bcl_barrier;
return job;
}
@@ -1538,21 +2081,20 @@ cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
const uint32_t total_state_count =
p_state->query.end.used_count + s_state->query.end.used_count;
v3dv_cmd_buffer_ensure_array_state(primary,
- sizeof(struct v3dv_end_query_cpu_job_info),
+ sizeof(struct v3dv_end_query_info),
total_state_count,
&p_state->query.end.alloc_count,
(void **) &p_state->query.end.states);
v3dv_return_if_oom(primary, NULL);
for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
- const struct v3dv_end_query_cpu_job_info *s_qstate =
+ const struct v3dv_end_query_info *s_qstate =
&secondary->state.query.end.states[i];
- struct v3dv_end_query_cpu_job_info *p_qstate =
+ struct v3dv_end_query_info *p_qstate =
&p_state->query.end.states[p_state->query.end.used_count++];
- p_qstate->pool = s_qstate->pool;
- p_qstate->query = s_qstate->query;
+ memcpy(p_qstate, s_qstate, sizeof(struct v3dv_end_query_info));
}
}
@@ -1563,6 +2105,20 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
{
assert(primary->state.job);
+ /* Typically we postpone applying binning syncs until we see a draw call
+ * that may actually access proteted resources in the binning stage. However,
+ * if the draw calls are recorded in a secondary command buffer and the
+ * barriers were recorded in a primary command buffer, that won't work
+ * and we will have to check if we need a binning sync when executing the
+ * secondary.
+ */
+ struct v3dv_job *primary_job = primary->state.job;
+ if (primary_job->serialize &&
+ (primary->state.barrier.bcl_buffer_access ||
+ primary->state.barrier.bcl_image_access)) {
+ v3dv_cmd_buffer_consume_bcl_sync(primary, primary_job);
+ }
+
/* Emit occlusion query state if needed so the draw calls inside our
* secondaries update the counters.
*/
@@ -1575,8 +2131,7 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
* pipelines used by the secondaries do, we need to re-start the primary
* job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
*/
- bool pending_barrier = false;
- bool pending_bcl_barrier = false;
+ struct v3dv_barrier_state pending_barrier = { 0 };
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
@@ -1585,7 +2140,7 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
list_for_each_entry(struct v3dv_job, secondary_job,
&secondary->jobs, list_link) {
- if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
+ if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
/* If the job is a CL, then we branch to it from the primary BCL.
* In this case the secondary's BCL is finished with a
* RETURN_FROM_SUB_LIST command to return back to the primary BCL
@@ -1609,10 +2164,14 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
* the RETURN_FROM_SUB_LIST into the primary job to skip the
* branch?
*/
- struct v3dv_job *primary_job = primary->state.job;
- if (!primary_job || secondary_job->serialize || pending_barrier) {
+ primary_job = primary->state.job;
+ if (!primary_job || secondary_job->serialize ||
+ pending_barrier.dst_mask) {
const bool needs_bcl_barrier =
- secondary_job->needs_bcl_sync || pending_bcl_barrier;
+ secondary_job->needs_bcl_sync ||
+ pending_barrier.bcl_buffer_access ||
+ pending_barrier.bcl_image_access;
+
primary_job =
cmd_buffer_subpass_split_for_barrier(primary,
needs_bcl_barrier);
@@ -1644,6 +2203,14 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
}
}
+ if (!secondary_job->can_use_double_buffer) {
+ primary_job->can_use_double_buffer = false;
+ } else {
+ primary_job->double_buffer_score.geom +=
+ secondary_job->double_buffer_score.geom;
+ primary_job->double_buffer_score.render +=
+ secondary_job->double_buffer_score.render;
+ }
primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
} else {
/* This is a regular job (CPU or GPU), so just finish the current
@@ -1652,15 +2219,21 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
*/
v3dv_cmd_buffer_finish_job(primary);
v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
- if (pending_barrier) {
- secondary_job->serialize = true;
- if (pending_bcl_barrier)
+ if (pending_barrier.dst_mask) {
+ /* FIXME: do the same we do for primaries and only choose the
+ * relevant src masks.
+ */
+ secondary_job->serialize = pending_barrier.src_mask_graphics |
+ pending_barrier.src_mask_transfer |
+ pending_barrier.src_mask_compute;
+ if (pending_barrier.bcl_buffer_access ||
+ pending_barrier.bcl_image_access) {
secondary_job->needs_bcl_sync = true;
+ }
}
}
- pending_barrier = false;
- pending_bcl_barrier = false;
+ memset(&pending_barrier, 0, sizeof(pending_barrier));
}
/* If the secondary has recorded any vkCmdEndQuery commands, we need to
@@ -1672,14 +2245,16 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
/* If this secondary had any pending barrier state we will need that
* barrier state consumed with whatever comes next in the primary.
*/
- assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
- pending_barrier = secondary->state.has_barrier;
- pending_bcl_barrier = secondary->state.has_bcl_barrier;
+ assert(secondary->state.barrier.dst_mask ||
+ (!secondary->state.barrier.bcl_buffer_access &&
+ !secondary->state.barrier.bcl_image_access));
+
+ pending_barrier = secondary->state.barrier;
}
- if (pending_barrier) {
- primary->state.has_barrier = true;
- primary->state.has_bcl_barrier |= pending_bcl_barrier;
+ if (pending_barrier.dst_mask) {
+ v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
+ &pending_barrier);
}
}
@@ -1698,7 +2273,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
gs_bin->prog_data.gs->base.threads == 4;
shader.geometry_bin_mode_shader_start_in_final_thread_section =
gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
shader.geometry_bin_mode_shader_uniforms_address =
gs_bin_uniforms;
@@ -1708,21 +2285,23 @@ emit_gs_shader_state_record(struct v3dv_job *job,
gs->prog_data.gs->base.threads == 4;
shader.geometry_render_mode_shader_start_in_final_thread_section =
gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
shader.geometry_render_mode_shader_uniforms_address =
gs_render_uniforms;
}
}
static uint8_t
-v3d_gs_output_primitive(uint32_t prim_type)
+v3d_gs_output_primitive(enum mesa_prim prim_type)
{
switch (prim_type) {
- case GL_POINTS:
+ case MESA_PRIM_POINTS:
return GEOMETRY_SHADER_POINTS;
- case GL_LINE_STRIP:
+ case MESA_PRIM_LINE_STRIP:
return GEOMETRY_SHADER_LINE_STRIP;
- case GL_TRIANGLE_STRIP:
+ case MESA_PRIM_TRIANGLE_STRIP:
return GEOMETRY_SHADER_TRI_STRIP;
default:
unreachable("Unsupported primitive type");
@@ -1884,10 +2463,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
pipeline->vpm_cfg.Gv);
}
+#if V3D_VERSION == 42
struct v3dv_bo *default_attribute_values =
pipeline->default_attribute_values != NULL ?
pipeline->default_attribute_values :
pipeline->device->default_attribute_float;
+#endif
cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
pipeline->shader_state_record, shader) {
@@ -1913,8 +2494,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
+#if V3D_VERSION == 42
shader.address_of_default_attribute_values =
v3dv_cl_address(default_attribute_values, 0);
+#endif
shader.any_shader_reads_hardware_written_primitive_id =
(pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
@@ -1979,6 +2562,8 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
cs_loaded_any = true;
}
+ attr.stride =
+ cmd_buffer->vk.dynamic_graphics_state.vi_binding_strides[binding];
attr.maximum_index = 0xffffff;
}
@@ -2027,6 +2612,11 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
+ /* Clearing push constants and descriptor sets for all stages is not quite
+ * correct (some shader stages may not be used at all or they may not be
+ * consuming push constants), however this is not relevant because if we
+ * bind a different pipeline we always have to rebuild the uniform streams.
+ */
cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
V3DV_CMD_DIRTY_PUSH_CONSTANTS);
@@ -2034,44 +2624,15 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
}
-/* FIXME: C&P from v3dx_draw. Refactor to common place? */
-static uint32_t
-v3d_hw_prim_type(enum pipe_prim_type prim_type)
-{
- switch (prim_type) {
- case PIPE_PRIM_POINTS:
- case PIPE_PRIM_LINES:
- case PIPE_PRIM_LINE_LOOP:
- case PIPE_PRIM_LINE_STRIP:
- case PIPE_PRIM_TRIANGLES:
- case PIPE_PRIM_TRIANGLE_STRIP:
- case PIPE_PRIM_TRIANGLE_FAN:
- return prim_type;
-
- case PIPE_PRIM_LINES_ADJACENCY:
- case PIPE_PRIM_LINE_STRIP_ADJACENCY:
- case PIPE_PRIM_TRIANGLES_ADJACENCY:
- case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
- return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY);
-
- default:
- unreachable("Unsupported primitive type");
- }
-}
-
void
v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_draw_info *info)
{
struct v3dv_job *job = cmd_buffer->state.job;
assert(job);
-
- struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
- struct v3dv_pipeline *pipeline = state->gfx.pipeline;
-
- assert(pipeline);
-
- uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
if (info->first_instance > 0) {
v3dv_cl_ensure_space_with_branch(
@@ -2226,7 +2787,9 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
assert(job);
const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
v3dv_cl_ensure_space_with_branch(
@@ -2245,37 +2808,159 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
}
void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp)
+v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer)
{
- const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+ struct v3dv_job *job = cmd_buffer->state.job;
+ assert(job);
- assert(state->subpass_idx < state->pass->subpass_count);
- const struct v3dv_subpass *subpass =
- &state->pass->subpasses[state->subpass_idx];
+ job->suspending = true;
- if (rt >= subpass->color_count)
- return;
+ v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(BRANCH));
- struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
- const uint32_t attachment_idx = attachment->attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
- return;
+ job->suspend_branch_inst_ptr = cl_start(&job->bcl);
+ cl_emit(&job->bcl, BRANCH, branch) {
+ branch.address = v3dv_cl_address(NULL, 0);
+ }
- const struct v3dv_framebuffer *framebuffer = state->framebuffer;
- assert(attachment_idx < framebuffer->attachment_count);
- struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
- assert(iview->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT);
-
- *rt_bpp = iview->internal_bpp;
- *rt_type = iview->internal_type;
- if (vk_format_is_int(iview->vk.format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
- else if (vk_format_is_srgb(iview->vk.format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
- else
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+ /* The sim complains if the command list ends with a branch */
+ cl_emit(&job->bcl, NOP, nop);
+}
+
+void
+v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend,
+ struct v3dv_job *suspend,
+ struct v3dv_job *resume)
+{
+ assert(resume && resume->resuming);
+ assert(first_suspend && first_suspend->suspending);
+ assert(suspend && suspend->suspending);
+ assert(suspend->suspend_branch_inst_ptr != NULL);
+
+ struct v3dv_bo *resume_bo =
+ list_first_entry(&resume->bcl.bo_list, struct v3dv_bo, list_link);
+ struct cl_packet_struct(BRANCH) branch = {
+ cl_packet_header(BRANCH),
+ };
+ branch.address = v3dv_cl_address(NULL, resume_bo->offset);
+
+ uint8_t *rewrite_addr = (uint8_t *) suspend->suspend_branch_inst_ptr;
+ cl_packet_pack(BRANCH)(NULL, rewrite_addr, &branch);
+
+ if (resume != first_suspend) {
+ set_foreach(resume->bos, entry) {
+ struct v3dv_bo *bo = (void *)entry->key;
+ v3dv_job_add_bo(first_suspend, bo);
+ }
+ }
+
+ first_suspend->suspended_bcl_end = resume->bcl.bo->offset +
+ v3dv_cl_offset(&resume->bcl);
+}
+
+static void
+job_destroy_cb(VkDevice device, uint64_t pobj, VkAllocationCallbacks *allocb)
+{
+ struct v3dv_job *clone = (struct v3dv_job *) (uintptr_t) pobj;
+ v3dv_job_destroy(clone);
+}
+
+/**
+ * This checks if the command buffer has been created with
+ * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, in which case we won't be
+ * able to safely patch the resume address into the job (since we could have
+ * another instance of this job running in the GPU, potentially resuming in a
+ * different address). In that case, we clone the job and make the clone have
+ * its own BCL copied from the original job so we can later patch the resume
+ * address into it safely.
+ */
+struct v3dv_job *
+v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job)
+{
+ assert(job->suspending);
+ assert(job->cmd_buffer);
+ assert(job->type == V3DV_JOB_TYPE_GPU_CL);
+
+ if (!(job->cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
+ return job;
+
+ /* Create the clone job, but skip the BCL since we are going to create
+ * our own below.
+ */
+ struct v3dv_job *clone = v3dv_job_clone(job, true);
+ if (!clone)
+ return NULL;
+
+ /* Compute total size of BCL we need to copy */
+ uint32_t bcl_size = 0;
+ list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link)
+ bcl_size += bo->size;
+
+ /* Prepare the BCL for the cloned job. For this we go over the BOs in the
+ * BCL of the original job and we copy their contents into the single BO
+ * in the BCL of the cloned job.
+ */
+ clone->clone_owns_bcl = true;
+ v3dv_cl_init(clone, &clone->bcl);
+ v3dv_cl_ensure_space(&clone->bcl, bcl_size, 4);
+ if (!clone->bcl.bo)
+ return NULL;
+
+ assert(clone->bcl.base);
+ assert(clone->bcl.base == clone->bcl.next);
+
+ /* Unlink this job from the command buffer's execution list */
+ list_inithead(&clone->list_link);
+
+ /* Copy the contents of each BO in the original job's BCL into the single
+ * BO we have in the clone's BCL.
+ *
+ * If the BO is the last in the BCL (which we can tell because it wouldn't
+ * have emitted a BRANCH instruction to link to another BO) we need to copy
+ * up to the current BCL offset, otherwise we need to copy up to the BRANCH
+ * instruction (excluded, since we are putting everything together into a
+ * single BO here).
+ */
+ list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
+ assert(bo->map);
+ uint32_t copy_size;
+ if (bo->cl_branch_offset == 0xffffffff) { /* Last BO in BCL */
+ assert(bo == list_last_entry(&job->bcl.bo_list, struct v3dv_bo, list_link));
+ copy_size = v3dv_cl_offset(&job->bcl);
+ } else {
+ assert(bo->cl_branch_offset >= cl_packet_length(BRANCH));
+ copy_size = bo->cl_branch_offset - cl_packet_length(BRANCH);
+ }
+
+ assert(v3dv_cl_offset(&job->bcl) + copy_size < bcl_size);
+ memcpy(cl_start(&clone->bcl), bo->map, copy_size);
+ cl_advance_and_end(&clone->bcl, copy_size);
+ }
+
+ /* Now we need to fixup the pointer to the suspend BRANCH instruction at the
+ * end of the BCL so it points to the address in the new BCL. We know that
+ * to suspend a command buffer we always emit a BRANCH+NOP combo, so we just
+ * need to go back that many bytes in to the BCL to find the instruction.
+ */
+ uint32_t suspend_terminator_size =
+ cl_packet_length(BRANCH) + cl_packet_length(NOP);
+ clone->suspend_branch_inst_ptr = (struct v3dv_cl_out *)
+ (((uint8_t *)cl_start(&clone->bcl)) - suspend_terminator_size);
+ assert(*(((uint8_t *)clone->suspend_branch_inst_ptr)) == V3DX(BRANCH_opcode));
+
+ /* This job is not in the execution list of the command buffer so it
+ * won't be destroyed with it; add it as a private object to get it freed.
+ *
+ * FIXME: every time this job is submitted we clone the job and we only
+ * destroy it when the command buffer is destroyed. If the user keeps the
+ * command buffer for the entire lifetime of the application, this command
+ * buffer could grow significantly, so maybe we want to do something smarter
+ * like having a syncobj bound to these jobs and every time we submit the
+ * command buffer again we first check these sncobjs to see if we can free
+ * some of these clones so we avoid blowing up memory.
+ */
+ v3dv_cmd_buffer_add_private_obj(
+ job->cmd_buffer, (uintptr_t)clone,
+ (v3dv_cmd_buffer_private_obj_destroy_cb)job_destroy_cb);
+
+ return clone;
}
diff --git a/src/broadcom/vulkan/v3dvx_descriptor_set.c b/src/broadcom/vulkan/v3dvx_descriptor_set.c
index 2c28ce46aa5..ced7b7e8c85 100644
--- a/src/broadcom/vulkan/v3dvx_descriptor_set.c
+++ b/src/broadcom/vulkan/v3dvx_descriptor_set.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -51,7 +51,7 @@ v3dX(descriptor_bo_size)(VkDescriptorType type)
}
/* To compute the max_bo_size we want to iterate through the descriptor
- * types. Unfourtunately we can't just use the descriptor type enum values, as
+ * types. Unfortunately we can't just use the descriptor type enum values, as
* the values are not defined consecutively (so extensions could add new
* descriptor types), and VK_DESCRIPTOR_TYPE_MAX_ENUM is also a really big
* number.
@@ -86,13 +86,15 @@ v3dX(max_descriptor_bo_size)(void)
uint32_t
-v3dX(combined_image_sampler_texture_state_offset)(void)
+v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane)
{
- return 0;
+ return v3dX(descriptor_bo_size)(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) *
+ plane;
}
uint32_t
-v3dX(combined_image_sampler_sampler_state_offset)(void)
+v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane)
{
- return cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32);
+ return v3dX(combined_image_sampler_texture_state_offset)(plane) +
+ cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32);
}
diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
index a48738aec42..a27d65cfd23 100644
--- a/src/broadcom/vulkan/v3dvx_device.c
+++ b/src/broadcom/vulkan/v3dvx_device.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,7 +26,6 @@
#include "broadcom/common/v3d_macros.h"
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
-#include "vk_format_info.h"
#include "util/u_pack_color.h"
#include "util/half_float.h"
@@ -50,8 +49,8 @@ vk_to_v3d_compare_func[] = {
[VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS,
};
-
static union pipe_color_union encode_border_color(
+ const struct v3dv_device *device,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
{
const struct util_format_description *desc =
@@ -59,10 +58,55 @@ static union pipe_color_union encode_border_color(
const struct v3dv_format *format = v3dX(get_format)(bc_info->format);
+ /* YCbCr doesn't interact with border color at all. From spec:
+ *
+ * "If sampler YCBCR conversion is enabled, addressModeU, addressModeV,
+ * and addressModeW must be VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+ * anisotropyEnable must be VK_FALSE, and unnormalizedCoordinates must
+ * be VK_FALSE"
+ */
+ assert(format->plane_count == 1);
+
+ /* We use the swizzle in our format table to determine swizzle configuration
+ * for sampling as well as to decide if we need to use the Swap R/B and
+ * Reverse Channels bits for Tile Load/Store operations. The order of the
+ * R/B swap and Reverse operations matters and gives different swizzles.
+ * Our format table assumes that Reverse happens first and R/B Swap second.
+ * This seems to match semantics for texture sampling and Tile load/store,
+ * however, it seems that the semantics are reversed for custom border
+ * colors so we need to fix up the swizzle manually for this case.
+ */
+ uint8_t swizzle[4];
+ const bool v3d_has_reverse_swap_rb_bits =
+ v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
+ if (!v3d_has_reverse_swap_rb_bits &&
+ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+ v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
+ swizzle[0] = PIPE_SWIZZLE_W;
+ swizzle[1] = PIPE_SWIZZLE_X;
+ swizzle[2] = PIPE_SWIZZLE_Y;
+ swizzle[3] = PIPE_SWIZZLE_Z;
+ }
+ /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
+ * we have to use the new reverse and swap_r/b flags in the texture shader
+ * state which will apply the format swizzle automatically when sampling
+ * the border color too and we should not apply it manually here.
+ */
+ else if (v3d_has_reverse_swap_rb_bits &&
+ (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
+ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
+ swizzle[0] = PIPE_SWIZZLE_X;
+ swizzle[1] = PIPE_SWIZZLE_Y;
+ swizzle[2] = PIPE_SWIZZLE_Z;
+ swizzle[3] = PIPE_SWIZZLE_W;
+ } else {
+ memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
+ }
+
union pipe_color_union border;
for (int i = 0; i < 4; i++) {
- if (format->swizzle[i] <= 3)
- border.ui[i] = bc_info->customBorderColor.uint32[format->swizzle[i]];
+ if (format->planes[0].swizzle[i] <= 3)
+ border.ui[i] = bc_info->customBorderColor.uint32[swizzle[i]];
else
border.ui[i] = 0;
}
@@ -90,7 +134,11 @@ static union pipe_color_union encode_border_color(
(1 << (desc->channel[i].size - 1)) - 1);
}
- /* convert from float to expected format */
+#if V3D_VERSION <= 42
+ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+ * for us. In V3D 4.x we need to manually convert floating point color
+ * values to the expected format.
+ */
if (vk_format_is_srgb(bc_info->format) ||
vk_format_is_compressed(bc_info->format)) {
for (int i = 0; i < 4; i++)
@@ -142,12 +190,14 @@ static union pipe_color_union encode_border_color(
}
}
}
+#endif
return border;
}
void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+ struct v3dv_sampler *sampler,
const VkSamplerCreateInfo *pCreateInfo,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
{
@@ -175,21 +225,6 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
break;
}
- /* For some texture formats, when clamping to transparent black border the
- * CTS expects alpha to be set to 1 instead of 0, but the border color mode
- * will take priority over the texture state swizzle, so the only way to
- * fix that is to apply a swizzle in the shader. Here we keep track of
- * whether we are activating that mode and we will decide if we need to
- * activate the texture swizzle lowering in the shader key at compile time
- * depending on the actual texture format.
- */
- if ((pCreateInfo->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
- pCreateInfo->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
- pCreateInfo->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER) &&
- border_color_mode == V3D_BORDER_COLOR_0000) {
- sampler->clamp_to_transparent_black_border = true;
- }
-
v3dvx_pack(sampler->sampler_state, SAMPLER_STATE, s) {
if (pCreateInfo->anisotropyEnable) {
s.anisotropy_enable = true;
@@ -204,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
s.border_color_mode = border_color_mode;
if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
- union pipe_color_union border = encode_border_color(bc_info);
+ union pipe_color_union border = encode_border_color(device, bc_info);
s.border_color_word_0 = border.ui[0];
s.border_color_word_1 = border.ui[1];
@@ -238,12 +273,15 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
void
v3dX(framebuffer_compute_internal_bpp_msaa)(
const struct v3dv_framebuffer *framebuffer,
+ const struct v3dv_cmd_buffer_attachment_state *attachments,
const struct v3dv_subpass *subpass,
- uint8_t *max_bpp,
+ uint8_t *max_internal_bpp,
+ uint8_t *total_color_bpp,
bool *msaa)
{
STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
- *max_bpp = V3D_INTERNAL_BPP_32;
+ *max_internal_bpp = V3D_INTERNAL_BPP_32;
+ *total_color_bpp = 0;
*msaa = false;
if (subpass) {
@@ -252,11 +290,15 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
if (att_idx == VK_ATTACHMENT_UNUSED)
continue;
- const struct v3dv_image_view *att = framebuffer->attachments[att_idx];
+ const struct v3dv_image_view *att = attachments[att_idx].image_view;
assert(att);
+ assert(att->plane_count == 1);
- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
- *max_bpp = MAX2(*max_bpp, att->internal_bpp);
+ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+ const uint32_t internal_bpp = att->planes[0].internal_bpp;
+ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ }
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
@@ -264,23 +306,26 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
if (!*msaa && subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
const struct v3dv_image_view *att =
- framebuffer->attachments[subpass->ds_attachment.attachment];
+ attachments[subpass->ds_attachment.attachment].image_view;
assert(att);
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
}
-
return;
}
assert(framebuffer->attachment_count <= 4);
for (uint32_t i = 0; i < framebuffer->attachment_count; i++) {
- const struct v3dv_image_view *att = framebuffer->attachments[i];
+ const struct v3dv_image_view *att = attachments[i].image_view;
assert(att);
+ assert(att->plane_count == 1);
- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
- *max_bpp = MAX2(*max_bpp, att->internal_bpp);
+ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+ const uint32_t internal_bpp = att->planes[0].internal_bpp;
+ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ }
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
@@ -342,7 +387,7 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
}
}
-#ifdef DEBUG
+#if MESA_DEBUG
void
v3dX(device_check_prepacked_sizes)(void)
{
diff --git a/src/broadcom/vulkan/v3dvx_formats.c b/src/broadcom/vulkan/v3dvx_formats.c
index 4f77dd0086a..4fe548faee0 100644
--- a/src/broadcom/vulkan/v3dvx_formats.c
+++ b/src/broadcom/vulkan/v3dvx_formats.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,6 +26,9 @@
#include "broadcom/cle/v3dx_pack.h"
#include "util/format/u_format.h"
+#include "vk_enum_to_str.h"
+#include "vk_enum_defines.h"
+#include "vk_util.h"
#define SWIZ(x,y,z,w) { \
PIPE_SWIZZLE_##x, \
@@ -35,15 +38,34 @@
}
#define FORMAT(vk, rt, tex, swiz, return_size, supports_filtering) \
- [VK_FORMAT_##vk] = { \
- true, \
- V3D_OUTPUT_IMAGE_FORMAT_##rt, \
- TEXTURE_DATA_FORMAT_##tex, \
- swiz, \
- return_size, \
+ [VK_ENUM_OFFSET(VK_FORMAT_##vk)] = { \
+ 1, \
+ {{ \
+ V3D_OUTPUT_IMAGE_FORMAT_##rt, \
+ TEXTURE_DATA_FORMAT_##tex, \
+ swiz, \
+ return_size, \
+ }}, \
supports_filtering, \
}
+#define PLANE(rt, tex, swiz, return_size) \
+ { \
+ V3D_OUTPUT_IMAGE_FORMAT_##rt, \
+ TEXTURE_DATA_FORMAT_##tex, \
+ swiz, \
+ return_size \
+ }
+
+#define YCBCR_FORMAT(vk, supports_filtering, plane_count, ...) \
+ [VK_ENUM_OFFSET(VK_FORMAT_##vk)] = { \
+ plane_count, \
+ { \
+ __VA_ARGS__, \
+ }, \
+ supports_filtering, \
+ }
+
#define SWIZ_X001 SWIZ(X, 0, 0, 1)
#define SWIZ_XY01 SWIZ(X, Y, 0, 1)
#define SWIZ_XYZ1 SWIZ(X, Y, Z, 1)
@@ -57,6 +79,7 @@
#define SWIZ_XXXX SWIZ(X, X, X, X)
#define SWIZ_000X SWIZ(0, 0, 0, X)
#define SWIZ_WXYZ SWIZ(W, X, Y, Z)
+#define SWIZ_WZYX SWIZ(W, Z, Y, X)
/* FIXME: expand format table to describe whether the format is supported
* for buffer surfaces (texel buffers, vertex buffers, etc).
@@ -132,6 +155,7 @@ static const struct v3dv_format format_table[] = {
FORMAT(A8B8G8R8_SRGB_PACK32, SRGB8_ALPHA8, RGBA8, SWIZ_XYZW, 16, true), /* RGBA8 sRGB */
FORMAT(A2B10G10R10_UNORM_PACK32,RGB10_A2, RGB10_A2, SWIZ_XYZW, 16, true),
FORMAT(A2B10G10R10_UINT_PACK32, RGB10_A2UI, RGB10_A2UI, SWIZ_XYZW, 16, false),
+ FORMAT(A2R10G10B10_UNORM_PACK32,RGB10_A2, RGB10_A2, SWIZ_ZYXW, 16, true),
FORMAT(E5B9G9R9_UFLOAT_PACK32, NO, RGB9_E5, SWIZ_XYZ1, 16, true),
FORMAT(B10G11R11_UFLOAT_PACK32, R11F_G11F_B10F,R11F_G11F_B10F, SWIZ_XYZ1, 16, true),
@@ -196,13 +220,61 @@ static const struct v3dv_format format_table[] = {
FORMAT(ASTC_12x12_SRGB_BLOCK, NO, ASTC_12X12, SWIZ_XYZW, 16, true),
};
+/**
+ * Vulkan layout for 4444 formats is defined like this:
+ *
+ * Vulkan ABGR4: (LSB) R | G | B | A (MSB)
+ * Vulkan ARGB4: (LSB) B | G | R | A (MSB)
+ *
+ * We map this to the V3D RGB4 texture format, which really, is ABGR4 with
+ * R in the MSB, so:
+ *
+ * V3D ABGR4 : (LSB) A | B | G | R (MSB)
+ *
+ * Which is reversed from Vulkan's ABGR4 layout. So in order to match Vulkan
+ * semantics we need to apply the following swizzles:
+ *
+ * ABGR4: WZYX (reverse)
+ * ARGB4: YZWX (reverse + swap R/B)
+ */
+static const struct v3dv_format format_table_4444[] = {
+ FORMAT(A4B4G4R4_UNORM_PACK16, ABGR4444, RGBA4, SWIZ_WZYX, 16, true), /* Reverse */
+ FORMAT(A4R4G4B4_UNORM_PACK16, ABGR4444, RGBA4, SWIZ_YZWX, 16, true), /* Reverse + RB swap */
+};
+
+static const struct v3dv_format format_table_ycbcr[] = {
+ YCBCR_FORMAT(G8_B8R8_2PLANE_420_UNORM, false, 2,
+ PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16),
+ PLANE(RG8, RG8, SWIZ(X, Y, 0, 1), 16)
+ ),
+ YCBCR_FORMAT(G8_B8_R8_3PLANE_420_UNORM, false, 3,
+ PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16),
+ PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16),
+ PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16)
+ ),
+};
+
const struct v3dv_format *
v3dX(get_format)(VkFormat format)
{
- if (format < ARRAY_SIZE(format_table) && format_table[format].supported)
+ /* Core formats */
+ if (format < ARRAY_SIZE(format_table) && format_table[format].plane_count)
return &format_table[format];
- else
+
+ uint32_t ext_number = VK_ENUM_EXTENSION(format);
+ uint32_t enum_offset = VK_ENUM_OFFSET(format);
+
+ switch (ext_number) {
+ case _VK_EXT_4444_formats_number:
+ return &format_table_4444[enum_offset];
+ case _VK_KHR_sampler_ycbcr_conversion_number:
+ if (enum_offset < ARRAY_SIZE(format_table_ycbcr))
+ return &format_table_ycbcr[enum_offset];
+ else
+ return NULL;
+ default:
return NULL;
+ }
}
void
@@ -339,18 +411,32 @@ bool
v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format)
{
uint32_t type, bpp;
- v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp);
+
+ /* Multiplanar images cannot be multisampled:
+ *
+ * "sampleCounts will be set to VK_SAMPLE_COUNT_1_BIT if at least one of
+ * the following conditions is true: (...) format is one of the formats
+ * that require a sampler Y′CBCR conversion (...)"
+ */
+ if (!format->plane_count || format->plane_count > 1)
+ return false;
+
+ v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type, &type, &bpp);
return type == V3D_INTERNAL_TYPE_8 || type == V3D_INTERNAL_TYPE_16F;
}
bool
v3dX(format_supports_blending)(const struct v3dv_format *format)
{
+ /* ycbcr formats don't support blending */
+ if (!format->plane_count || format->plane_count > 1)
+ return false;
+
/* Hardware blending is only supported on render targets that are configured
* 4x8-bit unorm, 2x16-bit float or 4x16-bit float.
*/
uint32_t type, bpp;
- v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp);
+ v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type, &type, &bpp);
switch (type) {
case V3D_INTERNAL_TYPE_8:
return bpp == V3D_INTERNAL_BPP_32;
@@ -426,23 +512,17 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
uint32_t *internal_type,
uint32_t *internal_bpp)
{
- const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
- VK_IMAGE_ASPECT_STENCIL_BIT;
-
/* We can't store depth/stencil pixel formats to a raster format, so
- * so instead we load our depth/stencil aspects to a compatible color
- * format.
+ * instead we load our depth/stencil aspects to a compatible color format.
*/
- /* FIXME: pre-compute this at image creation time? */
- if (aspect_mask & ds_aspects) {
+ if (aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+ *internal_bpp = V3D_INTERNAL_BPP_32;
switch (vk_format) {
case VK_FORMAT_D16_UNORM:
*internal_type = V3D_INTERNAL_TYPE_16UI;
- *internal_bpp = V3D_INTERNAL_BPP_64;
break;
case VK_FORMAT_D32_SFLOAT:
*internal_type = V3D_INTERNAL_TYPE_32F;
- *internal_bpp = V3D_INTERNAL_BPP_128;
break;
case VK_FORMAT_X8_D24_UNORM_PACK32:
case VK_FORMAT_D24_UNORM_S8_UINT:
@@ -451,7 +531,6 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
* load command for more details.
*/
*internal_type = V3D_INTERNAL_TYPE_8UI;
- *internal_bpp = V3D_INTERNAL_BPP_32;
break;
default:
assert(!"unsupported format");
@@ -459,7 +538,9 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
}
} else {
const struct v3dv_format *format = v3dX(get_format)(vk_format);
- v3dX(get_internal_type_bpp_for_output_format)(format->rt_type,
+ /* We only expect this to be called for single-plane formats */
+ assert(format->plane_count == 1);
+ v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type,
internal_type, internal_bpp);
}
}
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index a9aa0fb9797..de984e81220 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,32 +26,6 @@
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
-#include "vk_format_info.h"
-
-/*
- * This method translates pipe_swizzle to the swizzle values used at the
- * packet TEXTURE_SHADER_STATE
- *
- * FIXME: C&P from v3d, common place?
- */
-static uint32_t
-translate_swizzle(unsigned char pipe_swizzle)
-{
- switch (pipe_swizzle) {
- case PIPE_SWIZZLE_0:
- return 0;
- case PIPE_SWIZZLE_1:
- return 1;
- case PIPE_SWIZZLE_X:
- case PIPE_SWIZZLE_Y:
- case PIPE_SWIZZLE_Z:
- case PIPE_SWIZZLE_W:
- return 2 + pipe_swizzle;
- default:
- unreachable("unknown swizzle");
- }
-}
-
/*
* Packs and ensure bo for the shader state (the latter can be temporal).
*/
@@ -71,78 +45,125 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
const uint32_t msaa_scale = image->vk.samples == VK_SAMPLE_COUNT_1_BIT ? 1 : 2;
- v3dvx_pack(image_view->texture_shader_state[index], TEXTURE_SHADER_STATE, tex) {
-
- tex.level_0_is_strictly_uif =
- (image->slices[0].tiling == V3D_TILING_UIF_XOR ||
- image->slices[0].tiling == V3D_TILING_UIF_NO_XOR);
-
- tex.level_0_xor_enable = (image->slices[0].tiling == V3D_TILING_UIF_XOR);
-
- if (tex.level_0_is_strictly_uif)
- tex.level_0_ub_pad = image->slices[0].ub_pad;
-
- /* FIXME: v3d never sets uif_xor_disable, but uses it on the following
- * check so let's set the default value
- */
- tex.uif_xor_disable = false;
- if (tex.uif_xor_disable ||
- tex.level_0_is_strictly_uif) {
- tex.extended = true;
- }
-
- tex.base_level = image_view->vk.base_mip_level;
- tex.max_level = image_view->vk.base_mip_level +
- image_view->vk.level_count - 1;
-
- tex.swizzle_r = translate_swizzle(image_view->swizzle[0]);
- tex.swizzle_g = translate_swizzle(image_view->swizzle[1]);
- tex.swizzle_b = translate_swizzle(image_view->swizzle[2]);
- tex.swizzle_a = translate_swizzle(image_view->swizzle[3]);
-
- tex.texture_type = image_view->format->tex_type;
-
- if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
- tex.image_depth = image->vk.extent.depth;
- } else {
- tex.image_depth = image_view->vk.layer_count;
+ for (uint8_t plane = 0; plane < image_view->plane_count; plane++) {
+ uint8_t iplane = image_view->planes[plane].image_plane;
+ v3dvx_pack(image_view->planes[plane].texture_shader_state[index], TEXTURE_SHADER_STATE, tex) {
+
+ tex.level_0_is_strictly_uif =
+ (image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_XOR ||
+ image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_NO_XOR);
+
+ tex.level_0_xor_enable = (image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_XOR);
+
+ if (tex.level_0_is_strictly_uif)
+ tex.level_0_ub_pad = image->planes[iplane].slices[0].ub_pad;
+
+ /* FIXME: v3d never sets uif_xor_disable, but uses it on the following
+ * check so let's set the default value
+ */
+ tex.uif_xor_disable = false;
+ if (tex.uif_xor_disable ||
+ tex.level_0_is_strictly_uif) {
+ tex.extended = true;
+ }
+
+ tex.base_level = image_view->vk.base_mip_level;
+ tex.max_level = image_view->vk.base_mip_level +
+ image_view->vk.level_count - 1;
+
+ tex.swizzle_r = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[0]);
+ tex.swizzle_g = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[1]);
+ tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
+ tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
+
+ tex.texture_type = image_view->format->planes[plane].tex_type;
+
+ if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+ tex.image_depth = image->vk.extent.depth;
+ } else {
+ tex.image_depth = image_view->vk.layer_count;
+ }
+
+ /* Empirical testing with CTS shows that when we are sampling from cube
+ * arrays we want to set image depth to layers / 6, but not when doing
+ * image load/store.
+ */
+ if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
+ !for_cube_map_array_storage) {
+ assert(tex.image_depth % 6 == 0);
+ tex.image_depth /= 6;
+ }
+
+ tex.image_height = image->planes[iplane].height * msaa_scale;
+ tex.image_width = image->planes[iplane].width * msaa_scale;
+
+ /* On 4.x, the height of a 1D texture is redefined to be the
+ * upper 14 bits of the width (which is only usable with txf).
+ */
+ if (image->vk.image_type == VK_IMAGE_TYPE_1D)
+ tex.image_height = tex.image_width >> 14;
+
+ tex.image_width &= (1 << 14) - 1;
+ tex.image_height &= (1 << 14) - 1;
+
+ tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+
+ /* At this point we don't have the job. That's the reason the first
+ * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+ * add the bo to the job. This also means that we need to add manually
+ * the image bo to the job using the texture.
+ */
+ const uint32_t base_offset =
+ image->planes[iplane].mem->bo->offset +
+ v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
+ iplane);
+ tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+ bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+
+ /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
+ * the reverse and/or swap_r/b swizzle from the format table with the
+ * image view swizzle. This, however, doesn't work for border colors,
+ * for that there is the reverse_standard_border_color.
+ *
+ * In v3d 7.x, however, there is no reverse_standard_border_color bit,
+ * since the reverse and swap_r/b bits also affect border colors. It is
+ * because of this that we absolutely need to use these bits with
+ * reversed and swpaped formats, since that's the only way to ensure
+ * correct border colors. In that case we don't want to program the
+ * swizzle to the composition of the format swizzle and the view
+ * swizzle like we do in v3d 4.x, since the format swizzle is applied
+ * via the reverse and swap_r/b bits.
+ */
+#if V3D_VERSION == 42
+ tex.srgb = is_srgb;
+ tex.reverse_standard_border_color =
+ image_view->planes[plane].channel_reverse;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+
+ tex.reverse = image_view->planes[plane].channel_reverse;
+ tex.r_b_swap = image_view->planes[plane].swap_rb;
+
+ if (tex.reverse || tex.r_b_swap) {
+ tex.swizzle_r =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
+ tex.swizzle_g =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
+ tex.swizzle_b =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
+ tex.swizzle_a =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
+ }
+
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
-
- /* Empirical testing with CTS shows that when we are sampling from cube
- * arrays we want to set image depth to layers / 6, but not when doing
- * image load/store.
- */
- if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
- !for_cube_map_array_storage) {
- assert(tex.image_depth % 6 == 0);
- tex.image_depth /= 6;
- }
-
- tex.image_height = image->vk.extent.height * msaa_scale;
- tex.image_width = image->vk.extent.width * msaa_scale;
-
- /* On 4.x, the height of a 1D texture is redefined to be the
- * upper 14 bits of the width (which is only usable with txf).
- */
- if (image->vk.image_type == VK_IMAGE_TYPE_1D) {
- tex.image_height = tex.image_width >> 14;
- }
- tex.image_width &= (1 << 14) - 1;
- tex.image_height &= (1 << 14) - 1;
-
- tex.array_stride_64_byte_aligned = image->cube_map_stride / 64;
-
- tex.srgb = vk_format_is_srgb(image_view->vk.format);
-
- /* At this point we don't have the job. That's the reason the first
- * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
- * add the bo to the job. This also means that we need to add manually
- * the image bo to the job using the texture.
- */
- const uint32_t base_offset =
- image->mem->bo->offset +
- v3dv_layer_offset(image, 0, image_view->vk.base_array_layer);
- tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
}
}
@@ -163,10 +184,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
const struct v3dv_buffer *buffer = buffer_view->buffer;
v3dvx_pack(buffer_view->texture_shader_state, TEXTURE_SHADER_STATE, tex) {
- tex.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X);
- tex.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y);
- tex.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z);
- tex.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W);
+ tex.swizzle_r =
+ v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[0]);
+ tex.swizzle_g =
+ v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[1]);
+ tex.swizzle_b =
+ v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[2]);
+ tex.swizzle_a =
+ v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[3]);
tex.image_depth = 1;
@@ -180,8 +205,16 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
tex.image_width &= (1 << 14) - 1;
tex.image_height &= (1 << 14) - 1;
- tex.texture_type = buffer_view->format->tex_type;
- tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+ assert(buffer_view->format->plane_count == 1);
+ tex.texture_type = buffer_view->format->planes[0].tex_type;
+
+ bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
+#if V3D_VERSION == 42
+ tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif
/* At this point we don't have the job. That's the reason the first
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -194,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
buffer_view->offset;
tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
}
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index 2f79e4e9c32..858096f9e4b 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -25,11 +25,11 @@
#include "v3dv_meta_common.h"
#include "broadcom/common/v3d_macros.h"
+#include "broadcom/common/v3d_tfu.h"
+#include "broadcom/common/v3d_util.h"
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
-#include "vk_format_info.h"
-
struct rcl_clear_info {
const union v3dv_clear_value *clear_value;
struct v3dv_image *image;
@@ -51,25 +51,46 @@ emit_rcl_prologue(struct v3dv_job *job,
if (job->cmd_buffer->state.oom)
return NULL;
+ assert(!tiling->msaa || !tiling->double_buffer);
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
config.early_z_disable = true;
config.image_width_pixels = tiling->width;
config.image_height_pixels = tiling->height;
config.number_of_render_targets = 1;
config.multisample_mode_4x = tiling->msaa;
+ config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
config.internal_depth_type = fb->internal_depth_type;
}
+ const uint32_t *color = NULL;
if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (clear_info->image) {
const struct v3dv_image *image = clear_info->image;
+
+ /* From vkCmdClearColorImage:
+ * "image must not use any of the formats that require a sampler
+ * YCBCR conversion"
+ */
+ assert(image->plane_count == 1);
const struct v3d_resource_slice *slice =
- &image->slices[clear_info->level];
+ &image->planes[0].slices[clear_info->level];
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
slice->tiling == V3D_TILING_UIF_XOR) {
- int uif_block_height = v3d_utile_height(image->cpp) * 2;
+ int uif_block_height = v3d_utile_height(image->planes[0].cpp) * 2;
uint32_t implicit_padded_height =
align(tiling->height, uif_block_height) / uif_block_height;
@@ -81,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job,
}
}
- const uint32_t *color = &clear_info->clear_value->color[0];
+ color = &clear_info->clear_value->color[0];
+
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = color[0];
clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
@@ -105,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job,
clear.render_target_number = 0;
};
}
+#endif
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = tiling->internal_bpp;
rt.render_target_0_internal_type = fb->internal_type;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ if (color)
+ rt.clear_color_low_bits = color[0];
+ rt.internal_bpp = tiling->internal_bpp;
+ rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+ fb->vk_format);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = 0;
+ rt.render_target_number = 0;
+ }
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) color[1]) |
+ (((uint64_t) (color[2] & 0xff)) << 32);
+ rt.render_target_number = 0;
+ }
+ }
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (color[3])) << 24);
+ rt.render_target_number = 0;
+ }
+ }
+#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
@@ -167,11 +226,20 @@ emit_frame_setup(struct v3dv_job *job,
cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE;
}
- if (clear_value && i == 0) {
+ /* When using double-buffering, we need to clear both buffers (unless
+ * we only have a single tile to render).
+ */
+ if (clear_value &&
+ (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = true;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
@@ -254,6 +322,9 @@ choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
bool is_copy_to_buffer,
bool is_copy_from_buffer)
{
+ /* At this point the framebuffer was already lowered to single-plane */
+ assert(framebuffer->format->plane_count == 1);
+
if (is_copy_to_buffer || is_copy_from_buffer) {
switch (framebuffer->vk_format) {
case VK_FORMAT_D16_UNORM:
@@ -295,11 +366,11 @@ choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
}
}
default: /* Color formats */
- return framebuffer->format->rt_type;
+ return framebuffer->format->planes[0].rt_type;
break;
}
} else {
- return framebuffer->format->rt_type;
+ return framebuffer->format->planes[0].rt_type;
}
}
@@ -307,8 +378,24 @@ static inline bool
format_needs_rb_swap(struct v3dv_device *device,
VkFormat format)
{
- const uint8_t *swizzle = v3dv_get_format_swizzle(device, format);
- return swizzle[0] == PIPE_SWIZZLE_Z;
+ /* We are calling these methods for framebuffer formats, that at this point
+ * should be single-plane
+ */
+ assert(vk_format_get_plane_count(format) == 1);
+ const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0);
+ return v3dv_format_swizzle_needs_rb_swap(swizzle);
+}
+
+static inline bool
+format_needs_reverse(struct v3dv_device *device,
+ VkFormat format)
+{
+ /* We are calling these methods for framebuffer formats, that at this point
+ * should be single-plane
+ */
+ assert(vk_format_get_plane_count(format) == 1);
+ const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0);
+ return v3dv_format_swizzle_needs_reverse(swizzle);
}
static void
@@ -322,22 +409,29 @@ emit_image_load(struct v3dv_device *device,
bool is_copy_to_buffer,
bool is_copy_from_buffer)
{
- uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
+ uint8_t plane = v3dv_plane_from_aspect(aspect);
+ uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane);
+ /* For multi-plane formats we are copying plane by plane to the color
+ * tlb. Framebuffer format was already selected to be a tlb single-plane
+ * compatible format. We still need to use the real plane to get the
+ * address etc from the source image.
+ */
+ assert(framebuffer->format->plane_count == 1);
/* For image to/from buffer copies we always load to and store from RT0,
* even for depth/stencil aspects, because the hardware can't do raster
* stores or loads from/to the depth/stencil tile buffers.
*/
bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
+ image->format->plane_count > 1 ||
aspect == VK_IMAGE_ASPECT_COLOR_BIT;
- const struct v3d_resource_slice *slice = &image->slices[mip_level];
+ const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
load.buffer_to_load = load_to_color_tlb ?
RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
- load.address = v3dv_cl_address(image->mem->bo, layer_offset);
-
+ load.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset);
load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
is_copy_to_buffer,
is_copy_from_buffer);
@@ -374,6 +468,7 @@ emit_image_load(struct v3dv_device *device,
* so we need to make sure we respect the format swizzle.
*/
needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
+ needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
}
load.r_b_swap = needs_rb_swap;
@@ -406,17 +501,28 @@ emit_image_store(struct v3dv_device *device,
bool is_copy_from_buffer,
bool is_multisample_resolve)
{
- uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
+ uint8_t plane = v3dv_plane_from_aspect(aspect);
+ uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane);
+
+ /*
+ * For multi-plane formats we are copying plane by plane to the color
+ * tlb. Framebuffer format was already selected to be a tlb single-plane
+ * compatible format. We still need to use the real plane to get the
+ * address etc.
+ */
+ assert(framebuffer->format->plane_count == 1);
bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
+ image->format->plane_count > 1 ||
aspect == VK_IMAGE_ASPECT_COLOR_BIT;
- const struct v3d_resource_slice *slice = &image->slices[mip_level];
+ const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = store_from_color_tlb ?
RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
- store.address = v3dv_cl_address(image->mem->bo, layer_offset);
+ store.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset);
+
store.clear_buffer_being_stored = false;
/* See rationale in emit_image_load() */
@@ -431,6 +537,7 @@ emit_image_store(struct v3dv_device *device,
} else if (!is_copy_from_buffer && !is_copy_to_buffer &&
(aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
+ needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
}
store.r_b_swap = needs_rb_swap;
@@ -463,7 +570,7 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
struct v3dv_buffer *buffer,
struct v3dv_image *image,
uint32_t layer_offset,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
struct v3dv_cl *cl = &job->indirect;
v3dv_cl_ensure_space(cl, 200, 1);
@@ -512,9 +619,10 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
* Vulkan spec states that the output buffer must have packed stencil
* values, where each stencil value is 1 byte.
*/
+ uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
uint32_t cpp =
region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
- 1 : image->cpp;
+ 1 : image->planes[plane].cpp;
uint32_t buffer_stride = width * cpp;
uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
height * buffer_stride * layer_offset;
@@ -543,7 +651,7 @@ emit_copy_layer_to_buffer(struct v3dv_job *job,
struct v3dv_image *image,
struct v3dv_meta_framebuffer *framebuffer,
uint32_t layer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
image, layer, region);
@@ -555,7 +663,7 @@ v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
struct v3dv_buffer *buffer,
struct v3dv_image *image,
struct v3dv_meta_framebuffer *framebuffer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
v3dv_return_if_oom(NULL, job);
@@ -572,7 +680,7 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
struct v3dv_image *dst,
struct v3dv_image *src,
uint32_t layer_offset,
- const VkImageResolve2KHR *region)
+ const VkImageResolve2 *region)
{
struct v3dv_cl *cl = &job->indirect;
v3dv_cl_ensure_space(cl, 200, 1);
@@ -608,11 +716,14 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
region->dstSubresource.baseArrayLayer + layer_offset :
region->dstOffset.z + layer_offset;
+ bool is_depth_or_stencil =
+ region->dstSubresource.aspectMask &
+ (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);
emit_image_store(job->device, cl, framebuffer, dst,
region->dstSubresource.aspectMask,
dst_layer,
region->dstSubresource.mipLevel,
- false, false, true);
+ false, false, !is_depth_or_stencil);
cl_emit(cl, END_OF_TILE_MARKER, end);
@@ -630,7 +741,7 @@ emit_resolve_image_layer(struct v3dv_job *job,
struct v3dv_image *src,
struct v3dv_meta_framebuffer *framebuffer,
uint32_t layer,
- const VkImageResolve2KHR *region)
+ const VkImageResolve2 *region)
{
emit_resolve_image_layer_per_tile_list(job, framebuffer,
dst, src, layer, region);
@@ -642,7 +753,7 @@ v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
struct v3dv_image *dst,
struct v3dv_image *src,
struct v3dv_meta_framebuffer *framebuffer,
- const VkImageResolve2KHR *region)
+ const VkImageResolve2 *region)
{
struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
v3dv_return_if_oom(NULL, job);
@@ -733,7 +844,7 @@ emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
struct v3dv_image *dst,
struct v3dv_image *src,
uint32_t layer_offset,
- const VkImageCopy2KHR *region)
+ const VkImageCopy2 *region)
{
struct v3dv_cl *cl = &job->indirect;
v3dv_cl_ensure_space(cl, 200, 1);
@@ -791,7 +902,7 @@ emit_copy_image_layer(struct v3dv_job *job,
struct v3dv_image *src,
struct v3dv_meta_framebuffer *framebuffer,
uint32_t layer,
- const VkImageCopy2KHR *region)
+ const VkImageCopy2 *region)
{
emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
emit_supertile_coordinates(job, framebuffer);
@@ -802,7 +913,7 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
struct v3dv_image *dst,
struct v3dv_image *src,
struct v3dv_meta_framebuffer *framebuffer,
- const VkImageCopy2KHR *region)
+ const VkImageCopy2 *region)
{
struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
v3dv_return_if_oom(NULL, job);
@@ -815,79 +926,108 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
void
v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_image *dst,
- uint32_t dst_mip_level,
- uint32_t dst_layer,
- struct v3dv_image *src,
- uint32_t src_mip_level,
- uint32_t src_layer,
+ uint32_t dst_bo_handle,
+ uint32_t dst_offset,
+ enum v3d_tiling_mode dst_tiling,
+ uint32_t dst_padded_height_or_stride,
+ uint32_t dst_cpp,
+ uint32_t src_bo_handle,
+ uint32_t src_offset,
+ enum v3d_tiling_mode src_tiling,
+ uint32_t src_padded_height_or_stride,
+ uint32_t src_cpp,
uint32_t width,
uint32_t height,
- const struct v3dv_format *format)
+ const struct v3dv_format_plane *format_plane)
{
- const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
- const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
-
- assert(dst->mem && dst->mem->bo);
- const struct v3dv_bo *dst_bo = dst->mem->bo;
-
- assert(src->mem && src->mem->bo);
- const struct v3dv_bo *src_bo = src->mem->bo;
-
struct drm_v3d_submit_tfu tfu = {
.ios = (height << 16) | width,
.bo_handles = {
- dst_bo->handle,
- src_bo->handle != dst_bo->handle ? src_bo->handle : 0
+ dst_bo_handle,
+ src_bo_handle != dst_bo_handle ? src_bo_handle : 0
},
};
- const uint32_t src_offset =
- src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
tfu.iia |= src_offset;
- uint32_t icfg;
- if (src_slice->tiling == V3D_TILING_RASTER) {
- icfg = V3D_TFU_ICFG_FORMAT_RASTER;
+#if V3D_VERSION <= 42
+ if (src_tiling == V3D_TILING_RASTER) {
+ tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
+ } else {
+ tfu.icfg = (V3D33_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D33_TFU_ICFG_FORMAT_SHIFT;
+ }
+ tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
+#endif
+#if V3D_VERSION >= 71
+ if (src_tiling == V3D_TILING_RASTER) {
+ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
} else {
- icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
- (src_slice->tiling - V3D_TILING_LINEARTILE);
+ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_ICFG_IFORMAT_SHIFT;
}
- tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
+ tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
+#endif
- const uint32_t dst_offset =
- dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
- tfu.ioa |= dst_offset;
+ tfu.ioa = dst_offset;
- tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
- (dst_slice->tiling - V3D_TILING_LINEARTILE)) <<
- V3D_TFU_IOA_FORMAT_SHIFT;
- tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
+#if V3D_VERSION <= 42
+ tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
+ (dst_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D33_TFU_IOA_FORMAT_SHIFT;
+#endif
- switch (src_slice->tiling) {
+#if V3D_VERSION >= 71
+ tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
+ (dst_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_IOC_FORMAT_SHIFT;
+
+ switch (dst_tiling) {
case V3D_TILING_UIF_NO_XOR:
case V3D_TILING_UIF_XOR:
- tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
+ tfu.v71.ioc |=
+ (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
break;
case V3D_TILING_RASTER:
- tfu.iis |= src_slice->stride / src->cpp;
+ tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
break;
default:
break;
}
+#endif
+ switch (src_tiling) {
+ case V3D_TILING_UIF_NO_XOR:
+ case V3D_TILING_UIF_XOR:
+ tfu.iis |= src_padded_height_or_stride / (2 * v3d_utile_height(src_cpp));
+ break;
+ case V3D_TILING_RASTER:
+ tfu.iis |= src_padded_height_or_stride / src_cpp;
+ break;
+ default:
+ break;
+ }
+
+ /* The TFU can handle raster sources but always produces UIF results */
+ assert(dst_tiling != V3D_TILING_RASTER);
+
+#if V3D_VERSION <= 42
/* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
* OPAD field for the destination (how many extra UIF blocks beyond
* those necessary to cover the height).
*/
- if (dst_slice->tiling == V3D_TILING_UIF_NO_XOR ||
- dst_slice->tiling == V3D_TILING_UIF_XOR) {
- uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
+ if (dst_tiling == V3D_TILING_UIF_NO_XOR || dst_tiling == V3D_TILING_UIF_XOR) {
+ uint32_t uif_block_h = 2 * v3d_utile_height(dst_cpp);
uint32_t implicit_padded_height = align(height, uif_block_h);
- uint32_t icfg =
- (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
- tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
+ uint32_t icfg = (dst_padded_height_or_stride - implicit_padded_height) /
+ uif_block_h;
+ tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
}
+#endif
v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
}
@@ -1042,7 +1182,7 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
uint32_t layer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
struct v3dv_cl *cl = &job->indirect;
v3dv_cl_ensure_space(cl, 200, 1);
@@ -1072,8 +1212,9 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
+ uint8_t plane = v3dv_plane_from_aspect(imgrsc->aspectMask);
uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
- 1 : image->cpp;
+ 1 : image->planes[plane].cpp;
uint32_t buffer_stride = width * cpp;
uint32_t buffer_offset =
buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
@@ -1081,6 +1222,9 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
false, false, true);
+ uint32_t image_layer = layer + (image->vk.image_type != VK_IMAGE_TYPE_3D ?
+ imgrsc->baseArrayLayer : region->imageOffset.z);
+
emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
buffer_offset, buffer_stride, format);
@@ -1100,13 +1244,13 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
emit_image_load(job->device, cl, framebuffer, image,
VK_IMAGE_ASPECT_STENCIL_BIT,
- imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+ image_layer, imgrsc->mipLevel,
false, false);
} else {
assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
emit_image_load(job->device, cl, framebuffer, image,
VK_IMAGE_ASPECT_DEPTH_BIT,
- imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+ image_layer, imgrsc->mipLevel,
false, false);
}
}
@@ -1117,20 +1261,20 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
/* Store TLB to image */
emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
- imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+ image_layer, imgrsc->mipLevel,
false, true, false);
if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
emit_image_store(job->device, cl, framebuffer, image,
VK_IMAGE_ASPECT_STENCIL_BIT,
- imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+ image_layer, imgrsc->mipLevel,
false, false, false);
} else {
assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
emit_image_store(job->device, cl, framebuffer, image,
VK_IMAGE_ASPECT_DEPTH_BIT,
- imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+ image_layer, imgrsc->mipLevel,
false, false, false);
}
}
@@ -1151,7 +1295,7 @@ emit_copy_buffer_to_layer(struct v3dv_job *job,
struct v3dv_buffer *buffer,
struct v3dv_meta_framebuffer *framebuffer,
uint32_t layer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
layer, region);
@@ -1163,7 +1307,7 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
struct v3dv_meta_framebuffer *framebuffer,
- const VkBufferImageCopy2KHR *region)
+ const VkBufferImageCopy2 *region)
{
struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
v3dv_return_if_oom(NULL, job);
@@ -1175,8 +1319,8 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
}
/* Figure out a TLB size configuration for a number of pixels to process.
- * Beware that we can't "render" more than 4096x4096 pixels in a single job,
- * if the pixel count is larger than this, the caller might need to split
+ * Beware that we can't "render" more than MAX_DIMxMAX_DIM pixels in a single
+ * job, if the pixel count is larger than this, the caller might need to split
* the job and call this function multiple times.
*/
static void
@@ -1186,7 +1330,7 @@ framebuffer_size_for_pixel_count(uint32_t num_pixels,
{
assert(num_pixels > 0);
- const uint32_t max_dim_pixels = 4096;
+ const uint32_t max_dim_pixels = V3D_MAX_IMAGE_DIMENSION;
const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
uint32_t w, h;
@@ -1215,7 +1359,7 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t dst_offset,
struct v3dv_bo *src,
uint32_t src_offset,
- const VkBufferCopy2KHR *region)
+ const VkBufferCopy2 *region)
{
const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
@@ -1264,7 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t width, height;
framebuffer_size_for_pixel_count(num_items, &width, &height);
- v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
@@ -1310,7 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t width, height;
framebuffer_size_for_pixel_count(num_items, &width, &height);
- v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 8623a453701..616a7730cd4 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,8 +26,6 @@
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
-#include "vk_format_info.h"
-
static uint8_t
blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants)
{
@@ -58,15 +56,10 @@ blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants)
case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
case VK_BLEND_FACTOR_SRC1_ALPHA:
case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
- assert(!"Invalid blend factor: dual source blending not supported.");
+ unreachable("Invalid blend factor: dual source blending not supported.");
default:
- assert(!"Unknown blend factor.");
+ unreachable("Unknown blend factor.");
}
-
- /* Should be handled by the switch, added to avoid a "end of non-void
- * function" error
- */
- unreachable("Unknown blend factor.");
}
static void
@@ -86,21 +79,19 @@ pack_blend(struct v3dv_pipeline *pipeline,
if (!cb_info)
return;
- assert(pipeline->subpass);
- if (pipeline->subpass->color_count == 0)
+ const struct vk_render_pass_state *ri = &pipeline->rendering_info;
+ if (ri->color_attachment_count == 0)
return;
- assert(pipeline->subpass->color_count == cb_info->attachmentCount);
-
+ assert(ri->color_attachment_count == cb_info->attachmentCount);
pipeline->blend.needs_color_constants = false;
uint32_t color_write_masks = 0;
- for (uint32_t i = 0; i < pipeline->subpass->color_count; i++) {
+ for (uint32_t i = 0; i < ri->color_attachment_count; i++) {
const VkPipelineColorBlendAttachmentState *b_state =
&cb_info->pAttachments[i];
- uint32_t attachment_idx =
- pipeline->subpass->color_attachments[i].attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ const VkFormat vk_format = ri->color_attachment_formats[i];
+ if (vk_format == VK_FORMAT_UNDEFINED)
continue;
color_write_masks |= (~b_state->colorWriteMask & 0xf) << (4 * i);
@@ -108,10 +99,13 @@ pack_blend(struct v3dv_pipeline *pipeline,
if (!b_state->blendEnable)
continue;
- VkAttachmentDescription *desc =
- &pipeline->pass->attachments[attachment_idx].desc;
- const struct v3dv_format *format = v3dX(get_format)(desc->format);
- bool dst_alpha_one = (format->swizzle[3] == PIPE_SWIZZLE_1);
+ const struct v3dv_format *format = v3dX(get_format)(vk_format);
+
+ /* We only do blending with render pass attachments, so we should not have
+ * multiplanar images here
+ */
+ assert(format->plane_count == 1);
+ bool dst_alpha_one = (format->planes[0].swizzle[3] == PIPE_SWIZZLE_1);
uint8_t rt_mask = 1 << i;
pipeline->blend.enables |= rt_mask;
@@ -148,6 +142,7 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
const VkPipelineDepthStencilStateCreateInfo *ds_info,
const VkPipelineRasterizationStateCreateInfo *rs_info,
const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
+ const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info,
const VkPipelineMultisampleStateCreateInfo *ms_info)
{
assert(sizeof(pipeline->cfg_bits) == cl_packet_length(CFG_BITS));
@@ -156,23 +151,21 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
ms_info && ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
v3dvx_pack(pipeline->cfg_bits, CFG_BITS, config) {
- config.enable_forward_facing_primitive =
- rs_info ? !(rs_info->cullMode & VK_CULL_MODE_FRONT_BIT) : false;
-
- config.enable_reverse_facing_primitive =
- rs_info ? !(rs_info->cullMode & VK_CULL_MODE_BACK_BIT) : false;
-
- /* Seems like the hardware is backwards regarding this setting... */
- config.clockwise_primitives =
- rs_info ? rs_info->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE : false;
-
- config.enable_depth_offset = rs_info ? rs_info->depthBiasEnable: false;
+ /* Even if rs_info->depthBiasEnabled is true, we can decide to not
+ * enable it, like if there isn't a depth/stencil attachment with the
+ * pipeline.
+ */
+ config.enable_depth_offset = pipeline->depth_bias.enabled;
/* This is required to pass line rasterization tests in CTS while
* exposing, at least, a minimum of 4-bits of subpixel precision
* (the minimum requirement).
*/
- config.line_rasterization = 1; /* perp end caps */
+ if (ls_info &&
+ ls_info->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT)
+ config.line_rasterization = V3D_LINE_RASTERIZATION_DIAMOND_EXIT;
+ else
+ config.line_rasterization = V3D_LINE_RASTERIZATION_PERP_END_CAPS;
if (rs_info && rs_info->polygonMode != VK_POLYGON_MODE_FILL) {
config.direct3d_wireframe_triangles_mode = true;
@@ -180,7 +173,10 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
rs_info->polygonMode == VK_POLYGON_MODE_POINT;
}
- config.rasterizer_oversample_mode = pipeline->msaa ? 1 : 0;
+ /* diamond-exit rasterization does not support oversample */
+ config.rasterizer_oversample_mode =
+ (config.line_rasterization == V3D_LINE_RASTERIZATION_PERP_END_CAPS &&
+ pipeline->msaa) ? 1 : 0;
/* From the Vulkan spec:
*
@@ -203,30 +199,42 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
config.blend_enable = pipeline->blend.enables != 0;
- /* Disable depth/stencil if we don't have a D/S attachment */
- bool has_ds_attachment =
- pipeline->subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED;
-
- if (ds_info && ds_info->depthTestEnable && has_ds_attachment) {
- config.z_updates_enable = ds_info->depthWriteEnable;
- config.depth_test_function = ds_info->depthCompareOp;
+#if V3D_VERSION >= 71
+ /* From the Vulkan spec:
+ *
+ * "depthClampEnable controls whether to clamp the fragment’s depth
+ * values as described in Depth Test. If the pipeline is not created
+ * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
+ * then enabling depth clamp will also disable clipping primitives to
+ * the z planes of the frustrum as described in Primitive Clipping.
+ * Otherwise depth clipping is controlled by the state set in
+ * VkPipelineRasterizationDepthClipStateCreateInfoEXT."
+ */
+ bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
+ bool z_clip_enable = false;
+ const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+ rs_info ? vk_find_struct_const(rs_info->pNext,
+ PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
+ NULL;
+ if (clip_info)
+ z_clip_enable = clip_info->depthClipEnable;
+ else if (!z_clamp_enable)
+ z_clip_enable = true;
+
+ if (z_clip_enable) {
+ config.z_clipping_mode = pipeline->negative_one_to_one ?
+ V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
} else {
- config.depth_test_function = VK_COMPARE_OP_ALWAYS;
+ config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
}
- /* EZ state will be updated at draw time based on bound pipeline state */
- config.early_z_updates_enable = false;
- config.early_z_enable = false;
-
- config.stencil_enable =
- ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
-
- pipeline->z_updates_enable = config.z_updates_enable;
+ config.z_clamp_mode = z_clamp_enable;
+#endif
};
}
-static uint32_t
-translate_stencil_op(enum pipe_stencil_op op)
+uint32_t
+v3dX(translate_stencil_op)(VkStencilOp op)
{
switch (op) {
case VK_STENCIL_OP_KEEP:
@@ -255,7 +263,8 @@ pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
uint8_t *stencil_cfg,
bool is_front,
bool is_back,
- const VkStencilOpState *stencil_state)
+ const VkStencilOpState *stencil_state,
+ const struct vk_graphics_pipeline_state *state)
{
/* From the Vulkan spec:
*
@@ -267,60 +276,54 @@ pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
*
* In our case, 's' is always 8, so we clamp to that to prevent our packing
* functions to assert in debug mode if they see larger values.
- *
- * If we have dynamic state we need to make sure we set the corresponding
- * state bits to 0, since cl_emit_with_prepacked ORs the new value with
- * the old.
*/
- const uint8_t write_mask =
- pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK ?
- 0 : stencil_state->writeMask & 0xff;
-
- const uint8_t compare_mask =
- pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
- 0 : stencil_state->compareMask & 0xff;
-
- const uint8_t reference =
- pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
- 0 : stencil_state->reference & 0xff;
-
v3dvx_pack(stencil_cfg, STENCIL_CFG, config) {
config.front_config = is_front;
config.back_config = is_back;
- config.stencil_write_mask = write_mask;
- config.stencil_test_mask = compare_mask;
+ config.stencil_write_mask = stencil_state->writeMask & 0xff;
+ config.stencil_test_mask = stencil_state->compareMask & 0xff;
config.stencil_test_function = stencil_state->compareOp;
- config.stencil_pass_op = translate_stencil_op(stencil_state->passOp);
- config.depth_test_fail_op = translate_stencil_op(stencil_state->depthFailOp);
- config.stencil_test_fail_op = translate_stencil_op(stencil_state->failOp);
- config.stencil_ref_value = reference;
+ config.stencil_pass_op =
+ v3dX(translate_stencil_op)(stencil_state->passOp);
+ config.depth_test_fail_op =
+ v3dX(translate_stencil_op)(stencil_state->depthFailOp);
+ config.stencil_test_fail_op =
+ v3dX(translate_stencil_op)(stencil_state->failOp);
+ config.stencil_ref_value = stencil_state->reference & 0xff;
}
}
static void
pack_stencil_cfg(struct v3dv_pipeline *pipeline,
- const VkPipelineDepthStencilStateCreateInfo *ds_info)
+ const VkPipelineDepthStencilStateCreateInfo *ds_info,
+ const struct vk_graphics_pipeline_state *state)
{
assert(sizeof(pipeline->stencil_cfg) == 2 * cl_packet_length(STENCIL_CFG));
- if (!ds_info || !ds_info->stencilTestEnable)
+ if ((!ds_info || !ds_info->stencilTestEnable) &&
+ (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE))) {
return;
+ }
- if (pipeline->subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
+ const struct vk_render_pass_state *ri = &pipeline->rendering_info;
+ if (ri->stencil_attachment_format == VK_FORMAT_UNDEFINED)
return;
- const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
- V3DV_DYNAMIC_STENCIL_WRITE_MASK |
- V3DV_DYNAMIC_STENCIL_REFERENCE;
-
+ const bool any_dynamic_stencil_states =
+ BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+ BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+ BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+ BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+ BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_OP);
/* If front != back or we have dynamic stencil state we can't emit a single
* packet for both faces.
*/
bool needs_front_and_back = false;
- if ((pipeline->dynamic_state.mask & dynamic_stencil_states) ||
- memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front)))
+ if ((any_dynamic_stencil_states) ||
+ memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front))) {
needs_front_and_back = true;
+ }
/* If the front and back configurations are the same we can emit both with
* a single packet.
@@ -328,33 +331,41 @@ pack_stencil_cfg(struct v3dv_pipeline *pipeline,
pipeline->emit_stencil_cfg[0] = true;
if (!needs_front_and_back) {
pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
- true, true, &ds_info->front);
+ true, true, &ds_info->front, state);
} else {
pipeline->emit_stencil_cfg[1] = true;
pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
- true, false, &ds_info->front);
+ true, false, &ds_info->front, state);
pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[1],
- false, true, &ds_info->back);
+ false, true, &ds_info->back, state);
}
}
+
+/* FIXME: Now that we are passing the vk_graphics_pipeline_state we could
+ * avoid passing all those parameters. But doing that we would need to change
+ * all the code that uses the VkXXX structures, and use instead the equivalent
+ * vk_xxx
+ */
void
v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
const VkPipelineColorBlendStateCreateInfo *cb_info,
const VkPipelineDepthStencilStateCreateInfo *ds_info,
const VkPipelineRasterizationStateCreateInfo *rs_info,
const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
- const VkPipelineMultisampleStateCreateInfo *ms_info)
+ const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info,
+ const VkPipelineMultisampleStateCreateInfo *ms_info,
+ const struct vk_graphics_pipeline_state *state)
{
pack_blend(pipeline, cb_info);
- pack_cfg_bits(pipeline, ds_info, rs_info, pv_info, ms_info);
- pack_stencil_cfg(pipeline, ds_info);
+ pack_cfg_bits(pipeline, ds_info, rs_info, pv_info, ls_info, ms_info);
+ pack_stencil_cfg(pipeline, ds_info, state);
}
static void
pack_shader_state_record(struct v3dv_pipeline *pipeline)
{
- assert(sizeof(pipeline->shader_state_record) ==
+ assert(sizeof(pipeline->shader_state_record) >=
cl_packet_length(GL_SHADER_STATE_RECORD));
struct v3d_fs_prog_data *prog_data_fs =
@@ -378,7 +389,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
if (!pipeline->has_gs) {
shader.point_size_in_shaded_vertex_data =
- pipeline->topology == PIPE_PRIM_POINTS;
+ pipeline->topology == MESA_PRIM_POINTS;
} else {
struct v3d_gs_prog_data *prog_data_gs =
pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]->prog_data.gs;
@@ -390,6 +401,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
* shader needs to write the Z value (even just discards).
*/
shader.fragment_shader_does_z_writes = prog_data_fs->writes_z;
+
/* Set if the EZ test must be disabled (due to shader side
* effects and the early_z flag not being present in the
* shader).
@@ -428,15 +440,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
shader.number_of_varyings_in_fragment_shader =
prog_data_fs->num_inputs;
- shader.coordinate_shader_propagate_nans = true;
- shader.vertex_shader_propagate_nans = true;
- shader.fragment_shader_propagate_nans = true;
-
- /* Note: see previous note about adresses */
+ /* Note: see previous note about addresses */
/* shader.coordinate_shader_code_address */
/* shader.vertex_shader_code_address */
/* shader.fragment_shader_code_address */
+#if V3D_VERSION == 42
+ shader.coordinate_shader_propagate_nans = true;
+ shader.vertex_shader_propagate_nans = true;
+ shader.fragment_shader_propagate_nans = true;
+
/* FIXME: Use combined input/output size flag in the common case (also
* on v3d, see v3dx_draw).
*/
@@ -444,20 +457,32 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
prog_data_vs_bin->separate_segments;
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
prog_data_vs->separate_segments;
-
shader.coordinate_shader_input_vpm_segment_size =
prog_data_vs_bin->separate_segments ?
prog_data_vs_bin->vpm_input_size : 1;
shader.vertex_shader_input_vpm_segment_size =
prog_data_vs->separate_segments ?
prog_data_vs->vpm_input_size : 1;
+#endif
+
+ /* On V3D 7.1 there isn't a specific flag to set if we are using
+ * shared/separate segments or not. We just set the value of
+ * vpm_input_size to 0, and set output to the max needed. That should be
+ * already properly set on prog_data_vs_bin
+ */
+#if V3D_VERSION == 71
+ shader.coordinate_shader_input_vpm_segment_size =
+ prog_data_vs_bin->vpm_input_size;
+ shader.vertex_shader_input_vpm_segment_size =
+ prog_data_vs->vpm_input_size;
+#endif
shader.coordinate_shader_output_vpm_segment_size =
prog_data_vs_bin->vpm_output_size;
shader.vertex_shader_output_vpm_segment_size =
prog_data_vs->vpm_output_size;
- /* Note: see previous note about adresses */
+ /* Note: see previous note about addresses */
/* shader.coordinate_shader_uniforms_address */
/* shader.vertex_shader_uniforms_address */
/* shader.fragment_shader_uniforms_address */
@@ -499,7 +524,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
shader.instance_id_read_by_vertex_shader =
prog_data_vs->uses_iid;
- /* Note: see previous note about adresses */
+ /* Note: see previous note about addresses */
/* shader.address_of_default_attribute_values */
}
}
@@ -592,7 +617,6 @@ pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline,
attr.instance_divisor = MIN2(pipeline->vb[binding].instance_divisor,
0xffff);
- attr.stride = pipeline->vb[binding].stride;
attr.type = get_attr_type(desc);
}
}
@@ -652,3 +676,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
}
}
}
+
+#if V3D_VERSION == 42
+static bool
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+{
+ for (uint8_t i = 0; i < pipeline->va_count; i++) {
+ if (vk_format_is_int(pipeline->va[i].vk_format))
+ return true;
+ }
+ return false;
+}
+#endif
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION == 42
+ return pipeline_has_integer_vertex_attrib(pipeline);
+#endif
+
+ return false;
+}
+
+/* @pipeline can be NULL. In that case we assume the most common case. For
+ * example, for v42 we assume in that case that all the attributes have a
+ * float format (we only create an all-float BO once and we reuse it with all
+ * float pipelines), otherwise we look at the actual type of each attribute
+ * used with the specific pipeline passed in.
+ */
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION >= 71
+ return NULL;
+#endif
+
+ uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+ struct v3dv_bo *bo;
+
+ bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+
+ if (!bo) {
+ fprintf(stderr, "failed to allocate memory for the default "
+ "attribute values\n");
+ return NULL;
+ }
+
+ bool ok = v3dv_bo_map(device, bo, size);
+ if (!ok) {
+ fprintf(stderr, "failed to map default attribute values buffer\n");
+ return NULL;
+ }
+
+ uint32_t *attrs = bo->map;
+ uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+ for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+ attrs[i * 4 + 0] = 0;
+ attrs[i * 4 + 1] = 0;
+ attrs[i * 4 + 2] = 0;
+ VkFormat attr_format =
+ pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+ if (i < va_count && vk_format_is_int(attr_format)) {
+ attrs[i * 4 + 3] = 1;
+ } else {
+ attrs[i * 4 + 3] = fui(1.0);
+ }
+ }
+
+ v3dv_bo_unmap(device, bo);
+
+ return bo;
+}
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index ab134225a3a..68df5db74ad 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -55,6 +55,9 @@ void
v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
void
@@ -75,6 +78,14 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
uint32_t layers);
void
+v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job);
+
+void
+v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend,
+ struct v3dv_job *suspend,
+ struct v3dv_job *resume);
+
+void
v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
uint32_t cmd_buffer_count,
const VkCommandBuffer *cmd_buffers);
@@ -117,31 +128,34 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t stride);
void
+v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer);
+
+struct v3dv_job *
+v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job);
+
+void
v3dX(get_hw_clear_color)(const VkClearColorValue *color,
uint32_t internal_type,
uint32_t internal_size,
uint32_t *hw_color);
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp);
-
/* Used at v3dv_device */
void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+ struct v3dv_sampler *sampler,
const VkSamplerCreateInfo *pCreateInfo,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
void
v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
+ const struct v3dv_cmd_buffer_attachment_state *attachments,
const struct v3dv_subpass *subpass,
- uint8_t *max_bpp, bool *msaa);
+ uint8_t *max_internal_bpp,
+ uint8_t *total_color_bpp,
+ bool *msaa);
-#ifdef DEBUG
+#if MESA_DEBUG
void
v3dX(device_check_prepacked_sizes)(void);
#endif
@@ -161,6 +175,10 @@ v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format);
bool
v3dX(format_supports_blending)(const struct v3dv_format *format);
+/* FIXME: tex_format should be `enum V3DX(Texture_Data_Formats)`, but using
+ * that enum type in the header requires including v3dx_pack.h, which triggers
+ * circular include dependencies issues, so we're using a `uint32_t` for now.
+ */
bool
v3dX(tfu_supports_tex_format)(uint32_t tex_format);
@@ -189,14 +207,14 @@ v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
struct v3dv_buffer *buffer,
struct v3dv_image *image,
struct v3dv_meta_framebuffer *framebuffer,
- const VkBufferImageCopy2KHR *region);
+ const VkBufferImageCopy2 *region);
void
v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
struct v3dv_image *dst,
struct v3dv_image *src,
struct v3dv_meta_framebuffer *framebuffer,
- const VkImageResolve2KHR *region);
+ const VkImageResolve2 *region);
void
v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
@@ -223,19 +241,23 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
struct v3dv_image *dst,
struct v3dv_image *src,
struct v3dv_meta_framebuffer *framebuffer,
- const VkImageCopy2KHR *region);
+ const VkImageCopy2 *region);
void
v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_image *dst,
- uint32_t dst_mip_level,
- uint32_t dst_layer,
- struct v3dv_image *src,
- uint32_t src_mip_level,
- uint32_t src_layer,
+ uint32_t dst_bo_handle,
+ uint32_t dst_offset,
+ enum v3d_tiling_mode dst_tiling,
+ uint32_t dst_padded_height_or_stride,
+ uint32_t dst_cpp,
+ uint32_t src_bo_handle,
+ uint32_t src_offset,
+ enum v3d_tiling_mode src_tiling,
+ uint32_t src_padded_height_or_stride,
+ uint32_t src_cpp,
uint32_t width,
uint32_t height,
- const struct v3dv_format *format);
+ const struct v3dv_format_plane *format_plane);
void
v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
@@ -259,7 +281,7 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
struct v3dv_meta_framebuffer *framebuffer,
- const VkBufferImageCopy2KHR *region);
+ const VkBufferImageCopy2 *region);
void
v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
@@ -273,7 +295,7 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t dst_offset,
struct v3dv_bo *src,
uint32_t src_offset,
- const VkBufferCopy2KHR *region);
+ const VkBufferCopy2 *region);
void
v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
@@ -295,20 +317,57 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
const VkPipelineDepthStencilStateCreateInfo *ds_info,
const VkPipelineRasterizationStateCreateInfo *rs_info,
const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
- const VkPipelineMultisampleStateCreateInfo *ms_info);
+ const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info,
+ const VkPipelineMultisampleStateCreateInfo *ms_info,
+ const struct vk_graphics_pipeline_state *state);
void
v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
const VkPipelineVertexInputStateCreateInfo *vi_info,
const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
+
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline);
+
/* Used at v3dv_queue */
void
v3dX(job_emit_noop)(struct v3dv_job *job);
+/* Used at v3dv_query */
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
+
/* Used at v3dv_descriptor_set, and other descriptor set utils */
uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
uint32_t v3dX(max_descriptor_bo_size)(void);
-uint32_t v3dX(combined_image_sampler_texture_state_offset)(void);
+uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
+
+uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+
+/* General utils */
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
+
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
-uint32_t v3dX(combined_image_sampler_sampler_state_offset)(void);
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+ float scale[3],
+ float translate[3]);
+
+uint32_t
+v3dX(translate_stencil_op)(VkStencilOp op);
diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c
new file mode 100644
index 00000000000..e59a1e84ff6
--- /dev/null
+++ b/src/broadcom/vulkan/v3dvx_query.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+
+#include "common/v3d_performance_counters.h"
+
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+ uint32_t desc_count = *pCounterCount;
+
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+ out, pCounters, pCounterCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+ out_desc, pCounterDescriptions, &desc_count);
+
+ for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+ vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+ counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+ counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+ counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+ unsigned char sha1_result[20];
+ _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+ strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+ sha1_result);
+
+ memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+ }
+
+ vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+ &out_desc, desc) {
+ desc->flags = 0;
+ snprintf(desc->name, sizeof(desc->name), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+ snprintf(desc->category, sizeof(desc->category), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+ snprintf(desc->description, sizeof(desc->description), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
index 38f9efbfa5d..6eed2de9d54 100644
--- a/src/broadcom/vulkan/v3dvx_queue.c
+++ b/src/broadcom/vulkan/v3dvx_queue.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -29,7 +29,8 @@
void
v3dX(job_emit_noop)(struct v3dv_job *job)
{
- v3dv_job_start_frame(job, 1, 1, 1, true, 1, V3D_INTERNAL_BPP_32, false);
+ v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
+ V3D_INTERNAL_BPP_32, 4, false);
v3dX(job_emit_binning_flush)(job);
struct v3dv_cl *rcl = &job->rcl;
@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
config.image_height_pixels = 1;
config.number_of_render_targets = 1;
config.multisample_mode_4x = false;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = 3; /* Tile size 64 */
+ config.log2_tile_height = 3; /* Tile size 64 */
+#endif
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.internal_bpp = V3D_INTERNAL_BPP_32;
+ rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ rt.stride = 1; /* Unused RT */
+ }
+#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = 1.0f;
diff --git a/src/broadcom/vulkan/vk_format_info.h b/src/broadcom/vulkan/vk_format_info.h
deleted file mode 100644
index da85cb5b5dd..00000000000
--- a/src/broadcom/vulkan/vk_format_info.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright © 2016 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef VK_FORMAT_INFO_H
-#define VK_FORMAT_INFO_H
-
-#include <stdbool.h>
-#include <vulkan/vulkan.h>
-
-#include "util/format/u_format.h"
-#include "vulkan/util/vk_format.h"
-
-/* FIXME: from freedreno vk_format.h, common place?*/
-static inline bool
-vk_format_is_int(VkFormat format)
-{
- return util_format_is_pure_integer(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_sint(VkFormat format)
-{
- return util_format_is_pure_sint(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_uint(VkFormat format)
-{
- return util_format_is_pure_uint(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_unorm(VkFormat format)
-{
- return util_format_is_unorm(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_snorm(VkFormat format)
-{
- return util_format_is_snorm(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_float(VkFormat format)
-{
- return util_format_is_float(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_srgb(VkFormat format)
-{
- return util_format_is_srgb(vk_format_to_pipe_format(format));
-}
-
-static inline unsigned
-vk_format_get_blocksize(VkFormat format)
-{
- return util_format_get_blocksize(vk_format_to_pipe_format(format));
-}
-
-static inline unsigned
-vk_format_get_blockwidth(VkFormat format)
-{
- return util_format_get_blockwidth(vk_format_to_pipe_format(format));
-}
-
-static inline unsigned
-vk_format_get_blockheight(VkFormat format)
-{
- return util_format_get_blockheight(vk_format_to_pipe_format(format));
-}
-
-static inline bool
-vk_format_is_compressed(VkFormat format)
-{
- return util_format_is_compressed(vk_format_to_pipe_format(format));
-}
-
-static inline const struct util_format_description *
-vk_format_description(VkFormat format)
-{
- return util_format_description(vk_format_to_pipe_format(format));
-}
-
-#endif /* VK_FORMAT_INFO_H */