diff options
Diffstat (limited to 'src/broadcom')
141 files changed, 25515 insertions, 13827 deletions
diff --git a/src/broadcom/ci/piglit-vc4-rpi3-fails.txt b/src/broadcom/ci/broadcom-rpi3-fails.txt index cb9dfaa6eb6..fdcf09f1fef 100644 --- a/src/broadcom/ci/piglit-vc4-rpi3-fails.txt +++ b/src/broadcom/ci/broadcom-rpi3-fails.txt @@ -1,35 +1,116 @@ -glx@glx-copy-sub-buffer samples=2,Crash -glx@glx-copy-sub-buffer samples=4,Crash -glx@glx-make-current,Crash -glx@glx-multithread-buffer,Fail -glx@glx-query-drawable-glx_fbconfig_id-window,Fail +# Test expects red instead of luminance, contra OES_depth_texture spec. +# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3815 +KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail +KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail +KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail + +# Creating OpenGL ES 3 context +# Fail, context: 0x00000000, error: EGL_BAD_MATCH +# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3816 +x11-dEQP-EGL.functional.create_context.no_config,Fail +wayland-dEQP-EGL.functional.create_context.no_config,Fail + +# wide line outside the viewport incorrectly clipped out when ES wants it +# rendered as a quad and clipped appropriately. I think by expanding +# CLIPPER_XY_SCALING to have a guard band we might get these to work. +dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail +dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail + +dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail + +dEQP-GLES2.functional.uniform_api.random.3,Fail +dEQP-GLES2.functional.uniform_api.random.79,Fail + +# Sampling grid slightly off in test 2? +dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail +dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail +dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_mirror_rgba8888,Fail +dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888,Fail + +# " Warning: High precision not supported in fragment shaders. +# ERROR: Image verification failed, found 2048 invalid pixels!" +# one of the magnified pixels is (0xff, 0x29,0xd6) instead of (0xff,0x2d,0xd2). +# We do support highp, so we should fix glGetShaderPrecisionFormat reporting. +dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square,Fail +dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fail +dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail +dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail + +# One of the pixels on the left edge near the bottom is wrong for both min and +# mag. Also a line of pixels through the image in minification. +dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail + +# Despite exposing GL 2.1, the HW doesn't actually support 3D textures so we set +# 0 max levels. These tests fail (or assertion fail) as a result. +spec@!opengl 1.1@max-texture-size,Crash +spec@!opengl 1.2@copyteximage 3d,Fail +spec@!opengl 1.2@getteximage-targets 3d,Fail +spec@!opengl 1.2@tex3d-maxsize,Fail +spec@!opengl 1.2@tex3d,Fail +spec@!opengl 1.2@texture-packed-formats,Fail +spec@!opengl 1.2@texwrap 3d bordercolor,Fail +spec@!opengl 1.2@texwrap 3d proj bordercolor,Fail +spec@!opengl 1.2@texwrap 3d proj,Fail +spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail +spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail +spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- swizzled- projected,Fail +spec@!opengl 1.2@texwrap 3d,Fail +spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail +spec@!opengl 1.2@texwrap 3d@GL_RGBA8- swizzled,Fail +spec@!opengl 1.2@texwrap 3d@GL_RGBA8,Fail +spec@!opengl 1.3@tex3d-depth1,Fail +spec@!opengl 2.0@tex3d-npot,Fail +spec@!opengl 2.1@minmax,Fail +spec@arb_framebuffer_object@fbo-generatemipmap-3d,Fail +spec@arb_framebuffer_object@fbo-incomplete,Fail +spec@arb_framebuffer_object@fbo-incomplete@invalid slice of 3D texture,Fail +spec@arb_get_texture_sub_image@arb_get_texture_sub_image-get,Fail +spec@arb_robustness@arb_robustness_client-mem-bounds,Fail +spec@arb_texture_multisample@arb_texture_multisample-teximage-3d-multisample,Fail +spec@arb_texture_storage@texture-storage,Crash +spec@arb_texture_storage@texture-storage@3D mipmapped ,Fail +spec@arb_texture_storage@texture-storage@3D mipmapped (EXT_dsa),Fail +spec@arb_texture_storage@texture-storage@3D non-mipmapped ,Fail +spec@arb_texture_storage@texture-storage@3D non-mipmapped (EXT_dsa),Fail +spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail +spec@ext_direct_state_access@textures,Crash +spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail +spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE,Fail +spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT,Fail +spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail +spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail +spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail +spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail +spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail +spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail +spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail +spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE,Fail +spec@ext_direct_state_access@textures@TextureSubImage3DEXT,Fail +spec@ext_framebuffer_object@fbo-3d,Fail +spec@glsl-1.10@execution@texture3d-computed-coord,Fail +spec@glsl-1.10@execution@texture3d,Fail +spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 3d,Fail +spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 3d,Fail +spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 3d,Fail +spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 3d,Fail +spec@khr_texture_compression_astc@basic-gl,Fail + +glx@glx-make-current,Fail glx@glx-swap-pixmap-bad,Fail -glx@glx-visuals-depth -pixmap,Crash -glx@glx-visuals-depth,Crash -glx@glx-visuals-stencil -pixmap,Crash -glx@glx-visuals-stencil,Crash glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail glx@glx_arb_create_context_no_error@no error,Fail -glx@glx_ext_import_context@free context,Fail -glx@glx_ext_import_context@get context id,Fail -glx@glx_ext_import_context@get current display,Fail -glx@glx_ext_import_context@import context- multi process,Fail -glx@glx_ext_import_context@import context- single process,Fail -glx@glx_ext_import_context@imported context has same context id,Fail -glx@glx_ext_import_context@make current- multi process,Fail -glx@glx_ext_import_context@make current- single process,Fail -glx@glx_ext_import_context@query context info,Fail + +# piglit: error: Test timed out. +glx@glx_arb_sync_control@waitformsc,Fail + +glslparsertest@glsl2@gst-gl-text-download-i420-yv12.frag,Fail shaders@glsl-arb-fragment-coord-conventions,Fail shaders@glsl-bug-110796,Fail shaders@glsl-max-vertex-attrib,Fail -shaders@glsl-predication-on-large-array,Fail -spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail -spec@!opengl 1.0@gl-1.0-dlist-bitmap,Crash spec@!opengl 1.0@gl-1.0-drawbuffer-modes,Fail spec@!opengl 1.0@gl-1.0-edgeflag,Fail spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail -spec@!opengl 1.0@gl-1.0-logicop,Crash spec@!opengl 1.0@gl-1.0-no-op-paths,Fail spec@!opengl 1.0@gl-1.0-scissor-offscreen,Fail spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail @@ -682,33 +763,53 @@ spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail -spec@!opengl 1.1@depthstencil-default_fb-blit samples=2,Crash -spec@!opengl 1.1@depthstencil-default_fb-blit samples=4,Crash -spec@!opengl 1.1@depthstencil-default_fb-clear samples=2,Crash -spec@!opengl 1.1@depthstencil-default_fb-clear samples=4,Crash +spec@arb_clear_texture@arb_clear_texture-3d,Fail +spec@arb_clear_texture@arb_clear_texture-sized-formats,Fail +spec@arb_clear_texture@arb_clear_texture-supported-formats,Fail +spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash +spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash +spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail + +# fails on arm64, passes on armhf +spec@arb_depth_buffer_float@depthstencil-render-miplevels 1024 s=z24_s8_d=z32f,Fail + +# Crashes in this group are CMA allocation fails +spec@!opengl 1.1@depthstencil-default_fb-clear samples=2,Fail +spec@!opengl 1.1@depthstencil-default_fb-clear samples=4,Fail spec@!opengl 1.1@depthstencil-default_fb-clear,Fail -spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=2,Crash -spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=4,Crash -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Crash -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4,Crash +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Fail +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4,Fail spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Crash spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4,Crash -spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=2,Crash -spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=4,Crash -spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=2,Crash -spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=4,Crash -spec@!opengl 1.1@draw-pixels,Fail -spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_line_loop,Fail + +# These non-supported primitives draws are converted by Mesa into +# indexed draws with supported primitives. But these indexed draws +# require 4-byte index due the number of vertices to draw, but our +# hardware is limited to 2-byte indexes at most. spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_polygon,Crash spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quad_strip,Crash spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quads,Crash -spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_triangle_fan,Fail -spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_line_loop,Fail spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_polygon,Crash spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quad_strip,Crash spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quads,Crash + +# GFXH-515 / SW-5891: binner uses 16-bit index for drawarrays, so the +# draw is split in multiple calls. For trifans or lineloops it is not +# supported because the 1st vertex must be always included, which +# would require creating new vertex buffer to include the remaining +# vertices plus the 1st one. +spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_line_loop,Fail +spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_triangle_fan,Fail +spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_line_loop,Fail spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_triangle_fan,Fail + +spec@!opengl 1.1@draw-pixels,Fail spec@!opengl 1.1@line-flat-clip-color,Fail + +# Hardware do not support line/polygon stipple. In fact, this feature +# was deprecated/removed in newer OpenGL spec versions. It could be +# emulated using shaders +spec@!opengl 1.1@line-smooth-stipple,Fail spec@!opengl 1.1@linestipple,Fail spec@!opengl 1.1@linestipple@Baseline,Fail spec@!opengl 1.1@linestipple@Factor 2x,Fail @@ -716,6 +817,10 @@ spec@!opengl 1.1@linestipple@Factor 3x,Fail spec@!opengl 1.1@linestipple@Line loop,Fail spec@!opengl 1.1@linestipple@Line strip,Fail spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail +spec@!opengl 2.1@pbo,Fail +spec@!opengl 2.1@pbo@test_polygon_stip,Fail +spec@!opengl 2.1@polygon-stipple-fs,Fail + spec@!opengl 1.1@polygon-mode,Fail spec@!opengl 1.1@polygon-mode-offset,Fail spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail @@ -742,11 +847,6 @@ spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge, spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail -spec@!opengl 1.1@read-front clear-front-first samples=2,Crash -spec@!opengl 1.1@read-front clear-front-first samples=4,Crash -spec@!opengl 1.1@read-front samples=2,Crash -spec@!opengl 1.1@read-front samples=4,Crash -spec@!opengl 1.1@tex-upside-down-miptree,Fail spec@!opengl 1.1@texsubimage-unpack,Fail spec@!opengl 1.1@texwrap 2d proj,Fail spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- NPOT- projected,Fail @@ -787,25 +887,10 @@ spec@!opengl 1.1@texwrap formats@GL_RGBA16- swizzled,Fail spec@!opengl 1.1@texwrap formats@GL_RGBA8,Fail spec@!opengl 1.1@texwrap formats@GL_RGBA8- NPOT,Fail spec@!opengl 1.1@texwrap formats@GL_RGBA8- swizzled,Fail -spec@!opengl 1.1@windowoverlap,Fail -spec@!opengl 1.2@copyteximage 3d,Fail -spec@!opengl 1.2@getteximage-targets 3d,Fail spec@!opengl 1.2@lodclamp,Fail spec@!opengl 1.2@lodclamp-between,Fail spec@!opengl 1.2@lodclamp-between-max,Fail spec@!opengl 1.2@mipmap-setup,Fail -spec@!opengl 1.2@tex3d,Fail -spec@!opengl 1.2@tex3d-maxsize,Fail -spec@!opengl 1.2@teximage-errors,Fail -spec@!opengl 1.2@texwrap 3d proj,Fail -spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail -spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail -spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- swizzled- projected,Fail -spec@!opengl 1.2@texwrap 3d,Fail -spec@!opengl 1.2@texwrap 3d@GL_RGBA8,Fail -spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail -spec@!opengl 1.2@texwrap 3d@GL_RGBA8- swizzled,Fail -spec@!opengl 1.3@tex3d-depth1,Fail spec@!opengl 1.4@gl-1.4-polygon-offset,Fail spec@!opengl 1.4@tex-miplevel-selection,Fail spec@!opengl 1.4@tex-miplevel-selection-lod,Fail @@ -814,14 +899,6 @@ spec@!opengl 1.5@depth-tex-compare,Fail spec@!opengl 2.0@attrib-assignments,Fail spec@!opengl 2.0@gl-2.0-edgeflag,Fail spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail -spec@!opengl 2.0@occlusion-query-discard,Fail -spec@!opengl 2.0@tex3d-npot,Fail -spec@!opengl 2.1@minmax,Fail -spec@!opengl 2.1@pbo,Fail -spec@!opengl 2.1@pbo@test_polygon_stip,Fail -spec@!opengl 2.1@polygon-stipple-fs,Fail -spec@!opengl es 2.0@draw_buffers_gles2,Fail -spec@arb_arrays_of_arrays@execution@glsl-arrays-copy-size-mismatch,Fail spec@arb_depth_texture@depth-level-clamp,Fail spec@arb_depth_texture@texwrap formats,Fail spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16,Fail @@ -835,7 +912,6 @@ spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- swizzled,Fail spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index,Crash spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index-user_varrays,Crash -spec@arb_es2_compatibility@arb_es2_compatibility-drawbuffers,Fail spec@arb_es2_compatibility@texwrap formats,Fail spec@arb_es2_compatibility@texwrap formats@GL_RGB565,Fail spec@arb_es2_compatibility@texwrap formats@GL_RGB565- NPOT,Fail @@ -844,58 +920,24 @@ spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-integer,Fa spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-none,Fail spec@arb_fragment_program@fp-indirections2,Fail spec@arb_fragment_program@minmax,Fail -spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_depth24_stencil8,Fail -spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index1,Fail -spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index16,Fail -spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index4,Fail -spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index8,Fail spec@arb_framebuffer_object@fbo-attachments-blit-scaled-linear,Fail spec@arb_framebuffer_object@fbo-blit-stretch,Fail -spec@arb_framebuffer_object@fbo-generatemipmap-3d,Fail spec@arb_framebuffer_object@fbo-mipmap-copypix,Fail -spec@arb_framebuffer_object@framebuffer-blit-levels draw stencil,Fail -spec@arb_framebuffer_object@framebuffer-blit-levels read stencil,Fail spec@arb_framebuffer_object@mixed-buffer-sizes,Fail -spec@arb_framebuffer_object@same-attachment-glframebuffertexture2d-gl_depth_stencil_attachment,Fail +spec@arb_framebuffer_object@same-attachment-tex2d-depth_stencil,Fail spec@arb_framebuffer_srgb@arb_framebuffer_srgb-srgb_conformance,Fail -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled render,Crash -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled render,Crash -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled render,Crash -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled render,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled render,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled render,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled render,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled clear,Crash -spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled render,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled clear,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled render,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled clear,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled render,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled clear,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled render,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled clear,Crash -spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled render,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled clear,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled render,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled clear,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled render,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled clear,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled render,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled clear,Crash -spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled render,Crash spec@arb_internalformat_query2@all internalformat_<x>_size pname checks,Fail spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_ALPHA_SIZE,Fail spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_BLUE_SIZE,Fail spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_DEPTH_SIZE,Fail spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_GREEN_SIZE,Fail spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_RED_SIZE,Fail +spec@arb_internalformat_query2@all internalformat_<x>_type pname checks,Fail +spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_ALPHA_TYPE,Fail +spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_BLUE_TYPE,Fail +spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_DEPTH_TYPE,Fail +spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_GREEN_TYPE,Fail +spec@arb_internalformat_query2@all internalformat_<x>_type pname checks@GL_INTERNALFORMAT_RED_TYPE,Fail spec@arb_internalformat_query2@api error checks,Fail spec@arb_internalformat_query2@max dimensions related pname checks,Fail spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_COMBINED_DIMENSIONS,Fail @@ -903,147 +945,47 @@ spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_DEPTH, spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_HEIGHT,Fail spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_WIDTH,Fail spec@arb_occlusion_query2@render,Fail -spec@arb_occlusion_query@occlusion_query,Fail spec@arb_occlusion_query@occlusion_query_conform,Fail -spec@arb_occlusion_query@occlusion_query_meta_fragments,Fail -spec@arb_occlusion_query@occlusion_query_meta_save,Fail +spec@arb_occlusion_query@occlusion_query_conform@GetObjivAval_multi2,Fail spec@arb_pixel_buffer_object@fbo-pbo-readpixels-small,Fail spec@arb_pixel_buffer_object@pbo-getteximage,Fail spec@arb_pixel_buffer_object@texsubimage-unpack pbo,Fail spec@arb_point_sprite@arb_point_sprite-mipmap,Fail spec@arb_provoking_vertex@arb-provoking-vertex-render,Fail spec@arb_sampler_objects@sampler-objects,Fail -spec@arb_shader_texture_lod@execution@glsl-fs-texturelod-01,Fail -spec@arb_texture_multisample@arb_texture_multisample-teximage-3d-multisample,Fail spec@arb_texture_rectangle@1-1-linear-texture,Fail -spec@arb_texture_rectangle@copyteximage rect samples=2,Crash -spec@arb_texture_rectangle@copyteximage rect samples=4,Crash spec@arb_texture_rectangle@texrect-many,Crash -spec@arb_texture_storage@texture-storage,Fail -spec@arb_texture_storage@texture-storage@3D mipmapped ,Fail -spec@arb_texture_storage@texture-storage@3D non-mipmapped ,Fail spec@arb_vertex_program@minmax,Fail -spec@egl 1.4@egl-copy-buffers,Crash spec@egl 1.4@eglterminate then unbind context,Fail spec@egl 1.4@largest possible eglcreatepbuffersurface and then glclear,Fail -spec@egl_ext_protected_content@conformance,Fail spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail spec@egl_khr_surfaceless_context@viewport,Fail spec@egl_mesa_configless_context@basic,Fail -spec@ext_direct_state_access@indexed-state-queries 12,Fail -spec@ext_direct_state_access@indexed-state-queries 12@GetIntegerIndexedvEXT,Fail spec@ext_direct_state_access@multi-texture,Crash -spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail spec@ext_direct_state_access@multi-texture@MultiTexSubImage1DEXT,Fail -spec@ext_direct_state_access@textures,Fail -spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT,Fail spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE,Fail spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex*,Fail -spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail -spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@TextureImage2DEXT,Fail -spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail -spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@TextureParameterfEXT,Fail -spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@TextureParameteriEXT,Fail -spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@TextureParameterivEXT,Fail spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE,Fail spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail spec@ext_direct_state_access@textures@TextureSubImage2DEXT,Fail -spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE,Fail -spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail -spec@ext_direct_state_access@textures@TextureSubImage3DEXT,Fail spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail -spec@ext_framebuffer_multisample@blit-flipped 2 x,Crash -spec@ext_framebuffer_multisample@blit-flipped 2 y,Crash -spec@ext_framebuffer_multisample@blit-flipped 4 x,Crash -spec@ext_framebuffer_multisample@blit-flipped 4 y,Crash + +# Remaining crashes are CMA allocation failures. spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail -spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 downsample,Crash -spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 msaa,Crash -spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 upsample,Crash -spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 downsample,Crash -spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 msaa,Crash -spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 upsample,Crash -spec@ext_framebuffer_multisample@enable-flag,Crash +spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 msaa,Fail +spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 msaa,Fail spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail -spec@ext_framebuffer_multisample@line-smooth 2,Crash -spec@ext_framebuffer_multisample@line-smooth 4,Crash -spec@ext_framebuffer_multisample@multisample-blit 2 color linear,Crash -spec@ext_framebuffer_multisample@multisample-blit 2 color,Crash -spec@ext_framebuffer_multisample@multisample-blit 2 depth,Crash -spec@ext_framebuffer_multisample@multisample-blit 2 stencil,Crash -spec@ext_framebuffer_multisample@multisample-blit 4 color linear,Crash -spec@ext_framebuffer_multisample@multisample-blit 4 color,Crash -spec@ext_framebuffer_multisample@multisample-blit 4 depth,Crash -spec@ext_framebuffer_multisample@multisample-blit 4 stencil,Crash -spec@ext_framebuffer_multisample@no-color 2 depth combined,Crash -spec@ext_framebuffer_multisample@no-color 2 depth single,Crash -spec@ext_framebuffer_multisample@no-color 2 depth-computed combined,Crash -spec@ext_framebuffer_multisample@no-color 2 depth-computed single,Crash -spec@ext_framebuffer_multisample@no-color 2 stencil combined,Crash -spec@ext_framebuffer_multisample@no-color 2 stencil single,Crash -spec@ext_framebuffer_multisample@no-color 4 depth combined,Crash -spec@ext_framebuffer_multisample@no-color 4 depth single,Crash -spec@ext_framebuffer_multisample@no-color 4 depth-computed combined,Crash -spec@ext_framebuffer_multisample@no-color 4 depth-computed single,Crash -spec@ext_framebuffer_multisample@no-color 4 stencil combined,Crash -spec@ext_framebuffer_multisample@no-color 4 stencil single,Crash -spec@ext_framebuffer_multisample@point-smooth 2,Crash -spec@ext_framebuffer_multisample@point-smooth 4,Crash -spec@ext_framebuffer_multisample@polygon-smooth 2,Crash -spec@ext_framebuffer_multisample@polygon-smooth 4,Crash spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 color,Fail -spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 depth,Crash spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 color,Fail -spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 depth,Crash -spec@ext_framebuffer_multisample@sample-coverage 2 inverted,Crash -spec@ext_framebuffer_multisample@sample-coverage 2 non-inverted,Crash -spec@ext_framebuffer_multisample@sample-coverage 4 inverted,Crash -spec@ext_framebuffer_multisample@sample-coverage 4 non-inverted,Crash -spec@ext_framebuffer_multisample@unaligned-blit 2 color downsample,Crash -spec@ext_framebuffer_multisample@unaligned-blit 2 color msaa,Crash -spec@ext_framebuffer_multisample@unaligned-blit 2 color upsample,Crash -spec@ext_framebuffer_multisample@unaligned-blit 2 depth downsample,Crash -spec@ext_framebuffer_multisample@unaligned-blit 2 depth msaa,Crash -spec@ext_framebuffer_multisample@unaligned-blit 2 depth upsample,Crash -spec@ext_framebuffer_multisample@unaligned-blit 4 color downsample,Crash -spec@ext_framebuffer_multisample@unaligned-blit 4 color msaa,Crash -spec@ext_framebuffer_multisample@unaligned-blit 4 color upsample,Crash -spec@ext_framebuffer_multisample@unaligned-blit 4 depth downsample,Crash -spec@ext_framebuffer_multisample@unaligned-blit 4 depth msaa,Crash -spec@ext_framebuffer_multisample@unaligned-blit 4 depth upsample,Crash -spec@ext_framebuffer_multisample@upsample 2 color linear,Crash -spec@ext_framebuffer_multisample@upsample 2 color,Crash -spec@ext_framebuffer_multisample@upsample 2 depth,Crash -spec@ext_framebuffer_multisample@upsample 2 stencil,Crash -spec@ext_framebuffer_multisample@upsample 4 color linear,Crash -spec@ext_framebuffer_multisample@upsample 4 color,Crash -spec@ext_framebuffer_multisample@upsample 4 depth,Crash -spec@ext_framebuffer_multisample@upsample 4 stencil,Crash -spec@ext_framebuffer_multisample_blit_scaled@negative-blit-scaled,Crash -spec@ext_framebuffer_object@fbo-3d,Fail -spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail +spec@ext_framebuffer_multisample@sample-coverage 2 inverted,Fail +spec@ext_framebuffer_multisample@sample-coverage 2 non-inverted,Fail +spec@ext_framebuffer_multisample@sample-coverage 4 inverted,Fail +spec@ext_framebuffer_multisample@sample-coverage 4 non-inverted,Fail + spec@ext_framebuffer_object@fbo-depth-sample-compare,Fail -spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index1-blit,Fail -spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index16-blit,Fail -spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index4-blit,Fail -spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index8-blit,Fail spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail @@ -1054,10 +996,8 @@ spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y410,Fail spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail -spec@ext_occlusion_query_boolean@any-samples,Fail spec@ext_packed_depth_stencil@depth_stencil texture,Fail spec@ext_packed_depth_stencil@fbo-depthstencil-gl_depth24_stencil8-clear,Fail -spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-blit,Fail spec@ext_packed_depth_stencil@texwrap formats,Fail spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8,Fail spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail @@ -1087,6 +1027,24 @@ spec@ext_texture_srgb@texwrap formats@GL_SRGB8- swizzled,Fail spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8,Fail spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- NPOT,Fail spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- swizzled,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SLUMINANCE- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SLUMINANCE_ALPHA- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- swizzled- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SLUMINANCE- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SLUMINANCE_ALPHA- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT- border color only,Fail +spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- border color only,Fail spec@glsl-1.10@built-in constants,Fail spec@glsl-1.10@built-in constants@gl_MaxVertexAttribs,Fail spec@glsl-1.10@execution@built-in-functions@fs-cos-float,Fail @@ -1153,12 +1111,7 @@ spec@glsl-1.10@execution@built-in-functions@vs-tan-float,Fail spec@glsl-1.10@execution@built-in-functions@vs-tan-vec2,Fail spec@glsl-1.10@execution@built-in-functions@vs-tan-vec3,Fail spec@glsl-1.10@execution@built-in-functions@vs-tan-vec4,Fail -spec@glsl-1.10@execution@fs-texture-select,Fail spec@glsl-1.10@execution@glsl-fs-convolution-2,Fail -spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-2,Fail -spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-3,Fail -spec@glsl-1.10@execution@samplers@in-parameter-array,Fail -spec@glsl-1.10@execution@texture3d,Fail spec@glsl-1.20@built-in constants,Fail spec@glsl-1.20@built-in constants@gl_MaxVertexAttribs,Fail spec@glsl-1.20@execution@fs-nan-builtin-max,Fail @@ -1167,13 +1120,11 @@ spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1dshadow,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2dshadow,Fail -spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 3d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() cube,Crash spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1dshadow,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2dshadow,Fail -spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 3d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) cube,Crash spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d_projvec4,Fail @@ -1181,19 +1132,15 @@ spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1dshadow,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d_projvec4,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2dshadow,Fail -spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 3d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d_projvec4,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1dshadow,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d_projvec4,Fail spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2dshadow,Fail -spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 3d,Fail -spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-col-row-wr,Fail -spec@glsl-1.20@execution@variable-indexing@vs-temp-array-mat4-index-col-row-wr,Fail + spec@glsl-1.20@execution@vs-nan-builtin-max,Fail spec@glsl-1.20@execution@vs-nan-builtin-min,Fail -spec@intel_performance_query@intel_performance_query-issue_2235,Fail spec@khr_texture_compression_astc@basic-gles,Fail spec@khr_texture_compression_astc@miptree-gl ldr,Fail spec@khr_texture_compression_astc@miptree-gl ldr@LDR Profile,Fail @@ -1208,3 +1155,48 @@ spec@khr_texture_compression_astc@miptree-gles ldr@LDR Profile,Fail spec@khr_texture_compression_astc@miptree-gles srgb,Fail spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail spec@oes_compressed_etc1_rgb8_texture@miptree,Fail +spec@!opengl 1.0@depth-clear-precision-check,Fail +spec@!opengl 1.0@depth-clear-precision-check@depth16,Fail +spec@!opengl 1.0@depth-clear-precision-check@depth32,Fail + +spec@glsl-1.10@execution@variable-indexing@vs-output-array-vec2-index-wr-no-unroll,Fail + +spec@ext_framebuffer_multisample@accuracy 2 depth_draw depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 2 depth_draw small depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 2 depth_resolve depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 2 depth_resolve small depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 4 depth_draw depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 4 depth_draw small depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 4 depth_resolve depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 4 depth_resolve small depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 4 srgb depthstencil linear,Fail +spec@ext_framebuffer_multisample@accuracy 4 srgb depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy 4 srgb small depthstencil linear,Fail +spec@ext_framebuffer_multisample@accuracy 4 srgb small depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy all_samples depth_draw depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy all_samples depth_draw small depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy all_samples depth_resolve depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy all_samples depth_resolve small depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy all_samples srgb depthstencil linear,Fail +spec@ext_framebuffer_multisample@accuracy all_samples srgb depthstencil,Fail +spec@ext_framebuffer_multisample@accuracy all_samples srgb small depthstencil linear,Fail +spec@ext_framebuffer_multisample@accuracy all_samples srgb small depthstencil,Fail +spec@ext_framebuffer_multisample@multisample-blit 2 depth,Fail +spec@ext_framebuffer_multisample@multisample-blit 4 depth,Fail +spec@ext_framebuffer_multisample@no-color 2 depth combined,Fail +spec@ext_framebuffer_multisample@no-color 2 depth-computed combined,Fail +spec@ext_framebuffer_multisample@no-color 4 depth combined,Fail +spec@ext_framebuffer_multisample@no-color 4 depth-computed combined,Fail +spec@ext_framebuffer_multisample@unaligned-blit 2 depth msaa,Fail +spec@ext_framebuffer_multisample@unaligned-blit 2 stencil msaa,Fail +spec@ext_framebuffer_multisample@unaligned-blit 4 depth msaa,Fail +spec@ext_framebuffer_multisample@unaligned-blit 4 stencil msaa,Fail + +# https://gitlab.freedesktop.org/mesa/piglit/-/merge_requests/817 +spec@intel_performance_query@intel_performance_query-issue_2235,Fail + +# Bisected to 35ae5dce39c ("mesa: don't pass Infs to the shader via gl_Fog.scale") +spec@glsl-1.10@execution@glsl-1.10-built-in-uniform-state,Fail + +# Couldn't reproduce locally +spec@oes_packed_depth_stencil@depth_stencil texture gles2,Fail diff --git a/src/broadcom/ci/broadcom-rpi3-flakes.txt b/src/broadcom/ci/broadcom-rpi3-flakes.txt new file mode 100644 index 00000000000..7e11d7da34e --- /dev/null +++ b/src/broadcom/ci/broadcom-rpi3-flakes.txt @@ -0,0 +1,52 @@ +dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_neg_x_neg_y_neg_z +dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_pos_y_pos_z +dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_neg_y_pos_z_and_neg_x_pos_y_neg_z +dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_x_neg_y_pos_z_and_neg_x_pos_y_neg_z + +glx@glx-multi-window-single-context +glx@glx-visuals-stencil +shaders@glsl-vs-loop +shaders@glsl-vs-loop-nested +spec@ext_framebuffer_blit@fbo-sys-blit +spec@ext_framebuffer_blit@fbo-sys-sub-blit +spec@egl_chromium_sync_control@conformance + +# CMA allocations that may sometimes succeed +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4 +spec@!opengl 1.1@depthstencil-default_fb-clear samples=2 + +# https://gitlab.freedesktop.org/mesa/mesa/-/issues/7186 +spec@!opengl 1.0@rasterpos + +# Sometimes fail when run along with other tests, never when run by themselves +spec@!opengl 1.1@copypixels-sync +spec@!opengl 1.1@copypixels-draw-sync +spec@!opengl 1.1@draw-copypixels-sync +spec@!opengl 1.1@draw-sync + +# flaky on wayland, was stable on x11 +spec@ext_image_dma_buf_import@ext_image_dma_buf_import + +# fails on arm64, passes on armhf +spec@arb_depth_buffer_float@depthstencil-render-miplevels 1024 s=z24_s8_d=z32f + +# Sometimes goes into an infinite loop and times out +spec@arb_depth_buffer_float@depthstencil-render-miplevels 146 s=z24_s8_d=z32f_s8 + +spec@arb_depth_texture@depthstencil-render-miplevels 273 d=z24 +spec@arb_shader_texture_lod@execution@tex-miplevel-selection *lod 1d +spec@arb_occlusion_query2@render + +# Updated by ci-collate, found in this job run: https://gitlab.freedesktop.org/mesa/mesa/-/jobs/56164970 +glx@glx-multithread-clearbuffer + +spec@arb_vertex_buffer_object@vbo-subdata-many drawarrays +spec@arb_vertex_buffer_object@vbo-subdata-many drawelements +spec@arb_vertex_buffer_object@vbo-subdata-many drawrangeelements + +# Nightly run expectations update +spec@glsl-1.20@execution@variable-indexing@fs-uniform-mat2-rd + diff --git a/src/broadcom/ci/deqp-vc4-rpi3-skips.txt b/src/broadcom/ci/broadcom-rpi3-skips.txt index 62d4d939d2d..6da79a463a7 100644 --- a/src/broadcom/ci/deqp-vc4-rpi3-skips.txt +++ b/src/broadcom/ci/broadcom-rpi3-skips.txt @@ -5,10 +5,6 @@ # This is causing a binning memory overflow problem dEQP-GLES2.functional.fragment_ops.scissor.outside_render_line -# These are very slow -dEQP-GLES2.functional.uniform_api.random.3 -dEQP-GLES2.functional.uniform_api.random.79 - # Conformance issue: VC4 needs dynamic loops in the VS to cause a # shader link failure. # @@ -20,6 +16,21 @@ dEQP-GLES2.functional.uniform_api.random.79 # list for tracking. dEQP-GLES2.functional.shaders.loops.*dynamic.*vertex -# Timeout tests (> 1 minute to run) -KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_linear -KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_linear +# Slow tests (> 1 minute to run) +spec@!opengl 1.1@streaming-texture-leak + +# Versions / Extensions not supported +spec@!opengl 3.* +spec@!opengl 4.* +spec@!opengl es 3.* +spec@arb_gpu_shader5.* +spec@arb_gpu_shader_fp64.* +spec@arb_gpu_shader_int64.* +spec@arb_tessellation_shader.* +spec@arb_texture_cube_map.* +spec@glsl-1.30.* +spec@glsl-1.40.* +spec@glsl-1.50.* +spec@glsl-3.* +spec@glsl-4.* +spec@glsl-es-3.* diff --git a/src/broadcom/ci/broadcom-rpi4-fails.txt b/src/broadcom/ci/broadcom-rpi4-fails.txt new file mode 100644 index 00000000000..bac3d618634 --- /dev/null +++ b/src/broadcom/ci/broadcom-rpi4-fails.txt @@ -0,0 +1,602 @@ +glx@glx-make-current,Fail +glx@glx-multi-window-single-context,Fail +glx@glx-swap-pixmap-bad,Fail +glx@glx-visuals-depth -pixmap,Fail +glx@glx-visuals-stencil -pixmap,Fail +glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail +glx@glx_arb_create_context_no_error@no error,Fail +shaders@glsl-bug-110796,Fail +shaders@point-vertex-id divisor,Fail +shaders@point-vertex-id gl_instanceid divisor,Fail +shaders@point-vertex-id gl_instanceid,Fail +shaders@point-vertex-id gl_vertexid divisor,Fail +shaders@point-vertex-id gl_vertexid gl_instanceid divisor,Fail +shaders@point-vertex-id gl_vertexid gl_instanceid,Fail +shaders@point-vertex-id gl_vertexid,Fail +spec@!opengl 1.0@gl-1.0-edgeflag,Fail +spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail +spec@!opengl 1.0@gl-1.0-no-op-paths,Fail +spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail +spec@!opengl 1.1@point-line-no-cull,Fail +spec@!opengl 1.1@teximage-colors gl_alpha16@Exact upload-download of GL_ALPHA16,Fail +spec@!opengl 1.1@texwrap formats bordercolor,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY12- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY16- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA12- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA4- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB12- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB16- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA12- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA16- swizzled- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY12- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY16- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA12- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA4- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB12- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB16- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA12- border color only,Fail +spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail +spec@!opengl 1.4@gl-1.4-polygon-offset,Fail +spec@!opengl 2.0@gl-2.0-edgeflag,Fail +spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail +spec@arb_color_buffer_float@gl_rgba32f-render,Fail +spec@arb_color_buffer_float@gl_rgba32f-render-fog,Fail +spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Fail +spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Fail +spec@arb_copy_image@arb_copy_image-formats,Fail +spec@arb_copy_image@arb_copy_image-formats@Source: GL_ALPHA16/Destination: GL_ALPHA16,Fail +spec@arb_depth_buffer_float@fbo-generatemipmap-formats,Fail +spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F NPOT,Fail +spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F,Fail +spec@arb_depth_buffer_float@texwrap formats bordercolor,Fail +spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled,Fail +spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH32F_STENCIL8- swizzled- border color only,Fail +spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32F- swizzled- border color only,Fail +spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH32F_STENCIL8- border color only,Fail +spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH_COMPONENT32F- border color only,Fail +spec@arb_depth_buffer_float@texwrap formats offset,Fail +spec@arb_depth_buffer_float@texwrap formats offset@GL_DEPTH32F_STENCIL8- NPOT,Fail +spec@arb_depth_buffer_float@texwrap formats offset@GL_DEPTH_COMPONENT32F- NPOT,Fail +spec@arb_depth_buffer_float@texwrap formats,Fail +spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH32F_STENCIL8- NPOT,Fail +spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH_COMPONENT32F- NPOT,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT NPOT,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16 NPOT,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail +spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail +spec@arb_depth_texture@texwrap formats bordercolor,Fail +spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail +spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail +spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail +spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail +spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail +spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail +spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail +spec@arb_depth_texture@texwrap formats offset,Fail +spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT16- NPOT,Fail +spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT24- NPOT,Fail +spec@arb_depth_texture@texwrap formats offset@GL_DEPTH_COMPONENT32- NPOT,Fail +spec@arb_depth_texture@texwrap formats,Fail +spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail +spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail +spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail +spec@arb_direct_state_access@gettextureimage-formats init-by-rendering,Fail +spec@arb_direct_state_access@gettextureimage-formats,Fail +spec@arb_framebuffer_object@fbo-blit-scaled-linear,Fail +spec@arb_point_sprite@arb_point_sprite-checkerboard,Fail +spec@arb_point_sprite@arb_point_sprite-mipmap,Fail +spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail +spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgrad,Fail +spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail +spec@arb_texture_buffer_object@formats (fs- arb),Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_ALPHA16F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_ALPHA32F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY16UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY32UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY8I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_INTENSITY8UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE16UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE32UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE8I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE8UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA16UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32F_ARB,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA32UI_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA8I_EXT,Fail +spec@arb_texture_buffer_object@formats (fs- arb)@GL_LUMINANCE_ALPHA8UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb),Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_ALPHA16F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_ALPHA32F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY16UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY32UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY8I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_INTENSITY8UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE16UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE32UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE8I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE8UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA16UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32F_ARB,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA32UI_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA8I_EXT,Fail +spec@arb_texture_buffer_object@formats (vs- arb)@GL_LUMINANCE_ALPHA8UI_EXT,Fail +spec@arb_texture_buffer_object@texture-buffer-size-clamp,Fail +spec@arb_texture_buffer_object@texture-buffer-size-clamp@r8ui_texture_buffer_size_via_sampler,Fail +spec@arb_texture_buffer_object@texture-buffer-size-clamp@rg8ui_texture_buffer_size_via_sampler,Fail +spec@arb_texture_buffer_object@texture-buffer-size-clamp@rgba8ui_texture_buffer_size_via_sampler,Fail +spec@arb_texture_float@fbo-blending-formats,Fail +spec@arb_texture_float@fbo-blending-formats@GL_ALPHA32F_ARB,Fail +spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail +spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail +spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE_ALPHA32F_ARB,Fail +spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail +spec@arb_texture_float@fbo-blending-formats@GL_RGBA32F,Fail +spec@arb_texture_float@texwrap formats bordercolor,Fail +spec@arb_texture_float@texwrap formats bordercolor-swizzled,Fail +spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_ALPHA32F_ARB- swizzled- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_INTENSITY32F_ARB- swizzled- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE32F_ARB- swizzled- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE_ALPHA32F_ARB- swizzled- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGB32F- swizzled- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGBA32F- swizzled- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor@GL_ALPHA32F_ARB- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor@GL_INTENSITY32F_ARB- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE32F_ARB- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE_ALPHA32F_ARB- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor@GL_RGB32F- border color only,Fail +spec@arb_texture_float@texwrap formats bordercolor@GL_RGBA32F- border color only,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch.*,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R16I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R16_SNORM,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R32I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R8I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_R8_SNORM,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16F,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG16_SNORM,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG32F,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG32I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG8I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RG8_SNORM,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10_A2,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB10_A2UI,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB16I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB16_SNORM,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB32I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB4,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB8,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB8I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGB9_E5,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16F,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA16I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA32F,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA32I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA4,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA8,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_RGBA8I,Fail +spec@arb_texture_multisample@arb_texture_multisample-dsa-texelfetch@Texture type: GL_SRGB8_ALPHA8,Fail +spec@arb_texture_rectangle@1-1-linear-texture,Fail +spec@arb_texture_rg@fbo-blending-formats-float,Fail +spec@arb_texture_rg@fbo-blending-formats-float@GL_R32F,Fail +spec@arb_texture_rg@fbo-blending-formats-float@GL_RG32F,Fail +spec@arb_texture_rg@texwrap formats bordercolor,Fail +spec@arb_texture_rg@texwrap formats bordercolor-swizzled,Fail +spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_R16- swizzled- border color only,Fail +spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_RG16- swizzled- border color only,Fail +spec@arb_texture_rg@texwrap formats bordercolor@GL_R16- border color only,Fail +spec@arb_texture_rg@texwrap formats bordercolor@GL_RG16- border color only,Fail +spec@arb_texture_rg@texwrap formats-float bordercolor,Fail +spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled,Fail +spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_R32F- swizzled- border color only,Fail +spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_RG32F- swizzled- border color only,Fail +spec@arb_texture_rg@texwrap formats-float bordercolor@GL_R32F- border color only,Fail +spec@arb_texture_rg@texwrap formats-float bordercolor@GL_RG32F- border color only,Fail +spec@arb_texture_rg@texwrap formats-float offset,Fail +spec@arb_texture_rg@texwrap formats-float offset@GL_R32F- NPOT,Fail +spec@arb_texture_rg@texwrap formats-float offset@GL_RG32F- NPOT,Fail +spec@arb_texture_rg@texwrap formats-float,Fail +spec@arb_texture_rg@texwrap formats-float@GL_R32F- NPOT,Fail +spec@arb_texture_rg@texwrap formats-float@GL_RG32F- NPOT,Fail +spec@arb_texture_storage@texture-storage@cube array texture,Fail +spec@egl 1.4@eglterminate then unbind context,Fail +spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail +spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail +spec@egl_khr_surfaceless_context@viewport,Fail +spec@egl_mesa_configless_context@basic,Fail +spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail +spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail +spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail +spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail +spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-modifiers,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-modifiers@autogen-R16-DRM_FORMAT_MOD_LINEAR-clear_reimport,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_argb8888,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_nv12,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_nv21,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p016,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_uyvy,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_vyuy,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_xrgb8888,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y210,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y212,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuv420,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuyv,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvu420,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvyu,Fail +spec@ext_image_dma_buf_import@ext_image_dma_buf_import-transcode-nv12-as-r8-gr88,Fail +spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail +spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled,Fail +spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_STENCIL8- swizzled- border color only,Fail +spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail +spec@ext_packed_depth_stencil@texwrap formats offset,Fail +spec@ext_packed_depth_stencil@texwrap formats offset@GL_DEPTH24_STENCIL8- NPOT,Fail +spec@ext_packed_depth_stencil@texwrap formats,Fail +spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail +spec@ext_packed_float@query-rgba-signed-components,Fail +spec@ext_texture_integer@fbo-blending,Fail +spec@ext_texture_integer@getteximage-clamping gl_arb_texture_rg,Fail +spec@ext_texture_integer@getteximage-clamping,Fail +spec@ext_texture_integer@multisample-formats 2 gl_ext_texture_integer,Fail +spec@ext_texture_integer@multisample-formats 4 gl_ext_texture_integer,Fail +spec@ext_texture_integer@texwrap formats bordercolor,Fail +spec@ext_texture_integer@texwrap formats bordercolor-swizzled,Fail +spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA16I_EXT- swizzled- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA16UI_EXT- swizzled- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA32I_EXT- swizzled- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA32UI_EXT- swizzled- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA8I_EXT- swizzled- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor-swizzled@GL_ALPHA8UI_EXT- swizzled- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA16I_EXT- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA16UI_EXT- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA32I_EXT- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA32UI_EXT- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA8I_EXT- border color only,Fail +spec@ext_texture_integer@texwrap formats bordercolor@GL_ALPHA8UI_EXT- border color only,Fail +spec@ext_texture_integer@texwrap formats offset,Fail +spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT,Fail +spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT- NPOT,Fail +spec@ext_texture_integer@texwrap formats offset@GL_ALPHA8I_EXT- swizzled,Fail +spec@ext_texture_integer@texwrap formats,Fail +spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT,Fail +spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT- NPOT,Fail +spec@ext_texture_integer@texwrap formats@GL_ALPHA8I_EXT- swizzled,Fail +spec@ext_texture_lod_bias@lodbias,Fail +spec@ext_texture_snorm@texwrap formats bordercolor,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_ALPHA16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_INTENSITY16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_R16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RG16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGB16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGBA16_SNORM- swizzled- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_ALPHA16_SNORM- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_INTENSITY16_SNORM- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16_SNORM- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_SNORM- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_R16_SNORM- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_RG16_SNORM- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGB16_SNORM- border color only,Fail +spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGBA16_SNORM- border color only,Fail +spec@ext_transform_feedback@tessellation line_loop flat_first,Fail +spec@ext_transform_feedback@tessellation line_loop flat_last,Fail +spec@ext_transform_feedback@tessellation line_loop monochrome,Fail +spec@ext_transform_feedback@tessellation line_loop smooth,Fail +spec@ext_transform_feedback@tessellation triangle_fan flat_first,Fail +spec@ext_transform_feedback@tessellation triangle_strip flat_first,Fail +spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash +spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash +spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail +spec@glsl-1.20@execution@clipping@vs-clip-vertex-primitives,Fail +spec@glsl-1.20@execution@fs-underflow-mul-compare-zero,Fail +spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail +spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail +spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail +spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail +spec@nv_copy_image@nv_copy_image-formats,Fail +spec@nv_copy_image@nv_copy_image-formats@Source: GL_ALPHA16/Destination: GL_ALPHA16,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.vert,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.frag,Fail +spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.vert,Fail +spec@nv_read_depth@read_depth_gles3,Fail +spec@oes_point_sprite@arb_point_sprite-checkerboard_gles1,Fail +spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail + +# This crashes only when LLVM is not enabled. This is because Gallium backend +# uses TGSI to do some task that do not contains a sampler; when LLVM is +# enabled, it uses LLVM instead, which is complete. +spec@!opengl 1.0@rasterpos,Crash + +# https://gitlab.freedesktop.org/mesa/piglit/-/merge_requests/899 +spec@!opengl 1.0@depth-clear-precision-check,Fail + +# There are two problems here. On one side, hardware do not support +# different polygon mode for front and back faces. By default we +# choose the mode set for front face, unless we are culling it; in +# this case we choose the mode set for back face. The other problem is +# that we do not support rendering quads, so Gallium decomposes them +# in triangles. This has the drawback that when setting polygon mode +# as lines, we are rendering an extra edge. +spec@!opengl 1.1@polygon-mode,Fail +spec@!opengl 1.1@polygon-mode-facing,Fail +spec@!opengl 1.1@polygon-mode-offset,Fail +spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail +spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail +spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail +spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail +spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail +spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail + +# V3D does not support PIPE_FORMAT_{R16,R16G16,R16G16B16A16}_UNORM for +# rendering +spec@!opengl 3.0@required-texture-attachment-formats,Fail +spec@!opengl 3.1@required-texture-attachment-formats,Fail +spec@arb_texture_view@rendering-formats,Crash + +# V3D does not support blending for GL_R{GBA}32F +spec@!opengl 1.1@getteximage-formats,Fail + +# OpenGL 3.x requires 8 RT (MAX_DRAW_BUFFERS)/ color attachments (MAX_COLOR_ATTACHMENTS) +spec@!opengl 3.0@bindfragdata-link-error,Fail +spec@!opengl 3.0@bindfragdata-nonexistent-variable,Fail +spec@!opengl 3.0@clearbuffer-mixed-format,Fail +spec@!opengl 3.0@getfragdatalocation,Fail +spec@!opengl 3.0@minmax,Fail +spec@!opengl 3.1@minmax,Fail +spec@glsl-1.30@built-in constants,Fail +spec@glsl-1.30@built-in constants@gl_MaxDrawBuffers,Fail +spec@glsl-1.40@built-in constants,Fail +spec@glsl-1.40@built-in constants@gl_MaxDrawBuffers,Fail + +# OpenGL 3.x applies non-seamless cubemap texturing, while our +# driver/GLES uses seamless cubemap texturing. +spec@!opengl 3.0@sampler-cube-shadow,Fail +spec@arb_texture_cube_map_array@arb_texture_cube_map_array-sampler-cube-array-shadow,Fail + +# Precision differences between expected and obtained; works if +# exporting V3D_DEBUG=tmu32. +spec@oes_texture_view@rendering-formats,Fail +spec@oes_texture_view@rendering-formats@clear GL_R8 as GL_R8I,Fail +spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_R16F,Fail +spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_R16I,Fail +spec@oes_texture_view@rendering-formats@clear GL_RG8 as GL_RG8I,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_R32F,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_R32I,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RG16F,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RG16I,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGBA8 as GL_RGBA8I,Fail + +# Also related with precision issues +spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_R32F,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_R32I,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RG16F,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RG16I,Fail +spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RGBA8I,Fail + +spec@!opengl 1.0@depth-clear-precision-check@depth16,Fail +spec@!opengl 1.0@depth-clear-precision-check@depth24,Fail + +# This fails the subtest for GL_ALPHA16 because we don't support a 16-bit unorm format for rendering +# so gallium falls back to using an 8-bit unorm format and we lose some precision in the result. +spec@arb_clear_texture@arb_clear_texture-sized-formats,Fail + +# These fail because the shaders use indirect indexing on samplers which we +# don't support (the GLSL linker fails to link the shaders because of this). +# If loop unrolling kicks-in for these tests it removes the indirect indexing +# and the tests pass, but this would just be working around an issue in the +# tests. +spec@!opengl 2.0@max-samplers,Fail +spec@!opengl 2.0@max-samplers border,Fail + +# Hardware do not support line/polygon stipple. In fact, this feature +# was deprecated/removed in newer OpenGL spec versions. It could be +# emulated using shaders. +spec@!opengl 1.1@line-smooth-stipple,Fail +spec@!opengl 1.1@linestipple,Fail +spec@!opengl 1.1@linestipple@Factor 2x,Fail +spec@!opengl 1.1@linestipple@Factor 3x,Fail +spec@!opengl 1.1@linestipple@Line loop,Fail +spec@!opengl 1.1@linestipple@Line strip,Fail +spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail +spec@!opengl 2.1@pbo,Fail +spec@!opengl 2.1@pbo@test_polygon_stip,Fail +spec@!opengl 2.1@polygon-stipple-fs,Fail + +# Works when run individually, but fail consistently on the CI +dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32i_cube,Fail + +# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4422 +KHR-GL31.texture_size_promotion.functional,Fail + +# uprev Piglit in Mesa +spec@glsl-1.40@uniform_buffer@two-stages,Fail + +# RPI4 only supports 4RT, so this tests with 8RT will fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 128 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 64 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 1 8 8 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 128 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 64 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 32 42 8 8 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 128 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 64 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 1 8 8 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 128 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 64 8,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 1,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 2,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 3,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 4,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 7,Fail +spec@arb_texture_barrier@arb_texture_barrier-blending-in-shader 512 42 8 8 8,Fail + +# This seems to be a Vulkan Loader issue. Can be fixed by compiling the loader from the Github repo. +dEQP-VK.api.get_device_proc_addr.non_enabled,Fail +# This is a bug in CTS: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5096 +dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary,Fail + +# New CTS failures in 1.3.8.2 +dEQP-VK.api.info.vulkan1p2_limits_validation.khr_vertex_attribute_divisor,Fail diff --git a/src/broadcom/ci/broadcom-rpi4-flakes.txt b/src/broadcom/ci/broadcom-rpi4-flakes.txt new file mode 100644 index 00000000000..c1a2cd94b04 --- /dev/null +++ b/src/broadcom/ci/broadcom-rpi4-flakes.txt @@ -0,0 +1,48 @@ +KHR-GLES31.core.shader_image_load_store.basic-glsl-earlyFragTests +dEQP-GLES31.functional.ssbo.layout.instance_array_basic_type.std430.ivec4 + +glx@glx_arb_sync_control@waitformsc +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=4 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2 +spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4 +spec@!opengl 1.1@masked-clear +spec@arb_occlusion_query@occlusion_query_order +spec@arb_texture_multisample@large-float-texture +spec@egl_chromium_sync_control@conformance +spec@ext_packed_depth_stencil@depthstencil-render-miplevels 585 ds=z24_s8 + +# Seen this one flake a few times already +spec@egl 1.4@largest possible eglcreatepbuffersurface and then glclear + +# This test works alone, but fails when executing all the tests together +# https://gitlab.freedesktop.org/mesa/mesa/-/issues/8684 +dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32f_cube +dEQP-GLES3.functional.texture.specification.teximage2d_pbo.rgba32i_cube + +# Seem reliable on arm64, but they flake on armhf +dEQP-VK.glsl.builtin.function.integer.findMSB.ivec2_mediump_geometry +dEQP-VK.glsl.builtin.function.integer.findMSB.ivec2_highp_geometry + +# Failed twice one day with two different bad renders, and never since: +# https://gitlab.freedesktop.org/eric/mesa/-/jobs/37556931 +# https://gitlab.freedesktop.org/eric/mesa/-/jobs/37596148 +dEQP-VK.renderpass2.suballocation.load_store_op_none.depthstencil_d24_unorm_s8_uint_load_op_depth_load_stencil_none_store_op_depth_store_stencil_none_stencil_write_off + +# first encounter 01/04/2023 +spec@ext_framebuffer_blit@fbo-sys-blit +spec@ext_framebuffer_blit@fbo-sys-sub-blit + +dEQP-VK.fragment_operations.occlusion_query.precise_test_scissors_depth_write_stencil_clear_stencil_write +dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024 +dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1048576 +dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.atomic_atomic.atomicrmw.device.payload_local.image.guard_local.image.frag +dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.device.payload_local.image.guard_local.buffer.frag +dEQP-VK.pipeline.monolithic.image.suballocation.sampling_type.combined.view_type.1d_array.format.r8_unorm.count_1.size.443x1_array_of_6 +dEQP-VK.renderpass.suballocation.load_store_op_none.depthstencil_d24_unorm_s8_uint_load_op_depth_load_stencil_none_store_op_depth_store_stencil_none_stencil_write_off +dEQP-VK.synchronization.basic.timeline_semaphore.one_queue +dEQP-VK.synchronization2.basic.timeline_semaphore.one_queue +dEQP-VK.synchronization2.signal_order.shared_binary_semaphore.write_ssbo_compute_indirect_read_ssbo_geometry.buffer_262144_opaque_fd +dEQP-VK.texture.shadow.cube.linear.less_d24_unorm_s8_uint diff --git a/src/broadcom/ci/broadcom-rpi4-skips.txt b/src/broadcom/ci/broadcom-rpi4-skips.txt new file mode 100644 index 00000000000..66d371eaae2 --- /dev/null +++ b/src/broadcom/ci/broadcom-rpi4-skips.txt @@ -0,0 +1,293 @@ +# Slow tests (> 1 minute to run) +spec@!opengl 1.1@streaming-texture-leak +spec@!opengl 1.2@tex3d-maxsize +spec@arb_texture_multisample@texelfetch fs sampler2dms 4 1x130-501x130 +spec@arb_texture_multisample@texelfetch fs sampler2dms 4 1x71-501x71 +spec@arb_texture_multisample@texelfetch fs sampler2dmsarray 4 98x1x9-98x129x9 +spec@glsl-1.30@execution@texelfetch fs sampler2d 1x281-501x281 + +# Versions / Extensions not supported +spec@!opengl 3.2@.* +spec@!opengl 3.3@.* +spec@!opengl 4.2@.* +spec@!opengl 4.3@.* +spec@!opengl 4.4@.* +spec@!opengl 4.5@.* +spec@arb_gpu_shader5.* +spec@arb_gpu_shader_fp64.* +spec@arb_gpu_shader_int64.* +spec@arb_tessellation_shader.* +spec@glsl-1.50.* +spec@glsl-3.* +spec@glsl-4.* +spec@glsl-es-3.20.* + +# Broadcom waivers +dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero +dEQP-VK.rasterization.depth_bias.d32_sfloat + +# Kernel blocks (probably GMP violations) +spec@arb_shading_language_420pack@active sampler conflict +spec@arb_texture_buffer_object@render-no-bo + +# Slow tests (> 1 minute to run) +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.comp_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.comp_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.frag_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.store.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.nostore.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.load.store.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.frag_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.comp +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.frag +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.frag_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.nostore.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertuvec2.store.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.nostore.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.load.store.single.std140.vert_offset_nonzero +dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite +dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap +dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap +dEQP-VK.memory.mapping.suballocation.full.variable.implicit_unmap +dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom +dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert +dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4 +dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_comp_access +dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.layout.3_level_array.std140.column_major_mat4_store_cols +dEQP-VK.ssbo.layout.3_level_array.std140.mat4 +dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4 +dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access +dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_store_cols +dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4 +dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_comp_access +dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.layout.3_level_array.std430.column_major_mat4_store_cols +dEQP-VK.ssbo.layout.3_level_array.std430.mat4 +dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4 +dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access +dEQP-VK.ssbo.layout.3_level_unsized_array.std140.column_major_mat4 +dEQP-VK.ssbo.layout.3_level_unsized_array.std140.mat4 +dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4 +dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access +dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_store_cols +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4 +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_comp_access +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.column_major_mat4_store_cols +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.mat4 +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4 +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access +dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_store_cols +dEQP-VK.ssbo.layout.random.16bit.all_per_block_buffers.47 +dEQP-VK.ssbo.layout.random.16bit.all_per_block_buffers.5 +dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.5 +dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.6 +dEQP-VK.ssbo.layout.random.8bit.nested_structs_arrays_instance_arrays.15 +dEQP-VK.ssbo.layout.random.8bit.nested_structs_arrays_instance_arrays.9 +dEQP-VK.ssbo.layout.random.all_shared_buffer.3 +dEQP-VK.ssbo.layout.random.arrays_of_arrays.13 +dEQP-VK.ssbo.layout.random.nested_structs_arrays.17 +dEQP-VK.ssbo.phys.layout.2_level_array.std140.row_major_mat4 +dEQP-VK.ssbo.phys.layout.2_level_array.std430.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std140.column_major_mat4x3_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std140.mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat2x4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x2 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std430.column_major_mat4x3_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std430.mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat2x4 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x2 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.column_major_mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat2x4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x2 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.column_major_mat4x3_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat2x4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4x2 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.column_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.column_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_comp_access +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.random.16bit.all_per_block_buffers.45 +dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.23 +dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.36 +dEQP-VK.ssbo.phys.layout.random.16bit.all_shared_buffer.40 +dEQP-VK.ssbo.phys.layout.random.16bit.nested_structs_arrays.23 +dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.17 +dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.38 +dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46 +dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.49 +dEQP-VK.ssbo.phys.layout.random.8bit.all_shared_buffer.19 +dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.17 +dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.20 +dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays_instance_arrays.12 +dEQP-VK.ssbo.phys.layout.random.8bit.unsized_arrays.0 +dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.14 +dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.18 +dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.22 +dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.46 +dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.20 +dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.3 +dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.8 +dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays.13 +dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays_instance_arrays.23 +dEQP-VK.ssbo.phys.layout.random.nested_structs_arrays_instance_arrays.3 +dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std140_instance_array +dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array +dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array_comp_access +dEQP-VK.ssbo.phys.layout.single_struct_array.per_block_buffer.std430_instance_array_store_cols +dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array +dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array_comp_access +dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std140_instance_array_store_cols +dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array +dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_comp_access +dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.single_struct_array.single_buffer.std430_instance_array_store_cols +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_comp_access +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std140_instance_array_store_cols +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_comp_access +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.std430_instance_array_store_cols +dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array +dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array_comp_access +dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std140_instance_array_store_cols +dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array +dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_comp_access +dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.unsized_struct_array.single_buffer.std430_instance_array_store_cols +dEQP-VK.synchronization.basic.timeline_semaphore.chain +dEQP-VK.synchronization2.basic.timeline_semaphore.chain +dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp +dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat +dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp +dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat + +# WSI tests are too flaky to be useful +dEQP-VK.image.swapchain_mutable.* +dEQP-VK.wsi.* + +# These require VK_KHR_shader_draw_parameters but they don't check for it +# (Seems to be fixed in some later release of CTS 1.3.7). +dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multi_draw.* + +# Skip tests for unsupported features so we can increase the number of tests +# that are actually useful in the limited CI time we have per job. +dEQP-VK.pipeline.monolithic.multisample_with_fragment_shading_rate.* +dEQP-VK.pipeline.monolithic.bind_point.graphics_raytracing.* +dEQP-VK.pipeline.monolithic.bind_point.compute_raytracing.* +dEQP-VK.pipeline.pipeline_library.* +dEQP-VK.pipeline.fast_linked_library.* +dEQP-VK.pipeline.shader_object* +dEQP-VK.protected_memory.* +dEQP-VK.transform_feedback.* +dEQP-VK.ray_tracing_pipeline.* +dEQP-VK.ray_query.* +dEQP-VK.fragment_shading_rate.* +dEQP-VK.mesh_shader.* +dEQP-VK.shader_object.rendering.* diff --git a/src/broadcom/ci/broadcom-rpi5-fails.txt b/src/broadcom/ci/broadcom-rpi5-fails.txt new file mode 100644 index 00000000000..3241bf827dc --- /dev/null +++ b/src/broadcom/ci/broadcom-rpi5-fails.txt @@ -0,0 +1,11 @@ +# New CTS failures in 1.3.8.0 +dEQP-VK.query_pool.performance_query.query_compute,Fail +dEQP-VK.query_pool.performance_query.query_compute_copy,Fail +dEQP-VK.query_pool.performance_query.query_graphic,Fail +dEQP-VK.query_pool.performance_query.query_graphic_copy,Fail +# This seems to be a Vulkan Loader issue. Can be fixed by compiling the loader from the Github repo. +dEQP-VK.api.get_device_proc_addr.non_enabled,Fail +# This is a bug in CTS: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5096 +dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary,Fail + +dEQP-VK.api.info.vulkan1p2_limits_validation.khr_vertex_attribute_divisor,Fail diff --git a/src/broadcom/ci/broadcom-rpi5-flakes.txt b/src/broadcom/ci/broadcom-rpi5-flakes.txt new file mode 100644 index 00000000000..35a53c59666 --- /dev/null +++ b/src/broadcom/ci/broadcom-rpi5-flakes.txt @@ -0,0 +1,15 @@ +dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.4_samples +dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.4_samples +dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024 +dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1024 +dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.1048576 +dEQP-VK.memory.pipeline_barrier.transfer_dst_uniform_texel_buffer.8192 +dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.queuefamily.payload_local.image.guard_local.image.frag +dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.subgroup.payload_local.image.guard_local.buffer.frag +dEQP-VK.memory_model.message_passing.ext.u32.coherent.atomic_fence.atomicwrite.subgroup.payload_local.image.guard_local.buffer.frag +dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.device.payload_local.image.guard_local.buffer.frag +dEQP-VK.memory_model.message_passing.ext.u32.noncoherent.fence_fence.atomicwrite.queuefamily.payload_local.image.guard_local.image.frag +dEQP-VK.pipeline.monolithic.image.suballocation.sampling_type.combined.view_type.1d_array.format.r8_unorm.count_1.size.443x1_array_of_6 +dEQP-VK.spirv_assembly.type.scalar.i8.shift_left_logical_shift16_tesse +dEQP-VK.synchronization2.cross_instance.suballocated.write_blit_image_read_image_tess_eval.image_128x128_r32g32b32a32_sfloat_binary_semaphore_fence_fd +dEQP-VK.texture.shadow.cube.linear.less_d24_unorm_s8_uint diff --git a/src/broadcom/ci/broadcom-rpi5-skips.txt b/src/broadcom/ci/broadcom-rpi5-skips.txt new file mode 100644 index 00000000000..17110a448da --- /dev/null +++ b/src/broadcom/ci/broadcom-rpi5-skips.txt @@ -0,0 +1,96 @@ +# Slow tests (> 1 minute to run) +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.nostore.single.std140.vert_offset_nonzero +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertcheckuv2.store.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.nostore.single.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.multi.std140.vert +dEQP-VK.binding_model.buffer_device_address.set3.depth3.baseubo.convertcheckuv2.store.single.std140.vert +dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom +dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.5 +dEQP-VK.ssbo.layout.random.8bit.all_per_block_buffers.6 +dEQP-VK.ssbo.layout.random.8bit.scalar.78 +dEQP-VK.ssbo.layout.random.nested_structs_arrays.17 +dEQP-VK.ssbo.layout.random.scalar.75 +dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat3x4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.scalar.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat3x4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std140.row_major_mat4x3_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3_comp_access +dEQP-VK.ssbo.phys.layout.3_level_array.std430.row_major_mat4x3_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat3x4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4x3 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.scalar.row_major_mat4x3_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat3x4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4 +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access +dEQP-VK.ssbo.phys.layout.3_level_unsized_array.std430.row_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.basic_unsized_array.scalar.row_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std140.row_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4 +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_comp_access +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_comp_access_store_cols +dEQP-VK.ssbo.phys.layout.basic_unsized_array.std430.row_major_mat4_store_cols +dEQP-VK.ssbo.phys.layout.random.16bit.scalar.78 +dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46 +dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays.17 +dEQP-VK.ssbo.phys.layout.random.8bit.nested_structs_arrays_instance_arrays.12 +dEQP-VK.ssbo.phys.layout.random.8bit.scalar.78 +dEQP-VK.ssbo.phys.layout.random.8bit.scalar.96 +dEQP-VK.ssbo.phys.layout.random.all_per_block_buffers.22 +dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.3 +dEQP-VK.ssbo.phys.layout.random.scalar.3 +dEQP-VK.ssbo.phys.layout.random.scalar.93 + +# WSI tests are too flaky to be useful +dEQP-VK.image.swapchain_mutable.* +dEQP-VK.wsi.* + +# These require VK_KHR_shader_draw_parameters but they don't check for it +# (Seems to be fixed in some later release of CTS 1.3.7). +dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multi_draw.* + +# Skip tests for unsupported features so we can increase the number of tests +# that are actually useful in the limited CI time we have per job. +dEQP-VK.pipeline.monolithic.multisample_with_fragment_shading_rate.* +dEQP-VK.pipeline.monolithic.bind_point.graphics_raytracing.* +dEQP-VK.pipeline.monolithic.bind_point.compute_raytracing.* +dEQP-VK.pipeline.pipeline_library.* +dEQP-VK.pipeline.fast_linked_library.* +dEQP-VK.pipeline.shader_object* +dEQP-VK.protected_memory.* +dEQP-VK.transform_feedback.* +dEQP-VK.ray_tracing_pipeline.* +dEQP-VK.ray_query.* +dEQP-VK.fragment_shading_rate.* +dEQP-VK.mesh_shader.* +dEQP-VK.shader_object.rendering.* diff --git a/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml b/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml new file mode 100644 index 00000000000..a9649cbe516 --- /dev/null +++ b/src/broadcom/ci/deqp-broadcom-rpi3-piglit-full.toml @@ -0,0 +1,6 @@ +[[piglit]] +piglit_folder = "/piglit" +profile = "gpu" +process_isolation = true + [piglit.env] + PIGLIT_PLATFORM = "mixed_glx_egl" diff --git a/src/broadcom/ci/deqp-broadcom-rpi3.toml b/src/broadcom/ci/deqp-broadcom-rpi3.toml new file mode 100644 index 00000000000..1b7293b7c5c --- /dev/null +++ b/src/broadcom/ci/deqp-broadcom-rpi3.toml @@ -0,0 +1,61 @@ +[[deqp]] +deqp = "/deqp/modules/gles2/deqp-gles2" +caselists = ["/deqp/mustpass/gles2-main.txt"] +tests_per_group = 250 +deqp_args = [ + "--deqp-gl-config-name=rgba8888d24s8ms0", + "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", + "--deqp-surface-width=256", + "--deqp-visibility=hidden", +] +version_check = "GL ES 2.0.*git" +renderer_check = "VC4" + +[[deqp]] +deqp = "/deqp/external/openglcts/modules/glcts" +caselists = ["/deqp/mustpass/gles2-khr-main.txt"] +tests_per_group = 250 +deqp_args = [ + "--deqp-gl-config-name=rgba8888d24s8ms0", + "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", + "--deqp-surface-width=256", + "--deqp-visibility=hidden", +] + +# We are getting frequent GPU hangs with piglit, but still haven't identified +# the cause. So let's disable it for now. +# [[piglit]] +# piglit_folder = "/piglit" +# profile = "quick_gl" +# process_isolation = true +# [piglit.env] +# PIGLIT_PLATFORM = "mixed_glx_egl" + +[[piglit]] +piglit_folder = "/piglit" +profile = "quick_shader" +process_isolation = true + +# wayland +[[deqp]] +deqp = "/deqp/modules/egl/deqp-egl-wayland" +caselists = ["/deqp/mustpass/egl-main.txt"] +deqp_args = [ + "--deqp-surface-width=256", "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden", + "--deqp-gl-config-name=rgba8888d24s8ms0", +] +prefix = "wayland-" + +# x11 +[[deqp]] +deqp = "/deqp/modules/egl/deqp-egl-x11" +caselists = ["/deqp/mustpass/egl-main.txt"] +deqp_args = [ + "--deqp-surface-width=256", "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden", + "--deqp-gl-config-name=rgba8888d24s8ms0", +] +prefix = "x11-" diff --git a/src/broadcom/ci/deqp-broadcom-rpi4.toml b/src/broadcom/ci/deqp-broadcom-rpi4.toml new file mode 100644 index 00000000000..930077f31f2 --- /dev/null +++ b/src/broadcom/ci/deqp-broadcom-rpi4.toml @@ -0,0 +1,89 @@ +[[deqp]] +deqp = "/deqp/modules/gles31/deqp-gles31" +caselists = ["/deqp/mustpass/gles31-main.txt"] +deqp_args = [ + "--deqp-gl-config-name=rgba8888d24s8ms0", + "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", + "--deqp-surface-width=256", + "--deqp-visibility=hidden", +] +version_check = "GL ES 3.1.*git" +renderer_check = "V3D" + +[[deqp]] +deqp = "/deqp/modules/gles3/deqp-gles3" +caselists = ["/deqp/mustpass/gles3-main.txt"] +deqp_args = [ + "--deqp-gl-config-name=rgba8888d24s8ms0", + "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", + "--deqp-surface-width=256", + "--deqp-visibility=hidden", +] + +[[deqp]] +deqp = "/deqp/modules/gles2/deqp-gles2" +caselists = ["/deqp/mustpass/gles2-main.txt"] +deqp_args = [ + "--deqp-gl-config-name=rgba8888d24s8ms0", + "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", + "--deqp-surface-width=256", + "--deqp-visibility=hidden", +] + +[[deqp]] +deqp = "/deqp/external/openglcts/modules/glcts" +caselists = [ + "/deqp/mustpass/gles31-khr-main.txt", + "/deqp/mustpass/gles3-khr-main.txt", + "/deqp/mustpass/gles2-khr-main.txt", +] +deqp_args = [ + "--deqp-gl-config-name=rgba8888d24s8ms0", + "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", + "--deqp-surface-width=256", + "--deqp-visibility=hidden", +] + +[[deqp]] +deqp = "/deqp/external/openglcts/modules/glcts" +caselists = ["/deqp/mustpass/gl31-main.txt"] +deqp_args = [ + "--deqp-gl-config-name=rgba8888d24s8ms0", + "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", + "--deqp-surface-width=256", + "--deqp-visibility=hidden", +] + +[[piglit]] +piglit_folder = "/piglit" +profile = "gpu" +process_isolation = true + [piglit.env] + PIGLIT_PLATFORM = "mixed_glx_egl" + +# wayland +[[deqp]] +deqp = "/deqp/modules/egl/deqp-egl-wayland" +caselists = ["/deqp/mustpass/egl-main.txt"] +deqp_args = [ + "--deqp-surface-width=256", "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden", + "--deqp-gl-config-name=rgba8888d24s8ms0", +] +prefix = "wayland-" + +# x11 +[[deqp]] +deqp = "/deqp/modules/egl/deqp-egl-x11" +caselists = ["/deqp/mustpass/egl-main.txt"] +deqp_args = [ + "--deqp-surface-width=256", "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden", + "--deqp-gl-config-name=rgba8888d24s8ms0", +] +prefix = "x11-" diff --git a/src/broadcom/ci/deqp-v3d-rpi4-fails.txt b/src/broadcom/ci/deqp-v3d-rpi4-fails.txt deleted file mode 100644 index 10ab688613d..00000000000 --- a/src/broadcom/ci/deqp-v3d-rpi4-fails.txt +++ /dev/null @@ -1,4 +0,0 @@ -dEQP-GLES31.functional.geometry_shading.query.primitives_generated_amplification,Fail -dEQP-GLES31.functional.geometry_shading.query.primitives_generated_instanced,Fail -dEQP-GLES31.functional.geometry_shading.query.primitives_generated_no_amplification,Fail -dEQP-GLES31.functional.geometry_shading.query.primitives_generated_partial_primitives,Fail diff --git a/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt b/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt deleted file mode 100644 index 673cc5b0941..00000000000 --- a/src/broadcom/ci/deqp-v3d-rpi4-flakes.txt +++ /dev/null @@ -1,3 +0,0 @@ -dEQP-GLES31.functional.compute.shared_var.basic_type.ivec3_highp -dEQP-GLES31.functional.ssbo.layout.single_basic_type.packed.highp_mat2 -KHR-GLES31.core.shader_image_load_store.basic-glsl-earlyFragTests diff --git a/src/broadcom/ci/deqp-v3d-rpi4-gles.toml b/src/broadcom/ci/deqp-v3d-rpi4-gles.toml deleted file mode 100644 index 32a569344d2..00000000000 --- a/src/broadcom/ci/deqp-v3d-rpi4-gles.toml +++ /dev/null @@ -1,47 +0,0 @@ -[[deqp]] -deqp = "/deqp/modules/gles31/deqp-gles31" -caselists = [ "/deqp/mustpass/gles31-master.txt" ] -deqp_args = [ - "--deqp-gl-config-name=rgba8888d24s8ms0", - "--deqp-surface-height=256", - "--deqp-surface-type=pbuffer", - "--deqp-surface-width=256", - "--deqp-visibility=hidden", -] - -[[deqp]] -deqp = "/deqp/modules/gles3/deqp-gles3" -caselists = [ "/deqp/mustpass/gles3-master.txt" ] -deqp_args = [ - "--deqp-gl-config-name=rgba8888d24s8ms0", - "--deqp-surface-height=256", - "--deqp-surface-type=pbuffer", - "--deqp-surface-width=256", - "--deqp-visibility=hidden", -] - -[[deqp]] -deqp = "/deqp/modules/gles2/deqp-gles2" -caselists = [ "/deqp/mustpass/gles2-master.txt" ] -deqp_args = [ - "--deqp-gl-config-name=rgba8888d24s8ms0", - "--deqp-surface-height=256", - "--deqp-surface-type=pbuffer", - "--deqp-surface-width=256", - "--deqp-visibility=hidden", -] - -[[deqp]] -deqp = "/deqp/external/openglcts/modules/glcts" -caselists = [ - "/deqp/mustpass/gles31-khr-master.txt", - "/deqp/mustpass/gles3-khr-master.txt", - "/deqp/mustpass/gles2-khr-master.txt", -] -deqp_args = [ - "--deqp-gl-config-name=rgba8888d24s8ms0", - "--deqp-surface-height=256", - "--deqp-surface-type=pbuffer", - "--deqp-surface-width=256", - "--deqp-visibility=hidden", -] diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt b/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt deleted file mode 100644 index 7898bc2a2d1..00000000000 --- a/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt +++ /dev/null @@ -1,32 +0,0 @@ -# This seems to fail due to the test error threshold being insufficient -dEQP-VK.geometry.input.basic_primitive.line_strip_adjacency,Fail - -# CTS bug; fix submitted -dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_single_buffer_geom,Fail - -# Multiview doesn't work with points -dEQP-VK.multiview.point_size.15,Fail -dEQP-VK.multiview.point_size.8,Fail -dEQP-VK.multiview.point_size.1_2_4_8,Fail -dEQP-VK.multiview.point_size.15_15_15_15,Fail -dEQP-VK.multiview.point_size.8_1_1_8,Fail -dEQP-VK.multiview.point_size.5_10_5_10,Fail -dEQP-VK.multiview.point_size.1_2_4_8_16_32,Fail -dEQP-VK.multiview.point_size.max_multi_view_view_count,Fail - -dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail -dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail -dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail -dEQP-VK.draw.instanced.draw_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail -dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail -dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail -dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail -dEQP-VK.draw.instanced.draw_indirect_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_1_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_2_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_4_multiview,Fail -dEQP-VK.draw.instanced.draw_indexed_indirect_vk_primitive_topology_point_list_attrib_divisor_20_multiview,Fail diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt b/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt deleted file mode 100644 index 0d22f002dbd..00000000000 --- a/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt +++ /dev/null @@ -1,5 +0,0 @@ -dEQP-VK.api.external.fence.opaque_fd.reset_permanent -dEQP-VK.api.external.fence.opaque_fd.reset_temporary -dEQP-VK.api.external.fence.opaque_fd.signal_export_import_wait_permanent -dEQP-VK.ssbo.layout.instance_array_basic_type.std430.uvec4 -dEQP-VK.wsi.display.get_display_plane_capabilities diff --git a/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt b/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt deleted file mode 100644 index bf6a82c19bf..00000000000 --- a/src/broadcom/ci/deqp-v3dv-rpi4-skips.txt +++ /dev/null @@ -1,21 +0,0 @@ -# Broadcom waivers -dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero -dEQP-VK.rasterization.depth_bias.d32_sfloat - -# Timeout tests (> 1 minute to run) -dEQP-VK.api.object_management.max_concurrent.query_pool -dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite -dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap -dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap -dEQP-VK.memory.mapping.suballocation.full.variable.implicit_unmap -dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom -dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert -dEQP-VK.ssbo.layout.random.all_shared_buffer.5 -dEQP-VK.ssbo.layout.random.arrays_of_arrays.13 -dEQP-VK.ssbo.layout.random.nested_structs_arrays.0 -dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp -dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat -dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp -dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat -dEQP-VK.ubo.random.all_out_of_order_offsets.45 -dEQP-VK.ubo.random.all_shared_buffer.48 diff --git a/src/broadcom/ci/deqp-vc4-rpi3-fails.txt b/src/broadcom/ci/deqp-vc4-rpi3-fails.txt deleted file mode 100644 index d0722563e60..00000000000 --- a/src/broadcom/ci/deqp-vc4-rpi3-fails.txt +++ /dev/null @@ -1,420 +0,0 @@ -KHR-GLES2.core.internalformat.copy_tex_image.alpha8_oes,Fail -KHR-GLES2.core.internalformat.copy_tex_image.luminance4_alpha4_oes,Fail -KHR-GLES2.core.internalformat.copy_tex_image.luminance8_alpha8_oes,Fail -KHR-GLES2.core.internalformat.copy_tex_image.luminance8_oes,Fail -KHR-GLES2.core.internalformat.copy_tex_image.rgb565,Fail -KHR-GLES2.core.internalformat.copy_tex_image.rgb5_a1,Fail -KHR-GLES2.core.internalformat.copy_tex_image.rgba4,Fail -KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail -KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail -KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail -KHR-GLES2.texture_3d.copy_sub_image.negative,Fail -KHR-GLES2.texture_3d.copy_sub_image.rgba,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_clamp,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_mirror,Fail -KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_repeat,Fail -KHR-GLES2.texture_3d.filtering.combinations.negative,Fail -KHR-GLES2.texture_3d.filtering.formats.rgba8_linear,Fail -KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest,Fail -KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest,Fail -KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_linear,Fail -KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_nearest,Fail -KHR-GLES2.texture_3d.framebuffer_texture.rgba,Fail -KHR-GLES2.texture_3d.sub_image.rgba8,Fail -dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_pbuffer,Crash -dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_window,Crash -dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_pbuffer,Crash -dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_window,Crash -dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_pbuffer,Crash -dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_window,Crash -dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_pbuffer,Crash -dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_window,Crash -dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_pbuffer,Crash -dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_window,Crash -dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_pbuffer,Crash -dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_window,Crash -dEQP-EGL.functional.create_context.no_config,Fail -dEQP-EGL.functional.render.multi_context.gles2.rgb888_pbuffer,Crash -dEQP-EGL.functional.render.multi_context.gles2.rgb888_window,Crash -dEQP-EGL.functional.render.multi_context.gles2.rgba8888_pbuffer,Crash -dEQP-EGL.functional.render.multi_context.gles2.rgba8888_window,Crash -dEQP-EGL.functional.render.multi_thread.gles2.rgb888_pbuffer,Crash -dEQP-EGL.functional.render.multi_thread.gles2.rgb888_window,Crash -dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_pbuffer,Crash -dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_window,Crash -dEQP-EGL.functional.render.single_context.gles2.rgb888_pbuffer,Crash -dEQP-EGL.functional.render.single_context.gles2.rgb888_window,Crash -dEQP-EGL.functional.render.single_context.gles2.rgba8888_pbuffer,Crash -dEQP-EGL.functional.render.single_context.gles2.rgba8888_window,Crash -dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail -dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail -dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail -dEQP-GLES2.functional.draw.draw_arrays.line_loop.multiple_attributes,Fail -dEQP-GLES2.functional.draw.draw_arrays.line_loop.single_attribute,Fail -dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail -dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail -dEQP-GLES2.functional.negative_api.shader.uniform_matrixfv_invalid_transpose,Fail -dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level_array_compressed,Fail -dEQP-GLES2.functional.negative_api.vertex_array.vertex_attrib,Fail -dEQP-GLES2.functional.negative_api.vertex_array.vertex_attribv,Fail -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_mirror_rgba8888,Fail -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888,Fail -dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square,Fail -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fail -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail -dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.2d_rgba,Fail -dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.cube_rgba,Fail -dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail diff --git a/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt b/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt deleted file mode 100644 index 497be959096..00000000000 --- a/src/broadcom/ci/deqp-vc4-rpi3-flakes.txt +++ /dev/null @@ -1,30 +0,0 @@ -dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_neg_x_neg_y_neg_z -dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_pos_y_pos_z -dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_neg_y_pos_z_and_neg_x_pos_y_neg_z -dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_x_neg_y_pos_z_and_neg_x_pos_y_neg_z -dEQP-GLES2.functional.draw.random.51 -dEQP-GLES2.functional.fragment_ops.blend.rgb_func_alpha_func.src.one_minus_src_alpha_constant_color -dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_direct_write_dynamic_loop_subscript_read_vertex -dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_mediump_int_vertex -dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.conditional_continue_vertex -dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_inout_vertex -dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_return_vertex -dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_sequence_vertex -dEQP-GLES2.functional.shaders.loops.while_constant_iterations.select_iteration_count_vertex -dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.function_call_return_vertex -dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.infinite_with_conditional_break_vertex -dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.post_increment_vertex -dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.single_iteration_vertex -dEQP-GLES2.functional.shaders.operator.unary_operator.pre_decrement_result.mediump_vec3_fragment -dEQP-GLES2.functional.shaders.random.exponential.fragment.51 -dEQP-GLES2.functional.shaders.random.texture.fragment.129 -dEQP-GLES2.functional.shaders.return.output_write_in_func_never_vertex -dEQP-GLES2.functional.texture.filtering.2d.linear_linear_clamp_rgb888_pot -dEQP-GLES2.functional.texture.filtering.cube.linear_mipmap_linear_nearest_mirror_rgba8888 -dEQP-GLES2.functional.texture.filtering.cube.nearest_linear_mirror_rgba8888_pot -dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_linear_clamp_rgba8888 -dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_nearest_repeat_l8 -dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_clamp_rgba8888 -dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_mirror_rgba8888 -dEQP-GLES2.functional.texture.mipmap.cube.generate.rgb565_fastest -dEQP-GLES2.functional.texture.size.cube.256x256_rgb888 diff --git a/src/broadcom/ci/deqp-vc4-rpi3-gles.toml b/src/broadcom/ci/deqp-vc4-rpi3-gles.toml deleted file mode 100644 index 4ca3ab03231..00000000000 --- a/src/broadcom/ci/deqp-vc4-rpi3-gles.toml +++ /dev/null @@ -1,23 +0,0 @@ -[[deqp]] -deqp = "/deqp/modules/gles2/deqp-gles2" -caselists = [ "/deqp/mustpass/gles2-master.txt" ] -tests_per_group = 250 -deqp_args = [ - "--deqp-gl-config-name=rgba8888d24s8ms0", - "--deqp-surface-height=256", - "--deqp-surface-type=pbuffer", - "--deqp-surface-width=256", - "--deqp-visibility=hidden", -] - -[[deqp]] -deqp = "/deqp/external/openglcts/modules/glcts" -caselists = [ "/deqp/mustpass/gles2-khr-master.txt" ] -tests_per_group = 250 -deqp_args = [ - "--deqp-gl-config-name=rgba8888d24s8ms0", - "--deqp-surface-height=256", - "--deqp-surface-type=pbuffer", - "--deqp-surface-width=256", - "--deqp-visibility=hidden", -] diff --git a/src/broadcom/ci/gitlab-ci-inc.yml b/src/broadcom/ci/gitlab-ci-inc.yml new file mode 100644 index 00000000000..4a106db4af2 --- /dev/null +++ b/src/broadcom/ci/gitlab-ci-inc.yml @@ -0,0 +1,156 @@ +.broadcom-common-rules: + rules: + - changes: &broadcom_file_list + - src/broadcom/meson.build + - src/broadcom/ci/gitlab-ci.yml + - src/broadcom/ci/gitlab-ci-inc.yml + - src/broadcom/ci/deqp-$DEQP_SUITE.toml + - src/broadcom/ci/$GPU_VERSION-fails.txt + - src/broadcom/ci/$GPU_VERSION-flakes.txt + - src/broadcom/ci/$GPU_VERSION-skips.txt + - src/broadcom/ci/$PIGLIT_TRACES_FILE + - src/broadcom/cle/**/* + - src/broadcom/clif/**/* + - src/broadcom/common/**/* + - src/broadcom/compiler/**/* + - src/broadcom/drm-shim/**/* + - src/broadcom/qpu/**/* + - src/broadcom/simulator/**/* + when: on_success + +.broadcom-common-manual-rules: + rules: + - changes: *broadcom_file_list + when: manual + +.vc4-rules: + stage: broadcom + rules: + - if: $FORCE_KERNEL_TAG != null + when: never + - !reference [.test, rules] + - !reference [.igalia-farm-rules, rules] + - !reference [.gl-rules, rules] + - !reference [.broadcom-common-rules, rules] + - changes: &vc4_file_list + - src/gallium/drivers/vc4/**/* + - src/gallium/winsys/vc4/**/* + - src/gallium/auxiliary/renderonly/**/* + - src/gallium/winsys/kmsro/**/* + when: on_success + +.vc4-manual-rules: + stage: broadcom + rules: + - !reference [.test, rules] + - !reference [.igalia-farm-manual-rules, rules] + - !reference [.gl-manual-rules, rules] + - !reference [.broadcom-common-manual-rules, rules] + - changes: *vc4_file_list + when: manual + +.v3d-rules: + stage: broadcom + rules: + - if: $FORCE_KERNEL_TAG != null + when: never + - !reference [.test, rules] + - !reference [.igalia-farm-rules, rules] + - !reference [.gl-rules, rules] + - !reference [.broadcom-common-rules, rules] + - changes: &v3d_file_list + - src/gallium/drivers/v3d/**/* + - src/gallium/winsys/v3d/**/* + - src/gallium/auxiliary/renderonly/**/* + - src/gallium/winsys/kmsro/**/* + when: on_success + +.v3d-manual-rules: + stage: broadcom + retry: !reference [.scheduled_pipeline-rules, retry] + rules: + - !reference [.test, rules] + - !reference [.igalia-farm-manual-rules, rules] + - !reference [.gl-manual-rules, rules] + - !reference [.broadcom-common-manual-rules, rules] + - changes: + *v3d_file_list + when: manual + +.v3dv-rules: + stage: broadcom + rules: + - if: $FORCE_KERNEL_TAG != null + when: never + - !reference [.test, rules] + - !reference [.igalia-farm-rules, rules] + - !reference [.vulkan-rules, rules] + - !reference [.broadcom-common-rules, rules] + - changes: &v3dv_file_list + - src/broadcom/vulkan/**/* + when: on_success + +.v3dv-manual-rules: + stage: broadcom + rules: + - !reference [.test, rules] + - !reference [.igalia-farm-manual-rules, rules] + - !reference [.vulkan-manual-rules, rules] + - !reference [.broadcom-common-manual-rules, rules] + - changes: *v3dv_file_list + when: manual + +# 8 devices (2023-12-18) +.igalia-bcm2837-rpi-3-b:arm64: + variables: + DEVICE_TYPE: rpi3 + GPU_VERSION: broadcom-rpi3 + script: + - ./install/bare-metal/poe-powered.sh + tags: + - igalia-rpi3 + +# 21 devices (2023-12-18) +.igalia-bcm2711-rpi-4:arm64: + variables: + DEVICE_TYPE: rpi4 + GPU_VERSION: broadcom-rpi4 + VK_DRIVER: broadcom + script: + - ./install/bare-metal/poe-powered.sh + tags: + - igalia-rpi4 + +# 1 device (2024-01-02) +.igalia-bcm2712-rpi-5:arm64: + variables: + DEVICE_TYPE: rpi5 + GPU_VERSION: broadcom-rpi5 + VK_DRIVER: broadcom + script: + - ./install/bare-metal/poe-powered.sh + tags: + - igalia-rpi5 + +.broadcom-test: + script: + - ./install/bare-metal/poe-powered.sh + variables: + HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" + FLAKES_CHANNEL: "#videocore-ci" + FARM: igalia + timeout: 20m + +.broadcom-test:arm64: + extends: + - .broadcom-test + - .baremetal-test-arm64 + variables: + BM_BOOTFS: /boot/raspberrypi_arm64 + +.broadcom-test:arm32: + extends: + - .broadcom-test + - .baremetal-test-arm32 + variables: + BM_BOOTFS: /boot/raspberrypi_armhf diff --git a/src/broadcom/ci/gitlab-ci.yml b/src/broadcom/ci/gitlab-ci.yml index 165f9959936..32ef88554fc 100644 --- a/src/broadcom/ci/gitlab-ci.yml +++ b/src/broadcom/ci/gitlab-ci.yml @@ -1,141 +1,113 @@ -.vc4-rpi3-test:armhf: +include: + - local: 'src/broadcom/ci/gitlab-ci-inc.yml' + +vc4-rpi3-gl:arm32: extends: - - .baremetal-test-armhf + - .igalia-bcm2837-rpi-3-b:arm64 + - .broadcom-test:arm32 - .vc4-rules - - .use-debian/arm_test + parallel: 4 variables: - BM_BOOTFS: /boot/raspberrypi_armhf - BM_ROOTFS: /rootfs-armhf - DEQP_EXPECTED_RENDERER: VC4 - GPU_VERSION: vc4-rpi3 - HWCI_KERNEL_MODULES: vc4 - FLAKES_CHANNEL: "#videocore-ci" - script: - - ./install/bare-metal/poe-powered.sh - needs: - - job: debian/arm_test - artifacts: false - - debian-armhf - tags: - - igalia-rpi3 + DEQP_SUITE: broadcom-rpi3 + HWCI_START_WESTON: 1 -vc4-rpi3-gles:armhf: +vc4-rpi3-gl-piglit-full:arm32: extends: - - .vc4-rpi3-test:armhf - parallel: 2 + - vc4-rpi3-gl:arm32 + - .vc4-manual-rules + tags: + - igalia-rpi3 + - igalia-fullrun variables: - HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" - DEQP_SUITE: vc4-rpi3-gles - DEQP_VER: gles2 + DEQP_SUITE: broadcom-rpi3-piglit-full -vc4-rpi3-egl:armhf: - extends: - - .vc4-rpi3-test:armhf - variables: - HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" - HWCI_START_XORG: 1 - DEQP_RUNNER_OPTIONS: "--tests-per-group 250" - DEQP_VER: egl -.vc4-rpi3-piglit:armhf: +v3d-rpi4-gl:arm64: extends: - - .piglit-test - - .vc4-rpi3-test:armhf - - .test-manual + - .igalia-bcm2711-rpi-4:arm64 + - .broadcom-test:arm64 + - .v3d-rules + parallel: 8 variables: - HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh" - BM_POE_TIMEOUT: 180 - HWCI_START_XORG: 1 - PIGLIT_PLATFORM: mixed_glx_egl + HWCI_START_WESTON: 1 + DEQP_SUITE: broadcom-rpi4 + DEQP_FRACTION: 2 -vc4-rpi3-piglit-quick_gl:armhf: +v3d-rpi4-gl-full:arm64: extends: - - .vc4-rpi3-piglit:armhf - parallel: 4 + - v3d-rpi4-gl:arm64 + - .v3d-manual-rules + tags: + - igalia-rpi4 + - igalia-fullrun + parallel: 6 + timeout: 45m variables: - FDO_CI_CONCURRENT: 1 - PIGLIT_PROFILES: quick_gl + TEST_PHASE_TIMEOUT: 40 + DEQP_FRACTION: 1 -vc4-rpi3-piglit-quick_shader:armhf: - extends: - - .vc4-rpi3-piglit:armhf - parallel: 2 - variables: - FDO_CI_CONCURRENT: 2 - PIGLIT_PROFILES: quick_shader -.v3d-rpi4-test:armhf: +v3d-rpi4-traces:arm64: extends: - - .baremetal-test-armhf + - .igalia-bcm2711-rpi-4:arm64 + - .piglit-traces-test + - .broadcom-test:arm64 - .v3d-rules - - .use-debian/arm_test variables: - HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" - BM_BOOTFS: /boot/raspberrypi_armhf - BM_POE_TIMEOUT: 300 - BM_ROOTFS: /rootfs-armhf - DEQP_EXPECTED_RENDERER: V3D - FLAKES_CHANNEL: "#videocore-ci" - GPU_VERSION: v3d-rpi4 - HWCI_KERNEL_MODULES: v3d,vc4 - script: - - ./install/bare-metal/poe-powered.sh - needs: - - debian/arm_test - - debian-armhf - tags: - - igalia-rpi4 + HWCI_TEST_SCRIPT: "/install/piglit/piglit-traces.sh" + PIGLIT_TRACES_FILE: traces-broadcom.yml + PIGLIT_REPLAY_DEVICE_NAME: "broadcom-rpi4" + PIGLIT_RESULTS: "broadcom-rpi4-replay" -v3d-rpi4-gles:armhf: - extends: - - .v3d-rpi4-test:armhf - parallel: 8 - variables: - DEQP_SUITE: v3d-rpi4-gles - DEQP_VER: gles31 -v3d-rpi4-egl:armhf: +v3dv-rpi4-vk:arm64: extends: - - .v3d-rpi4-test:armhf + - .igalia-bcm2711-rpi-4:arm64 + - .broadcom-test:arm64 + - .v3dv-rules + parallel: 10 variables: - HWCI_START_XORG: 1 - DEQP_VER: egl + HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" + HWCI_START_WESTON: 1 + DEQP_EXPECTED_RENDERER: "V3D.4.2" + DEQP_FRACTION: 3 + DEQP_VER: vk + FLAKES_CHANNEL: "#videocore-ci" -v3d-rpi4-piglit:armhf: +v3dv-rpi4-vk-full:arm64: extends: - - .piglit-test - - .v3d-rpi4-test:armhf - parallel: 4 + - v3dv-rpi4-vk:arm64 + - .v3dv-manual-rules + tags: + - igalia-rpi4 + - igalia-fullrun + parallel: 6 + timeout: 2h variables: - HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh" - HWCI_START_XORG: 1 - PIGLIT_PLATFORM: mixed_glx_egl - PIGLIT_PROFILES: all + # Keep 10 minutes for boot + setup + uploading the artifacts at the end + TEST_PHASE_TIMEOUT: 110 + DEQP_FRACTION: 1 -v3dv-rpi4-vk:arm64: + +.v3dv-rpi5-vk:arm64: extends: - - .baremetal-test - - .use-debian/arm_test + - .igalia-bcm2712-rpi-5:arm64 + - .broadcom-test:arm64 - .v3dv-rules - parallel: 8 variables: HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" - BM_BOOTFS: /boot/raspberrypi_arm64 - BM_POE_TIMEOUT: 300 - BM_ROOTFS: /rootfs-arm64 - DEQP_EXPECTED_RENDERER: "V3D 4.2" - DEQP_FRACTION: 5 + HWCI_START_WESTON: 1 + DEQP_EXPECTED_RENDERER: "V3D.7.1" + DEQP_FRACTION: 15 DEQP_VER: vk FLAKES_CHANNEL: "#videocore-ci" - GPU_VERSION: v3dv-rpi4 - HWCI_KERNEL_MODULES: v3d,vc4 - MINIO_ARTIFACT_NAME: mesa-arm64 - VK_DRIVER: broadcom - script: - - ./install/bare-metal/poe-powered.sh - needs: - - debian/arm_test - - job: debian-arm64 - artifacts: false - tags: - - igalia-rpi4 + +v3dv-rpi5-vk-full:arm64: + extends: + - .v3dv-rpi5-vk:arm64 + - .v3dv-manual-rules + timeout: 3h + variables: + # Keep 10 minutes for boot + setup + uploading the artifacts at the end + TEST_PHASE_TIMEOUT: 170 + DEQP_FRACTION: 1 diff --git a/src/broadcom/ci/piglit-v3d-rpi4-fails.txt b/src/broadcom/ci/piglit-v3d-rpi4-fails.txt deleted file mode 100644 index 4557a55562f..00000000000 --- a/src/broadcom/ci/piglit-v3d-rpi4-fails.txt +++ /dev/null @@ -1,337 +0,0 @@ -glx@glx-make-current,Crash -glx@glx-multi-window-single-context,Fail -glx@glx-multithread-buffer,Fail -glx@glx-query-drawable-glx_fbconfig_id-window,Fail -glx@glx-swap-pixmap-bad,Fail -glx@glx-visuals-depth -pixmap,Crash -glx@glx-visuals-stencil -pixmap,Crash -glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail -glx@glx_arb_create_context_no_error@no error,Fail -glx@glx_ext_import_context@free context,Fail -glx@glx_ext_import_context@get context id,Fail -glx@glx_ext_import_context@get current display,Fail -glx@glx_ext_import_context@import context- multi process,Fail -glx@glx_ext_import_context@import context- single process,Fail -glx@glx_ext_import_context@imported context has same context id,Fail -glx@glx_ext_import_context@make current- multi process,Fail -glx@glx_ext_import_context@make current- single process,Fail -glx@glx_ext_import_context@query context info,Fail -shaders@glsl-bug-110796,Fail -spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail -spec@!opengl 1.0@gl-1.0-dlist-bitmap,Fail -spec@!opengl 1.0@gl-1.0-edgeflag,Fail -spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail -spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail -spec@!opengl 1.0@gl-1.0-no-op-paths,Fail -spec@!opengl 1.0@gl-1.0-spot-light,Fail -spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Fail -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2,Fail -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=4,Fail -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Fail -spec@!opengl 1.1@getteximage-depth,Fail -spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT16,Fail -spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT24,Fail -spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT32,Fail -spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT,Fail -spec@!opengl 1.1@getteximage-formats,Fail -spec@!opengl 1.1@linestipple,Fail -spec@!opengl 1.1@linestipple@Factor 2x,Fail -spec@!opengl 1.1@linestipple@Factor 3x,Fail -spec@!opengl 1.1@linestipple@Line loop,Fail -spec@!opengl 1.1@linestipple@Line strip,Fail -spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail -spec@!opengl 1.1@point-line-no-cull,Fail -spec@!opengl 1.1@polygon-mode,Fail -spec@!opengl 1.1@polygon-mode-offset,Fail -spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail -spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail -spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail -spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail -spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail -spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail -spec@!opengl 1.1@texwrap formats bordercolor,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY12- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY16- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA12- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA4- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB12- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB16- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA12- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY12- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY16- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA12- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA4- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB12- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB16- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA12- swizzled- border color only,Fail -spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA16- swizzled- border color only,Fail -spec@!opengl 1.1@windowoverlap,Fail -spec@!opengl 1.4@gl-1.4-polygon-offset,Fail -spec@!opengl 2.0@gl-2.0-edgeflag,Fail -spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail -spec@!opengl 2.0@max-samplers,Fail -spec@!opengl 2.0@max-samplers border,Fail -spec@!opengl 2.1@pbo,Fail -spec@!opengl 2.1@pbo@test_polygon_stip,Fail -spec@!opengl 2.1@polygon-stipple-fs,Fail -spec@!opengl es 3.0@gles-3.0-transform-feedback-uniform-buffer-object,Fail -spec@arb_color_buffer_float@gl_rgba32f-render,Fail -spec@arb_color_buffer_float@gl_rgba32f-render-fog,Fail -spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Fail -spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Fail -spec@arb_compute_shader@minmax,Fail -spec@arb_copy_buffer@targets,Fail -spec@arb_depth_buffer_float@fbo-generatemipmap-formats,Fail -spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F,Fail -spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F NPOT,Fail -spec@arb_depth_buffer_float@texwrap formats bordercolor,Fail -spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH32F_STENCIL8- border color only,Fail -spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH_COMPONENT32F- border color only,Fail -spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled,Fail -spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH32F_STENCIL8- swizzled- border color only,Fail -spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32F- swizzled- border color only,Fail -spec@arb_depth_buffer_float@texwrap formats,Fail -spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH32F_STENCIL8- NPOT,Fail -spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH_COMPONENT32F- NPOT,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16 NPOT,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT,Fail -spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT NPOT,Fail -spec@arb_depth_texture@texwrap formats bordercolor,Fail -spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail -spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail -spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail -spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail -spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail -spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail -spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail -spec@arb_depth_texture@texwrap formats,Fail -spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail -spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail -spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail -spec@arb_framebuffer_object@fbo-drawbuffers-none use_frag_out,Fail -spec@arb_pixel_buffer_object@pbo-getteximage,Fail -spec@arb_pixel_buffer_object@texsubimage array pbo,Fail -spec@arb_point_sprite@arb_point_sprite-checkerboard,Fail -spec@arb_point_sprite@arb_point_sprite-mipmap,Fail -spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail -spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgrad,Fail -spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail -spec@arb_texture_float@fbo-blending-formats,Fail -spec@arb_texture_float@fbo-blending-formats@GL_ALPHA32F_ARB,Fail -spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY16F_ARB,Fail -spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail -spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE16F_ARB,Fail -spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail -spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE_ALPHA32F_ARB,Fail -spec@arb_texture_float@fbo-blending-formats@GL_RGB16F,Fail -spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail -spec@arb_texture_float@fbo-blending-formats@GL_RGBA32F,Fail -spec@arb_texture_float@texwrap formats bordercolor,Fail -spec@arb_texture_float@texwrap formats bordercolor@GL_ALPHA32F_ARB- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor@GL_INTENSITY32F_ARB- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE32F_ARB- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE_ALPHA32F_ARB- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor@GL_RGB32F- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor@GL_RGBA32F- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor-swizzled,Fail -spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_ALPHA32F_ARB- swizzled- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_INTENSITY32F_ARB- swizzled- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE32F_ARB- swizzled- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE_ALPHA32F_ARB- swizzled- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGB32F- swizzled- border color only,Fail -spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGBA32F- swizzled- border color only,Fail -spec@arb_texture_rectangle@1-1-linear-texture,Fail -spec@arb_texture_rg@fbo-blending-formats-float,Fail -spec@arb_texture_rg@fbo-blending-formats-float@GL_R32F,Fail -spec@arb_texture_rg@fbo-blending-formats-float@GL_RG32F,Fail -spec@arb_texture_rg@texwrap formats bordercolor,Fail -spec@arb_texture_rg@texwrap formats bordercolor@GL_R16- border color only,Fail -spec@arb_texture_rg@texwrap formats bordercolor@GL_RG16- border color only,Fail -spec@arb_texture_rg@texwrap formats bordercolor-swizzled,Fail -spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_R16- swizzled- border color only,Fail -spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_RG16- swizzled- border color only,Fail -spec@arb_texture_rg@texwrap formats-float bordercolor,Fail -spec@arb_texture_rg@texwrap formats-float bordercolor@GL_R32F- border color only,Fail -spec@arb_texture_rg@texwrap formats-float bordercolor@GL_RG32F- border color only,Fail -spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled,Fail -spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_R32F- swizzled- border color only,Fail -spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_RG32F- swizzled- border color only,Fail -spec@arb_texture_rg@texwrap formats-float,Fail -spec@arb_texture_rg@texwrap formats-float@GL_R32F- NPOT,Fail -spec@arb_texture_rg@texwrap formats-float@GL_RG32F- NPOT,Fail -spec@arb_transform_feedback2@change objects while paused (gles3),Fail -spec@egl 1.4@egl-copy-buffers,Crash -spec@egl 1.4@eglterminate then unbind context,Fail -spec@egl_ext_protected_content@conformance,Fail -spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail -spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail -spec@egl_khr_surfaceless_context@viewport,Fail -spec@egl_mesa_configless_context@basic,Fail -spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail -spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail -spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail -spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail -spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail -spec@ext_framebuffer_object@fbo-blending-formats,Fail -spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB10,Fail -spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail -spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail -spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-isampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-sampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-usampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-isampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-sampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-usampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-isampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-sampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-usampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-isampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-sampler1darray,Fail -spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-usampler1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darrayshadow,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darrayshadow,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() cubeshadow,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darrayshadow,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darrayshadow,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darrayshadow,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darrayshadow,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darray,Fail -spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darrayshadow,Fail -spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail -spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail -spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled,Fail -spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_STENCIL8- swizzled- border color only,Fail -spec@ext_packed_depth_stencil@texwrap formats,Fail -spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail -spec@ext_packed_float@query-rgba-signed-components,Fail -spec@ext_texture_array@array-texture,Fail -spec@ext_texture_array@fbo-generatemipmap-array rgb9_e5,Fail -spec@ext_texture_array@fbo-generatemipmap-array,Fail -spec@ext_texture_array@texsubimage array,Fail -spec@ext_texture_integer@getteximage-clamping gl_arb_texture_rg,Fail -spec@ext_texture_integer@getteximage-clamping,Fail -spec@ext_texture_lod_bias@lodbias,Fail -spec@ext_texture_snorm@texwrap formats bordercolor,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_ALPHA16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_INTENSITY16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_R16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_RG16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGB16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGBA16_SNORM- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_ALPHA16_SNORM- swizzled- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_INTENSITY16_SNORM- swizzled- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16_SNORM- swizzled- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_SNORM- swizzled- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_R16_SNORM- swizzled- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RG16_SNORM- swizzled- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGB16_SNORM- swizzled- border color only,Fail -spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGBA16_SNORM- swizzled- border color only,Fail -spec@arb_texture_storage@texture-storage@cube array texture,Fail -spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash -spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash -spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail -spec@glsl-1.20@execution@clipping@vs-clip-vertex-primitives,Fail -spec@glsl-1.20@execution@fs-underflow-mul-compare-zero,Fail -spec@intel_performance_query@intel_performance_query-issue_2235,Fail -spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail -spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail -spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail -spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail -spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x223344ff,Crash -spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x76356278,Crash -spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x223344ff,Crash -spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x76356278,Crash -spec@nv_copy_depth_to_color@nv_copy_depth_to_color,Crash -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.vert,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.frag,Fail -spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.vert,Fail -spec@nv_read_depth@read_depth_gles3,Fail -spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Crash -spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail diff --git a/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt b/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt deleted file mode 100644 index 14d2b9b4fd8..00000000000 --- a/src/broadcom/ci/piglit-v3d-rpi4-flakes.txt +++ /dev/null @@ -1,7 +0,0 @@ -glx@glx_arb_sync_control@swapbuffersmsc-divisor-zero -glx@glx_arb_sync_control@waitformsc -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4 -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2 -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4 -spec@arb_occlusion_query@occlusion_query_order -spec@egl_chromium_sync_control@conformance diff --git a/src/broadcom/ci/piglit-v3d-rpi4-skips.txt b/src/broadcom/ci/piglit-v3d-rpi4-skips.txt deleted file mode 100644 index 2c70ff30c3f..00000000000 --- a/src/broadcom/ci/piglit-v3d-rpi4-skips.txt +++ /dev/null @@ -1,20 +0,0 @@ -# Slow tests (> 1 minute to run) -spec@!opengl 1.1@streaming-texture-leak -spec@!opengl 1.2@tex3d-maxsize -spec@ext_texture_env_combine@texture-env-combine -spec@glsl-1.10@execution@loops@glsl-fs-unroll-explosion -spec@glsl-1.10@execution@loops@glsl-vs-unroll-explosion -spec@!opengl 1.0@gl-1.0-blend-func - -# Extensions not supported -spec@arb_gpu_shader_fp64.* -spec@arb_gpu_shader_gpu5.* -spec@arb_gpu_shader_int64.* -spec@arb_tessellation_shader.* -spec@arb_texture_cube_map.* -spec@glsl-1.30.* -spec@glsl-1.40.* -spec@glsl-1.50.* -spec@glsl-3.* -spec@glsl-4.* -spec@glsl-es-3.20.* diff --git a/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt b/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt deleted file mode 100644 index afb7a908c87..00000000000 --- a/src/broadcom/ci/piglit-vc4-rpi3-flakes.txt +++ /dev/null @@ -1,8 +0,0 @@ -glx@glx-multi-window-single-context -shaders@glsl-vs-loop -shaders@glsl-vs-loop-nested -spec@arb_framebuffer_srgb@blit renderbuffer srgb single_sampled enabled clear -spec@egl_chromium_sync_control@conformance -spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-readpixels -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2 -spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4 diff --git a/src/broadcom/ci/piglit-vc4-rpi3-skips.txt b/src/broadcom/ci/piglit-vc4-rpi3-skips.txt deleted file mode 100644 index ae25a28bb9a..00000000000 --- a/src/broadcom/ci/piglit-vc4-rpi3-skips.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Slow tests (> 1 minute to run) -spec@ext_framebuffer_multisample@accuracy -glx@glx-multithread-texture -spec@arb_internalformat_query2@all internalformat_<x>_type pname checks -spec@!opengl 1.1@streaming-texture-leak -spec@!opengl 1.0@gl-1.0-blend-func - -# Extensions not supported -spec@arb_gpu_shader_fp64.* -spec@arb_gpu_shader_gpu5.* -spec@arb_gpu_shader_int64.* -spec@arb_tessellation_shader.* -spec@arb_texture_cube_map.* -spec@glsl-1.30.* -spec@glsl-1.40.* -spec@glsl-1.50.* -spec@glsl-3.* -spec@glsl-4.* -spec@glsl-es-3.* diff --git a/src/broadcom/ci/traces-broadcom.yml b/src/broadcom/ci/traces-broadcom.yml new file mode 100644 index 00000000000..d330ad0dcc8 --- /dev/null +++ b/src/broadcom/ci/traces-broadcom.yml @@ -0,0 +1,205 @@ +%YAML 1.2 +--- +traces-db: + download-url: "http://192.168.40.131:8888/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public/" + +traces: + 0ad/0ad-v2.trace: + broadcom-rpi4: + checksum: 8bdca9e63f483ee71970075842f003db + + behdad-glyphy/glyphy-v2.trace: + broadcom-rpi4: + checksum: ea49462ff1545f21506dbd7b5028df45 + + blender/blender-demo-cube_diorama.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 4.5 + + blender/blender-demo-ellie_pose.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 4.5 + + filament/filament-default.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 4.1 + + glxgears/glxgears-2-v2.trace: + broadcom-rpi4: + label: [skip, flakes] + text: "Often fails when running on xwayland, with what looks like an incorrect resolution" + checksum: 2a9c5e35fa5693fd7d3a76f7b9746edb + + godot/godot-thrive.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + godot/godot-tps-gles3-high.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + gputest/furmark-v2.trace: + broadcom-rpi4: + checksum: 800b2be5981d7e1a6570643f7dfd9a33 + + gputest/gimark-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + gputest/pixmark-julia-fp32-v2.trace: + broadcom-rpi4: + label: [skip, flakes] + checksum: be70fc9e3829fff5ad1b6ecfb6fa551c + + gputest/pixmark-julia-fp64-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 4.0 + + gputest/pixmark-volplosion-v2.trace: + broadcom-rpi4: + checksum: 03f6b1c064af4e7eb117b800893cdba6 + + gputest/plot3d-v2.trace: + broadcom-rpi4: + checksum: 1ef33ad22679107a256501c79bfd9e7c + + gputest/tessmark-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 4.0 + + gputest/triangle-v2.trace: + broadcom-rpi4: + checksum: df6df2af5fecfa42b5c2c332b726e93c + + humus/AmbientAperture-v2.trace: + broadcom-rpi4: + checksum: a2d2a0141384a23e91ed30a27ed46bfe + + humus/CelShading-v2.trace: + broadcom-rpi4: + checksum: 1135888a0e8723bbcded5ef9f0925964 + + humus/DynamicBranching3-v2.trace: + broadcom-rpi4: + checksum: 68011c66cfd83aa8a6b568de7c726d49 + + humus/HDR-v2.trace: + broadcom-rpi4: + checksum: de024f342418b578841f98ce697de8b5 + + humus/Portals-v2.trace: + broadcom-rpi4: + checksum: 269b9572113d6991cf58c96a833502bf + + humus/RaytracedShadows-v2.trace: + broadcom-rpi4: + checksum: 6b572f241f4f9ee001ef849d10d03cc5 + + humus/VolumetricFogging2-v2.trace: + broadcom-rpi4: + checksum: d3b89dfaff0277be4b4b2ad2cf055d54 + + jvgs/jvgs-d27fb67-v2.trace: + broadcom-rpi4: + checksum: 831138a408cc9557528ef68381b080f2 + + neverball/neverball-v2.trace: + broadcom-rpi4: + checksum: c8e8ee352bdb303e4ed144b69272575e + + nheko/nheko-colors.trace: + broadcom-rpi4: + checksum: 922597b0203ff18d6e430002bcf32ef4 + + supertuxkart/supertuxkart-mansion-egl-gles-v2.trace: + broadcom-rpi4: + checksum: 93fe17a18ab10d862b5a42b4ea05a658 + + valve/counterstrike-source-v2.trace: + broadcom-rpi4: + label: [skip, timeout] + + valve/counterstrike-v2.trace: + broadcom-rpi4: + checksum: 547f6435bf21458e518bbcb2161962ab + + valve/half-life-2-v2.trace: + broadcom-rpi4: + label: [crash] + text: v3d42_create_texture_shader_state_bo assertion abot serial_id + + valve/portal-2-v2.trace: + broadcom-rpi4: + label: [skip, timeout] + + paraview/pv-manyspheres-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + paraview/pv-waveletcontour-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + paraview/pv-waveletvolume-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + pathfinder/canvas_moire-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + pathfinder/canvas_text_v2-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + pathfinder/demo-v2.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + pioneer/pioneer.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.2 + + ror/ror-default.trace: + broadcom-rpi4: + label: [skip, flakes] + checksum: 533edca21409981b4983db846de4355e + + thedarkmod/thedarkmod.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.3 + + unvanquished/unvanquished-lowest.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.2 + + unvanquished/unvanquished-ultra.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GL 3.2 + + warzone2100/warzone2100-default.trace: + broadcom-rpi4: + label: [unsupported] + text: needs GLSL 1.50 + + xonotic/xonotic-keybench-high-v2.trace: + broadcom-rpi4: + checksum: 3bc4ca2efa5a7b35701a8daad378e565 diff --git a/src/broadcom/cle/gen_pack_header.py b/src/broadcom/cle/gen_pack_header.py index 0090b616d50..1cc2446d0bd 100644 --- a/src/broadcom/cle/gen_pack_header.py +++ b/src/broadcom/cle/gen_pack_header.py @@ -25,9 +25,8 @@ import xml.parsers.expat import re import sys -import copy -license = """/* Generated code, see v3d_packet_v21.xml, v3d_packet_v33.xml and gen_pack_header.py */ +license = """/* Generated code, see vc4_packet.xml, v3d_packet.xml and gen_pack_header.py */ """ pack_header = """%(license)s @@ -113,7 +112,7 @@ class Field(object): self.type = attrs["type"] if self.type == 'bool' and self.start != self.end: - print("#error Field {} has bool type but more than one bit of size".format(self.name)); + print("#error Field {} has bool type but more than one bit of size".format(self.name)) if "prefix" in attrs: self.prefix = safe_name(attrs["prefix"]).upper() @@ -215,7 +214,7 @@ class Group(object): last_byte = field.end // 8 for b in range(first_byte, last_byte + 1): - if not b in bytes: + if b not in bytes: bytes[b] = self.Byte() bytes[b].fields.append(field) @@ -240,7 +239,7 @@ class Group(object): for index in range(self.length): # Handle MBZ bytes - if not index in bytes: + if index not in bytes: print(" cl[%2d] = 0;" % index) continue byte = bytes[index] @@ -276,7 +275,6 @@ class Group(object): byte_start = index * 8 - v = None prefix = " cl[%2d] =" % index field_index = 0 @@ -296,46 +294,46 @@ class Group(object): value = "%s - 1" % value if field.type == "mbo": - s = "__gen_mbo(%d, %d)" % \ + s = "util_bitpack_ones(%d, %d)" % \ (start, end) elif field.type == "address": extra_shift = (31 - (end - start)) // 8 * 8 s = "__gen_address_offset(&values->%s)" % byte.address.name elif field.type == "uint": - s = "__gen_uint(%s, %d, %d)" % \ + s = "util_bitpack_uint(%s, %d, %d)" % \ (value, start, end) elif field.type in self.parser.enums: - s = "__gen_uint(%s, %d, %d)" % \ + s = "util_bitpack_uint(%s, %d, %d)" % \ (value, start, end) elif field.type == "int": - s = "__gen_sint(%s, %d, %d)" % \ + s = "util_bitpack_sint(%s, %d, %d)" % \ (value, start, end) elif field.type == "bool": - s = "__gen_uint(%s, %d, %d)" % \ + s = "util_bitpack_uint(%s, %d, %d)" % \ (value, start, end) elif field.type == "float": s = "#error %s float value mixed in with other fields" % name elif field.type == "f187": - s = "__gen_uint(fui(%s) >> 16, %d, %d)" % \ + s = "util_bitpack_uint(fui(%s) >> 16, %d, %d)" % \ (value, start, end) elif field.type == "offset": s = "__gen_offset(%s, %d, %d)" % \ (value, start, end) elif field.type == 'ufixed': - s = "__gen_ufixed(%s, %d, %d, %d)" % \ + s = "util_bitpack_ufixed(%s, %d, %d, %d)" % \ (value, start, end, field.fractional_size) elif field.type == 'sfixed': - s = "__gen_sfixed(%s, %d, %d, %d)" % \ + s = "util_bitpack_sfixed(%s, %d, %d, %d)" % \ (value, start, end, field.fractional_size) elif field.type in self.parser.structs: - s = "__gen_uint(v%d_%d, %d, %d)" % \ + s = "util_bitpack_uint(v%d_%d, %d, %d)" % \ (index, field_index, start, end) field_index = field_index + 1 else: print("/* unhandled field %s, type %s */\n" % (name, field.type)) s = None - if not s == None: + if s is not None: shift = byte_start - field_byte_start + extra_shift if shift: s = "%s >> %d" % (s, shift) @@ -383,7 +381,6 @@ class Group(object): convert = "__gen_unpack_sfixed" else: print("/* unhandled field %s, type %s */\n" % (field.name, field.type)) - s = None plusone = "" if field.minus_one: @@ -545,9 +542,9 @@ class Parser(object): def emit_header(self, name): default_fields = [] for field in self.group.fields: - if not type(field) is Field: + if type(field) is not Field: continue - if field.default == None: + if field.default is None: continue default_fields.append(" .%-35s = %6d" % (field.name, field.default)) @@ -577,7 +574,7 @@ class Parser(object): return name = self.register - if not self.reg_num == None: + if self.reg_num is not None: print('#define %-33s 0x%04x' % (self.gen_prefix(name + "_num"), self.reg_num)) diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build index 4cab2b38dda..da88cd220a5 100644 --- a/src/broadcom/cle/meson.build +++ b/src/broadcom/cle/meson.build @@ -18,27 +18,25 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -# [version, cle XML version] +# [version, cle XML file] v3d_versions = [ - [21, 21], - [33, 33], - [41, 33], - [42, 33] + [21, 'vc4_packet.xml'], + [42, 'v3d_packet.xml'], + [71, 'v3d_packet.xml'] ] v3d_xml_files = [] v3d_xml_pack = [] foreach _v : v3d_versions v = _v[0] - xmlver = _v[1] - f = 'v3d_packet_v@0@.xml'.format(xmlver) + xmlfile = _v[1] _name = 'v3d_packet_v@0@_pack.h'.format(v) - if not v3d_xml_files.contains(f) - v3d_xml_files += f + if not v3d_xml_files.contains(xmlfile) + v3d_xml_files += xmlfile endif v3d_xml_pack += custom_target( _name, - input : ['gen_pack_header.py', f], + input : ['gen_pack_header.py', xmlfile], output : _name, command : [prog_python, '@INPUT@', '@0@'.format(v)], capture : true, @@ -47,7 +45,7 @@ endforeach v3d_xml_h = custom_target( 'v3d_xml.h', - input : ['../../intel/genxml/gen_zipped_file.py', v3d_xml_files], + input : ['../../util/gen_zipped_xml_file.py', v3d_xml_files], output : 'v3d_xml.h', command : [prog_python, '@INPUT@'], capture : true, @@ -59,9 +57,9 @@ if dep_expat.found() endif libbroadcom_cle = static_library( - ['broadcom_cle', v3d_xml_h], - 'v3d_decoder.c', - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom], + 'broadcom_cle', + ['v3d_decoder.c', v3d_xml_h], + include_directories : [inc_include, inc_src, inc_broadcom], c_args : [no_override_init_args, expat_args], gnu_symbol_visibility : 'hidden', dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib], diff --git a/src/broadcom/cle/v3d_decoder.c b/src/broadcom/cle/v3d_decoder.c index 97dd8ce8423..46cd152e599 100644 --- a/src/broadcom/cle/v3d_decoder.c +++ b/src/broadcom/cle/v3d_decoder.c @@ -267,51 +267,6 @@ get_register_offset(const char **atts, uint32_t *offset) return; } -static void -get_start_end_pos(int *start, int *end) -{ - /* start value has to be mod with 32 as we need the relative - * start position in the first DWord. For the end position, add - * the length of the field to the start position to get the - * relative postion in the 64 bit address. - */ - if (*end - *start > 32) { - int len = *end - *start; - *start = *start % 32; - *end = *start + len; - } else { - *start = *start % 32; - *end = *end % 32; - } - - return; -} - -static inline uint64_t -mask(int start, int end) -{ - uint64_t v; - - v = ~0ULL >> (63 - end + start); - - return v << start; -} - -static inline uint64_t -field(uint64_t value, int start, int end) -{ - get_start_end_pos(&start, &end); - return (value & mask(start, end)) >> (start); -} - -static inline uint64_t -field_address(uint64_t value, int start, int end) -{ - /* no need to right shift for address/offset */ - get_start_end_pos(&start, &end); - return (value & mask(start, end)); -} - static struct v3d_type string_to_type(struct parser_context *ctx, const char *s) { diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet.xml index de80a6b64a1..09dde392fac 100644 --- a/src/broadcom/cle/v3d_packet_v33.xml +++ b/src/broadcom/cle/v3d_packet.xml @@ -1,4 +1,4 @@ -<vcxml gen="3.3" min_ver="33" max_ver="42"> +<vcxml gen="3.3" min_ver="42" max_ver="71"> <enum name="Compare Function" prefix="V3D_COMPARE_FUNC"> <value name="NEVER" value="0"/> @@ -69,30 +69,7 @@ <value name="TRIANGLE_FAN_TF" value="22"/> </enum> - <enum name="TMU Filter" prefix="V3D_TMU_FILTER" max_ver="33"> - <!-- Names are mip filter, min filter, mag filter --> - <value name="MIN_LIN_MIP_NONE_MAG_LIN" value="0"/> - <value name="MIN_LIN_MIP_NONE_MAG_NEAR" value="1"/> - <value name="MIN_NEAR_MIP_NONE_MAG_LIN" value="2"/> - <value name="MIN_NEAR_MIP_NONE_MAG_NEAR" value="3"/> - - <value name="MIN_NEAR_MIP_NEAR_MAG_LIN" value="4"/> - <value name="MIN_NEAR_MIP_NEAR_MAG_NEAR" value="5"/> - <value name="MIN_NEAR_MIP_LIN_MAG_LIN" value="6"/> - <value name="MIN_NEAR_MIP_LIN_MAG_NEAR" value="7"/> - - <value name="MIN_LIN_MIP_NEAR_MAG_LIN" value="8"/> - <value name="MIN_LIN_MIP_NEAR_MAG_NEAR" value="9"/> - <value name="MIN_LIN_MIP_LIN_MAG_LIN" value="10"/> - <value name="MIN_LIN_MIP_LIN_MAG_NEAR" value="11"/> - - <value name="ANISOTROPIC_2_1" value="12"/> - <value name="ANISOTROPIC_4_1" value="13"/> - <value name="ANISOTROPIC_8_1" value="14"/> - <value name="ANISOTROPIC_16_1" value="15"/> - </enum> - - <enum name="Border Color Mode" prefix="V3D_BORDER_COLOR" min_ver="41"> + <enum name="Border Color Mode" prefix="V3D_BORDER_COLOR"> <value name="0000" value="0"/> <value name="0001" value="1"/> <value name="1111" value="2"/> @@ -107,7 +84,7 @@ <value name="MIRROR_ONCE" value="4"/> </enum> - <enum name="TMU Op" prefix="V3D_TMU_OP" min_ver="41"> + <enum name="TMU Op" prefix="V3D_TMU_OP"> <value name="Write ADD, Read Prefetch" value="0"/> <value name="Write SUB, Read Clear" value="1"/> <value name="Write XCHG, Read Flush" value="2"/> @@ -167,11 +144,34 @@ <value name="depth_16" value="2"/> </enum> - <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41"> + <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" max_ver="42"> <value name="none" value="0"/> <!-- no clamping --> <value name="norm" value="1"/> <!-- [0,1] for f16 --> <value name="pos" value="2"/> <!-- [0, for f16 --> - <value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range --> + <value name="int" value="3"/> <!-- clamp to integer RT's range --> + </enum> + + <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71"> + <value name="8i" value="0"/> <!-- no clamping --> + <value name="16i" value="1"/> <!-- no clamping --> + <value name="32i" value="2"/> <!-- no clamping --> + <value name="8ui" value="4"/> <!-- no clamping --> + <value name="16ui" value="5"/> <!-- no clamping --> + <value name="32ui" value="6"/> <!-- no clamping --> + <value name="8" value="8"/> <!-- no clamping --> + <value name="16f" value="9"/> <!-- no clamping --> + <value name="32f" value="10"/> <!-- no clamping --> + <value name="8i_clamped" value="16"/> <!-- clamp to integer RT's range --> + <value name="16i_clamped" value="17"/> <!-- clamp to integer RT's range --> + <value name="32i_clamped" value="18"/> <!-- clamp to integer RT's range --> + <value name="8ui_clamped" value="20"/> <!-- clamp to integer RT's range --> + <value name="16ui_clamped" value="21"/> <!-- clamp to integer RT's range --> + <value name="32ui_clamped" value="22"/> <!-- clamp to integer RT's range --> + <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 --> + <value name="16f_clamp_pos" value="25"/> <!-- [0, for f16 --> + <value name="16f_clamp_pq" value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 --> + <value name="16f_clamp_hlg" value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 --> + <value name="invalid" value="32"/> </enum> <!--- @@ -261,22 +261,27 @@ <value name="rgba8ui" value="34"/> <value name="rg8ui" value="35"/> <value name="r8ui" value="36"/> - <value name="srgbx8" value="37" max_ver="33"/> - <value name="rgbx8" value="38" max_ver="33"/> - <value name="bstc" value="39" min_ver="41"/> - <value name="d32f" value="40" min_ver="41"/> - <value name="d24" value="41" min_ver="41"/> - <value name="d16" value="42" min_ver="41"/> - <value name="d24s8" value="43" min_ver="41"/> - <value name="s8" value="44" min_ver="41"/> - <value name="rgba5551" value="45" min_ver="41"/> - </enum> - - <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33"> - <value name="depth_component32f" value="0"/> - <value name="depth_component24" value="1"/> <!-- depth low, pad high --> - <value name="depth_component16" value="2"/> - <value name="depth24_stencil8" value="3"/> <!-- stencil low, depth high --> + <value name="bstc8" value="39"/> + <value name="d32f" value="40"/> + <value name="d24" value="41"/> + <value name="d16" value="42"/> + <value name="d24s8" value="43"/> + <value name="s8" value="44"/> + <value name="rgba5551" value="45"/> + <value name="bstc8_srgb" value="46" min_ver="71"/> + <value name="bstc10" value="47" min_ver="71"/> + <value name="bstc10_srgb" value="48" min_ver="71"/> + <value name="bstc10_pq" value="49" min_ver="71"/> + <value name="rgba10x6" value="50" min_ver="71"/> + <value name="bstc10_hlg" value="55" min_ver="71"/> + <value name="rgba10x6_hlg" value="56" min_ver="71"/> + <value name="rgb10_a2_hlg" value="57" min_ver="71"/> + <value name="bstc10_pq_bt1886" value="58" min_ver="71"/> + <value name="rgba10x6_pq_bt1886" value="59" min_ver="71"/> + <value name="rgb10_a2_pq_bt1886" value="60" min_ver="71"/> + <value name="bstc10_hlg_bt1886" value="61" min_ver="71"/> + <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/> + <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/> </enum> <enum name="Dither Mode" prefix="V3D_DITHER_MODE"> @@ -299,7 +304,7 @@ <value name="packed complete patches" value="2"/> </enum> - <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS"> + <enum name="Primitive counters" prefix="V3D_PRIM_COUNTS"> <value name="tf_words_buffer0" value="0"/> <value name="tf_words_buffer1" value="1"/> <value name="tf_words_buffer2" value="2"/> @@ -309,6 +314,17 @@ <value name="tf_overflow" value="6"/> </enum> + <enum name="Line Rasterization" prefix="V3D_LINE_RASTERIZATION"> + <value name="diamond exit" value="0"/> + <value name="perp end caps" value="1"/> + </enum> + + <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE"> + <value name="NONE" value="0"/> + <value name="MIN_ONE_TO_ONE" value="1"/> + <value name="ZERO_TO_ONE" value="2"/> + </enum> + <packet code="0" name="Halt"/> <packet code="1" name="NOP"/> <packet code="4" name="Flush"/> @@ -362,57 +378,18 @@ <field name="column number in supertiles" size="8" start="0" type="uint"/> </packet> - <packet code="24" shortname="store_subsample" name="Store Multi-Sample Resolved Tile Color Buffer" cl="R" max_ver="33"/> - - <packet code="25" shortname="store_subsample_ex" name="Store Multi-Sample Resolved Tile Color Buffer (extended)" cl="R" max_ver="33"> - <field name="Disable Color Buffer write" size="8" start="8" type="uint"/> - <field name="Enable Z write" size="1" start="7" type="bool"/> - <field name="Enable Stencil write" size="1" start="6" type="bool"/> - <!-- bit 5 unused --> - <field name="Disable Color buffer(s) clear on write" size="1" start="4" type="bool"/> - <field name="Disable Stencil buffer clear on write" size="1" start="3" type="bool"/> - <field name="Disable Z buffer clear on write" size="1" start="2" type="bool"/> - <field name="Disable fast opportunistic write out in multisample mode" size="1" start="1" type="bool"/> - <field name="Last Tile of Frame" size="1" start="0" type="bool"/> - </packet> - - <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41"> + <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" max_ver="42"> <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/> <field name="Clear all Render Targets" size="1" start="0" type="bool"/> </packet> - <packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33"> - <field name="Disable Color Buffer load" size="8" start="8" type="uint"/> - <field name="Enable Z load" size="1" start="7" type="bool"/> - <field name="Enable Stencil load" size="1" start="6" type="bool"/> - </packet> + <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/> - <packet code="26" shortname="end_loads" name="End of Loads" cl="R" min_ver="41"/> + <packet code="26" shortname="end_loads" name="End of Loads" cl="R"/> <packet code="27" shortname="end_tile" name="End of Tile Marker" cl="R"/> - <packet code="29" shortname="store_general" name="Store Tile Buffer General" cl="R" max_ver="33"> - <field name="Address" size="24" start="24" type="address"/> - <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/> - <field name="XOR UIF" size="1" start="10" type="bool"/> - <field name="Last Tile of Frame" size="1" start="8" type="bool"/> - <field name="Disable Color buffer(s) clear on write" size="1" start="7" type="bool"/> - <field name="Disable Stencil buffer clear on write" size="1" start="6" type="bool"/> - <field name="Disable Z buffer clear on write" size="1" start="5" type="bool"/> - <field name="Raw Mode" size="1" start="4" type="bool"/> - <field name="Buffer to Store" size="4" start="0" type="uint"> - <value name="Render target 0" value="0"/> - <value name="Render target 1" value="1"/> - <value name="Render target 2" value="2"/> - <value name="Render target 3" value="3"/> - <value name="None" value="8"/> - <value name="Z" value="9"/> - <value name="Stencil" value="10"/> - <value name="Z+Stencil" value="11"/> - </field> - </packet> - - <packet code="29" shortname="store" name="Store Tile Buffer General" cl="R" min_ver="41"> + <packet code="29" shortname="store" name="Store Tile Buffer General" cl="R"> <field name="Address" size="32" start="64" type="address"/> <!-- used for y flip --> @@ -438,6 +415,10 @@ <value name="Render target 1" value="1"/> <value name="Render target 2" value="2"/> <value name="Render target 3" value="3"/> + <value name="Render target 4" value="4" min_ver="71"/> + <value name="Render target 5" value="5" min_ver="71"/> + <value name="Render target 6" value="6" min_ver="71"/> + <value name="Render target 7" value="7" min_ver="71"/> <value name="None" value="8"/> <value name="Z" value="9"/> <value name="Stencil" value="10"/> @@ -445,24 +426,7 @@ </field> </packet> - <packet code="30" shortname="load_general" name="Load Tile Buffer General" cl="R" max_ver="33"> - <field name="Address" size="24" start="24" type="address"/> - <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/> - <field name="XOR UIF" size="1" start="10" type="bool"/> - <field name="Raw Mode" size="1" start="4" type="bool"/> - <field name="Buffer to Load" size="4" start="0" type="uint"> - <value name="Render target 0" value="0"/> - <value name="Render target 1" value="1"/> - <value name="Render target 2" value="2"/> - <value name="Render target 3" value="3"/> - <value name="None" value="8"/> - <value name="Z" value="9"/> - <value name="Stencil" value="10"/> - <value name="Z+Stencil" value="11"/> - </field> - </packet> - - <packet code="30" shortname="load" name="Load Tile Buffer General" cl="R" min_ver="41"> + <packet code="30" shortname="load" name="Load Tile Buffer General" cl="R"> <field name="Address" size="32" start="64" type="address"/> <!-- used for y flip --> @@ -496,23 +460,7 @@ <packet code="31" shortname="tf_draw_flush_and_count" name="Transform Feedback Flush and Count"/> - <packet code="32" name="Indexed Prim List" cl="B" max_ver="33"> - <field name="Minimum index" size="32" start="104" type="uint"/> - <field name="Enable Primitive Restarts" size="1" start="103" type="bool"/> - <field name="Maximum index" size="31" start="72" type="uint"/> - <field name="Address of Indices List" size="32" start="40" type="address"/> - <field name="Length" size="32" start="8" type="uint"/> - - <field name="Index type" size="2" start="6" type="uint"> - <value name="Index type 8-bit" value="0"/> - <value name="Index type 16-bit" value="1"/> - <value name="Index type 32-bit" value="2"/> - </field> - - <field name="mode" size="5" start="0" type="Primitive"/> - </packet> - - <packet code="32" name="Indexed Prim List" cl="B" min_ver="41"> + <packet code="32" name="Indexed Prim List" cl="B"> <field name="Index Offset" size="32" start="40" type="uint"/> <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/> @@ -527,23 +475,7 @@ <field name="mode" size="6" start="0" type="Primitive"/> </packet> - <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B" max_ver="33"> - <field name="Stride in Multiples of 4 Bytes" size="8" start="104" type="uint"/> - <field name="Address of Indices List" size="32" start="72" type="address"/> - <field name="Address" size="32" start="40" type="address"/> - <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/> - <field name="Number of Draw Indirect Indexed Records" size="31" start="8" type="uint"/> - - <field name="Index type" size="2" start="6" type="uint"> - <value name="Index type 8-bit" value="0"/> - <value name="Index type 16-bit" value="1"/> - <value name="Index type 32-bit" value="2"/> - </field> - - <field name="mode" size="6" start="0" type="Primitive"/> - </packet> - - <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B" min_ver="41"> + <packet code="33" name="Indirect Indexed Instanced Prim List" cl="B"> <field name="Stride in Multiples of 4 Bytes" size="8" start="72" type="uint"/> <field name="Address" size="32" start="40" type="address"/> <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/> @@ -558,23 +490,7 @@ <field name="mode" size="6" start="0" type="Primitive"/> </packet> - <packet code="34" name="Indexed Instanced Prim List" cl="B" max_ver="33"> - <field name="Enable Primitive Restarts" size="1" start="135" type="bool"/> - <field name="Maximum index" size="31" start="104" type="uint"/> - <field name="Address of Indices List" size="32" start="72" type="address"/> - <field name="Number of Instances" size="32" start="40" type="uint"/> - <field name="Instance Length" size="32" start="8" type="uint"/> - - <field name="Index type" size="2" start="6" type="uint"> - <value name="Index type 8-bit" value="0"/> - <value name="Index type 16-bit" value="1"/> - <value name="Index type 32-bit" value="2"/> - </field> - - <field name="mode" size="5" start="0" type="Primitive"/> - </packet> - - <packet code="34" name="Indexed Instanced Prim List" cl="B" min_ver="41"> + <packet code="34" name="Indexed Instanced Prim List" cl="B"> <field name="Index Offset" size="32" start="72" type="uint"/> <field name="Number of Instances" size="32" start="40" type="uint"/> <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/> @@ -626,16 +542,16 @@ <field name="Base Vertex" size="32" start="0" type="uint"/> </packet> - <packet code="44" name="Index Buffer Setup" cl="B" min_ver="41"> + <packet code="44" name="Index Buffer Setup" cl="B"> <field name="Address" size="32" start="0" type="address"/> <field name="Size" size="32" start="32" type="uint"/> </packet> - <packet code="54" name="Set InstanceID" cl="B" min_ver="41"> + <packet code="54" name="Set InstanceID" cl="B"> <field name="Instance ID" size="32" start="0" type="uint"/> </packet> - <packet code="55" name="Set PrimitiveID" cl="B" min_ver="41"> + <packet code="55" name="Set PrimitiveID" cl="B"> <field name="Primitive ID" size="32" start="0" type="uint"/> </packet> @@ -662,22 +578,22 @@ <field name="number of attribute arrays" size="5" start="0" type="uint"/> </packet> - <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS" min_ver="41"> + <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS"> <field name="address" size="27" start="5" type="address"/> <field name="number of attribute arrays" size="5" start="0" type="uint"/> </packet> - <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS" min_ver="41"> + <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS"> <field name="address" size="27" start="5" type="address"/> <field name="number of attribute arrays" size="5" start="0" type="uint"/> </packet> - <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS" min_ver="41"> + <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS"> <field name="address" size="27" start="5" type="address"/> <field name="number of attribute arrays" size="5" start="0" type="uint"/> </packet> - <packet code="71" name="VCM Cache Size" min_ver="41"> + <packet code="71" name="VCM Cache Size"> <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/> <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/> </packet> @@ -706,23 +622,13 @@ </field> </packet> - <packet code="73" name="VCM Cache Size" max_ver="33"> - <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/> - <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/> - </packet> - - <packet code="73" name="Transform Feedback Buffer" min_ver="41"> + <packet code="73" name="Transform Feedback Buffer"> <field name="Buffer Address" size="32" start="32" type="address"/> <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/> <field name="Buffer Number" size="2" start="0" type="uint"/> </packet> - <packet code="74" name="Transform Feedback Enable" max_ver="33"> - <field name="number of 32-bit Output Buffer Address following" size="3" start="8" type="uint"/> - <field name="number of 16-bit Output Data Specs following" size="5" start="11" type="uint"/> - </packet> - - <packet code="74" name="Transform Feedback Specs" min_ver="41"> + <packet code="74" name="Transform Feedback Specs"> <field name="Enable" size="1" start="7" type="bool"/> <field name="Number of 16-bit Output Data Specs following" size="5" start="0" type="uint"/> </packet> @@ -742,13 +648,7 @@ <field name="L2T Flush Start" size="32" start="0" type="address"/> </packet> - <struct name="Transform Feedback Output Data Spec" max_ver="33"> - <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/> - <field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/> - <field name="Output Buffer to write to" size="2" start="12" type="uint"/> - </struct> - - <struct name="Transform Feedback Output Data Spec" min_ver="41"> + <struct name="Transform Feedback Output Data Spec"> <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/> <field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/> <field name="Output Buffer to write to" size="2" start="12" type="uint"/> @@ -771,11 +671,12 @@ <field name="Stencil Ref Value" size="8" start="0" type="uint"/> </packet> - <packet code="83" name="Blend Enables" min_ver="41"> + <packet code="83" name="Blend Enables"> <field name="Mask" size="8" start="0" type="uint"/> </packet> - <packet code="84" name="Blend Cfg" max_ver="33"> + <packet code="84" name="Blend Cfg" max_ver="42"> + <field name="Render Target Mask" size="4" start="24" type="uint"/> <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/> <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/> <field name="Color blend mode" size="4" start="12" type="Blend Mode"/> @@ -784,8 +685,8 @@ <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/> </packet> - <packet code="84" name="Blend Cfg" min_ver="41"> - <field name="Render Target Mask" size="4" start="24" type="uint"/> + <packet code="84" name="Blend Cfg" min_ver="71"> + <field name="Render Target Mask" size="8" start="24" type="uint"/> <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/> <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/> <field name="Color blend mode" size="4" start="12" type="Blend Mode"/> @@ -805,16 +706,16 @@ <field name="Mask" size="32" start="0" type="uint"/> </packet> - <packet code="88" name="Zero All Centroid Flags" min_ver="41"/> + <packet code="88" name="Zero All Centroid Flags" /> - <packet code="89" name="Centroid Flags" min_ver="41"> + <packet code="89" name="Centroid Flags"> <field name="Centroid Flags for varyings V0*24" size="24" start="8" type="uint"/> <field name="Action for Centroid Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/> <field name="Action for Centroid Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/> <field name="Varying offset V0" size="4" start="0" type="uint"/> </packet> - <packet code="91" name="Sample State" min_ver="41"> + <packet code="91" name="Sample State"> <field name="Coverage" size="16" start="16" type="f187"/> <field name="Mask" size="4" start="0" type="uint"/> </packet> @@ -823,7 +724,12 @@ <field name="address" size="32" start="0" type="address"/> </packet> - <packet code="96" name="Cfg Bits"> + <packet code="93" name="Depth Bounds Test Limits" min_ver="71"> + <field name="Lower Test Limit" size="32" start="0" type="float"/> + <field name="Upper Test Limit" size="32" start="32" type="float"/> + </packet> + + <packet code="96" name="Cfg Bits" max_ver="42"> <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/> <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/> <field name="Blend enable" size="1" start="19" type="bool"/> @@ -834,7 +740,26 @@ <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/> <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/> <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/> - <field name="Line Rasterization" size="2" start="4" type="uint"/> + <field name="Line Rasterization" size="2" start="4" type="Line Rasterization"/> + <field name="Enable Depth Offset" size="1" start="3" type="bool"/> + <field name="Clockwise Primitives" size="1" start="2" type="bool"/> + <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/> + <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/> + </packet> + + <packet code="96" name="Cfg Bits" min_ver="71"> + <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/> + <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/> + <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/> + <field name="Blend enable" size="1" start="19" type="bool"/> + <field name="Stencil enable" size="1" start="18" type="bool"/> + <field name="Z updates enable" size="1" start="15" type="bool"/> + <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/> + <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/> + <field name="Z Clamp Mode" size="1" start="10" type="bool"/> + <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/> + <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/> + <field name="Line Rasterization" size="1" start="4" type="uint"/> <field name="Enable Depth Offset" size="1" start="3" type="bool"/> <field name="Clockwise Primitives" size="1" start="2" type="bool"/> <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/> @@ -850,9 +775,9 @@ <field name="Varying offset V0" size="4" start="0" type="uint"/> </packet> - <packet code="99" shortname="zero_all_noperspective_flags" name="Zero All Non-perspective Flags" min_ver="41"/> + <packet code="99" shortname="zero_all_noperspective_flags" name="Zero All Non-perspective Flags" /> - <packet code="100" shortname="noperspective_flags" name="Non-perspective Flags" min_ver="41"> + <packet code="100" shortname="noperspective_flags" name="Non-perspective Flags"> <field name="Non-perspective Flags for varyings V0*24" size="24" start="8" type="uint"/> <field name="Action for Non-perspective Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/> <field name="Action for Non-perspective Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/> @@ -867,12 +792,7 @@ <field name="Line width" size="32" start="0" type="float"/> </packet> - <packet name="Depth Offset" code="106" max_ver="33"> - <field name="Depth Offset Units" size="16" start="16" type="f187"/> - <field name="Depth Offset Factor" size="16" start="0" type="f187"/> - </packet> - - <packet name="Depth Offset" code="106" min_ver="41"> + <packet name="Depth Offset" code="106"> <field name="Limit" size="32" start="32" type="float"/> <field name="Depth Offset Units" size="16" start="16" type="f187"/> <field name="Depth Offset Factor" size="16" start="0" type="f187"/> @@ -885,16 +805,11 @@ <field name="Clip Window Left Pixel Coordinate" size="16" start="0" type="uint"/> </packet> - <packet name="Viewport Offset" code="108" max_ver="33"> - <field name="Viewport Centre Y-coordinate" size="32" start="32" type="s24.8"/> - <field name="Viewport Centre X-coordinate" size="32" start="0" type="s24.8"/> - </packet> - - <packet name="Viewport Offset" code="108" min_ver="41"> - <field name="Coarse Y" size="10" start="54" type="uint"/> - <field name="Viewport Centre Y-coordinate" size="22" start="32" type="s14.8"/> - <field name="Coarse X" size="10" start="22" type="uint"/> - <field name="Viewport Centre X-coordinate" size="22" start="0" type="s14.8"/> + <packet name="Viewport Offset" code="108"> + <field name="Coarse Y" size="10" start="54" type="int"/> + <field name="Fine Y" size="22" start="32" type="u14.8"/> + <field name="Coarse X" size="10" start="22" type="int"/> + <field name="Fine X" size="22" start="0" type="u14.8"/> </packet> <packet shortname="clipz" name="Clipper Z min/max clipping planes" code="109"> @@ -902,31 +817,41 @@ <field name="Minimum Zw" size="32" start="0" type="float"/> </packet> - <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B"> + <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42"> <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/> <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/> </packet> + <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71"> + <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/> + <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/> + </packet> + <packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B"> <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/> <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/> </packet> - <packet name="Number of Layers" code="119" min_ver="41"> + <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71"> + <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/> + <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/> + </packet> + + <packet name="Number of Layers" code="119"> <field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/> </packet> - <packet code="120" name="Tile Binning Mode Cfg (Part1)" max_ver="33"> - <field name="Double-buffer in non-ms mode" size="1" start="63" type="bool"/> - <field name="Multisample Mode (4x)" size="1" start="62" type="bool"/> + <packet code="120" name="Tile Binning Mode Cfg" max_ver="42"> - <field name="Maximum BPP of all render targets" size="2" start="60" type="Internal BPP"/> + <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/> + <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/> - <field name="Number of Render Targets" size="4" start="56" type="uint"/> - <field name="Height (in tiles)" size="12" start="44" type="uint"/> - <field name="Width (in tiles)" size="12" start="32" type="uint"/> + <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/> + <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/> - <field name="Tile State Data Array Base Address" size="26" start="6" type="address"/> + <field name="Maximum BPP of all render targets" size="2" start="12" type="Internal BPP"/> + + <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/> <field name="tile allocation block size" size="2" start="4" type="uint"> <value name="tile allocation block size 64b" value="0"/> @@ -938,21 +863,24 @@ <value name="tile allocation initial block size 128b" value="1"/> <value name="tile allocation initial block size 256b" value="2"/> </field> - <field name="auto-initialize tile state data array" size="1" start="1" type="bool" default="1"/> - <field name="sub-id" size="1" start="0" type="uint" default="0"/> </packet> - <packet code="120" name="Tile Binning Mode Cfg" min_ver="41"> - + <packet code="120" name="Tile Binning Mode Cfg" min_ver="71"> <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/> <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/> - <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/> - <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/> - - <field name="Maximum BPP of all render targets" size="2" start="12" type="Internal BPP"/> - - <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/> + <field name="Log2 Tile Height" size="3" start="11" type="uint"> + <value name="tile height 8 pixels" value="0"/> + <value name="tile height 16 pixels" value="1"/> + <value name="tile height 32 pixels" value="2"/> + <value name="tile height 64 pixels" value="3"/> + </field> + <field name="Log2 Tile Width" size="3" start="8" type="uint"> + <value name="tile width 8 pixels" value="0"/> + <value name="tile width 16 pixels" value="1"/> + <value name="tile width 32 pixels" value="2"/> + <value name="tile width 64 pixels" value="3"/> + </field> <field name="tile allocation block size" size="2" start="4" type="uint"> <value name="tile allocation block size 64b" value="0"/> @@ -966,17 +894,11 @@ </field> </packet> - <packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33"> - <field name="Tile Allocation Memory Address" size="32" start="32" type="address"/> - <field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/> - - <field name="sub-id" size="1" start="0" type="uint" default="1"/> - </packet> + <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" max_ver="42"> + <field name="Pad" size="12" start="52" type="uint"/> - <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" max_ver="33"> - <field name="Disable Render Target Stores" size="8" start="56" type="uint"/> - <field name="Enable Z Store" size="1" start="55" type="bool"/> - <field name="Enable Stencil Store" size="1" start="54" type="bool"/> + <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/> + <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/> <field name="Early-Z disable" size="1" start="46" type="bool"/> @@ -988,7 +910,11 @@ <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/> <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/> - <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/> + <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"> + <value name="Render target maximum 32bpp" value="0"/> + <value name="Render target maximum 64bpp" value="1"/> + <value name="Render target maximum 128bpp" value="2"/> + </field> <field name="Image Height (pixels)" size="16" start="24" type="uint"/> <field name="Image Width (pixels)" size="16" start="8" type="uint"/> @@ -997,8 +923,21 @@ <field name="sub-id" size="4" start="0" type="uint" default="0"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41"> - <field name="Pad" size="12" start="52" type="uint"/> + <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71"> + <field name="Pad" size="6" start="58" type="uint"/> + + <field name="Log2 Tile Height" size="3" start="55" type="uint"> + <value name="tile height 8 pixels" value="0"/> + <value name="tile height 16 pixels" value="1"/> + <value name="tile height 32 pixels" value="2"/> + <value name="tile height 64 pixels" value="3"/> + </field> + <field name="Log2 Tile Width" size="3" start="52" type="uint"> + <value name="tile width 8 pixels" value="0"/> + <value name="tile width 16 pixels" value="1"/> + <value name="tile width 32 pixels" value="2"/> + <value name="tile width 64 pixels" value="3"/> + </field> <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/> <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/> @@ -1010,40 +949,18 @@ <value name="Early-Z direction GT/GE" value="1"/> </field> + <field name="Depth-buffer disable" size="1" start="44" type="bool"/> <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/> <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/> - <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/> - <field name="Image Height (pixels)" size="16" start="24" type="uint"/> <field name="Image Width (pixels)" size="16" start="8" type="uint"/> <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/> - <field name="sub-id" size="4" start="0" type="uint" default="0"/> + <field name="sub-id" size="3" start="0" type="uint" default="0"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33"> - <field name="Address" size="32" start="32" type="address"/> - - <field name="Pad" size="4" start="28" type="uint"/> - - <field name="Flip Y" size="1" start="27" type="bool"/> - - <field name="Memory Format" size="3" start="24" type="Memory Format"/> - - <field name="Dither Mode" size="2" start="22" type="Dither Mode"/> - - <field name="Output image format" size="6" start="16" type="Output Image Format"/> - - <field name="Decimate mode" size="2" start="14" type="Decimate Mode"/> - - <field name="Internal Type" size="4" start="10" type="Internal Type"/> - <field name="Internal BPP" size="2" start="8" type="Internal BPP"/> - <field name="Render Target Number" size="4" start="4" type="uint"/> - <field name="sub-id" size="4" start="0" type="uint" default="2"/> - </packet> - - <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41"> + <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="42"> <field name="Pad" size="28" start="36" type="uint"/> @@ -1066,53 +983,25 @@ <field name="sub-id" size="4" start="0" type="uint" default="1"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Z/Stencil)" cl="R" max_ver="33"> - <field name="Address" size="26" start="38" type="address"/> - - <field name="Padded height of output image in UIF blocks" size="13" start="25" type="uint"/> - - <field name="Memory Format" size="3" start="22" type="Memory Format"/> - - <field name="Output image format" size="6" start="16" type="Z/S Output Image Format"/> - - <field name="Decimate mode" size="2" start="14" type="uint"/> - - <field name="Internal Type" size="4" start="10" type="Internal Depth Type"/> - - <field name="Internal BPP (ignored)" size="2" start="8" type="uint"/> - <!-- selects between Z/Stencil config packet and Separate Stencil packet. --> - <field name="Z/Stencil ID" size="4" start="4" type="uint" default="0"/> - <field name="sub-id" size="4" start="0" type="uint" default="1"/> - </packet> - - <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" max_ver="33"> + <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" max_ver="42"> <field name="unused" size="16" start="48" type="uint"/> <field name="Z Clear Value" size="32" start="16" type="float"/> <field name="Stencil Clear Value" size="8" start="8" type="uint"/> - <field name="sub-id" size="4" start="0" type="uint" default="3"/> + <field name="sub-id" size="4" start="0" type="uint" default="2"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41"> + <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71"> <field name="unused" size="16" start="48" type="uint"/> <field name="Z Clear Value" size="32" start="16" type="float"/> <field name="Stencil Clear Value" size="8" start="8" type="uint"/> - <field name="sub-id" size="4" start="0" type="uint" default="2"/> - </packet> - - <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33"> - <!-- Express this as a 56-bit field? --> - <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/> - <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/> - - <field name="Render Target number" size="4" start="4" type="uint"/> - <field name="sub-id" size="4" start="0" type="uint" default="4"/> + <field name="sub-id" size="4" start="0" type="uint" default="1"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41"> + <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="42"> <!-- Express this as a 56-bit field? --> <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/> <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/> @@ -1121,16 +1010,20 @@ <field name="sub-id" size="4" start="0" type="uint" default="3"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33"> - <!-- Express this as a 56-bit field? --> - <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/> - <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/> + <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71"> - <field name="Render Target number" size="4" start="4" type="uint"/> - <field name="sub-id" size="4" start="0" type="uint" default="5"/> + <field name="Clear Color low bits" size="32" start="32" type="uint"/> + <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/> + <field name="Internal BPP" size="2" start="25" type="Internal BPP"/> + + <field name="Stride" size="7" start="18" type="uint" minus_one="true"/> + <!-- In multiples of 512 bits --> + <field name="Base Address" size="11" start="7" type="uint"/> + <field name="Render Target number" size="3" start="3" type="uint"/> + <field name="sub-id" size="3" start="0" type="uint" default="2"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41"> + <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="42"> <!-- Express this as a 56-bit field? --> <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/> <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/> @@ -1139,18 +1032,14 @@ <field name="sub-id" size="4" start="0" type="uint" default="4"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33"> - <field name="pad" size="11" start="53" type="uint"/> - <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/> - <!-- image height is for Y flipping --> - <field name="Raster Row Stride or Image Height in Pixels" size="16" start="24" type="uint"/> - <field name="Clear Color high 16 bits" size="16" start="8" type="uint"/> + <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71"> + <field name="Clear Color mid bits" size="40" start="24" type="uint"/> - <field name="Render Target number" size="4" start="4" type="uint"/> - <field name="sub-id" size="4" start="0" type="uint" default="6"/> + <field name="Render Target number" size="3" start="3" type="uint"/> + <field name="sub-id" size="3" start="0" type="uint" default="3"/> </packet> - <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41"> + <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="42"> <field name="pad" size="11" start="53" type="uint"/> <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/> <!-- image height is for Y flipping --> @@ -1161,6 +1050,13 @@ <field name="sub-id" size="4" start="0" type="uint" default="5"/> </packet> + <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71"> + <field name="Clear Color top bits" size="56" start="8" type="uint"/> + + <field name="Render Target number" size="3" start="3" type="uint"/> + <field name="sub-id" size="3" start="0" type="uint" default="4"/> + </packet> + <packet code="124" shortname="tile_coords" name="Tile Coordinates"> <field name="tile row number" size="12" start="12" type="uint"/> <field name="tile column number" size="12" start="0" type="uint"/> @@ -1199,43 +1095,7 @@ </field> </packet> - <struct name="GL Shader State Record" max_ver="33"> - <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/> - <field name="Enable clipping" size="1" start="1" type="bool"/> - <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/> - <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/> - <field name="Vertex ID read by vertex shader" size="1" start="4" type="bool"/> - <field name="Instance ID read by vertex shader" size="1" start="5" type="bool"/> - <field name="Fragment shader does Z writes" size="1" start="6" type="bool"/> - <field name="Turn off early-z test" size="1" start="7" type="bool"/> - <field name="Coordinate shader has separate input and output VPM blocks" size="1" start="8" type="bool"/> - <field name="Vertex shader has separate input and output VPM blocks" size="1" start="9" type="bool"/> - <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="10" type="bool"/> - - <field name="Number of varyings in Fragment Shader" size="8" start="2b" type="uint"/> - <field name="Coordinate Shader output VPM segment size" size="8" start="4b" type="uint"/> - <field name="Coordinate Shader input VPM segment size" size="8" start="5b" type="uint"/> - <field name="Vertex Shader output VPM segment size" size="8" start="6b" type="uint"/> - <field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/> - <field name="Address of default attribute values" size="32" start="8b" type="address"/> - <field name="Fragment Shader Code Address" size="29" start="99" type="address"/> - <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/> - <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/> - <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/> - <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/> - <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/> - <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/> - <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/> - <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/> - <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/> - <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/> - <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/> - <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/> - <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/> - <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/> - </struct> - - <struct name="GL Shader State Record" min_ver="41"> + <struct name="GL Shader State Record" max_ver="42"> <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/> <field name="Enable clipping" size="1" start="1" type="bool"/> @@ -1294,7 +1154,64 @@ <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/> </struct> - <struct name="Geometry Shader State Record" min_ver="41"> + <struct name="GL Shader State Record" min_ver="71"> + <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/> + <field name="Enable clipping" size="1" start="1" type="bool"/> + + <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/> + <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/> + <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/> + <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/> + <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/> + <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/> + + <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/> + <field name="Turn off early-z test" size="1" start="9" type="bool"/> + + <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/> + <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/> + <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/> + <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/> + <field name="Turn off scoreboard" size="1" start="16" type="bool"/> + <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/> + <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/> + <field name="No prim pack" size="1" start="19" type="bool"/> + <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/> + + <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/> + + <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/> + <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/> + + <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/> + <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/> + + <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/> + <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/> + + <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/> + <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/> + + <field name="Fragment Shader Code Address" size="29" start="67" type="address"/> + <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/> + <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/> + <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/> + <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/> + + <field name="Vertex Shader Code Address" size="29" start="131" type="address"/> + <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/> + <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/> + <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/> + <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/> + + <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/> + <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/> + <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/> + <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/> + <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/> + </struct> + + <struct name="Geometry Shader State Record"> <field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/> <field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/> <field name="Geometry Bin Mode Shader Start in final thread section" size="1" start="1" type="bool"/> @@ -1307,7 +1224,7 @@ <field name="Geometry Render Mode Shader Uniforms Address" size="32" start="12b" type="address"/> </struct> - <struct name="Tessellation Shader State Record" min_ver="41"> + <struct name="Tessellation Shader State Record"> <field name="Tessellation Bin Mode Control Shader Code Address" size="29" start="3" type="address"/> <field name="Tessellation Bin Mode Control Shader 4-way threadable" size="1" start="0" type="bool"/> <field name="Tessellation Bin Mode Control Shader Start in final thread section" size="1" start="1" type="bool"/> @@ -1331,7 +1248,7 @@ <field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/> </struct> - <struct name="Tessellation/Geometry Common Params" min_ver="41"> + <struct name="Tessellation/Geometry Common Params"> <field name="Tessellation Type" size="2" start="1" type="uint"> <value name="Tessellation Type Triangle" value="0"/> <value name="Tessellation Type Quads" value="1"/> @@ -1391,31 +1308,7 @@ <field name="GBG min GS output segments required in play" size="3" start="59" type="uint" minus_one="true"/> </struct> - <struct name="GL Shader State Attribute Record" max_ver="33"> - <field name="Address" size="32" start="0" type="address"/> - - <field name="Vec size" size="2" start="32" type="uint"/> - <field name="Type" size="3" start="34" type="uint"> - <value name="Attribute half-float" value="1"/> - <value name="Attribute float" value="2"/> - <value name="Attribute fixed" value="3"/> - <value name="Attribute byte" value="4"/> - <value name="Attribute short" value="5"/> - <value name="Attribute int" value="6"/> - <value name="Attribute int2_10_10_10" value="7"/> - </field> - <field name="Signed int type" size="1" start="37" type="bool"/> - <field name="Normalized int type" size="1" start="38" type="bool"/> - <field name="Read as int/uint" size="1" start="39" type="bool"/> - - <field name="Number of values read by Coordinate shader" size="4" start="40" type="uint"/> - <field name="Number of values read by Vertex shader" size="4" start="44" type="uint"/> - - <field name="Instance Divisor" size="16" start="6b" type="uint"/> - <field name="Stride" size="32" start="8b" type="uint"/> - </struct> - - <struct name="GL Shader State Attribute Record" min_ver="41"> + <struct name="GL Shader State Attribute Record"> <field name="Address" size="32" start="0" type="address"/> <field name="Vec size" size="2" start="32" type="uint"/> @@ -1476,55 +1369,19 @@ <field name="addr" size="13" start="0" type="uint"/> </struct> - <struct name="Texture Uniform Parameter 0 CFG_MODE=1" max_ver="33"> - <field name="Per-pixel mask enable" size="1" start="31" type="bool"/> - - <field name="Texel offset for r coordinate" size="4" start="27" type="int"/> - <field name="Texel offset for t coordinate" size="4" start="23" type="int"/> - <field name="Texel offset for s coordinate" size="4" start="19" type="int"/> - - <field name="R Wrap Mode" size="3" start="16" type="Wrap Mode"/> - <field name="T Wrap Mode" size="3" start="13" type="Wrap Mode"/> - <field name="S Wrap Mode" size="3" start="10" type="Wrap Mode"/> - - <field name="New configuration mode" size="1" start="9" type="bool" default="1"/> - - <field name="Shadow" size="1" start="8" type="bool"/> - <field name="Coefficient lookup mode" size="1" start="7" type="bool"/> - <field name="Disable AutoLOD, use bias only" size="1" start="6" type="bool"/> - <field name="Bias supplied" size="1" start="5" type="bool"/> - <field name="Gather sample mode" size="1" start="4" type="bool"/> - <field name="Fetch sample mode" size="1" start="3" type="bool"/> - - <field name="Lookup Type" size="3" start="0" type="uint"> - <value name="Texture 2D" value="0"/> - <value name="Texture 2D array" value="1"/> - <value name="Texture 3D" value="2"/> - <value name="Texture Cube Map" value="3"/> - <value name="Texture 1D" value="4"/> - <value name="Texture 1D Array" value="5"/> - <value name="Texture Child Image" value="6"/> - </field> - </struct> - - <struct name="Texture Uniform Parameter 1 CFG_MODE=1" max_ver="33"> - <field name="Texture state record base address" size="28" start="4" type="address"/> - <field name="Return words of texture data" size="4" start="0" type="uint"/> - </struct> - - <struct name="TMU Config Parameter 0" min_ver="41"> + <struct name="TMU Config Parameter 0"> <field name="Texture state address" size="32" start="0" type="address"/> <field name="Return words of texture data" size="4" start="0" type="uint"/> </struct> - <struct name="TMU Config Parameter 1" min_ver="41"> + <struct name="TMU Config Parameter 1"> <field name="Sampler state address" size="32" start="0" type="address"/> <field name="Per-pixel mask enable" size="1" start="2" type="bool"/> <field name="Unnormalized coordinates" size="1" start="1" type="bool"/> <field name="Output Type 32-bit" size="1" start="0" type="bool"/> </struct> - <struct name="TMU Config Parameter 2" min_ver="41" max_ver="41"> + <struct name="TMU Config Parameter 2" max_ver="41"> <field name="Pad" size="8" start="24" type="uint"/> <field name="Op" size="4" start="20" type="TMU Op"/> <field name="Offset R" size="4" start="16" type="int"/> @@ -1538,7 +1395,7 @@ <field name="Offset Format 8" size="1" start="0" type="bool"/> </struct> - <struct name="TMU Config Parameter 2" min_ver="42"> + <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42"> <field name="Pad" size="7" start="25" type="uint"/> <field name="LOD Query" size="1" start="24" type="bool"/> <field name="Op" size="4" start="20" type="TMU Op"/> @@ -1553,30 +1410,34 @@ <field name="Offset Format 8" size="1" start="0" type="bool"/> </struct> - <struct name="Texture Shader State" max_ver="33"> - <field name="UIF XOR disable" size="1" start="255" type="bool"/> - <field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/> - <field name="Level 0 XOR enable" size="1" start="252" type="bool"/> - <field name="Level 0 UB_PAD" size="4" start="248" type="uint"/> - <field name="Output 32-bit" size="1" start="246" type="bool"/> - <field name="Sample Number" size="2" start="244" type="uint"/> - - <field name="Base Level" size="4" start="240" type="uint"/> - <field name="Fixed Bias" size="16" start="224" type="s8.8"/> - <field name="Max Level-of-Detail" size="16" start="208" type="s8.8"/> - <field name="Min Level-of-Detail" size="16" start="192" type="s8.8"/> - - <field name="Border Color alpha" size="16" start="176" type="uint"/> - <field name="Border Color blue" size="16" start="160" type="uint"/> - <field name="Border Color green" size="16" start="144" type="uint"/> - <field name="Border Color red" size="16" start="128" type="uint"/> - - <field name="Flip S and T on incoming request" size="1" start="127" type="bool"/> - <field name="Flip ETC Y" size="1" start="126" type="bool" default="1"/> - <field name="Flip texture Y Axis" size="1" start="125" type="bool"/> - <field name="Flip texture X Axis" size="1" start="124" type="bool"/> - - <field name="Swizzle A" size="3" start="121" type="uint"> + <struct name="TMU Config Parameter 2" min_ver="71"> + <field name="Pad" size="5" start="27" type="uint"/> + <field name="Write conversion" size="1" start="26" type="bool"/> + <field name="DIM query" size="1" start="25" type="bool"/> + <field name="LOD Query" size="1" start="24" type="bool"/> + <field name="Op" size="4" start="20" type="TMU Op"/> + <field name="Offset R" size="4" start="16" type="int"/> + <field name="Offset T" size="4" start="12" type="int"/> + <field name="Offset S" size="4" start="8" type="int"/> + <field name="Gather Mode" size="1" start="7" type="bool"/> + <field name="Gather Component" size="2" start="5" type="uint"/> + <field name="Coefficient Mode" size="1" start="4" type="bool"/> + <field name="Sample Number" size="2" start="2" type="uint"/> + <field name="Disable AutoLOD" size="1" start="1" type="bool"/> + <field name="Offset Format 8" size="1" start="0" type="bool"/> + </struct> + + <struct name="Texture Shader State" max_ver="42"> + <field name="Pad" size="56" start="136" type="uint"/> + <field name="UIF XOR disable" size="1" start="135" type="bool"/> + <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/> + <field name="Level 0 XOR enable" size="1" start="132" type="bool"/> + <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/> + + <field name="Base Level" size="4" start="124" type="uint"/> + <field name="Max Level" size="4" start="120" type="uint"/> + + <field name="Swizzle A" size="3" start="117" type="uint"> <value name="Swizzle Zero" value="0"/> <value name="Swizzle One" value="1"/> <value name="Swizzle Red" value="2"/> @@ -1585,29 +1446,54 @@ <value name="Swizzle Alpha" value="5"/> </field> - <field name="Swizzle B" size="3" start="118" type="uint"/> - <field name="Swizzle G" size="3" start="115" type="uint"/> - <field name="Swizzle R" size="3" start="112" type="uint"/> - - <field name="Depth Compare Function" size="3" start="109" type="Compare Function"/> - - <field name="sRGB" size="1" start="107" type="bool"/> + <field name="Swizzle B" size="3" start="114" type="uint"/> + <field name="Swizzle G" size="3" start="111" type="uint"/> + <field name="Swizzle R" size="3" start="108" type="uint"/> + <field name="Extended" size="1" start="107" type="bool"/> <field name="Texture type" size="7" start="100" type="uint"/> - <field name="Image Depth" size="14" start="86" type="uint"/> <field name="Image Height" size="14" start="72" type="uint"/> <field name="Image Width" size="14" start="58" type="uint"/> <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/> - <field name="Texture base pointer" size="30" start="2" type="address"/> + <field name="Texture base pointer" size="32" start="0" type="address"/> - <field name="Filter" size="4" start="0" type="TMU Filter"/> + <field name="Reverse Standard Border Color" size="1" start="5" type="bool"/> + <field name="AHDR" size="1" start="4" type="bool"/> + <field name="sRGB" size="1" start="3" type="bool"/> + <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/> + <field name="Flip texture Y Axis" size="1" start="1" type="bool"/> + <field name="Flip texture X Axis" size="1" start="0" type="bool"/> </struct> - <struct name="Texture Shader State" min_ver="41"> - <field name="Pad" size="56" start="136" type="uint"/> + <struct name="Texture Shader State" min_ver="71"> + <field name="Pad" size="2" start="190" type="uint"/> + <!-- When we use an address type, there is an implicit requirement + that the address is a 32-bit that is encoded starting at a 32-bit + aligned bit offset into the packet. If the address field has less than + 32 bits, it is assumed that the address is aligned. For example, a + 26-bit address field is expected to be 64-byte aligned (6 lsb bits + are 0) and that this will be encoded into a packet starting at bit + offset 6 into a 32-bit dword (since bits 0..5 of the address are + implicitly 0 and don't need to be explicitly encoded). + + Unfortunately, the CB address below doesn't match this requirement: + it starts at bit 138, which is 10 bits into a 32-bit dword, but it + represents a 64-bit aligned address (6 lsb bits are 0), so we cannot + encode it as an address type. To fix this we encode these addresses + as uint types which has two implications: + 1. the driver is responsible for manually addinng the buffer objects + for these addresses to the job BO list. + 2. the driver needs to pass an actual 26-bit address value by manually + shifting the 6 lsb bits (that are implicitly 0). + --> + <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/> + <field name="texture base pointer Cb" size="26" start="138" type="uint"/> + <field name="Chroma offset y" size="1" start="137" type="uint"/> + <field name="Chroma offset x" size="1" start="136" type="uint"/> + <field name="UIF XOR disable" size="1" start="135" type="bool"/> <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/> <field name="Level 0 XOR enable" size="1" start="132" type="bool"/> @@ -1635,19 +1521,30 @@ <field name="Image Height" size="14" start="72" type="uint"/> <field name="Image Width" size="14" start="58" type="uint"/> - <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/> + <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting + at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has + Array Stride starting at 33, which is backwards incompatible, + We use the definition from 7.1.5. + --> + <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/> + <field name="R/B swap" size="1" start="32" type="bool"/> <field name="Texture base pointer" size="32" start="0" type="address"/> - <field name="Reverse Standard Border Color" size="1" start="5" type="bool"/> - <field name="AHDR" size="1" start="4" type="bool"/> - <field name="sRGB" size="1" start="3" type="bool"/> - <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/> + <field name="Reverse" size="1" start="5" type="bool"/> + <field name="Transfer func" size="3" start="2" type="uint"> + <value name="Transfer Func None" value="0"/> + <value name="Transfer Func sRGB" value="1"/> + <value name="Transfer Func PQ" value="2"/> + <value name="Transfer Func HLG" value="3"/> + <value name="Transfer Func PQ BT1886" value="4"/> + <value name="Transfer Func HLG BT1886" value="5"/> + </field> <field name="Flip texture Y Axis" size="1" start="1" type="bool"/> <field name="Flip texture X Axis" size="1" start="0" type="bool"/> </struct> - <struct name="Sampler State" min_ver="41"> + <struct name="Sampler State"> <field name="Border color word 3" size="32" start="160" type="uint"/> <field name="Border color word 2" size="32" start="128" type="uint"/> <field name="Border color word 1" size="32" start="96" type="uint"/> diff --git a/src/broadcom/cle/v3d_packet_helpers.h b/src/broadcom/cle/v3d_packet_helpers.h index 2b5e32ff215..41054618e3a 100644 --- a/src/broadcom/cle/v3d_packet_helpers.h +++ b/src/broadcom/cle/v3d_packet_helpers.h @@ -24,87 +24,20 @@ #ifndef MESA_V3D_PACKET_HELPERS_H #define MESA_V3D_PACKET_HELPERS_H -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <assert.h> -#include <math.h> -#include "util/u_math.h" +#include "util/bitpack_helpers.h" #ifdef HAVE_VALGRIND #include <valgrind.h> #include <memcheck.h> #define VG(x) x -#ifndef NDEBUG -#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x)) -#endif #else #define VG(x) ((void)0) #endif -#ifndef __gen_validate_value -#define __gen_validate_value(x) -#endif -/* -#ifndef __gen_address_type -#error #define __gen_address_type before including this file -#endif - -#ifndef __gen_user_data -#error #define __gen_combine_address before including this file -#endif -*/ -union __gen_value { - float f; - uint32_t dw; -}; - -static inline uint64_t -__gen_mbo(uint32_t start, uint32_t end) -{ - return (~0ull >> (64 - (end - start + 1))) << start; -} - -static inline uint64_t -__gen_uint(uint64_t v, uint32_t start, uint32_t end) -{ - __gen_validate_value(v); - -#ifndef NDEBUG - const int width = end - start + 1; - if (width < 64) { - const uint64_t max = (1ull << width) - 1; - assert(v <= max); - } -#endif - - return v << start; -} - -static inline uint64_t -__gen_sint(int64_t v, uint32_t start, uint32_t end) -{ - const int width = end - start + 1; - - __gen_validate_value(v); - -#ifndef NDEBUG - if (width < 64) { - const int64_t max = (1ll << (width - 1)) - 1; - const int64_t min = -(1ll << (width - 1)); - assert(min <= v && v <= max); - } -#endif - - const uint64_t mask = ~0ull >> (64 - width); - - return (v & mask) << start; -} - static inline uint64_t __gen_offset(uint64_t v, uint32_t start, uint32_t end) { - __gen_validate_value(v); + util_bitpack_validate_value(v); #ifndef NDEBUG uint64_t mask = (~0ull >> (64 - (end - start + 1))) << start; @@ -114,50 +47,6 @@ __gen_offset(uint64_t v, uint32_t start, uint32_t end) return v; } -static inline uint32_t -__gen_float(float v) -{ - __gen_validate_value(v); - return ((union __gen_value) { .f = (v) }).dw; -} - -static inline uint64_t -__gen_sfixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits) -{ - __gen_validate_value(v); - - const float factor = (1 << fract_bits); - -#ifndef NDEBUG - const float max = ((1 << (end - start)) - 1) / factor; - const float min = -(1 << (end - start)) / factor; - assert(min <= v && v <= max); -#endif - - const int64_t int_val = llroundf(v * factor); - const uint64_t mask = ~0ull >> (64 - (end - start + 1)); - - return (int_val & mask) << start; -} - -static inline uint64_t -__gen_ufixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits) -{ - __gen_validate_value(v); - - const float factor = (1 << fract_bits); - -#ifndef NDEBUG - const float max = ((1 << (end - start + 1)) - 1) / factor; - const float min = 0.0f; - assert(min <= v && v <= max); -#endif - - const uint64_t uint_val = llroundf(v * factor); - - return uint_val << start; -} - static inline uint64_t __gen_unpack_uint(const uint8_t *restrict cl, uint32_t start, uint32_t end) { diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h index 5762e5aaa70..0062ddbd516 100644 --- a/src/broadcom/cle/v3dx_pack.h +++ b/src/broadcom/cle/v3dx_pack.h @@ -31,12 +31,10 @@ #if (V3D_VERSION == 21) # include "cle/v3d_packet_v21_pack.h" -#elif (V3D_VERSION == 33) -# include "cle/v3d_packet_v33_pack.h" -#elif (V3D_VERSION == 41) -# include "cle/v3d_packet_v41_pack.h" #elif (V3D_VERSION == 42) # include "cle/v3d_packet_v42_pack.h" +#elif (V3D_VERSION == 71) +# include "cle/v3d_packet_v71_pack.h" #else # error "Need to add a pack header include for this v3d version" #endif diff --git a/src/broadcom/cle/v3d_packet_v21.xml b/src/broadcom/cle/vc4_packet.xml index df838a70845..df838a70845 100644 --- a/src/broadcom/cle/v3d_packet_v21.xml +++ b/src/broadcom/cle/vc4_packet.xml diff --git a/src/broadcom/clif/clif_dump.c b/src/broadcom/clif/clif_dump.c index 0aaa6b6ad8b..db94edba113 100644 --- a/src/broadcom/clif/clif_dump.c +++ b/src/broadcom/clif/clif_dump.c @@ -106,12 +106,16 @@ static bool clif_dump_packet(struct clif_dump *clif, uint32_t offset, const uint8_t *cl, uint32_t *size, bool reloc_mode) { - if (clif->devinfo->ver >= 42) + + switch (clif->devinfo->ver) { + case 42: return v3d42_clif_dump_packet(clif, offset, cl, size, reloc_mode); - else if (clif->devinfo->ver >= 41) - return v3d41_clif_dump_packet(clif, offset, cl, size, reloc_mode); - else - return v3d33_clif_dump_packet(clif, offset, cl, size, reloc_mode); + case 71: + return v3d71_clif_dump_packet(clif, offset, cl, size, reloc_mode); + default: + break; + }; + unreachable("Unknown HW version"); } static uint32_t @@ -160,7 +164,8 @@ clif_dump_cl(struct clif_dump *clif, uint32_t start, uint32_t end, static uint32_t clif_dump_gl_shader_state_record(struct clif_dump *clif, struct reloc_worklist_entry *reloc, - void *vaddr) + void *vaddr, + bool including_gs) { struct v3d_group *state = v3d_spec_find_struct(clif->spec, "GL Shader State Record"); @@ -170,6 +175,16 @@ clif_dump_gl_shader_state_record(struct clif_dump *clif, assert(attr); uint32_t offset = 0; + if (including_gs) { + struct v3d_group *gs_state = v3d_spec_find_struct(clif->spec, + "Geometry Shader State Record"); + assert(gs_state); + out(clif, "@format shadrec_gl_geom\n"); + v3d_print_group(clif, gs_state, 0, vaddr + offset); + offset += v3d_group_get_length(gs_state); + /* Extra pad when geometry/tessellation shader is present */ + offset += 20; + } out(clif, "@format shadrec_gl_main\n"); v3d_print_group(clif, state, 0, vaddr + offset); offset += v3d_group_get_length(state); @@ -201,6 +216,7 @@ clif_process_worklist(struct clif_dump *clif) break; case reloc_gl_shader_state: + case reloc_gl_including_gs_shader_state: break; case reloc_generic_tile_list: clif_dump_cl(clif, reloc->addr, @@ -336,10 +352,12 @@ clif_dump_buffers(struct clif_dump *clif) break; case reloc_gl_shader_state: + case reloc_gl_including_gs_shader_state: offset += clif_dump_gl_shader_state_record(clif, reloc, bo->vaddr + - offset); + offset, + reloc->type == reloc_gl_including_gs_shader_state); break; case reloc_generic_tile_list: offset = clif_dump_cl(clif, reloc->addr, diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h index d96bfd12de9..d4e55e03730 100644 --- a/src/broadcom/clif/clif_private.h +++ b/src/broadcom/clif/clif_private.h @@ -64,6 +64,7 @@ struct clif_dump { enum reloc_worklist_type { reloc_cl, reloc_gl_shader_state, + reloc_gl_including_gs_shader_state, reloc_generic_tile_list, }; @@ -94,12 +95,10 @@ clif_dump_add_address_to_worklist(struct clif_dump *clif, enum reloc_worklist_type type, uint32_t addr); -bool v3d33_clif_dump_packet(struct clif_dump *clif, uint32_t offset, - const uint8_t *cl, uint32_t *size, bool reloc_mode); -bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset, - const uint8_t *cl, uint32_t *size, bool reloc_mode); bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset, const uint8_t *cl, uint32_t *size, bool reloc_mode); +bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset, + const uint8_t *cl, uint32_t *size, bool reloc_mode); static inline void out(struct clif_dump *clif, const char *fmt, ...) diff --git a/src/broadcom/clif/v3dx_dump.c b/src/broadcom/clif/v3dx_dump.c index 9cf59f88920..454478531ff 100644 --- a/src/broadcom/clif/v3dx_dump.c +++ b/src/broadcom/clif/v3dx_dump.c @@ -94,6 +94,25 @@ v3dX(clif_dump_packet)(struct clif_dump *clif, uint32_t offset, return true; } +#if V3D_VERSION >= 41 + case V3DX(GL_SHADER_STATE_INCLUDING_GS_opcode): { + struct V3DX(GL_SHADER_STATE_INCLUDING_GS) values; + V3DX(GL_SHADER_STATE_INCLUDING_GS_unpack)(cl, &values); + + if (reloc_mode) { + struct reloc_worklist_entry *reloc = + clif_dump_add_address_to_worklist(clif, + reloc_gl_including_gs_shader_state, + values.address); + if (reloc) { + reloc->shader_state.num_attrs = + values.number_of_attribute_arrays; + } + } + return true; + } +#endif /* V3D_VERSION >= 41 */ + #if V3D_VERSION < 40 case V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED_opcode): { struct V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED) values; diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h index cb1ee7c96f4..4cfd98f961b 100644 --- a/src/broadcom/common/v3d_cpu_tiling.h +++ b/src/broadcom/common/v3d_cpu_tiling.h @@ -31,7 +31,7 @@ static inline void v3d_load_utile(void *cpu, uint32_t cpu_stride, void *gpu, uint32_t gpu_stride) { -#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) +#if defined(V3D_BUILD_NEON) && DETECT_ARCH_ARM if (gpu_stride == 8) { __asm__ volatile ( /* Load from the GPU in one shot, no interleave, to @@ -80,7 +80,7 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride, : "q0", "q1", "q2", "q3"); return; } -#elif defined (PIPE_ARCH_AARCH64) +#elif DETECT_ARCH_AARCH64 if (gpu_stride == 8) { __asm__ volatile ( /* Load from the GPU in one shot, no interleave, to @@ -141,7 +141,7 @@ static inline void v3d_store_utile(void *gpu, uint32_t gpu_stride, void *cpu, uint32_t cpu_stride) { -#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) +#if defined(V3D_BUILD_NEON) && DETECT_ARCH_ARM if (gpu_stride == 8) { __asm__ volatile ( /* Load each 8-byte line from cpu-side source, @@ -188,7 +188,7 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride, : "q0", "q1", "q2", "q3"); return; } -#elif defined (PIPE_ARCH_AARCH64) +#elif DETECT_ARCH_AARCH64 if (gpu_stride == 8) { __asm__ volatile ( /* Load each 8-byte line from cpu-side source, diff --git a/src/broadcom/vulkan/v3dv_util.c b/src/broadcom/common/v3d_csd.h index d26369f9f56..dc1bd11efc5 100644 --- a/src/broadcom/vulkan/v3dv_util.c +++ b/src/broadcom/common/v3d_csd.h @@ -1,12 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi - * - * based in part on anv driver which is: - * Copyright © 2015 Intel Corporation - * - * based in part on radv driver which is: - * Copyright © 2016 Red Hat. - * Copyright © 2016 Bas Nieuwenhuizen + * Copyright © 2023 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -28,38 +21,23 @@ * IN THE SOFTWARE. */ -#include <stdarg.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> -#include <assert.h> - -#include "vk_enum_to_str.h" -#include "v3dv_private.h" - -VkResult -__vk_errorf(struct v3dv_instance *instance, VkResult error, const char *file, - int line, const char *format, ...) -{ - va_list ap; - char buffer[256]; +#ifndef V3D_CSD_H +#define V3D_CSD_H + +#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 +#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0 +/* Allow this dispatch to start while the last one is still running. */ +#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26) +/* Maximum supergroup ID. 6 bits. */ +#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20 +/* Batches per supergroup minus 1. 8 bits. */ +#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12 +/* Workgroups per supergroup, 0 means 16 */ +#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8 +#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0 + +#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2) +#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) +#define V3D_CSD_CFG5_THREADING (1 << 0) -#ifndef DEBUG - return error; #endif - - const char *error_str = vk_Result_to_str(error); - - if (format) { - va_start(ap, format); - vsnprintf(buffer, sizeof(buffer), format, ap); - va_end(ap); - - fprintf(stderr, "%s:%d: %s (%s)\n", file, line, buffer, error_str); - } else { - fprintf(stderr, "%s:%d: %s\n", file, line, error_str); - } - - return error; -} diff --git a/src/broadcom/common/v3d_debug.c b/src/broadcom/common/v3d_debug.c index 508a2b7c74c..b6b32bc72ad 100644 --- a/src/broadcom/common/v3d_debug.c +++ b/src/broadcom/common/v3d_debug.c @@ -37,13 +37,13 @@ #include "util/u_debug.h" #include "c11/threads.h" -uint32_t V3D_DEBUG = 0; +uint32_t v3d_mesa_debug = 0; static const struct debug_named_value debug_control[] = { { "cl", V3D_DEBUG_CL, "Dump command list during creation" }, { "cl_nobin", V3D_DEBUG_CL_NO_BIN, - "Dump command listduring creation, excluding binary resources" }, + "Dump command list during creation, excluding binary resources" }, { "clif", V3D_DEBUG_CLIF, "Dump command list (CLIF format) during creation", }, { "qpu", V3D_DEBUG_QPU, @@ -53,15 +53,21 @@ static const struct debug_named_value debug_control[] = { { "nir", V3D_DEBUG_NIR, "Dump NIR during program compile" }, { "tgsi", V3D_DEBUG_TGSI, - "Dump TGSI during program compile" }, + "Dump TGSI during program compile (v3d only)" }, + /* `shaderdb` is *not* used by shader-db, but is here so that any other + * game/app can dump its stats in the shader-db format, allowing them + * to be compared using shader-db's report.py tool. + */ { "shaderdb", V3D_DEBUG_SHADERDB, "Dump program compile information for shader-db analysis" }, { "surface", V3D_DEBUG_SURFACE, - "Print resource layout information" }, + /* FIXME: evaluate to implement it on v3dv */ + "Print resource layout information (v3d only)" }, { "perf", V3D_DEBUG_PERF, - "Print during runtime performance-related events" }, + "Print performance-related events during runtime" }, { "norast", V3D_DEBUG_NORAST, - "Skip actual hardware execution of commands" }, + /* FIXME: evaluate to implement on v3dv*/ + "Skip actual hardware execution of commands (v3d only)" }, { "fs", V3D_DEBUG_FS, "Dump fragment shaders" }, { "gs", V3D_DEBUG_GS, @@ -73,11 +79,11 @@ static const struct debug_named_value debug_control[] = { { "always_flush", V3D_DEBUG_ALWAYS_FLUSH, "Flush after each draw call" }, { "precompile", V3D_DEBUG_PRECOMPILE, - "Precompiles shader variant at shader state creation time" }, + "Precompiles shader variant at shader state creation time (v3d only)" }, { "ra", V3D_DEBUG_RA, "Dump register allocation failures" }, { "dump_spirv", V3D_DEBUG_DUMP_SPIRV, - "Dump SPIR-V code" }, + "Dump SPIR-V code (v3dv only)" }, { "tmu32", V3D_DEBUG_TMU_32BIT, "Force 32-bit precision on all TMU operations" }, /* This can lead to incorrect behavior for applications that do @@ -88,12 +94,25 @@ static const struct debug_named_value debug_control[] = { "Force 16-bit precision on all TMU operations" }, { "noloopunroll", V3D_DEBUG_NO_LOOP_UNROLL, "Disable loop unrolling" }, - { NULL } + { "db", V3D_DEBUG_DOUBLE_BUFFER, + "Enable double buffer for Tile Buffer when MSAA is disabled" }, +#ifdef ENABLE_SHADER_CACHE + { "cache", V3D_DEBUG_CACHE, + "Print on-disk cache events (only with cache enabled)" }, +#endif + { "no_merge_jobs", V3D_DEBUG_NO_MERGE_JOBS, + "Don't try to merge subpasses in the same job even if they share framebuffer configuration (v3dv only)" }, + { "opt_compile_time", V3D_DEBUG_OPT_COMPILE_TIME, + "Don't try to reduce shader spilling, might improve compile times with expensive shaders." }, + /* disable_tfu is v3dv only because v3d has some uses of the TFU without alternative codepaths */ + { "disable_tfu", V3D_DEBUG_DISABLE_TFU, + "Disable TFU (v3dv only)" }, + DEBUG_NAMED_VALUE_END }; DEBUG_GET_ONCE_FLAGS_OPTION(v3d_debug, "V3D_DEBUG", debug_control, 0) -uint32_t +bool v3d_debug_flag_for_shader_stage(gl_shader_stage stage) { uint32_t flags[] = { @@ -105,14 +124,11 @@ v3d_debug_flag_for_shader_stage(gl_shader_stage stage) [MESA_SHADER_COMPUTE] = V3D_DEBUG_CS, }; STATIC_ASSERT(MESA_SHADER_STAGES == 6); - return flags[stage]; + return v3d_mesa_debug & flags[stage]; } void v3d_process_debug_variable(void) { - V3D_DEBUG = debug_get_option_v3d_debug(); - - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) - V3D_DEBUG |= V3D_DEBUG_NORAST; + v3d_mesa_debug = debug_get_option_v3d_debug(); } diff --git a/src/broadcom/common/v3d_debug.h b/src/broadcom/common/v3d_debug.h index b5278c4c759..67112ebf361 100644 --- a/src/broadcom/common/v3d_debug.h +++ b/src/broadcom/common/v3d_debug.h @@ -39,7 +39,9 @@ extern "C" { * list of debugging flags, as well as some macros for handling them. */ -extern uint32_t V3D_DEBUG; +extern uint32_t v3d_mesa_debug; + +#define V3D_DBG(flag) unlikely(v3d_mesa_debug & V3D_DEBUG_ ## flag) #define V3D_DEBUG_SHADERDB (1 << 0) #define V3D_DEBUG_TGSI (1 << 1) @@ -63,6 +65,11 @@ extern uint32_t V3D_DEBUG; #define V3D_DEBUG_TMU_16BIT (1 << 19) #define V3D_DEBUG_NO_LOOP_UNROLL (1 << 20) #define V3D_DEBUG_CL_NO_BIN (1 << 21) +#define V3D_DEBUG_DOUBLE_BUFFER (1 << 22) +#define V3D_DEBUG_CACHE (1 << 23) +#define V3D_DEBUG_NO_MERGE_JOBS (1 << 24) +#define V3D_DEBUG_OPT_COMPILE_TIME (1 << 25) +#define V3D_DEBUG_DISABLE_TFU (1 << 26) #define V3D_DEBUG_SHADERS (V3D_DEBUG_TGSI | V3D_DEBUG_NIR | \ V3D_DEBUG_VIR | V3D_DEBUG_QPU | \ @@ -85,12 +92,7 @@ extern uint32_t V3D_DEBUG; #define dbg_printf(...) fprintf(stderr, __VA_ARGS__) #endif /* HAVE_ANDROID_PLATFORM */ -#define DBG(flag, ...) do { \ - if (unlikely(V3D_DEBUG & (flag))) \ - dbg_printf(__VA_ARGS__); \ -} while(0) - -extern uint32_t v3d_debug_flag_for_shader_stage(gl_shader_stage stage); +extern bool v3d_debug_flag_for_shader_stage(gl_shader_stage stage); extern void v3d_process_debug_variable(void); diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c index 272190eb2e5..fa85a7d5077 100644 --- a/src/broadcom/common/v3d_device_info.c +++ b/src/broadcom/common/v3d_device_info.c @@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i struct drm_v3d_get_param ident1 = { .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1, }; + struct drm_v3d_get_param hub_ident3 = { + .param = DRM_V3D_PARAM_V3D_HUB_IDENT3, + }; int ret; ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0); @@ -62,10 +65,11 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i int qups = (ident1.value >> 8) & 0xf; devinfo->qpu_count = nslc * qups; + devinfo->has_accumulators = devinfo->ver < 71; + switch (devinfo->ver) { - case 33: - case 41: case 42: + case 71: break; default: fprintf(stderr, @@ -75,5 +79,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i return false; } - return true; + ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3); + if (ret != 0) { + fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n", + strerror(errno)); + return false; + } + + devinfo->rev = (hub_ident3.value >> 8) & 0xff; + + return true; } diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h index 97abd9b8d9f..8dfc7858727 100644 --- a/src/broadcom/common/v3d_device_info.h +++ b/src/broadcom/common/v3d_device_info.h @@ -34,11 +34,17 @@ struct v3d_device_info { /** Simple V3D version: major * 10 + minor */ uint8_t ver; + /** V3D revision number */ + uint8_t rev; + /** Size of the VPM, in bytes. */ int vpm_size; /* NSLC * QUPS from the core's IDENT registers. */ int qpu_count; + + /* If the hw has accumulator registers */ + bool has_accumulators; }; typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg); diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h index 129e53e29a4..354c8784914 100644 --- a/src/broadcom/common/v3d_limits.h +++ b/src/broadcom/common/v3d_limits.h @@ -24,6 +24,8 @@ #ifndef V3D_LIMITS_H #define V3D_LIMITS_H +#define V3D_CL_MAX_INSTR_SIZE 25 + /* Number of channels a QPU thread executes in parallel. Also known as * gl_SubGroupSizeARB. */ @@ -36,32 +38,35 @@ V3D_MAX_GS_INPUTS, \ V3D_MAX_FS_INPUTS) -/* For now we need to maintain a different limits for OpenGL and Vulkan due - * some OpenGL CTS tests hitting register allocation when trying to use all - * the texture available. - * - * FIXME: nir_schedule should be able to handle that. When fixed it would be - * simpler to keep just one limit - */ -#define V3D_VULKAN_MAX_TEXTURE_SAMPLERS 24 -#define V3D_OPENGL_MAX_TEXTURE_SAMPLERS 16 - -/* Not specifically a hardware limit, just coordination between compiler and - * driver. - */ -#define V3D_MAX_TEXTURE_SAMPLERS MAX2(V3D_VULKAN_MAX_TEXTURE_SAMPLERS, \ - V3D_OPENGL_MAX_TEXTURE_SAMPLERS) - -/* The HW can do 16384 (15), but we run into hangs when we expose that. */ -#define V3D_MAX_MIP_LEVELS 13 +#define V3D_MAX_TEXTURE_SAMPLERS 24 #define V3D_MAX_SAMPLES 4 -#define V3D_MAX_DRAW_BUFFERS 4 +#define V3D_MAX_DRAW_BUFFERS 8 +#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8) #define V3D_MAX_POINT_SIZE 512.0f #define V3D_MAX_LINE_WIDTH 32 -#define V3D_MAX_BUFFER_RANGE (1 << 27) +#define V3D_MAX_BUFFER_RANGE (1 << 30) + +/* Sub-pixel precision bits in the rasterizer */ +#define V3D_COORD_SHIFT 6 + +/* Size of a cache line */ +#define V3D_NON_COHERENT_ATOM_SIZE 256 + +/* Minimum alignment for texel buffers */ +#define V3D_TMU_TEXEL_ALIGN 64 + +#define V3D_MAX_IMAGE_DIMENSION 4096 + +/* The HW can do 16384 (15), but we run into hangs when we expose that. Also, + * since we are only exposing images up to 4096 pixels per dimension 13 is + * all we need. + */ +#define V3D_MAX_MIP_LEVELS 13 + +#define V3D_MAX_ARRAY_LAYERS 2048 #endif /* V3D_LIMITS_H */ diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h index fe89398208a..4ab66f647ab 100644 --- a/src/broadcom/common/v3d_macros.h +++ b/src/broadcom/common/v3d_macros.h @@ -32,15 +32,12 @@ #if (V3D_VERSION == 21) # define V3DX(x) V3D21_##x # define v3dX(x) v3d21_##x -#elif (V3D_VERSION == 33) -# define V3DX(x) V3D33_##x -# define v3dX(x) v3d33_##x -#elif (V3D_VERSION == 41) -# define V3DX(x) V3D41_##x -# define v3dX(x) v3d41_##x #elif (V3D_VERSION == 42) # define V3DX(x) V3D42_##x # define v3dX(x) v3d42_##x +#elif (V3D_VERSION == 71) +# define V3DX(x) V3D71_##x +# define v3dX(x) v3d71_##x #else # error "Need to add prefixing macros for this v3d version" #endif diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h new file mode 100644 index 00000000000..33e3e0e78db --- /dev/null +++ b/src/broadcom/common/v3d_performance_counters.h @@ -0,0 +1,229 @@ +/* + * Copyright © 2023 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef V3D_PERFORMANCE_COUNTERS_H +#define V3D_PERFORMANCE_COUNTERS_H + +#define V3D_PERFCNT_CATEGORY 0 +#define V3D_PERFCNT_NAME 1 +#define V3D_PERFCNT_DESCRIPTION 2 + +#ifndef V3D_VERSION +# error "The V3D_VERSION macro must be defined" +#endif + +#if (V3D_VERSION >= 71) + +static const char *v3d_performance_counters[][3] = { + {"CORE", "cycle-count", "[CORE] Cycle counter"}, + {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"}, + {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"}, + {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"}, + {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, + {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, + {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, + {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"}, + {"FEP", "FEP-valid-quads", "[FEP] Valid quads"}, + {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"}, + {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"}, + {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"}, + {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"}, + {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"}, + {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"}, + {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"}, + {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"}, + {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"}, + {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"}, + {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"}, + {"TMU", "TMU-active-cycles", "[TMU] Active cycles"}, + {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"}, + {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"}, + {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"}, + {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"}, + {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"}, + {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"}, + {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"}, + {"L2T", "L2T-local", "[L2T] Local mode access"}, + {"L2T", "L2T-writeback", "[L2T] Writeback"}, + {"L2T", "L2T-zero", "[L2T] Zero"}, + {"L2T", "L2T-merge", "[L2T] Merge"}, + {"L2T", "L2T-fill", "[L2T] Fill"}, + {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"}, + {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"}, + {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"}, + {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"}, + {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"}, + {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"}, + {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"}, + {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"}, + {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"}, + {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"}, + {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"}, + {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"}, + {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"}, + {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"}, + {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"}, + {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"}, + {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"}, + {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"}, + {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"}, + {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"}, + {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"}, + {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"}, + {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"}, + {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"}, + {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"}, + {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"}, + {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"}, + {"CORE", "core-memory-writes", "[CORE] Total memory writes"}, + {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"}, + {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"}, + {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"}, + {"CORE", "core-memory-reads", "[CORE] Total memory reads"}, + {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"}, + {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"}, + {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"}, + {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"}, + {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"}, + {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"}, + {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"}, + {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"}, + {"AXI", "AXI-read-trans", "[AXI] Read transaction count"}, + {"AXI", "AXI-write-trans", "[AXI] Write transaction count"}, + {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"}, + {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"}, + {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"}, + {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"}, + {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"}, + {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"}, + {"QPU", "QPU-active", "[QPU] Executed shader instruction"}, + {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"}, + {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"}, + {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"}, + {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"}, + {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"}, + {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"}, + {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"}, + {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"}, + {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"}, +}; + +#elif (V3D_VERSION >= 42) + +static const char *v3d_performance_counters[][3] = { + {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, + {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, + {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"}, + {"FEP", "FEP-valid-quads", "[FEP] Valid quads"}, + {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"}, + {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"}, + {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"}, + {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"}, + {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"}, + {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"}, + {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"}, + {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"}, + {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"}, + {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"}, + {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"}, + {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"}, + {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"}, + {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"}, + {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"}, + {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"}, + {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"}, + {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"}, + {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"}, + {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"}, + {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"}, + {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"}, + {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"}, + {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"}, + {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"}, + {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"}, + {"CORE", "cycle-count", "[CORE] Cycle counter"}, + {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"}, + {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"}, + {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"}, + {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"}, + {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"}, + {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"}, + {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"}, + {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"}, + {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"}, + {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"}, + {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"}, + {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"}, + {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"}, + {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"}, + {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"}, + {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"}, + {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"}, + {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"}, + {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"}, + {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"}, + {"TMU", "TMU-active-cycles", "[TMU] Active cycles"}, + {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"}, + {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"}, + {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"}, + {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"}, + {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"}, + {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"}, + {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"}, + {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"}, + {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"}, + {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"}, + {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"}, + {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"}, + {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"}, + {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"}, + {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"}, + {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"}, + {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"}, + {"CORE", "core-memory-writes", "[CORE] Total memory writes"}, + {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"}, + {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"}, + {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"}, + {"CORE", "core-memory-reads", "[CORE] Total memory reads"}, + {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"}, + {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"}, + {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"}, + {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"}, + {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"}, + {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"}, + {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"}, + {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"}, + {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"}, + {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"}, + {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, +}; + +#else +static const char *v3d_performance_counters[][3] = { }; +#endif + +#endif diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h new file mode 100644 index 00000000000..572d0074794 --- /dev/null +++ b/src/broadcom/common/v3d_tfu.h @@ -0,0 +1,74 @@ +/* + * Copyright © 2021 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef V3D_TFU_H +#define V3D_TFU_H + +/* Disable level 0 write, just write following mipmaps */ +#define V3D33_TFU_IOA_DIMTW (1 << 0) +#define V3D33_TFU_IOA_FORMAT_SHIFT 3 +#define V3D33_TFU_IOA_FORMAT_LINEARTILE 3 +#define V3D33_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4 +#define V3D33_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5 +#define V3D33_TFU_IOA_FORMAT_UIF_NO_XOR 6 +#define V3D33_TFU_IOA_FORMAT_UIF_XOR 7 + +#define V3D33_TFU_ICFG_NUMMM_SHIFT 5 +#define V3D33_TFU_ICFG_TTYPE_SHIFT 9 + +#define V3D33_TFU_ICFG_OPAD_SHIFT 22 + +#define V3D33_TFU_ICFG_FORMAT_SHIFT 18 +#define V3D33_TFU_ICFG_FORMAT_RASTER 0 +#define V3D33_TFU_ICFG_FORMAT_SAND_128 1 +#define V3D33_TFU_ICFG_FORMAT_SAND_256 2 +#define V3D33_TFU_ICFG_FORMAT_LINEARTILE 11 +#define V3D33_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12 +#define V3D33_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13 +#define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14 +#define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15 + +/* Disable level 0 write, just write following mipmaps */ +#define V3D71_TFU_IOC_DIMTW (1 << 0) +#define V3D71_TFU_IOC_FORMAT_SHIFT 12 +#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3 +#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4 +#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5 +#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6 +#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7 + +#define V3D71_TFU_IOC_STRIDE_SHIFT 16 +#define V3D71_TFU_IOC_NUMMM_SHIFT 4 + +#define V3D71_TFU_ICFG_OTYPE_SHIFT 16 +#define V3D71_TFU_ICFG_IFORMAT_SHIFT 23 +#define V3D71_TFU_ICFG_FORMAT_RASTER 0 +#define V3D71_TFU_ICFG_FORMAT_SAND_128 1 +#define V3D71_TFU_ICFG_FORMAT_SAND_256 2 +#define V3D71_TFU_ICFG_FORMAT_LINEARTILE 11 +#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12 +#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13 +#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR 14 +#define V3D71_TFU_ICFG_FORMAT_UIF_XOR 15 + +#endif diff --git a/src/broadcom/common/v3d_tiling.c b/src/broadcom/common/v3d_tiling.c index 22f84811e19..6e785916578 100644 --- a/src/broadcom/common/v3d_tiling.c +++ b/src/broadcom/common/v3d_tiling.c @@ -28,6 +28,7 @@ */ #include <stdint.h> +#include "util/box.h" #include "v3d_tiling.h" #include "broadcom/common/v3d_cpu_tiling.h" diff --git a/src/broadcom/common/v3d_tiling.h b/src/broadcom/common/v3d_tiling.h index 08ae7cce805..2573c8a5f02 100644 --- a/src/broadcom/common/v3d_tiling.h +++ b/src/broadcom/common/v3d_tiling.h @@ -24,7 +24,7 @@ #ifndef V3D_TILING_H #define V3D_TILING_H -#include "util/u_box.h" +#include "util/format/u_format.h" /* A UIFblock is a 256-byte region of memory that's 256-byte aligned. These * will be grouped in 4x4 blocks (left-to-right, then top-to-bottom) in a 4KB @@ -63,6 +63,8 @@ enum v3d_tiling_mode { V3D_TILING_UIF_XOR, }; +struct pipe_box; + uint32_t v3d_utile_width(int cpp) ATTRIBUTE_CONST; uint32_t v3d_utile_height(int cpp) ATTRIBUTE_CONST; bool v3d_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST; diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c index 424656fd8b1..8a50d279985 100644 --- a/src/broadcom/common/v3d_util.c +++ b/src/broadcom/common/v3d_util.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -86,3 +86,187 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, return best_wgs_per_sg; } + +#define V3D71_TLB_COLOR_SIZE (16 * 1024) +#define V3D71_TLB_DETPH_SIZE (16 * 1024) +#define V3D71_TLB_AUX_DETPH_SIZE (8 * 1024) + +static bool +tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp) +{ + /* First, we check if we can fit this tile size allocating the depth + * TLB memory to color. + */ + if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE && + pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) { + return true; + } + + /* Otherwise the tile must fit in the main TLB buffers */ + return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE && + pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE; +} + +void +v3d_choose_tile_size(const struct v3d_device_info *devinfo, + uint32_t color_attachment_count, + /* V3D 4.x max internal bpp of all RTs */ + uint32_t max_internal_bpp, + /* V3D 7.x accumulated bpp for all RTs (in bytes) */ + uint32_t total_color_bpp, + bool msaa, + bool double_buffer, + uint32_t *width, + uint32_t *height) +{ + static const uint8_t tile_sizes[] = { + 64, 64, + 64, 32, + 32, 32, + 32, 16, + 16, 16, + 16, 8, + 8, 8 + }; + + uint32_t idx = 0; + if (devinfo->ver >= 71) { + /* In V3D 7.x, we use the actual bpp used by color attachments to compute + * the tile size instead of the maximum bpp. This may allow us to choose a + * larger tile size than we would in 4.x in scenarios with multiple RTs + * with different bpps. + * + * Also, the TLB has an auxiliary buffer of 8KB that will be automatically + * used for depth instead of the main 16KB depth TLB buffer when the depth + * tile fits in the auxiliary buffer, allowing the hardware to allocate + * the 16KB from the main depth TLB to the color TLB. If we can do that, + * then we are effectively doubling the memory we have for color and we + * can also select a larger tile size. This is necessary to support + * the most expensive configuration: 8x128bpp RTs + MSAA. + * + * FIXME: the docs state that depth TLB memory can be used for color + * if depth testing is not used by setting the 'depth disable' bit in the + * rendering configuration. However, this comes with a requirement that + * occlussion queries must not be active. We need to clarify if this means + * active at the point at which we emit a tile rendering configuration + * item, meaning that the we have a query spanning a full render pass + * (this is something we can tell before we emit the rendering + * configuration item) or active in the subpass for which we are enabling + * the bit (which we can't tell until later, when we record commands for + * the subpass). If it is the latter, then we cannot use this feature. + * + * FIXME: pending handling double_buffer. + */ + const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1); + const uint32_t depth_bpp = 4 * (msaa ? 4 : 1); + do { + const uint32_t tile_w = tile_sizes[idx * 2]; + const uint32_t tile_h = tile_sizes[idx * 2 + 1]; + if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp)) + break; + idx++; + } while (idx < ARRAY_SIZE(tile_sizes) / 2); + + /* FIXME: pending handling double_buffer */ + assert(!double_buffer); + } else { + /* On V3D 4.x tile size is selected based on the number of RTs, the + * maximum bpp across all of them and whether 4x MSAA is used. + */ + if (color_attachment_count > 4) + idx += 3; + else if (color_attachment_count > 2) + idx += 2; + else if (color_attachment_count > 1) + idx += 1; + + /* MSAA and double-buffer are mutually exclusive */ + assert(!msaa || !double_buffer); + if (msaa) + idx += 2; + else if (double_buffer) + idx += 1; + + idx += max_internal_bpp; + } + + assert(idx < ARRAY_SIZE(tile_sizes) / 2); + + *width = tile_sizes[idx * 2]; + *height = tile_sizes[idx * 2 + 1]; +} + +/* Translates a pipe swizzle to the swizzle values used in the + * TEXTURE_SHADER_STATE packet. + */ +uint32_t +v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle) +{ + switch (swizzle) { + case PIPE_SWIZZLE_0: + return 0; + case PIPE_SWIZZLE_1: + return 1; + case PIPE_SWIZZLE_X: + case PIPE_SWIZZLE_Y: + case PIPE_SWIZZLE_Z: + case PIPE_SWIZZLE_W: + return 2 + swizzle; + default: + unreachable("unknown swizzle"); + } +} + +/* Translates a pipe primitive type to a hw value we can use in the various + * draw packets. + */ +uint32_t +v3d_hw_prim_type(enum mesa_prim prim_type) +{ + switch (prim_type) { + case MESA_PRIM_POINTS: + case MESA_PRIM_LINES: + case MESA_PRIM_LINE_LOOP: + case MESA_PRIM_LINE_STRIP: + case MESA_PRIM_TRIANGLES: + case MESA_PRIM_TRIANGLE_STRIP: + case MESA_PRIM_TRIANGLE_FAN: + return prim_type; + + case MESA_PRIM_LINES_ADJACENCY: + case MESA_PRIM_LINE_STRIP_ADJACENCY: + case MESA_PRIM_TRIANGLES_ADJACENCY: + case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY: + return 8 + (prim_type - MESA_PRIM_LINES_ADJACENCY); + + default: + unreachable("Unsupported primitive type"); + } +} + +uint32_t +v3d_internal_bpp_words(uint32_t internal_bpp) +{ + switch (internal_bpp) { + case 0 /* V3D_INTERNAL_BPP_32 */: + return 1; + case 1 /* V3D_INTERNAL_BPP_64 */: + return 2; + case 2 /* V3D_INTERNAL_BPP_128 */: + return 4; + default: + unreachable("Unsupported internal BPP"); + } +} + +uint32_t +v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, + uint32_t bpp) +{ + /* stride in multiples of 128 bits, and covers 2 rows. This is the + * reason we divide by 2 instead of 4, as we divide number of 32-bit + * words per row by 2. + */ + + return (tile_width * bpp) / 2; +} diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h index b9804f235ae..cc6b57b27b2 100644 --- a/src/broadcom/common/v3d_util.h +++ b/src/broadcom/common/v3d_util.h @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,7 +24,10 @@ #ifndef V3D_UTIL_H #define V3D_UTIL_H +#include "util/macros.h" #include "common/v3d_device_info.h" +#include "compiler/shader_enums.h" +#include "util/format/u_formats.h" uint32_t v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, @@ -34,4 +37,46 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, uint32_t num_wgs, uint32_t wg_size); +void +v3d_choose_tile_size(const struct v3d_device_info *devinfo, + uint32_t color_attachment_count, + uint32_t max_internal_bpp, + uint32_t total_color_bpp, + bool msaa, + bool double_buffer, + uint32_t *width, + uint32_t *height); + +uint32_t +v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle); + +uint32_t +v3d_hw_prim_type(enum mesa_prim prim_type); + +uint32_t +v3d_internal_bpp_words(uint32_t internal_bpp); + +/* Some configuration packets want the size on log2, but starting at 0 for + * size 8. + */ +static inline uint8_t +log2_tile_size(uint32_t size) +{ + switch(size) { + case 8: + return 0; + case 16: + return 1; + case 32: + return 2; + case 64: + return 3; + default: + unreachable("Unsupported tile width/height"); + } +} + +uint32_t +v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, + uint32_t bpp); #endif diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build index 95156140ad9..d5aafb3879e 100644 --- a/src/broadcom/compiler/meson.build +++ b/src/broadcom/compiler/meson.build @@ -32,23 +32,22 @@ libbroadcom_compiler_files = files( 'vir_to_qpu.c', 'qpu_schedule.c', 'qpu_validate.c', - 'v3d33_tex.c', - 'v3d40_tex.c', - 'v3d33_vpm_setup.c', + 'v3d_tex.c', 'v3d_compiler.h', 'v3d_nir_lower_io.c', 'v3d_nir_lower_image_load_store.c', 'v3d_nir_lower_line_smooth.c', + 'v3d_nir_lower_load_store_bitsize.c', 'v3d_nir_lower_logic_ops.c', - 'v3d_nir_lower_robust_buffer_access.c', 'v3d_nir_lower_scratch.c', 'v3d_nir_lower_txf_ms.c', + 'v3d_packing.c', ) libbroadcom_compiler = static_library( - ['broadcom_compiler', v3d_xml_pack], - libbroadcom_compiler_files, - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom], + 'broadcom_compiler', + [libbroadcom_compiler_files, v3d_xml_pack], + include_directories : [inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_broadcom], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers], diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index d0a89f1a7d4..acc62a092f2 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -38,7 +38,7 @@ #define __gen_address_type uint32_t #define __gen_address_offset(reloc) (*reloc) #define __gen_emit_reloc(cl, reloc) -#include "cle/v3d_packet_v41_pack.h" +#include "cle/v3d_packet_v42_pack.h" #define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) #define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) @@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c) c->last_thrsw->qpu.sig.thrsw = true; c->last_thrsw_at_top_level = !c->in_control_flow; - /* We need to lock the scoreboard before any tlb acess happens. If this + /* We need to lock the scoreboard before any tlb access happens. If this * thread switch comes after we have emitted a tlb load, then it means * that we can't lock on the last thread switch any more. */ @@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) } static uint32_t +v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr) +{ + nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr); + switch (atomic_op) { + case nir_atomic_op_iadd: + return instr->intrinsic == nir_intrinsic_ssbo_atomic ? + v3d_get_op_for_atomic_add(instr, 2) : + v3d_get_op_for_atomic_add(instr, 1); + case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN; + case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; + case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX; + case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX; + case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC; + case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC; + case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT; + case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; + case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + default: unreachable("unknown atomic op"); + } +} + +static uint32_t v3d_general_tmu_op(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { @@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr) case nir_intrinsic_load_uniform: case nir_intrinsic_load_shared: case nir_intrinsic_load_scratch: + case nir_intrinsic_load_global_2x32: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: return V3D_TMU_OP_REGULAR; - case nir_intrinsic_ssbo_atomic_add: - return v3d_get_op_for_atomic_add(instr, 2); - case nir_intrinsic_shared_atomic_add: - return v3d_get_op_for_atomic_add(instr, 1); - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_shared_atomic_imin: - return V3D_TMU_OP_WRITE_SMIN; - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_shared_atomic_umin: - return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_shared_atomic_imax: - return V3D_TMU_OP_WRITE_SMAX; - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_shared_atomic_umax: - return V3D_TMU_OP_WRITE_UMAX; - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_shared_atomic_and: - return V3D_TMU_OP_WRITE_AND_READ_INC; - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_shared_atomic_or: - return V3D_TMU_OP_WRITE_OR_READ_DEC; - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_shared_atomic_xor: - return V3D_TMU_OP_WRITE_XOR_READ_NOT; - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_shared_atomic_exchange: - return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; - case nir_intrinsic_ssbo_atomic_comp_swap: - case nir_intrinsic_shared_atomic_comp_swap: - return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_global_atomic_2x32: + case nir_intrinsic_global_atomic_swap_2x32: + return v3d_general_tmu_op_for_atomic(instr); + default: unreachable("unknown intrinsic op"); } @@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c) bool emitted_tmuwt = false; for (int i = 0; i < c->tmu.flush_count; i++) { if (c->tmu.flush[i].component_mask > 0) { - nir_dest *dest = c->tmu.flush[i].dest; - assert(dest); + nir_def *def = c->tmu.flush[i].def; + assert(def); for (int j = 0; j < 4; j++) { if (c->tmu.flush[i].component_mask & (1 << j)) { - ntq_store_dest(c, dest, j, - vir_MOV(c, vir_LDTMU(c))); + ntq_store_def(c, def, j, + vir_MOV(c, vir_LDTMU(c))); } } } else if (!emitted_tmuwt) { @@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c) /** * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller - * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, + * is responsible for ensuring that doing this doesn't overflow the TMU fifos, * and more specifically, the output fifo, since that can't stall. */ void ntq_add_pending_tmu_flush(struct v3d_compile *c, - nir_dest *dest, + nir_def *def, uint32_t component_mask) { const uint32_t num_components = util_bitcount(component_mask); @@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c, if (num_components > 0) { c->tmu.output_fifo_size += num_components; - if (!dest->is_ssa) - _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg); + + nir_intrinsic_instr *store = nir_store_reg_for_def(def); + if (store != NULL) { + nir_def *reg = store->src[1].ssa; + _mesa_set_add(c->tmu.outstanding_regs, reg); + } } - c->tmu.flush[c->tmu.flush_count].dest = dest; + c->tmu.flush[c->tmu.flush_count].def = def; c->tmu.flush[c->tmu.flush_count].component_mask = component_mask; c->tmu.flush_count++; + c->tmu.total_count++; if (c->disable_tmu_pipelining) ntq_flush_tmu(c); @@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c, uint32_t base_const_offset, uint32_t *writemask, uint32_t *const_offset, + uint32_t *type_size, uint32_t *tmu_writes) { struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); @@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c, /* Update the offset for the TMU write based on the * the first component we are writing. */ - *const_offset = base_const_offset + first_component * 4; + *type_size = nir_src_bit_size(instr->src[0]) / 8; + *const_offset = + base_const_offset + first_component * (*type_size); /* Clear these components from the writemask */ uint32_t written_mask = @@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c, int offset_src, struct qreg base_offset, uint32_t const_offset, + uint32_t dest_components, uint32_t *tmu_writes) { if (mode == MODE_COUNT) { @@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c, if (vir_in_nonuniform_control_flow(c)) vir_set_cond(tmu, V3D_QPU_COND_IFA); + + tmu->ldtmu_count = dest_components; } /** @@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c, */ static void ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, - bool is_shared_or_scratch) + bool is_shared_or_scratch, bool is_global) { uint32_t tmu_op = v3d_general_tmu_op(instr); @@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, * amount to add/sub, as that is implicit. */ bool atomic_add_replaced = - ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || - instr->intrinsic == nir_intrinsic_shared_atomic_add) && + (instr->intrinsic == nir_intrinsic_ssbo_atomic || + instr->intrinsic == nir_intrinsic_shared_atomic || + instr->intrinsic == nir_intrinsic_global_atomic_2x32) && + nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd && (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || - tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC); bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || instr->intrinsic == nir_intrinsic_store_scratch || - instr->intrinsic == nir_intrinsic_store_shared); + instr->intrinsic == nir_intrinsic_store_shared || + instr->intrinsic == nir_intrinsic_store_global_2x32); bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || instr->intrinsic == nir_intrinsic_load_ubo || instr->intrinsic == nir_intrinsic_load_ssbo || instr->intrinsic == nir_intrinsic_load_scratch || - instr->intrinsic == nir_intrinsic_load_shared); + instr->intrinsic == nir_intrinsic_load_shared || + instr->intrinsic == nir_intrinsic_load_global_2x32); if (!is_load) c->tmu_dirty_rcl = true; - bool has_index = !is_shared_or_scratch; + if (is_global) + c->has_global_address = true; + + bool has_index = !is_shared_or_scratch && !is_global; int offset_src; if (instr->intrinsic == nir_intrinsic_load_uniform) { @@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, instr->intrinsic == nir_intrinsic_load_ubo || instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_load_shared || + instr->intrinsic == nir_intrinsic_load_global_2x32 || atomic_add_replaced) { offset_src = 0 + has_index; } else if (is_store) { @@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, v3d_unit_data_create(0, const_offset)); const_offset = 0; } else if (instr->intrinsic == nir_intrinsic_load_ubo) { - uint32_t index = nir_src_as_uint(instr->src[0]); - /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index - * shifted up by 1 (0 is gallium's constant buffer 0). + /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0 + * is gallium's constant buffer 0 in GL and push constants + * in Vulkan)). */ - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) - index++; - + uint32_t index = nir_src_as_uint(instr->src[0]) + 1; base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, v3d_unit_data_create(index, const_offset)); @@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, base_offset = c->cs_shared_offset; const_offset += nir_intrinsic_base(instr); } + } else if (is_global) { + /* Global load/store intrinsics use gloal addresses, so the + * offset is the target address and we don't need to add it + * to a base offset. + */ + base_offset = vir_uniform_ui(c, 0); } else { + uint32_t idx = is_store ? 1 : 0; base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, - nir_src_as_uint(instr->src[is_store ? - 1 : 0])); + nir_src_comp_as_uint(instr->src[idx], 0)); } /* We are ready to emit TMU register writes now, but before we actually @@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) { assert(mode == MODE_COUNT || tmu_writes > 0); + uint32_t type_size = 4; + if (is_store) { emit_tmu_general_store_writes(c, mode, instr, base_const_offset, &writemask, &const_offset, + &type_size, &tmu_writes); } else if (!is_load && !atomic_add_replaced) { - emit_tmu_general_atomic_writes(c, mode, instr, - tmu_op, has_index, - &tmu_writes); + emit_tmu_general_atomic_writes(c, mode, instr, + tmu_op, has_index, + &tmu_writes); + } else if (is_load) { + type_size = instr->def.bit_size / 8; } /* For atomics we use 32bit except for CMPXCHG, that we need @@ -618,17 +648,40 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, v3d_tmu_get_type_from_op(tmu_op, !is_load) == V3D_TMU_OP_TYPE_ATOMIC; + /* Only load per-quad if we can be certain that all + * lines in the quad are active. Notice that demoted + * invocations, unlike terminated ones, are still + * active: we want to skip memory writes for them but + * loads should still work. + */ uint32_t perquad = - is_load && !vir_in_nonuniform_control_flow(c) - ? GENERAL_TMU_LOOKUP_PER_QUAD - : GENERAL_TMU_LOOKUP_PER_PIXEL; + is_load && !vir_in_nonuniform_control_flow(c) && + ((c->s->info.stage == MESA_SHADER_FRAGMENT && + c->s->info.fs.needs_quad_helper_invocations && + !c->emitted_discard) || + c->s->info.uses_wide_subgroup_intrinsics) ? + GENERAL_TMU_LOOKUP_PER_QUAD : + GENERAL_TMU_LOOKUP_PER_PIXEL; config = 0xffffff00 | tmu_op << 3 | perquad; if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { config |= GENERAL_TMU_LOOKUP_TYPE_VEC2; } else if (is_atomic || num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + switch (type_size) { + case 4: + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + break; + case 2: + config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI; + break; + case 1: + config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI; + break; + default: + unreachable("Unsupported bitsize"); + } } else { + assert(type_size == 4); config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; } @@ -637,7 +690,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, emit_tmu_general_address_write(c, mode, instr, config, dynamic_src, offset_src, base_offset, const_offset, - &tmu_writes); + dest_components, &tmu_writes); assert(tmu_writes > 0); if (mode == MODE_COUNT) { @@ -660,7 +713,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, */ const uint32_t component_mask = (1 << dest_components) - 1; - ntq_add_pending_tmu_flush(c, &instr->dest, + ntq_add_pending_tmu_flush(c, &instr->def, component_mask); } } @@ -673,7 +726,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, } static struct qreg * -ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) +ntq_init_ssa_def(struct v3d_compile *c, nir_def *def) { struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, def->num_components); @@ -717,8 +770,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig) * its destination to be the NIR reg's destination */ void -ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, - struct qreg result) +ntq_store_def(struct v3d_compile *c, nir_def *def, int chan, + struct qreg result) { struct qinst *last_inst = NULL; if (!list_is_empty(&c->cur_block->instructions)) @@ -731,23 +784,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, assert(result.file == QFILE_TEMP && last_inst && (last_inst == c->defs[result.index] || is_reused_uniform)); - if (dest->is_ssa) { - assert(chan < dest->ssa.num_components); + nir_intrinsic_instr *store = nir_store_reg_for_def(def); + if (store == NULL) { + assert(chan < def->num_components); struct qreg *qregs; struct hash_entry *entry = - _mesa_hash_table_search(c->def_ht, &dest->ssa); + _mesa_hash_table_search(c->def_ht, def); if (entry) qregs = entry->data; else - qregs = ntq_init_ssa_def(c, &dest->ssa); + qregs = ntq_init_ssa_def(c, def); qregs[chan] = result; } else { - nir_register *reg = dest->reg.reg; - assert(dest->reg.base_offset == 0); - assert(reg->num_array_elems == 0); + nir_def *reg = store->src[1].ssa; + ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + assert(nir_intrinsic_base(store) == 0); + assert(nir_intrinsic_num_array_elems(decl) == 0); struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; @@ -802,7 +857,9 @@ struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i) { struct hash_entry *entry; - if (src.is_ssa) { + + nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa); + if (load == NULL) { assert(i < src.ssa->num_components); entry = _mesa_hash_table_search(c->def_ht, src.ssa); @@ -811,10 +868,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i) entry = _mesa_hash_table_search(c->def_ht, src.ssa); } } else { - nir_register *reg = src.reg.reg; - assert(reg->num_array_elems == 0); - assert(src.reg.base_offset == 0); - assert(i < reg->num_components); + nir_def *reg = load->src[0].ssa; + ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + assert(nir_intrinsic_base(load) == 0); + assert(nir_intrinsic_num_array_elems(decl) == 0); + assert(i < nir_intrinsic_num_components(decl)); if (_mesa_set_search(c->tmu.outstanding_regs, reg)) ntq_flush_tmu(c); @@ -830,13 +888,8 @@ static struct qreg ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, unsigned src) { - assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); - unsigned chan = ffs(instr->dest.write_mask) - 1; struct qreg r = ntq_get_src(c, instr->src[src].src, - instr->src[src].swizzle[chan]); - - assert(!instr->src[src].abs); - assert(!instr->src[src].negate); + instr->src[src].swizzle[0]); return r; }; @@ -876,6 +929,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) case GLSL_SAMPLER_DIM_3D: case GLSL_SAMPLER_DIM_CUBE: case GLSL_SAMPLER_DIM_BUF: + case GLSL_SAMPLER_DIM_EXTERNAL: /* Don't minify the array size. */ if (!(instr->is_array && i == dest_size - 1)) { size = ntq_minify(c, size, lod); @@ -890,7 +944,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) unreachable("Bad sampler type"); } - ntq_store_dest(c, &instr->dest, i, size); + ntq_store_def(c, &instr->def, i, size); } } @@ -905,12 +959,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) */ switch (instr->op) { case nir_texop_query_levels: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); return; case nir_texop_texture_samples: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); return; case nir_texop_txs: ntq_emit_txs(c, instr); @@ -919,10 +973,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) break; } - if (c->devinfo->ver >= 40) - v3d40_vir_emit_tex(c, instr); - else - v3d33_vir_emit_tex(c, instr); + v3d_vir_emit_tex(c, instr); } static struct qreg @@ -963,44 +1014,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr) static struct qreg emit_smooth_varying(struct v3d_compile *c, - struct qreg vary, struct qreg w, struct qreg r5) + struct qreg vary, struct qreg w, struct qreg c_reg) { - return vir_FADD(c, vir_FMUL(c, vary, w), r5); + return vir_FADD(c, vir_FMUL(c, vary, w), c_reg); } static struct qreg emit_noperspective_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { - return vir_FADD(c, vir_MOV(c, vary), r5); + return vir_FADD(c, vir_MOV(c, vary), c_reg); } static struct qreg emit_flat_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { vir_MOV_dest(c, c->undef, vary); - return vir_MOV(c, r5); + return vir_MOV(c, c_reg); } static struct qreg emit_fragment_varying(struct v3d_compile *c, nir_variable *var, int8_t input_idx, uint8_t swizzle, int array_index) { - struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); - struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + struct qreg c_reg; /* C coefficient */ + + if (c->devinfo->has_accumulators) + c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + else + c_reg = vir_reg(QFILE_REG, 0); struct qinst *ldvary = NULL; struct qreg vary; - if (c->devinfo->ver >= 41) { - ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, - c->undef, c->undef); - ldvary->qpu.sig.ldvary = true; - vary = vir_emit_def(c, ldvary); - } else { - vir_NOP(c)->qpu.sig.ldvary = true; - vary = r3; - } + ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldvary->qpu.sig.ldvary = true; + vary = vir_emit_def(c, ldvary); /* Store the input value before interpolation so we can implement * GLSL's interpolateAt functions if the shader uses them. @@ -1008,7 +1058,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (input_idx >= 0) { assert(var); c->interp[input_idx].vp = vary; - c->interp[input_idx].C = vir_MOV(c, r5); + c->interp[input_idx].C = vir_MOV(c, c_reg); c->interp[input_idx].mode = var->data.interpolation; } @@ -1018,7 +1068,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, */ if (!var) { assert(input_idx < 0); - return emit_smooth_varying(c, vary, c->payload_w, r5); + return emit_smooth_varying(c, vary, c->payload_w, c_reg); } int i = c->num_inputs++; @@ -1033,20 +1083,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (var->data.centroid) { BITSET_SET(c->centroid_flags, i); result = emit_smooth_varying(c, vary, - c->payload_w_centroid, r5); + c->payload_w_centroid, c_reg); } else { - result = emit_smooth_varying(c, vary, c->payload_w, r5); + result = emit_smooth_varying(c, vary, c->payload_w, c_reg); } break; case INTERP_MODE_NOPERSPECTIVE: BITSET_SET(c->noperspective_flags, i); - result = emit_noperspective_varying(c, vary, r5); + result = emit_noperspective_varying(c, vary, c_reg); break; case INTERP_MODE_FLAT: BITSET_SET(c->flat_shade_flags, i); - result = emit_flat_varying(c, vary, r5); + result = emit_flat_varying(c, vary, c_reg); break; default: @@ -1163,16 +1213,6 @@ ntq_emit_comparison(struct v3d_compile *c, vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); break; - case nir_op_i2b32: - vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); - cond_invert = true; - break; - - case nir_op_f2b32: - vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); - cond_invert = true; - break; - default: return false; } @@ -1188,7 +1228,7 @@ ntq_emit_comparison(struct v3d_compile *c, static struct nir_alu_instr * ntq_get_alu_parent(nir_src src) { - if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) + if (src.ssa->parent_instr->type != nir_instr_type_alu) return NULL; nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); if (!instr) @@ -1199,7 +1239,7 @@ ntq_get_alu_parent(nir_src src) * src. */ for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { - if (!instr->src[i].src.is_ssa) + if (nir_load_reg_for_def(instr->src[i].src.ssa)) return NULL; } @@ -1242,12 +1282,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond) return result; } +static struct qreg +ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond) +{ + struct qreg result = + vir_MOV(c, vir_SEL(c, cond, + vir_uniform_ui(c, 1), + vir_uniform_ui(c, 0))); + c->flags_temp = result.index; + c->flags_cond = cond; + return result; +} + +static struct qreg +f2f16_rtz(struct v3d_compile *c, struct qreg f32) +{ + /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding + * method and seems to be using RTE by default, so we need to implement + * RTZ rounding in software. + */ + struct qreg rf16 = vir_FMOV(c, f32); + vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L); + + struct qreg rf32 = vir_FMOV(c, rf16); + vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L); + + struct qreg f32_abs = vir_FMOV(c, f32); + vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS); + + struct qreg rf32_abs = vir_FMOV(c, rf32); + vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS); + + vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs), + V3D_QPU_PF_PUSHN); + return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, + vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16)); +} + +/** + * Takes the result value of a signed integer width conversion from a smaller + * type to a larger type and if needed, it applies sign extension to it. + */ +static struct qreg +sign_extend(struct v3d_compile *c, + struct qreg value, + uint32_t src_bit_size, + uint32_t dst_bit_size) +{ + assert(src_bit_size < dst_bit_size); + + struct qreg tmp = vir_MOV(c, value); + + /* Do we need to sign-extend? */ + uint32_t sign_mask = 1 << (src_bit_size - 1); + struct qinst *sign_check = + vir_AND_dest(c, vir_nop_reg(), + tmp, vir_uniform_ui(c, sign_mask)); + vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ); + + /* If so, fill in leading sign bits */ + uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) & + ((1ull << dst_bit_size) - 1); + struct qinst *extend_inst = + vir_OR_dest(c, tmp, tmp, + vir_uniform_ui(c, extend_bits)); + vir_set_cond(extend_inst, V3D_QPU_COND_IFNA); + + return tmp; +} + static void ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) { - /* This should always be lowered to ALU operations for V3D. */ - assert(!instr->dest.saturate); - /* Vectors are special in that they have non-scalarized writemasks, * and just take the first swizzle channel for each argument in order * into each writemask channel. @@ -1260,8 +1366,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) srcs[i] = ntq_get_src(c, instr->src[i].src, instr->src[i].swizzle[0]); for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - ntq_store_dest(c, &instr->dest.dest, i, - vir_MOV(c, srcs[i])); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, srcs[i])); return; } @@ -1327,6 +1433,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); break; + case nir_op_f2f16: + case nir_op_f2f16_rtne: + assert(nir_src_bit_size(instr->src[0].src) == 32); + result = vir_FMOV(c, src[0]); + vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L); + break; + + case nir_op_f2f16_rtz: + assert(nir_src_bit_size(instr->src[0].src) == 32); + result = f2f16_rtz(c, src[0]); + break; + + case nir_op_f2f32: + assert(nir_src_bit_size(instr->src[0].src) == 16); + result = vir_FMOV(c, src[0]); + vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); + break; + + case nir_op_i2i16: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 32 || bit_size == 8); + if (bit_size == 32) { + /* We don't have integer pack/unpack methods for + * converting between 16-bit and 32-bit, so we implement + * the conversion manually by truncating the src. + */ + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); + } else { + struct qreg tmp = vir_AND(c, src[0], + vir_uniform_ui(c, 0xff)); + result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16)); + } + break; + } + + case nir_op_u2u16: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 32 || bit_size == 8); + + /* We don't have integer pack/unpack methods for converting + * between 16-bit and 32-bit, so we implement the conversion + * manually by truncating the src. For the 8-bit case, we + * want to make sure we don't copy garbage from any of the + * 24 MSB bits. + */ + if (bit_size == 32) + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); + else + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); + break; + } + + case nir_op_i2i8: + case nir_op_u2u8: + assert(nir_src_bit_size(instr->src[0].src) == 32 || + nir_src_bit_size(instr->src[0].src) == 16); + /* We don't have integer pack/unpack methods for converting + * between 8-bit and 32-bit, so we implement the conversion + * manually by truncating the src. + */ + result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); + break; + + case nir_op_u2u32: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 16 || bit_size == 8); + + /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit + * from the src but we make sure to clear any garbage bits that + * may be present in the invalid src bits. + */ + uint32_t mask = (1 << bit_size) - 1; + result = vir_AND(c, src[0], vir_uniform_ui(c, mask)); + break; + } + + case nir_op_i2i32: { + uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + assert(bit_size == 16 || bit_size == 8); + + uint32_t mask = (1 << bit_size) - 1; + struct qreg tmp = vir_AND(c, src[0], + vir_uniform_ui(c, mask)); + + result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32)); + break; + } + case nir_op_iadd: result = vir_ADD(c, src[0], src[1]); break; @@ -1390,8 +1584,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; } - case nir_op_i2b32: - case nir_op_f2b32: case nir_op_feq32: case nir_op_fneu32: case nir_op_fge32: @@ -1485,13 +1677,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_uadd_carry: vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), V3D_QPU_PF_PUSHC); - result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); + result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); + break; + + case nir_op_usub_borrow: + vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]), + V3D_QPU_PF_PUSHC); + result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); break; case nir_op_pack_half_2x16_split: result = vir_VFPACK(c, src[0], src[1]); break; + case nir_op_pack_2x32_to_2x16_v3d: + result = vir_VPACK(c, src[0], src[1]); + break; + + case nir_op_pack_32_to_r11g11b10_v3d: + result = vir_V11FPACK(c, src[0], src[1]); + break; + + case nir_op_pack_uint_32_to_r10g10b10a2_v3d: + result = vir_V10PACK(c, src[0], src[1]); + break; + + case nir_op_pack_4x16_to_4x8_v3d: + result = vir_V8PACK(c, src[0], src[1]); + break; + case nir_op_unpack_half_2x16_split_x: result = vir_FMOV(c, src[0]); vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); @@ -1502,26 +1716,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); break; - case nir_op_fquantize2f16: { - /* F32 -> F16 -> F32 conversion */ - struct qreg tmp = vir_FMOV(c, src[0]); - vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L); - tmp = vir_FMOV(c, tmp); - vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L); + case nir_op_pack_2x16_to_unorm_2x8_v3d: + result = vir_VFTOUNORM8(c, src[0]); + break; - /* Check for denorm */ - struct qreg abs_src = vir_FMOV(c, src[0]); - vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS); - struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14)); - vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold), - V3D_QPU_PF_PUSHC); + case nir_op_pack_2x16_to_snorm_2x8_v3d: + result = vir_VFTOSNORM8(c, src[0]); + break; - /* Return +/-0 for denorms */ - struct qreg zero = - vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000)); - result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); + case nir_op_pack_2x16_to_unorm_2x10_v3d: + result = vir_VFTOUNORM10LO(c, src[0]); + break; + + case nir_op_pack_2x16_to_unorm_10_2_v3d: + result = vir_VFTOUNORM10HI(c, src[0]); + break; + + case nir_op_f2unorm_16_v3d: + result = vir_FTOUNORM16(c, src[0]); + break; + + case nir_op_f2snorm_16_v3d: + result = vir_FTOSNORM16(c, src[0]); break; - } default: fprintf(stderr, "unknown NIR ALU inst: "); @@ -1530,17 +1747,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) abort(); } - /* We have a scalar result, so the instruction should only have a - * single channel written to. - */ - assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); - ntq_store_dest(c, &instr->dest.dest, - ffs(instr->dest.write_mask) - 1, result); + ntq_store_def(c, &instr->def, 0, result); } /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit * specifier. They come from a register that's preloaded with 0xffffffff - * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low + * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low * 8 bits are shifted off the bottom and 0xff shifted in from the top. */ #define TLB_TYPE_F16_COLOR (3 << 6) @@ -1670,15 +1882,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt) static void emit_frag_end(struct v3d_compile *c) { - /* If the shader has no non-TLB side effects and doesn't write Z - * we can promote it to enabling early_fragment_tests even - * if the user didn't. - */ - if (c->output_position_index == -1 && - !(c->s->info.num_images || c->s->info.num_ssbos)) { - c->s->info.fs.early_fragment_tests = true; - } - if (c->output_sample_mask_index != -1) { vir_SETMSF_dest(c, vir_nop_reg(), vir_AND(c, @@ -1703,55 +1906,75 @@ emit_frag_end(struct v3d_compile *c) } struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); - if (c->output_position_index != -1 && - !c->s->info.fs.early_fragment_tests) { - struct qinst *inst = vir_MOV_dest(c, tlbu_reg, - c->outputs[c->output_position_index]); - uint8_t tlb_specifier = TLB_TYPE_DEPTH; - if (c->devinfo->ver >= 42) { - tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | - TLB_SAMPLE_MODE_PER_PIXEL); - } else - tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; + /* If the shader has no non-TLB side effects and doesn't write Z + * we can promote it to enabling early_fragment_tests even + * if the user didn't. + */ + if (c->output_position_index == -1 && + !(c->s->info.num_images || c->s->info.num_ssbos) && + !c->s->info.fs.uses_discard && + !c->s->info.fs.uses_demote && + !c->fs_key->sample_alpha_to_coverage && + c->output_sample_mask_index == -1 && + has_any_tlb_color_write) { + c->s->info.fs.early_fragment_tests = true; + } - inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, - tlb_specifier | - 0xffffff00); + /* By default, Z buffer writes are implicit using the Z values produced + * from FEP (Z value produced from rasterization). When this is not + * desirable (shader writes Z explicitly, has discards, etc) we need + * to let the hardware know by setting c->writes_z to true, in which + * case we always need to write a Z value from the QPU, even if it is + * just the passthrough Z value produced from FEP. + * + * Also, from the V3D 4.2 spec: + * + * "If a shader performs a Z read the “Fragment shader does Z writes” + * bit in the shader record must be enabled to ensure deterministic + * results" + * + * So if c->reads_z is set we always need to write Z, even if it is + * a passthrough from the Z value produced from FEP. + */ + if (!c->s->info.fs.early_fragment_tests || c->reads_z) { c->writes_z = true; - } else if (c->s->info.fs.uses_discard || - !c->s->info.fs.early_fragment_tests || - c->fs_key->sample_alpha_to_coverage || - !has_any_tlb_color_write) { - /* Emit passthrough Z if it needed to be delayed until shader - * end due to potential discards. - * - * Since (single-threaded) fragment shaders always need a TLB - * write, emit passthrouh Z if we didn't have any color - * buffers and flag us as potentially discarding, so that we - * can use Z as the TLB write. - */ - c->s->info.fs.uses_discard = true; - - struct qinst *inst = vir_MOV_dest(c, tlbu_reg, - vir_nop_reg()); uint8_t tlb_specifier = TLB_TYPE_DEPTH; + struct qinst *inst; + + if (c->output_position_index != -1) { + /* Shader writes to gl_FragDepth, use that */ + inst = vir_MOV_dest(c, tlbu_reg, + c->outputs[c->output_position_index]); + + tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | + TLB_SAMPLE_MODE_PER_PIXEL); + } else { + /* Shader doesn't write to gl_FragDepth, take Z from + * FEP. + */ + c->writes_z_from_fep = true; + inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg()); - if (c->devinfo->ver >= 42) { /* The spec says the PER_PIXEL flag is ignored for * invariant writes, but the simulator demands it. */ tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | TLB_SAMPLE_MODE_PER_PIXEL); - } else { - tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; + + /* Since (single-threaded) fragment shaders always need + * a TLB write, if we dond't have any we emit a + * passthrouh Z and flag us as potentially discarding, + * so that we can use Z as the required TLB write. + */ + if (!has_any_tlb_color_write) + c->s->info.fs.uses_discard = true; } - inst->uniform = vir_get_uniform_index(c, - QUNIFORM_CONSTANT, + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, tlb_specifier | 0xffffff00); - c->writes_z = true; + inst->is_tlb_z_write = true; } /* XXX: Performance improvement: Merge Z write and color writes TLB @@ -1767,7 +1990,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c, struct qreg vpm_index, bool uniform_vpm_index) { - assert(c->devinfo->ver >= 40); if (uniform_vpm_index) vir_STVPMV(c, vpm_index, val); else @@ -1777,13 +1999,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c, static void vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { - if (c->devinfo->ver >= 40) { - vir_VPM_WRITE_indirect(c, val, - vir_uniform_ui(c, vpm_index), true); - } else { - /* XXX: v3d33_vir_vpm_write_setup(c); */ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); - } + vir_VPM_WRITE_indirect(c, val, + vir_uniform_ui(c, vpm_index), true); } static void @@ -1791,7 +2008,7 @@ emit_vert_end(struct v3d_compile *c) { /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ - if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + if (c->devinfo->ver == 42) vir_VPMWT(c); } @@ -1800,7 +2017,7 @@ emit_geom_end(struct v3d_compile *c) { /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ - if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + if (c->devinfo->ver == 42) vir_VPMWT(c); } @@ -1812,8 +2029,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset, nir_intrinsic_instr *high, void *data) { - /* Our backend is 32-bit only at present */ - if (bit_size != 32) + /* TMU general access only supports 32-bit vectors */ + if (bit_size > 32) + return false; + + if ((bit_size == 8 || bit_size == 16) && num_components > 1) return false; if (align_mul % 4 != 0 || align_offset % 4 != 0) @@ -1843,7 +2063,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) do { progress = false; - NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp); + NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp); + NIR_PASS(progress, s, nir_opt_deref); + + NIR_PASS(progress, s, nir_lower_vars_to_ssa); + if (!s->info.var_copies_lowered) { + /* Only run this pass if nir_lower_var_copies was not called + * yet. That would lower away any copy_deref instructions and we + * don't want to introduce any more. + */ + NIR_PASS(progress, s, nir_opt_find_array_copies); + } + + NIR_PASS(progress, s, nir_opt_copy_prop_vars); + NIR_PASS(progress, s, nir_opt_dead_write_vars); + NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all); + + NIR_PASS(progress, s, nir_remove_dead_variables, + (nir_variable_mode)(nir_var_function_temp | + nir_var_shader_temp | + nir_var_mem_shared), + NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); NIR_PASS(progress, s, nir_copy_prop); @@ -1851,10 +2093,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false); + NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_intrinsics); + NIR_PASS(progress, s, nir_opt_idiv_const, 32); + NIR_PASS(progress, s, nir_lower_alu); + + if (nir_opt_loop(s)) { + progress = true; + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_dce); + } + + NIR_PASS(progress, s, nir_opt_conditional_discard); + + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_if, false); + if (c && !c->disable_gcm) { + bool local_progress = false; + NIR_PASS(local_progress, s, nir_opt_gcm, false); + c->gcm_progress |= local_progress; + progress |= local_progress; + } + + /* Note that vectorization may undo the load/store scalarization + * pass we run for non 32-bit TMU general load/store by + * converting, for example, 2 consecutive 16-bit loads into a + * single 32-bit load. This is fine (and desirable) as long as + * the resulting 32-bit load meets 32-bit alignment requirements, + * which mem_vectorize_callback() should be enforcing. + */ nir_load_store_vectorize_options vectorize_opts = { .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared | @@ -1862,7 +2133,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) .callback = mem_vectorize_callback, .robust_modes = 0, }; - NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); + bool vectorize_progress = false; + + + /* This requires that we have called + * nir_lower_vars_to_explicit_types / nir_lower_explicit_io + * first, which we may not have done yet if we call here too + * early durign NIR pre-processing. We can detect this because + * in that case we won't have a compile object + */ + if (c) { + NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize, + &vectorize_opts); + if (vectorize_progress) { + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(progress, s, nir_lower_pack); + progress = true; + } + } if (lower_flrp != 0) { bool lower_flrp_progress = false; @@ -1895,10 +2183,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) nir_move_options sink_opts = nir_move_const_undef | nir_move_comparisons | nir_move_copies | - nir_move_load_ubo; + nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform; NIR_PASS(progress, s, nir_opt_sink, sink_opts); - - NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo); } static int @@ -1915,27 +2201,9 @@ ntq_emit_vpm_read(struct v3d_compile *c, uint32_t *remaining, uint32_t vpm_index) { - struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); - - if (c->devinfo->ver >= 40 ) { - return vir_LDVPMV_IN(c, - vir_uniform_ui(c, - (*num_components_queued)++)); - } - - if (*num_components_queued != 0) { - (*num_components_queued)--; - return vir_MOV(c, vpm); - } - - uint32_t num_components = MIN2(*remaining, 32); - - v3d33_vir_vpm_read_setup(c, num_components); - - *num_components_queued = num_components - 1; - *remaining -= num_components; - - return vir_MOV(c, vpm); + return vir_LDVPMV_IN(c, + vir_uniform_ui(c, + (*num_components_queued)++)); } static void @@ -2005,31 +2273,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c) } /* The actual loads will happen directly in nir_intrinsic_load_input - * on newer versions. */ - if (c->devinfo->ver >= 40) - return; - - for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { - resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + 1) * 4); - - for (int i = 0; i < c->vattr_sizes[loc]; i++) { - c->inputs[loc * 4 + i] = - ntq_emit_vpm_read(c, - &vpm_components_queued, - &num_components, - loc * 4 + i); - - } - } - - if (c->devinfo->ver >= 40) { - assert(vpm_components_queued == num_components); - } else { - assert(vpm_components_queued == 0); - assert(num_components == 0); - } + return; } static bool @@ -2058,14 +2303,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c) */ assert(glsl_type_is_array(var->type)); const struct glsl_type *type = glsl_get_array_element(var->type); - unsigned array_len = MAX2(glsl_get_length(type), 1); + unsigned var_len = glsl_count_vec4_slots(type, false, false); unsigned loc = var->data.driver_location; resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + array_len) * 4); + (loc + var_len) * 4); if (var->data.compact) { - for (unsigned j = 0; j < array_len; j++) { + for (unsigned j = 0; j < var_len; j++) { unsigned input_idx = c->num_inputs++; unsigned loc_frac = var->data.location_frac + j; unsigned loc = var->data.location + loc_frac / 4; @@ -2076,8 +2321,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c) continue; } - for (unsigned j = 0; j < array_len; j++) { - unsigned num_elements = glsl_get_vector_elements(type); + for (unsigned j = 0; j < var_len; j++) { + unsigned num_elements = + glsl_type_is_struct(glsl_without_array(type)) ? + 4 : glsl_get_vector_elements(type); for (unsigned k = 0; k < num_elements; k++) { unsigned chan = var->data.location_frac + k; unsigned input_idx = c->num_inputs++; @@ -2124,7 +2371,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c) } else if (var->data.compact) { for (int j = 0; j < var_len; j++) emit_compact_fragment_input(c, loc, var, j); - } else if (glsl_type_is_struct(var->type)) { + } else if (glsl_type_is_struct(glsl_without_array(var->type))) { for (int j = 0; j < var_len; j++) { emit_fragment_input(c, loc, var, j, 4); } @@ -2143,12 +2390,9 @@ ntq_setup_outputs(struct v3d_compile *c) return; nir_foreach_shader_out_variable(var, c->s) { - unsigned array_len = MAX2(glsl_get_length(var->type), 1); + assert(glsl_type_is_vector_or_scalar(var->type)); unsigned loc = var->data.driver_location * 4; - assert(array_len == 1); - (void)array_len; - for (int i = 0; i < 4 - var->data.location_frac; i++) { add_output(c, loc + var->data.location_frac + i, var->data.location, @@ -2157,15 +2401,17 @@ ntq_setup_outputs(struct v3d_compile *c) switch (var->data.location) { case FRAG_RESULT_COLOR: - c->output_color_var[0] = var; - c->output_color_var[1] = var; - c->output_color_var[2] = var; - c->output_color_var[3] = var; + for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) + c->output_color_var[i] = var; break; case FRAG_RESULT_DATA0: case FRAG_RESULT_DATA1: case FRAG_RESULT_DATA2: case FRAG_RESULT_DATA3: + case FRAG_RESULT_DATA4: + case FRAG_RESULT_DATA5: + case FRAG_RESULT_DATA6: + case FRAG_RESULT_DATA7: c->output_color_var[var->data.location - FRAG_RESULT_DATA0] = var; break; @@ -2185,17 +2431,19 @@ ntq_setup_outputs(struct v3d_compile *c) * Each nir_register gets a struct qreg per 32-bit component being stored. */ static void -ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) +ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl) { - foreach_list_typed(nir_register, nir_reg, node, list) { - unsigned array_len = MAX2(nir_reg->num_array_elems, 1); + nir_foreach_reg_decl(decl, impl) { + unsigned num_components = nir_intrinsic_num_components(decl); + unsigned array_len = nir_intrinsic_num_array_elems(decl); + array_len = MAX2(array_len, 1); struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, - array_len * - nir_reg->num_components); + array_len * num_components); + nir_def *nir_reg = &decl->def; _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); - for (int i = 0; i < array_len * nir_reg->num_components; i++) + for (int i = 0; i < array_len * num_components; i++) qregs[i] = vir_get_temp(c); } } @@ -2222,23 +2470,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) assert(nir_src_as_uint(instr->src[1]) == 0); - ntq_store_dest(c, &instr->dest, 0, + ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); if (instr->num_components > 1) { - ntq_store_dest(c, &instr->dest, 1, - vir_uniform(c, - instr->num_components == 2 && is_array ? - QUNIFORM_IMAGE_ARRAY_SIZE : - QUNIFORM_IMAGE_HEIGHT, - image_index)); + ntq_store_def(c, &instr->def, 1, + vir_uniform(c, + instr->num_components == 2 && is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_HEIGHT, + image_index)); } if (instr->num_components > 2) { - ntq_store_dest(c, &instr->dest, 2, - vir_uniform(c, - is_array ? - QUNIFORM_IMAGE_ARRAY_SIZE : - QUNIFORM_IMAGE_DEPTH, - image_index)); + ntq_store_def(c, &instr->def, 2, + vir_uniform(c, + is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_DEPTH, + image_index)); } } @@ -2263,16 +2511,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) * * To fix that, we make sure we always emit a thread switch before the * first tlb color read. If that happens to be the last thread switch - * we emit, then everything is fine, but otherwsie, if any code after + * we emit, then everything is fine, but otherwise, if any code after * this point needs to emit additional thread switches, then we will * switch the strategy to locking the scoreboard on the first thread * switch instead -- see vir_emit_thrsw(). */ if (!c->emitted_tlb_load) { - if (!c->last_thrsw_at_top_level) { - assert(c->devinfo->ver >= 41); + if (!c->last_thrsw_at_top_level) vir_emit_thrsw(c); - } c->emitted_tlb_load = true; } @@ -2371,27 +2617,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) } assert(color_reads_for_sample[component].file != QFILE_NULL); - ntq_store_dest(c, &instr->dest, 0, - vir_MOV(c, color_reads_for_sample[component])); + ntq_store_def(c, &instr->def, 0, + vir_MOV(c, color_reads_for_sample[component])); +} + +static bool +ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr); + +static bool +try_emit_uniform(struct v3d_compile *c, + int offset, + int num_components, + nir_def *def, + enum quniform_contents contents) +{ + /* Even though ldunif is strictly 32-bit we can still use it + * to load scalar 8-bit/16-bit uniforms so long as their offset + * is 32-bit aligned. In this case, ldunif would still load + * 32-bit into the destination with the 8-bit/16-bit uniform + * data in the LSB and garbage in the MSB, but that is fine + * because we should only be accessing the valid bits of the + * destination. + * + * FIXME: if in the future we improve our register allocator to + * pack 2 16-bit variables in the MSB and LSB of the same + * register then this optimization would not be valid as is, + * since the load clobbers the MSB. + */ + if (offset % 4 != 0) + return false; + + /* We need dwords */ + offset = offset / 4; + + for (int i = 0; i < num_components; i++) { + ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i)); + } + + return true; } static void ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) { + /* We scalarize general TMU access for anything that is not 32-bit. */ + assert(instr->def.bit_size == 32 || + instr->num_components == 1); + + /* Try to emit ldunif if possible, otherwise fallback to general TMU */ if (nir_src_is_const(instr->src[0])) { int offset = (nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0])); - assert(offset % 4 == 0); - /* We need dwords */ - offset = offset / 4; - for (int i = 0; i < instr->num_components; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_UNIFORM, - offset + i)); + + if (try_emit_uniform(c, offset, instr->num_components, + &instr->def, QUNIFORM_UNIFORM)) { + return; + } + } + + if (!ntq_emit_load_unifa(c, instr)) { + ntq_emit_tmu_general(c, instr, false, false); + c->has_general_tmu_load = true; + } +} + +static bool +ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + if (c->compiler->max_inline_uniform_buffers <= 0) + return false; + + /* Regular UBOs start after inline UBOs */ + uint32_t index = nir_src_as_uint(instr->src[0]); + if (index >= c->compiler->max_inline_uniform_buffers) + return false; + + /* We scalarize general TMU access for anything that is not 32-bit */ + assert(instr->def.bit_size == 32 || + instr->num_components == 1); + + if (nir_src_is_const(instr->src[1])) { + int offset = nir_src_as_uint(instr->src[1]); + if (try_emit_uniform(c, offset, instr->num_components, + &instr->def, + QUNIFORM_INLINE_UBO_0 + index)) { + return true; } - } else { - ntq_emit_tmu_general(c, instr, false); } + + /* Fallback to regular UBO load */ + return false; } static void @@ -2411,7 +2726,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) unsigned offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]); - if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) { + if (c->s->info.stage != MESA_SHADER_FRAGMENT) { /* Emit the LDVPM directly now, rather than at the top * of the shader like we did for V3D 3.x (which needs * vpmsetup when not just taking the next offset). @@ -2433,19 +2748,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) SYSTEM_VALUE_VERTEX_ID)) { index++; } - for (int i = 0; i < offset; i++) - index += c->vattr_sizes[i]; + + for (int i = 0; i < offset; i++) { + /* GFXH-1602: if any builtins (vid, iid, etc) are read then + * attribute 0 must be active (size > 0). When we hit this, + * the driver is expected to program attribute 0 to have a + * size of 1, so here we need to add that. + */ + if (i == 0 && c->vs_key->is_coord && + c->vattr_sizes[i] == 0 && index > 0) { + index++; + } else { + index += c->vattr_sizes[i]; + } + } + index += nir_intrinsic_component(instr); for (int i = 0; i < instr->num_components; i++) { struct qreg vpm_offset = vir_uniform_ui(c, index++); - ntq_store_dest(c, &instr->dest, i, - vir_LDVPMV_IN(c, vpm_offset)); + ntq_store_def(c, &instr->def, i, + vir_LDVPMV_IN(c, vpm_offset)); } } else { for (int i = 0; i < instr->num_components; i++) { int comp = nir_intrinsic_component(instr) + i; - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, c->inputs[offset * 4 + comp])); + struct qreg input = c->inputs[offset * 4 + comp]; + ntq_store_def(c, &instr->def, i, vir_MOV(c, input)); + + if (c->s->info.stage == MESA_SHADER_FRAGMENT && + input.file == c->payload_z.file && + input.index == c->payload_z.index) { + c->reads_z = true; + } } } } @@ -2610,18 +2944,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c, /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */ struct qreg F = vir_uniform_ui(c, 0); struct qreg T = vir_uniform_ui(c, ~0); - struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1); + struct qreg s0 = vir_AND(c, sample_mask, i1); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); - s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2); + s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s1 = vir_AND(c, sample_mask, i2); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); - s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4); + s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s2 = vir_AND(c, sample_mask, i4); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); - s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); - struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8); + s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); + struct qreg s3 = vir_AND(c, sample_mask, i8); vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ); - s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); + s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F); /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */ struct qreg sample_idx = i3; @@ -2708,28 +3042,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result) c->current_unifa_offset += 4; } -static void -ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) +/* Checks if the value of a nir src is derived from a nir register */ +static bool +nir_src_derived_from_reg(nir_src src) +{ + nir_def *def = src.ssa; + if (nir_load_reg_for_def(def)) + return true; + + nir_instr *parent = def->parent_instr; + switch (parent->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(parent); + int num_srcs = nir_op_infos[alu->op].num_inputs; + for (int i = 0; i < num_srcs; i++) { + if (nir_src_derived_from_reg(alu->src[i].src)) + return true; + } + return false; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); + int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs; + for (int i = 0; i < num_srcs; i++) { + if (nir_src_derived_from_reg(intr->src[i])) + return true; + } + return false; + } + case nir_instr_type_load_const: + case nir_instr_type_undef: + return false; + default: + /* By default we assume it may come from a register, the above + * cases should be able to handle the majority of situations + * though. + */ + return true; + }; +} + +static bool +ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) { + assert(instr->intrinsic == nir_intrinsic_load_ubo || + instr->intrinsic == nir_intrinsic_load_ssbo || + instr->intrinsic == nir_intrinsic_load_uniform); + + bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; + bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; + bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo; + /* Every ldunifa auto-increments the unifa address by 4 bytes, so our * current unifa offset is 4 bytes ahead of the offset of the last load. */ static const int32_t max_unifa_skip_dist = MAX_UNIFA_SKIP_DISTANCE - 4; - bool dynamic_src = !nir_src_is_const(instr->src[1]); - uint32_t const_offset = - dynamic_src ? 0 : nir_src_as_uint(instr->src[1]); + /* We can only use unifa if the offset is uniform */ + nir_src offset = is_uniform ? instr->src[0] : instr->src[1]; + if (nir_src_is_divergent(offset)) + return false; - /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index - * shifted up by 1 (0 is gallium's constant buffer 0). + /* Emitting loads from unifa may not be safe under non-uniform control + * flow. It seems the address that is used to write to the unifa + * register is taken from the first lane and if that lane is disabled + * by control flow then the value we read may be bogus and lead to + * invalid memory accesses on follow-up ldunifa instructions. However, + * ntq_store_def only emits conditional writes for nir registersas long + * we can be certain that the offset isn't derived from a load_reg we + * should be fine. + * + * The following CTS test can be used to trigger the problem, which + * causes a GMP violations in the sim without this check: + * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int */ - uint32_t index = nir_src_as_uint(instr->src[0]); - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) + if (vir_in_nonuniform_control_flow(c) && + nir_src_derived_from_reg(offset)) { + return false; + } + + /* We can only use unifa with SSBOs if they are read-only. Otherwise + * ldunifa won't see the shader writes to that address (possibly + * because ldunifa doesn't read from the L2T cache). + */ + if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE)) + return false; + + /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms + * that we may have been written to scratch using the TMU. + */ + bool dynamic_src = !nir_src_is_const(offset); + if (is_uniform && dynamic_src && c->s->scratch_size > 0) + return false; + + uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset); + if (is_uniform) + const_offset += nir_intrinsic_base(instr); + + /* ldunifa is a 32-bit load instruction so we can only use it with + * 32-bit aligned addresses. We always produce 32-bit aligned addresses + * except for types smaller than 32-bit, so in these cases we can only + * use ldunifa if we can verify alignment, which we can only do for + * loads with a constant offset. + */ + uint32_t bit_size = instr->def.bit_size; + uint32_t value_skips = 0; + if (bit_size < 32) { + if (dynamic_src) { + return false; + } else if (const_offset % 4 != 0) { + /* If we are loading from an unaligned offset, fix + * alignment and skip over unused elements in result. + */ + value_skips = (const_offset % 4) / (bit_size / 8); + const_offset &= ~0x3; + } + } + + assert((bit_size == 32 && value_skips == 0) || + (bit_size == 16 && value_skips <= 1) || + (bit_size == 8 && value_skips <= 3)); + + /* Both Vulkan and OpenGL reserve index 0 for uniforms / push + * constants. + */ + uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]); + + /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use + * index 0 for Gallium's constant buffer (GL) or push constants + * (Vulkan). + */ + if (is_ubo) index++; /* We can only keep track of the last unifa address we used with - * constant offset loads. If the new load targets the same UBO and + * constant offset loads. If the new load targets the same buffer and * is close enough to the previous load, we can skip the unifa register * write by emitting dummy ldunifa instructions to update the unifa * address. @@ -2739,6 +3187,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) if (dynamic_src) { c->current_unifa_block = NULL; } else if (c->cur_block == c->current_unifa_block && + c->current_unifa_is_ubo == !is_ssbo && c->current_unifa_index == index && c->current_unifa_offset <= const_offset && c->current_unifa_offset + max_unifa_skip_dist >= const_offset) { @@ -2746,32 +3195,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) ldunifa_skips = (const_offset - c->current_unifa_offset) / 4; } else { c->current_unifa_block = c->cur_block; + c->current_unifa_is_ubo = !is_ssbo; c->current_unifa_index = index; c->current_unifa_offset = const_offset; } if (!skip_unifa) { - struct qreg base_offset = + struct qreg base_offset = !is_ssbo ? vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(index, const_offset)); + v3d_unit_data_create(index, const_offset)) : + vir_uniform(c, QUNIFORM_SSBO_OFFSET, index); struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); if (!dynamic_src) { - vir_MOV_dest(c, unifa, base_offset); + if (!is_ssbo) { + /* Avoid the extra MOV to UNIFA by making + * ldunif load directly into it. We can't + * do this if we have not actually emitted + * ldunif and are instead reusing a previous + * one. + */ + struct qinst *inst = + (struct qinst *)c->cur_block->instructions.prev; + if (inst == c->defs[base_offset.index]) { + inst->dst = unifa; + c->defs[base_offset.index] = NULL; + } else { + vir_MOV_dest(c, unifa, base_offset); + } + } else { + vir_ADD_dest(c, unifa, base_offset, + vir_uniform_ui(c, const_offset)); + } } else { vir_ADD_dest(c, unifa, base_offset, - ntq_get_src(c, instr->src[1], 0)); + ntq_get_src(c, offset, 0)); } } else { for (int i = 0; i < ldunifa_skips; i++) emit_ldunifa(c, NULL); } - for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) { + uint32_t num_components = nir_intrinsic_dest_components(instr); + for (uint32_t i = 0; i < num_components; ) { struct qreg data; emit_ldunifa(c, &data); - ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data)); + + if (bit_size == 32) { + assert(value_skips == 0); + ntq_store_def(c, &instr->def, i, vir_MOV(c, data)); + i++; + } else { + assert((bit_size == 16 && value_skips <= 1) || + (bit_size == 8 && value_skips <= 3)); + + /* If we have any values to skip, shift to the first + * valid value in the ldunifa result. + */ + if (value_skips > 0) { + data = vir_SHR(c, data, + vir_uniform_ui(c, bit_size * + value_skips)); + } + + /* Check how many valid components we have discounting + * read components to skip. + */ + uint32_t valid_count = (32 / bit_size) - value_skips; + assert((bit_size == 16 && valid_count <= 2) || + (bit_size == 8 && valid_count <= 4)); + assert(valid_count > 0); + + /* Process the valid components */ + do { + struct qreg tmp; + uint32_t mask = (1 << bit_size) - 1; + tmp = vir_AND(c, vir_MOV(c, data), + vir_uniform_ui(c, mask)); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, tmp)); + i++; + valid_count--; + + /* Shift to next component */ + if (i < num_components && valid_count > 0) { + data = vir_SHR(c, data, + vir_uniform_ui(c, bit_size)); + } + } while (i < num_components && valid_count > 0); + } } + + return true; } static inline struct qreg @@ -2781,187 +3296,273 @@ emit_load_local_invocation_index(struct v3d_compile *c) vir_uniform_ui(c, 32 - c->local_invocation_index_bits)); } -/* Various subgroup operations rely on the A flags, so this helper ensures that - * A flags represents currently active lanes in the subgroup. +/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in + * fragment shaders a lane is considered active if any sample flags are set + * for *any* lane in the same quad, however, we still need to ensure that + * terminated lanes (OpTerminate) are not included. Further, we also need to + * disable lanes that may be disabled because of non-uniform control + * flow. */ -static void -set_a_flags_for_subgroup(struct v3d_compile *c) +static enum v3d_qpu_cond +setup_subgroup_control_flow_condition(struct v3d_compile *c) { - /* MSF returns 0 for disabled lanes in compute shaders so - * PUSHZ will set A=1 for disabled lanes. We want the inverse - * of this but we don't have any means to negate the A flags - * directly, but we can do it by repeating the same operation - * with NORZ (A = ~A & ~Z). + assert(c->s->info.stage == MESA_SHADER_FRAGMENT || + c->s->info.stage == MESA_SHADER_COMPUTE); + + enum v3d_qpu_cond cond = V3D_QPU_COND_NONE; + + /* We need to make sure that terminated lanes in fragment shaders are + * not included. We can identify these lanes by comparing the inital + * sample mask with the current. This fixes: + * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_* */ - assert(c->s->info.stage == MESA_SHADER_COMPUTE); - vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); - vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ); + if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) { + vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf, + vir_NOT(c, vir_XOR(c, c->start_msf, + vir_MSF(c)))), + V3D_QPU_PF_PUSHZ); + cond = V3D_QPU_COND_IFNA; + } - /* If we are under non-uniform control flow we also need to - * AND the A flags with the current execute mask. + /* If we are in non-uniform control-flow update the condition to + * also limit lanes to those in the current execution mask. */ if (vir_in_nonuniform_control_flow(c)) { - const uint32_t bidx = c->cur_block->index; - vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(), - c->execute, - vir_uniform_ui(c, bidx)), - V3D_QPU_UF_ANDZ); + if (cond == V3D_QPU_COND_IFNA) { + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_UF_NORNZ); + } else { + assert(cond == V3D_QPU_COND_NONE); + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + cond = V3D_QPU_COND_IFA; } + + return cond; +} + +static void +emit_compute_barrier(struct v3d_compile *c) +{ + /* Ensure we flag the use of the control barrier. NIR's + * gather info pass usually takes care of this, but that + * requires that we call that pass after any other pass + * may emit a control barrier, so this is safer. + */ + c->s->info.uses_control_barrier = true; + + /* Emit a TSY op to get all invocations in the workgroup + * (actually supergroup) to block until the last + * invocation reaches the TSY op. + */ + vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB)); +} + +static void +emit_barrier(struct v3d_compile *c) +{ + struct qreg eidx = vir_EIDX(c); + + /* The config for the TSY op should be setup like this: + * - Lane 0: Quorum + * - Lane 2: TSO id + * - Lane 3: TSY opcode + */ + + /* Lane 0: we want to synchronize across one subgroup. Here we write to + * all lanes unconditionally and will overwrite other lanes below. + */ + struct qreg tsy_conf = vir_uniform_ui(c, 1); + + /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the + * curent QPU index and thread index to ensure we get a unique one for + * this group of invocations in this core. + */ + struct qreg tso_id = + vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f)); + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id); + + /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */ + struct qreg tsy_op = vir_uniform_ui(c, 16); + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op); + + /* Emit TSY sync */ + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf); } static void ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) { switch (instr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + break; /* Ignore these */ + case nir_intrinsic_load_uniform: ntq_emit_load_uniform(c, instr); break; + case nir_intrinsic_load_global_2x32: + ntq_emit_tmu_general(c, instr, false, true); + c->has_general_tmu_load = true; + break; + case nir_intrinsic_load_ubo: - if (!nir_src_is_divergent(instr->src[1])) - ntq_emit_load_ubo_unifa(c, instr); - else - ntq_emit_tmu_general(c, instr, false); - break; - - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: + if (ntq_emit_inline_ubo_load(c, instr)) + break; + FALLTHROUGH; case nir_intrinsic_load_ssbo: + if (!ntq_emit_load_unifa(c, instr)) { + ntq_emit_tmu_general(c, instr, false, false); + c->has_general_tmu_load = true; + } + break; + case nir_intrinsic_store_ssbo: - ntq_emit_tmu_general(c, instr, false); - break; - - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - case nir_intrinsic_load_shared: + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + ntq_emit_tmu_general(c, instr, false, false); + break; + + case nir_intrinsic_store_global_2x32: + case nir_intrinsic_global_atomic_2x32: + case nir_intrinsic_global_atomic_swap_2x32: + ntq_emit_tmu_general(c, instr, false, true); + break; + + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: case nir_intrinsic_store_shared: - case nir_intrinsic_load_scratch: case nir_intrinsic_store_scratch: - ntq_emit_tmu_general(c, instr, true); + ntq_emit_tmu_general(c, instr, true, false); + break; + + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_shared: + ntq_emit_tmu_general(c, instr, true, false); + c->has_general_tmu_load = true; break; - case nir_intrinsic_image_load: case nir_intrinsic_image_store: - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_comp_swap: - v3d40_vir_emit_image_load_store(c, instr); + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + v3d_vir_emit_image_load_store(c, instr); + break; + + case nir_intrinsic_image_load: + v3d_vir_emit_image_load_store(c, instr); + /* Not really a general TMU load, but we only use this flag + * for NIR scheduling and we do schedule these under the same + * policy as general TMU. + */ + c->has_general_tmu_load = true; break; case nir_intrinsic_get_ssbo_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, - nir_src_comp_as_uint(instr->src[0], 0))); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, + nir_src_comp_as_uint(instr->src[0], 0))); break; case nir_intrinsic_get_ubo_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_GET_UBO_SIZE, - nir_src_comp_as_uint(instr->src[0], 0))); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_GET_UBO_SIZE, + nir_src_comp_as_uint(instr->src[0], 0))); break; case nir_intrinsic_load_user_clip_plane: for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, - nir_intrinsic_ucp_id(instr) * - 4 + i)); + ntq_store_def(c, &instr->def, i, + vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, + nir_intrinsic_ucp_id(instr) * + 4 + i)); } break; case nir_intrinsic_load_viewport_x_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); break; case nir_intrinsic_load_viewport_y_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); break; case nir_intrinsic_load_viewport_z_scale: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); break; case nir_intrinsic_load_viewport_z_offset: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); break; case nir_intrinsic_load_line_coord: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x)); break; case nir_intrinsic_load_line_width: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); break; case nir_intrinsic_load_aa_line_width: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); break; case nir_intrinsic_load_sample_mask_in: - ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); + ntq_store_def(c, &instr->def, 0, vir_MSF(c)); break; case nir_intrinsic_load_helper_invocation: vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); - ntq_store_dest(c, &instr->dest, 0, qdest); + ntq_store_def(c, &instr->def, 0, qdest); break; case nir_intrinsic_load_front_face: /* The register contains 0 (front) or 1 (back), and we need to * turn it into a NIR bool where true means front. */ - ntq_store_dest(c, &instr->dest, 0, - vir_ADD(c, - vir_uniform_ui(c, -1), - vir_REVF(c))); + ntq_store_def(c, &instr->def, 0, + vir_ADD(c, + vir_uniform_ui(c, -1), + vir_REVF(c))); break; case nir_intrinsic_load_base_instance: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid)); break; case nir_intrinsic_load_instance_id: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid)); break; case nir_intrinsic_load_vertex_id: - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid)); + break; + + case nir_intrinsic_load_draw_id: + ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0)); break; case nir_intrinsic_load_tlb_color_v3d: vir_emit_tlb_color_read(c, instr); break; + case nir_intrinsic_load_fep_w_v3d: + ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w)); + break; + case nir_intrinsic_load_input: ntq_emit_load_input(c, instr); break; @@ -2978,7 +3579,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_image_size(c, instr); break; + /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which + * is intended to match the semantics of GLSL's discard) should + * terminate the invocation immediately. Our implementation doesn't + * do that. What we do is actually a demote by removing the invocations + * from the sample mask. Maybe we could be more strict and force an + * early termination by emitting a (maybe conditional) jump to the + * end section of the fragment shader for affected invocations. + */ case nir_intrinsic_discard: + case nir_intrinsic_terminate: + c->emitted_discard = true; + FALLTHROUGH; + case nir_intrinsic_demote: ntq_flush_tmu(c); if (vir_in_nonuniform_control_flow(c)) { @@ -2993,7 +3606,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) } break; - case nir_intrinsic_discard_if: { + case nir_intrinsic_discard_if: + case nir_intrinsic_terminate_if: + c->emitted_discard = true; + FALLTHROUGH; + case nir_intrinsic_demote_if: { ntq_flush_tmu(c); enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); @@ -3011,102 +3628,79 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), vir_uniform_ui(c, 0)), cond); - break; } - case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_memory_barrier_tcs_patch: - case nir_intrinsic_group_memory_barrier: - /* We don't do any instruction scheduling of these NIR - * instructions between each other, so we just need to make - * sure that the TMU operations before the barrier are flushed + case nir_intrinsic_barrier: + /* Ensure that the TMU operations before the barrier are flushed * before the ones after the barrier. */ ntq_flush_tmu(c); - break; - - case nir_intrinsic_control_barrier: - /* Emit a TSY op to get all invocations in the workgroup - * (actually supergroup) to block until the last invocation - * reaches the TSY op. - */ - ntq_flush_tmu(c); - if (c->devinfo->ver >= 42) { - vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, - V3D_QPU_WADDR_SYNCB)); - } else { - struct qinst *sync = - vir_BARRIERID_dest(c, - vir_reg(QFILE_MAGIC, - V3D_QPU_WADDR_SYNCU)); - sync->uniform = - vir_get_uniform_index(c, QUNIFORM_CONSTANT, - 0xffffff00 | - V3D_TSY_WAIT_INC_CHECK); + if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) { + if (c->s->info.stage == MESA_SHADER_COMPUTE) + emit_compute_barrier(c); + else + emit_barrier(c); + /* The blocking of a TSY op only happens at the next + * thread switch. No texturing may be outstanding at the + * time of a TSY blocking operation. + */ + vir_emit_thrsw(c); } - - /* The blocking of a TSY op only happens at the next thread - * switch. No texturing may be outstanding at the time of a - * TSY blocking operation. - */ - vir_emit_thrsw(c); break; case nir_intrinsic_load_num_workgroups: for (int i = 0; i < 3; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, - i)); + ntq_store_def(c, &instr->def, i, + vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, + i)); } break; case nir_intrinsic_load_workgroup_id: { struct qreg x = vir_AND(c, c->cs_payload[0], vir_uniform_ui(c, 0xffff)); + ntq_store_def(c, &instr->def, 0, x); struct qreg y = vir_SHR(c, c->cs_payload[0], vir_uniform_ui(c, 16)); + ntq_store_def(c, &instr->def, 1, y); struct qreg z = vir_AND(c, c->cs_payload[1], vir_uniform_ui(c, 0xffff)); + ntq_store_def(c, &instr->def, 2, z); + break; + } - /* We only support dispatch base in Vulkan */ - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { - x = vir_ADD(c, x, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0)); - y = vir_ADD(c, y, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1)); - z = vir_ADD(c, z, - vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2)); - } + case nir_intrinsic_load_base_workgroup_id: { + struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0); + ntq_store_def(c, &instr->def, 0, x); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y)); - ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z)); + struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1); + ntq_store_def(c, &instr->def, 1, y); + + struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2); + ntq_store_def(c, &instr->def, 2, z); break; } case nir_intrinsic_load_local_invocation_index: - ntq_store_dest(c, &instr->dest, 0, - emit_load_local_invocation_index(c)); + ntq_store_def(c, &instr->def, 0, + emit_load_local_invocation_index(c)); break; case nir_intrinsic_load_subgroup_id: { /* This is basically the batch index, which is the Local * Invocation Index divided by the SIMD width). */ - STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS)); + STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0); const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1; struct qreg lii = emit_load_local_invocation_index(c); - ntq_store_dest(c, &instr->dest, 0, - vir_SHR(c, lii, - vir_uniform_ui(c, divide_shift))); + ntq_store_def(c, &instr->def, 0, + vir_SHR(c, lii, + vir_uniform_ui(c, divide_shift))); break; } @@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg col = ntq_get_src(c, instr->src[0], 0); for (int i = 0; i < instr->num_components; i++) { struct qreg row = vir_uniform_ui(c, row_idx++); - ntq_store_dest(c, &instr->dest, i, - vir_LDVPMG_IN(c, row, col)); + ntq_store_def(c, &instr->def, i, + vir_LDVPMG_IN(c, row, col)); } break; } @@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) * using ldvpm(v,d)_in (See Table 71). */ assert(c->s->info.stage == MESA_SHADER_GEOMETRY); - ntq_store_dest(c, &instr->dest, 0, - vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); + ntq_store_def(c, &instr->def, 0, + vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); break; } case nir_intrinsic_load_invocation_id: - ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); + ntq_store_def(c, &instr->def, 0, vir_IID(c)); break; case nir_intrinsic_load_fb_layers_v3d: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); break; case nir_intrinsic_load_sample_id: - ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c)); + ntq_store_def(c, &instr->def, 0, vir_SAMPID(c)); break; case nir_intrinsic_load_sample_pos: - ntq_store_dest(c, &instr->dest, 0, - vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); - ntq_store_dest(c, &instr->dest, 1, - vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); + ntq_store_def(c, &instr->def, 0, + vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); + ntq_store_def(c, &instr->def, 1, + vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); break; case nir_intrinsic_load_barycentric_at_offset: - ntq_store_dest(c, &instr->dest, 0, - vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); - ntq_store_dest(c, &instr->dest, 1, - vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); + ntq_store_def(c, &instr->def, 0, + vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); + ntq_store_def(c, &instr->def, 1, + vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); break; case nir_intrinsic_load_barycentric_pixel: - ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); - ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f)); break; case nir_intrinsic_load_barycentric_at_sample: { if (!c->fs_key->msaa) { - ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); - ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f)); + ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f)); return; } @@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0); ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x)); + ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y)); break; } @@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) struct qreg offset_y = vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); - ntq_store_dest(c, &instr->dest, 0, - vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); - ntq_store_dest(c, &instr->dest, 1, - vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); + ntq_store_def(c, &instr->def, 0, + vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); + ntq_store_def(c, &instr->def, 1, + vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); break; } case nir_intrinsic_load_barycentric_centroid: { struct qreg offset_x, offset_y; ntq_get_barycentric_centroid(c, &offset_x, &offset_y); - ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); - ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x)); + ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y)); break; } @@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) */ if (!c->fs_key->msaa || c->interp[input_idx].vp.file == QFILE_NULL) { - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, c->inputs[input_idx])); + ntq_store_def(c, &instr->def, i, + vir_MOV(c, c->inputs[input_idx])); continue; } @@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_load_interpolated_input(c, p, C, offset_x, offset_y, interp_mode); - ntq_store_dest(c, &instr->dest, i, result); + ntq_store_def(c, &instr->def, i, result); } break; } case nir_intrinsic_load_subgroup_size: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform_ui(c, V3D_CHANNELS)); + ntq_store_def(c, &instr->def, 0, + vir_uniform_ui(c, V3D_CHANNELS)); break; case nir_intrinsic_load_subgroup_invocation: - ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); + ntq_store_def(c, &instr->def, 0, vir_EIDX(c)); break; case nir_intrinsic_elect: { - set_a_flags_for_subgroup(c); - struct qreg first = vir_FLAFIRST(c); + struct qreg first; + if (vir_in_nonuniform_control_flow(c)) { + /* Sets A=1 for lanes enabled in the execution mask */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + /* Updates A ANDing with lanes enabled in MSF */ + vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), + V3D_QPU_UF_ANDNZ); + first = vir_FLAFIRST(c); + } else { + /* Sets A=1 for inactive lanes */ + vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), + V3D_QPU_PF_PUSHZ); + first = vir_FLNAFIRST(c); + } - /* Produce a boolean result from Flafirst */ + /* Produce a boolean result */ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), first, vir_uniform_ui(c, 1)), V3D_QPU_PF_PUSHZ); struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); - ntq_store_dest(c, &instr->dest, 0, result); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_ballot: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_BALLOT_dest(c, res, value), cond); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg index = ntq_get_src(c, instr->src[1], 0); + struct qreg res = vir_SHUFFLE(c, value, index); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_first_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_BCASTF_dest(c, res, value), cond); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_shuffle: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg indices = ntq_get_src(c, instr->src[1], 0); + struct qreg res = vir_SHUFFLE(c, value, indices); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_vote_feq: + case nir_intrinsic_vote_ieq: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ? + vir_ALLEQ_dest(c, res, value) : + vir_ALLFEQ_dest(c, res, value), + cond); + + /* Produce boolean result */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_vote_all: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); + + /* We want to check if 'all lanes are equal (alleq != 0) and + * their value is True (value != 0)'. + * + * The first MOV.pushz generates predicate for 'alleq == 0'. + * The second MOV.NORZ generates predicate for: + * '!(alleq == 0) & !(value == 0). + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value), + V3D_QPU_UF_NORZ); + struct qreg result = + ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); + ntq_store_def(c, &instr->def, 0, result); + break; + } + + case nir_intrinsic_vote_any: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c); + struct qreg res = vir_get_temp(c); + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); + + /* We want to check 'not (all lanes are equal (alleq != 0)' + * and their value is False (value == 0))'. + * + * The first MOV.pushz generates predicate for 'alleq == 0'. + * The second MOV.NORNZ generates predicate for: + * '!(alleq == 0) & (value == 0). + * The IFNA condition negates the predicate when evaluated: + * '!(!alleq == 0) & (value == 0)) + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), + V3D_QPU_PF_PUSHZ); + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value), + V3D_QPU_UF_NORNZ); + struct qreg result = + ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA); + ntq_store_def(c, &instr->def, 0, result); break; } @@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_view_index: - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); + ntq_store_def(c, &instr->def, 0, + vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); break; default: @@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); } +static bool +is_cheap_block(nir_block *block) +{ + int32_t cost = 3; + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + case nir_instr_type_undef: + case nir_instr_type_load_const: + if (--cost <= 0) + return false; + break; + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + continue; + default: + return false; + } + } + default: + return false; + } + } + return true; +} + static void ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) { @@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) c->execute, vir_uniform_ui(c, else_block->index)); - /* Jump to ELSE if nothing is active for THEN, otherwise fall - * through. + /* Set the flags for taking the THEN block */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + + /* Jump to ELSE if nothing is active for THEN (unless THEN block is + * so small it won't pay off), otherwise fall through. */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); - vir_link_blocks(c->cur_block, else_block); + bool is_cheap = exec_list_is_singular(&if_stmt->then_list) && + is_cheap_block(nir_if_first_then_block(if_stmt)); + if (!is_cheap) { + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); + vir_link_blocks(c->cur_block, else_block); + } vir_link_blocks(c->cur_block, then_block); - /* Process the THEN block. */ + /* Process the THEN block. + * + * Notice we don't call ntq_activate_execute_for_block here on purpose: + * c->execute is already set up to be 0 for lanes that must take the + * THEN block. + */ vir_set_emit_block(c, then_block); ntq_emit_cf_list(c, &if_stmt->then_list); @@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, after_block->index)); - /* If everything points at ENDIF, then jump there immediately. */ - vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), - c->execute, - vir_uniform_ui(c, after_block->index)), - V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); - vir_link_blocks(c->cur_block, after_block); + /* If everything points at ENDIF, then jump there immediately + * (unless ELSE block is so small it won't pay off). + */ + bool is_cheap = exec_list_is_singular(&if_stmt->else_list) && + is_cheap_block(nir_else_block); + if (!is_cheap) { + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), + c->execute, + vir_uniform_ui(c, after_block->index)), + V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); + vir_link_blocks(c->cur_block, after_block); + } vir_link_blocks(c->cur_block, else_block); vir_set_emit_block(c, else_block); @@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) ntq_emit_load_const(c, nir_instr_as_load_const(instr)); break; - case nir_instr_type_ssa_undef: + case nir_instr_type_undef: unreachable("Should've been lowered by nir_lower_undef_to_zero"); break; @@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop) static void ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) { - c->loop_cont_block = vir_new_block(c); c->loop_break_block = vir_new_block(c); @@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) static void ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) { + assert(!nir_loop_has_continue_construct(loop)); + + /* Disable flags optimization for loop conditions. The problem here is + * that we can have code like this: + * + * // block_0 + * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2 + * loop { + * // block_1 + * if ssa_9 { + * + * In this example we emit flags to compute ssa_9 and the optimization + * will skip regenerating them again for the loop condition in the + * loop continue block (block_1). However, this is not safe after the + * first iteration because the loop body can stomp the flags if it has + * any conditionals. + */ + c->flags_temp = -1; + bool was_in_control_flow = c->in_control_flow; c->in_control_flow = true; @@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) static void ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) { - ntq_setup_registers(c, &impl->registers); + ntq_setup_registers(c, impl); ntq_emit_cf_list(c, &impl->body); } @@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c) { switch (c->s->info.stage) { case MESA_SHADER_FRAGMENT: - c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->start_msf = vir_MSF(c); + if (c->devinfo->ver < 71) + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + else + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); @@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c) emit_fragment_varying(c, NULL, -1, 0, 0); } - if (c->fs_key->is_points && - (c->devinfo->ver < 40 || program_reads_point_coord(c))) { + if (c->fs_key->is_points && program_reads_point_coord(c)) { c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0); c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0); c->uses_implicit_point_line_varyings = true; } else if (c->fs_key->is_lines && - (c->devinfo->ver < 40 || - BITSET_TEST(c->s->info.system_values_read, + (BITSET_TEST(c->s->info.system_values_read, SYSTEM_VALUE_LINE_COORD))) { c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0); c->uses_implicit_point_line_varyings = true; } - - c->force_per_sample_msaa = - c->s->info.fs.uses_sample_qualifier || - BITSET_TEST(c->s->info.system_values_read, - SYSTEM_VALUE_SAMPLE_ID) || - BITSET_TEST(c->s->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS); break; case MESA_SHADER_COMPUTE: /* Set up the TSO for barriers, assuming we do some. */ @@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c) V3D_QPU_WADDR_SYNC)); } - c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); - c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + if (c->devinfo->ver == 42) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } else if (c->devinfo->ver >= 71) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } /* Set up the division between gl_LocalInvocationIndex and * wg_in_mem in the payload reg. @@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c) /* Find the main function and emit the body. */ nir_foreach_function(function, c->s) { - assert(strcmp(function->name, "main") == 0); + assert(function->is_entrypoint); assert(function->impl); ntq_emit_impl(c, function->impl); } @@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c, { *restore_last_thrsw = c->last_thrsw; - /* On V3D before 4.1, we need a TMU op to be outstanding when thread - * switching, so disable threads if we didn't do any TMU ops (each of - * which would have emitted a THRSW). - */ - if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { - c->threads = 1; - if (c->last_thrsw) - vir_remove_thrsw(c); - *restore_last_thrsw = NULL; - } - /* If we're threaded and the last THRSW was in conditional code, then * we need to emit another one so that we can flag it as the last * thrsw. */ - if (c->last_thrsw && !c->last_thrsw_at_top_level) { - assert(c->devinfo->ver >= 41); + if (c->last_thrsw && !c->last_thrsw_at_top_level) vir_emit_thrsw(c); - } /* If we're threaded, then we need to mark the last THRSW instruction * so we can emit a pair of them at QPU emit time. @@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c, * For V3D 4.x, we can spawn the non-fragment shaders already in the * post-last-THRSW state, so we can skip this. */ - if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { - assert(c->devinfo->ver >= 41); + if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) vir_emit_thrsw(c); - } /* If we have not inserted a last thread switch yet, do it now to ensure * any potential spilling we do happens before this. If we don't spill @@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c) vir_for_each_inst_inorder(inst, c) { for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file == QFILE_REG && - inst->src[i].index == 0) { + if (inst->src[i].file == c->payload_w.file && + inst->src[i].index == c->payload_w.index) { c->uses_center_w = true; return; } @@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c) void v3d_nir_to_vir(struct v3d_compile *c) { - if (V3D_DEBUG & (V3D_DEBUG_NIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(NIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d NIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c) unreachable("bad stage"); } - if (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c) * instructions until the results are needed. */ - if (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { fprintf(stderr, "%s prog %d/%d VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); @@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c) /* Attempt to allocate registers for the temporaries. If we fail, * reduce thread count and try again. */ - int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; + int min_threads = 2; struct qpu_reg *temp_registers; while (true) { - bool spilled; - temp_registers = v3d_register_allocate(c, &spilled); - if (spilled) - continue; - - if (temp_registers) + temp_registers = v3d_register_allocate(c); + if (temp_registers) { + assert(c->spills + c->fills <= c->max_tmu_spills); break; + } if (c->threads == min_threads && - (V3D_DEBUG & V3D_DEBUG_RA)) { + V3D_DBG(RA)) { fprintf(stderr, "Failed to register allocate using %s\n", c->fallback_scheduler ? "the fallback scheduler:" : @@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c) } if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) { - if (V3D_DEBUG & V3D_DEBUG_PERF) { + if (V3D_DBG(PERF)) { fprintf(stderr, - "Failed to register allocate %s at " - "%d threads.\n", vir_get_stage_name(c), - c->threads); + "Failed to register allocate %s " + "prog %d/%d at %d threads.\n", + vir_get_stage_name(c), + c->program_id, c->variant_id, c->threads); } c->compilation_result = V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; return; } - c->spill_count = 0; + c->spills = 0; + c->fills = 0; c->threads /= 2; if (c->threads == 1) @@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c) vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock); if (c->spills && - (V3D_DEBUG & (V3D_DEBUG_VIR | - v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { + (V3D_DBG(VIR) || + v3d_debug_flag_for_shader_stage(c->s->info.stage))) { fprintf(stderr, "%s prog %d/%d spilled VIR:\n", vir_get_stage_name(c), c->program_id, c->variant_id); diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index c559814b9ea..ba76ac87e1e 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -85,6 +85,7 @@ struct schedule_state { struct schedule_node *last_unif; struct schedule_node *last_rtop; struct schedule_node *last_unifa; + struct schedule_node *last_setmsf; enum direction dir; /* Estimated cycle when the current instruction would start. */ uint32_t time; @@ -97,7 +98,7 @@ add_dep(struct schedule_state *state, bool write) { bool write_after_read = !write && state->dir == R; - void *edge_data = (void *)(uintptr_t)write_after_read; + uintptr_t edge_data = write_after_read; if (!before || !after) return; @@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; - if (inst->alu.add.magic_write && + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) return true; - if (inst->alu.mul.magic_write && + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) return true; @@ -153,12 +156,13 @@ static void process_mux_deps(struct schedule_state *state, struct schedule_node *n, enum v3d_qpu_mux mux) { + assert(state->devinfo->ver < 71); switch (mux) { case V3D_QPU_MUX_A: add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); break; case V3D_QPU_MUX_B: - if (!n->inst->qpu.sig.small_imm) { + if (!n->inst->qpu.sig.small_imm_b) { add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); } @@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, } } + +static void +process_raddr_deps(struct schedule_state *state, struct schedule_node *n, + uint8_t raddr, bool is_small_imm) +{ + assert(state->devinfo->ver >= 71); + + if (!is_small_imm) + add_read_dep(state, state->last_rf[raddr], n); +} + static bool tmu_write_is_sequence_terminator(uint32_t waddr) { @@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr) static bool can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) { - if (devinfo->ver < 40) - return false; - if (tmu_write_is_sequence_terminator(waddr)) return false; @@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, break; case V3D_QPU_WADDR_UNIFA: - if (state->devinfo->ver >= 40) - add_write_dep(state, &state->last_unifa, n); + add_write_dep(state, &state->last_unifa, n); break; case V3D_QPU_WADDR_NOP: @@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* If the input and output segments are shared, then all VPM reads to * a location need to happen before all writes. We handle this by * serializing all VPM operations for now. + * + * FIXME: we are assuming that the segments are shared. That is + * correct right now as we are only using shared, but technically you + * can choose. */ bool separate_vpm_segment = false; @@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* XXX: LOAD_IMM */ - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) - process_mux_deps(state, n, inst->alu.add.a); - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) - process_mux_deps(state, n, inst->alu.add.b); + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.a.raddr, + inst->sig.small_imm_a); + } + } + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.b.raddr, + inst->sig.small_imm_b); + } + } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) - process_mux_deps(state, n, inst->alu.mul.a); - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) - process_mux_deps(state, n, inst->alu.mul.b); + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.a.raddr, + inst->sig.small_imm_c); + } + } + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.b.raddr, + inst->sig.small_imm_d); + } + } switch (inst->alu.add.op) { case V3D_QPU_A_VPMSETUP: @@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) case V3D_QPU_A_MSF: add_read_dep(state, state->last_tlb, n); + add_read_dep(state, state->last_setmsf, n); break; case V3D_QPU_A_SETMSF: + add_write_dep(state, &state->last_setmsf, n); + add_write_dep(state, &state->last_tmu_write, n); + FALLTHROUGH; case V3D_QPU_A_SETREVF: add_write_dep(state, &state->last_tlb, n); break; + case V3D_QPU_A_BALLOT: + case V3D_QPU_A_BCASTF: + case V3D_QPU_A_ALLEQ: + case V3D_QPU_A_ALLFEQ: + add_read_dep(state, state->last_setmsf, n); + break; + default: break; } @@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_r[4], n); if (v3d_qpu_writes_r5(devinfo, inst)) add_write_dep(state, &state->last_r[5], n); + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) + add_write_dep(state, &state->last_rf[0], n); /* If we add any more dependencies here we should consider whether we * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. @@ -492,9 +544,16 @@ struct choose_scoreboard { int last_thrsw_tick; int last_branch_tick; int last_setmsf_tick; - bool tlb_locked; + bool first_thrsw_emitted; + bool last_thrsw_emitted; bool fixup_ldvary; int ldvary_count; + int pending_ldtmu_count; + bool first_ldtmu_after_thrsw; + + /* V3D 7.x */ + int last_implicit_rf0_write_tick; + bool has_rf0_flops_conflict; }; static bool @@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard, } static bool -reads_too_soon_after_write(struct choose_scoreboard *scoreboard, +reads_too_soon(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, uint8_t raddr) +{ + switch (raddr) { + case 0: /* ldvary delayed write of C coefficient to rf0 */ + if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) + return true; + break; + default: + break; + } + + return false; +} + +static bool +reads_too_soon_after_write(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, struct qinst *qinst) { const struct v3d_qpu_instr *inst = &qinst->qpu; @@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); if (inst->alu.add.op != V3D_QPU_A_NOP) { - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr)) + return true; + } } - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr)) + return true; + } } } if (inst->alu.mul.op != V3D_QPU_M_NOP) { - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr)) + return true; + } } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) + return true; + } } } @@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo, v3d_qpu_writes_r4(devinfo, inst)) return true; + if (devinfo->ver == 42) + return false; + + /* Don't schedule anything that writes rf0 right after ldvary, since + * that would clash with the ldvary's delayed rf0 write (the exception + * is another ldvary, since its implicit rf0 write would also have + * one cycle of delay and would not clash). + */ + if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick && + (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + (v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !inst->sig.ldvary))) { + return true; + } + return false; } static bool -pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, +scoreboard_is_locked(struct choose_scoreboard *scoreboard, + bool lock_scoreboard_on_first_thrsw) +{ + if (lock_scoreboard_on_first_thrsw) { + return scoreboard->first_thrsw_emitted && + scoreboard->tick - scoreboard->last_thrsw_tick >= 3; + } + + return scoreboard->last_thrsw_emitted && + scoreboard->tick - scoreboard->last_thrsw_tick >= 3; +} + +static bool +pixel_scoreboard_too_soon(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { - return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); + return qpu_inst_is_tlb(inst) && + !scoreboard_is_locked(scoreboard, + c->lock_scoreboard_on_first_thrsw); } static bool -qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, +qpu_instruction_uses_rf(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, uint32_t waddr) { if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && - inst->raddr_a == waddr) - return true; + if (devinfo->ver < 71) { + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && + inst->raddr_a == waddr) + return true; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && - !inst->sig.small_imm && (inst->raddr_b == waddr)) - return true; + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && + !inst->sig.small_imm_b && (inst->raddr_b == waddr)) + return true; + } else { + if (v3d71_qpu_reads_raddr(inst, waddr)) + return true; + } return false; } static bool -mux_read_stalls(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst) +read_stalls(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) { return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && - qpu_instruction_uses_rf(inst, + qpu_instruction_uses_rf(devinfo, inst, scoreboard->last_stallable_sfu_reg); } /* We define a max schedule priority to allow negative priorities as result of - * substracting this max when an instruction stalls. So instructions that + * subtracting this max when an instruction stalls. So instructions that * stall have lower priority than regular instructions. */ #define MAX_SCHEDULE_PRIORITY 16 @@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo, return next_score; next_score++; + /* Empirical testing shows that using priorities to hide latency of + * TMU operations when scheduling QPU leads to slightly worse + * performance, even at 2 threads. We think this is because the thread + * switching is already quite effective at hiding latency and NIR + * scheduling (and possibly TMU pipelining too) are sufficient to hide + * TMU latency, so piling up on that here doesn't provide any benefits + * and instead may cause us to postpone critical paths that depend on + * the TMU results. + */ +#if 0 /* Schedule texture read results collection late to hide latency. */ if (v3d_qpu_waits_on_tmu(inst)) return next_score; next_score++; +#endif /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; +#if 0 /* Schedule texture read setup early to hide their latency better. */ if (v3d_qpu_writes_tmu(devinfo, inst)) return next_score; next_score++; +#endif /* We should increase the maximum if we assert here */ assert(next_score < MAX_SCHEDULE_PRIORITY); @@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo, return baseline_score; } -static bool -qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo, - enum v3d_qpu_waddr waddr) -{ - return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) || - v3d_qpu_magic_waddr_is_sfu(waddr) || - v3d_qpu_magic_waddr_is_tlb(waddr) || - v3d_qpu_magic_waddr_is_vpm(waddr) || - v3d_qpu_magic_waddr_is_tsy(waddr)); -} +enum { + V3D_PERIPHERAL_VPM_READ = (1 << 0), + V3D_PERIPHERAL_VPM_WRITE = (1 << 1), + V3D_PERIPHERAL_VPM_WAIT = (1 << 2), + V3D_PERIPHERAL_SFU = (1 << 3), + V3D_PERIPHERAL_TMU_WRITE = (1 << 4), + V3D_PERIPHERAL_TMU_READ = (1 << 5), + V3D_PERIPHERAL_TMU_WAIT = (1 << 6), + V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), + V3D_PERIPHERAL_TSY = (1 << 8), + V3D_PERIPHERAL_TLB_READ = (1 << 9), + V3D_PERIPHERAL_TLB_WRITE = (1 << 10), +}; -static bool -qpu_accesses_peripheral(const struct v3d_device_info *devinfo, - const struct v3d_qpu_instr *inst) +static uint32_t +qpu_peripherals(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { - if (v3d_qpu_uses_vpm(inst)) - return true; + uint32_t result = 0; + if (v3d_qpu_reads_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_READ; + if (v3d_qpu_writes_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_WRITE; + if (v3d_qpu_waits_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_WAIT; + + if (v3d_qpu_writes_tmu(devinfo, inst)) + result |= V3D_PERIPHERAL_TMU_WRITE; + if (inst->sig.ldtmu) + result |= V3D_PERIPHERAL_TMU_READ; + if (inst->sig.wrtmuc) + result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG; + if (v3d_qpu_uses_sfu(inst)) - return true; + result |= V3D_PERIPHERAL_SFU; + + if (v3d_qpu_reads_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_READ; + if (v3d_qpu_writes_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_WRITE; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && - qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) { - return true; + v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) { + result |= V3D_PERIPHERAL_TSY; } if (inst->alu.add.op == V3D_QPU_A_TMUWT) - return true; - - if (inst->alu.mul.op != V3D_QPU_M_NOP && - inst->alu.mul.magic_write && - qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) { - return true; - } + result |= V3D_PERIPHERAL_TMU_WAIT; } - return (inst->sig.ldvpm || - inst->sig.ldtmu || - inst->sig.ldtlb || - inst->sig.ldtlbu || - inst->sig.wrtmuc); + return result; } static bool @@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { - const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a); - const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b); + const uint32_t a_peripherals = qpu_peripherals(devinfo, a); + const uint32_t b_peripherals = qpu_peripherals(devinfo, b); /* We can always do one peripheral access per instruction. */ - if (!a_uses_peripheral || !b_uses_peripheral) + if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1) return true; - if (devinfo->ver < 41) + /* V3D 4.x can't do more than one peripheral access except in a + * few cases: + */ + if (devinfo->ver == 42) { + /* WRTMUC signal with TMU register write (other than tmuc). */ + if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); + } + if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); + } + + /* TMU read with VPM read/write. */ + if (a_peripherals == V3D_PERIPHERAL_TMU_READ && + (b_peripherals == V3D_PERIPHERAL_VPM_READ || + b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + if (b_peripherals == V3D_PERIPHERAL_TMU_READ && + (a_peripherals == V3D_PERIPHERAL_VPM_READ || + a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + return false; + } - /* V3D 4.1 and later allow TMU read along with a VPM read or write, and - * WRTMUC with a TMU magic register write (other than tmuc). - */ - if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) || - (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) { - return true; + /* V3D 7.x can't have more than one of these restricted peripherals */ + const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE | + V3D_PERIPHERAL_TMU_WRTMUC_SIG | + V3D_PERIPHERAL_TSY | + V3D_PERIPHERAL_TLB_READ | + V3D_PERIPHERAL_SFU | + V3D_PERIPHERAL_VPM_READ | + V3D_PERIPHERAL_VPM_WRITE; + + const uint32_t a_restricted = a_peripherals & restricted; + const uint32_t b_restricted = b_peripherals & restricted; + if (a_restricted && b_restricted) { + /* WRTMUC signal with TMU register write (other than tmuc) is + * allowed though. + */ + if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || + (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) { + return false; + } } - if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || - (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) { - return true; + /* Only one TMU read per instruction */ + if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) && + (b_peripherals & V3D_PERIPHERAL_TMU_READ)) { + return false; } - return false; + /* Only one TLB access per instruction */ + if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ)) && + (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ))) { + return false; + } + + return true; } /* Compute a bitmask of which rf registers are used between @@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, uint64_t raddrs_used = 0; if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) raddrs_used |= (1ll << a->raddr_a); - if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) + if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) raddrs_used |= (1ll << a->raddr_b); if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) raddrs_used |= (1ll << b->raddr_a); - if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) + if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) raddrs_used |= (1ll << b->raddr_b); return raddrs_used; } -/* Take two instructions and attempt to merge their raddr fields - * into one merged instruction. Returns false if the two instructions - * access more than two different rf registers between them, or more - * than one rf register and one small immediate. +/* Takes two instructions and attempts to merge their raddr fields (including + * small immediates) into one merged instruction. For V3D 4.x, returns false + * if the two instructions access more than two different rf registers between + * them, or more than one rf register and one small immediate. For 7.x returns + * false if both instructions use small immediates. */ static bool qpu_merge_raddrs(struct v3d_qpu_instr *result, const struct v3d_qpu_instr *add_instr, - const struct v3d_qpu_instr *mul_instr) + const struct v3d_qpu_instr *mul_instr, + const struct v3d_device_info *devinfo) { + if (devinfo->ver >= 71) { + assert(add_instr->sig.small_imm_a + + add_instr->sig.small_imm_b <= 1); + assert(add_instr->sig.small_imm_c + + add_instr->sig.small_imm_d == 0); + assert(mul_instr->sig.small_imm_a + + mul_instr->sig.small_imm_b == 0); + assert(mul_instr->sig.small_imm_c + + mul_instr->sig.small_imm_d <= 1); + + result->sig.small_imm_a = add_instr->sig.small_imm_a; + result->sig.small_imm_b = add_instr->sig.small_imm_b; + result->sig.small_imm_c = mul_instr->sig.small_imm_c; + result->sig.small_imm_d = mul_instr->sig.small_imm_d; + + return (result->sig.small_imm_a + + result->sig.small_imm_b + + result->sig.small_imm_c + + result->sig.small_imm_d) <= 1; + } + + assert(devinfo->ver == 42); + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); int naddrs = util_bitcount64(raddrs_used); if (naddrs > 2) return false; - if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { + if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) { if (naddrs > 1) return false; - if (add_instr->sig.small_imm && mul_instr->sig.small_imm) + if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b) if (add_instr->raddr_b != mul_instr->raddr_b) return false; - result->sig.small_imm = true; - result->raddr_b = add_instr->sig.small_imm ? + result->sig.small_imm_b = true; + result->raddr_b = add_instr->sig.small_imm_b ? add_instr->raddr_b : mul_instr->raddr_b; } @@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, raddrs_used &= ~(1ll << raddr_a); result->raddr_a = raddr_a; - if (!result->sig.small_imm) { + if (!result->sig.small_imm_b) { if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && raddr_a == add_instr->raddr_b) { - if (add_instr->alu.add.a == V3D_QPU_MUX_B) - result->alu.add.a = V3D_QPU_MUX_A; - if (add_instr->alu.add.b == V3D_QPU_MUX_B && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B) + result->alu.add.a.mux = V3D_QPU_MUX_A; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_A; + result->alu.add.b.mux = V3D_QPU_MUX_A; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && raddr_a == mul_instr->raddr_b) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) - result->alu.mul.a = V3D_QPU_MUX_A; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B) + result->alu.mul.a.mux = V3D_QPU_MUX_A; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_A; + result->alu.mul.b.mux = V3D_QPU_MUX_A; } } } @@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, result->raddr_b = raddr_b; if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && raddr_b == add_instr->raddr_a) { - if (add_instr->alu.add.a == V3D_QPU_MUX_A) - result->alu.add.a = V3D_QPU_MUX_B; - if (add_instr->alu.add.b == V3D_QPU_MUX_A && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A) + result->alu.add.a.mux = V3D_QPU_MUX_B; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_B; + result->alu.add.b.mux = V3D_QPU_MUX_B; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && raddr_b == mul_instr->raddr_a) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) - result->alu.mul.a = V3D_QPU_MUX_B; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A) + result->alu.mul.a.mux = V3D_QPU_MUX_B; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_B; + result->alu.mul.b.mux = V3D_QPU_MUX_B; } } @@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op) } static void -qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) +qpu_convert_add_to_mul(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst) { STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); assert(inst->alu.add.op != V3D_QPU_A_NOP); @@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) inst->flags.ac = V3D_QPU_COND_NONE; inst->flags.apf = V3D_QPU_PF_NONE; inst->flags.auf = V3D_QPU_UF_NONE; + + inst->alu.mul.output_pack = inst->alu.add.output_pack; + + inst->alu.mul.a.unpack = inst->alu.add.a.unpack; + inst->alu.mul.b.unpack = inst->alu.add.b.unpack; + inst->alu.add.output_pack = V3D_QPU_PACK_NONE; + inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + + if (devinfo->ver >= 71) { + assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d); + assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1); + if (inst->sig.small_imm_a) { + inst->sig.small_imm_c = true; + inst->sig.small_imm_a = false; + } else if (inst->sig.small_imm_b) { + inst->sig.small_imm_d = true; + inst->sig.small_imm_b = false; + } + } +} + +static bool +can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + case V3D_QPU_M_FMOV: + return devinfo->ver >= 71; + default: + return false; + } +} + +static enum v3d_qpu_mul_op +mul_op_as_add_op(enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + return V3D_QPU_A_MOV; + case V3D_QPU_M_FMOV: + return V3D_QPU_A_FMOV; + default: + unreachable("unexpected mov opcode"); + } +} + +static void +qpu_convert_mul_to_add(struct v3d_qpu_instr *inst) +{ + STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul)); + assert(inst->alu.mul.op != V3D_QPU_M_NOP); + assert(inst->alu.add.op == V3D_QPU_A_NOP); + + memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add)); + inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op); + inst->alu.mul.op = V3D_QPU_M_NOP; + + inst->flags.ac = inst->flags.mc; + inst->flags.apf = inst->flags.mpf; + inst->flags.auf = inst->flags.muf; + inst->flags.mc = V3D_QPU_COND_NONE; + inst->flags.mpf = V3D_QPU_PF_NONE; + inst->flags.muf = V3D_QPU_UF_NONE; + + inst->alu.add.output_pack = inst->alu.mul.output_pack; + inst->alu.add.a.unpack = inst->alu.mul.a.unpack; + inst->alu.add.b.unpack = inst->alu.mul.b.unpack; + inst->alu.mul.output_pack = V3D_QPU_PACK_NONE; + inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b); + assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1); + if (inst->sig.small_imm_c) { + inst->sig.small_imm_a = true; + inst->sig.small_imm_c = false; + } else if (inst->sig.small_imm_d) { + inst->sig.small_imm_b = true; + inst->sig.small_imm_d = false; + } } static bool @@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(b->alu.add.op)) { mul_inst = *b; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge.alu.mul = mul_inst.alu.mul; - merge.flags.mc = b->flags.ac; - merge.flags.mpf = b->flags.apf; - merge.flags.muf = b->flags.auf; + merge.flags.mc = mul_inst.flags.mc; + merge.flags.mpf = mul_inst.flags.mpf; + merge.flags.muf = mul_inst.flags.muf; add_instr = a; mul_instr = &mul_inst; } else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(a->alu.add.op)) { mul_inst = *a; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge = mul_inst; merge.alu.add = b->alu.add; @@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, } } + struct v3d_qpu_instr add_inst; if (b->alu.mul.op != V3D_QPU_M_NOP) { - if (a->alu.mul.op != V3D_QPU_M_NOP) - return false; - merge.alu.mul = b->alu.mul; + if (a->alu.mul.op == V3D_QPU_M_NOP) { + merge.alu.mul = b->alu.mul; - merge.flags.mc = b->flags.mc; - merge.flags.mpf = b->flags.mpf; - merge.flags.muf = b->flags.muf; + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; - mul_instr = b; - add_instr = a; + mul_instr = b; + add_instr = a; + } + /* If a's mul op is used but its add op is not, then see if we + * can convert either a's mul op or b's mul op to an add op + * so we can merge. + */ + else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, b->alu.mul.op)) { + add_inst = *b; + qpu_convert_mul_to_add(&add_inst); + + merge.alu.add = add_inst.alu.add; + + merge.flags.ac = add_inst.flags.ac; + merge.flags.apf = add_inst.flags.apf; + merge.flags.auf = add_inst.flags.auf; + + mul_instr = a; + add_instr = &add_inst; + } else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, a->alu.mul.op)) { + add_inst = *a; + qpu_convert_mul_to_add(&add_inst); + + merge = add_inst; + merge.alu.mul = b->alu.mul; + + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; + + mul_instr = b; + add_instr = &add_inst; + } else { + return false; + } } + /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and + * they have restrictions on the number of raddrs that can be adressed + * in a single instruction. In V3D 7.x, we don't have that restriction, + * but we are still limited to a single small immediate per instruction. + */ if (add_instr && mul_instr && - !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { - return false; + !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { + return false; } merge.sig.thrsw |= b->sig.thrsw; @@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, merge.sig.ldtmu |= b->sig.ldtmu; merge.sig.ldvary |= b->sig.ldvary; merge.sig.ldvpm |= b->sig.ldvpm; - merge.sig.small_imm |= b->sig.small_imm; merge.sig.ldtlb |= b->sig.ldtlb; merge.sig.ldtlbu |= b->sig.ldtlbu; merge.sig.ucb |= b->sig.ucb; @@ -1047,24 +1403,25 @@ retry: * regfile A or B that was written to by the previous * instruction." */ - if (reads_too_soon_after_write(scoreboard, n->inst)) + if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; - /* "A scoreboard wait must not occur in the first two - * instructions of a fragment shader. This is either the - * explicit Wait for Scoreboard signal or an implicit wait - * with the first tile-buffer read or write instruction." + /* "Before doing a TLB access a scoreboard wait must have been + * done. This happens either on the first or last thread + * switch, depending on a setting (scb_wait_on_first_thrsw) in + * the shader state." */ - if (pixel_scoreboard_too_soon(scoreboard, inst)) + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; - /* ldunif and ldvary both write r5, but ldunif does so a tick - * sooner. If the ldvary's r5 wasn't used, then ldunif might + /* ldunif and ldvary both write the same register (r5 for v42 + * and below, rf0 for v71), but ldunif does so a tick sooner. + * If the ldvary's register wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update - * r5 in the same tick. + * the register in the same tick. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { @@ -1131,24 +1488,54 @@ retry: continue; } - /* Don't merge in something that will lock the TLB. - * Hopwefully what we have in inst will release some - * other instructions, allowing us to delay the - * TLB-locking instruction until later. + /* Don't merge TLB instructions before we have acquired + * the scoreboard lock. */ - if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; - /* When we succesfully pair up an ldvary we then try + /* When we successfully pair up an ldvary we then try * to merge it into the previous instruction if * possible to improve pipelining. Don't pick up the * ldvary now if the follow-up fixup would place * it in the delay slots of a thrsw, which is not * allowed and would prevent the fixup from being - * successul. + * successful. In V3D 7.x we can allow this to happen + * as long as it is not the last delay slot. */ - if (inst->sig.ldvary && - scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { + if (inst->sig.ldvary) { + if (c->devinfo->ver == 42 && + scoreboard->last_thrsw_tick + 2 >= + scoreboard->tick - 1) { + continue; + } + if (c->devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 == + scoreboard->tick - 1) { + continue; + } + } + + /* We can emit a new tmu lookup with a previous ldtmu + * if doing this would free just enough space in the + * TMU output fifo so we don't overflow, however, this + * is only safe if the ldtmu cannot stall. + * + * A ldtmu can stall if it is not the first following a + * thread switch and corresponds to the first word of a + * read request. + * + * FIXME: For now we forbid pairing up a new lookup + * with a previous ldtmu that is not the first after a + * thrsw if that could overflow the TMU output fifo + * regardless of whether the ldtmu is reading the first + * word of a TMU result or not, since we don't track + * this aspect in the compiler yet. + */ + if (prev_inst->inst->qpu.sig.ldtmu && + !scoreboard->first_ldtmu_after_thrsw && + (scoreboard->pending_ldtmu_count + + n->inst->ldtmu_count > 16 / c->threads)) { continue; } @@ -1161,7 +1548,7 @@ retry: int prio = get_instruction_priority(c->devinfo, inst); - if (mux_read_stalls(scoreboard, inst)) { + if (read_stalls(c->devinfo, scoreboard, inst)) { /* Don't merge an instruction that stalls */ if (prev_inst) continue; @@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, { if (v3d_qpu_magic_waddr_is_sfu(waddr)) scoreboard->last_magic_sfu_write_tick = scoreboard->tick; - else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) + else if (waddr == V3D_QPU_WADDR_UNIFA) scoreboard->last_unifa_write_tick = scoreboard->tick; } @@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, } static void +update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard, + const struct qinst *inst) +{ + /* Track if the have seen any ldtmu after the last thread switch */ + if (scoreboard->tick == scoreboard->last_thrsw_tick + 2) + scoreboard->first_ldtmu_after_thrsw = true; + + /* Track the number of pending ldtmu instructions for outstanding + * TMU lookups. + */ + scoreboard->pending_ldtmu_count += inst->ldtmu_count; + if (inst->qpu.sig.ldtmu) { + assert(scoreboard->pending_ldtmu_count > 0); + scoreboard->pending_ldtmu_count--; + scoreboard->first_ldtmu_after_thrsw = false; + } +} + +static void +set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick && + v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic) { + scoreboard->has_rf0_flops_conflict = true; + } +} + +static void +update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return; + + /* Thread switch restrictions: + * + * At the point of a thread switch or thread end (when the actual + * thread switch or thread end happens, not when the signalling + * instruction is processed): + * + * - If the most recent write to rf0 was from a ldunif, ldunifa, or + * ldvary instruction in which another signal also wrote to the + * register file, and the final instruction of the thread section + * contained a signal which wrote to the register file, then the + * value of rf0 is undefined at the start of the new section + * + * Here we use the scoreboard to track if our last rf0 implicit write + * happens at the same time that another signal writes the register + * file (has_rf0_flops_conflict). We will use that information when + * scheduling thrsw instructions to avoid putting anything in their + * last delay slot which has a signal that writes to the register file. + */ + + /* Reset tracking if we have an explicit rf0 write or we are starting + * a new thread section. + */ + if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + scoreboard->tick - scoreboard->last_thrsw_tick == 3) { + scoreboard->last_implicit_rf0_write_tick = -10; + scoreboard->has_rf0_flops_conflict = false; + } + + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) { + scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ? + scoreboard->tick + 1 : scoreboard->tick; + } + + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); +} + +static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst, + const struct qinst *qinst, const struct v3d_device_info *devinfo) { + const struct v3d_qpu_instr *inst = &qinst->qpu; + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; @@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, } } + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) { + update_scoreboard_for_magic_waddr(scoreboard, + inst->sig_addr, + devinfo); + } + if (inst->sig.ldvary) scoreboard->last_ldvary_tick = scoreboard->tick; - if (qpu_inst_is_tlb(inst)) - scoreboard->tlb_locked = true; + update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo); + + update_scoreboard_tmu_tracking(scoreboard, qinst); } static void @@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo, after_inst->type != V3D_QPU_INSTR_TYPE_ALU) return latency; - if (before_inst->alu.add.magic_write) { + if (v3d_qpu_instr_is_sfu(before_inst)) + return 2; + + if (before_inst->alu.add.op != V3D_QPU_A_NOP && + before_inst->alu.add.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.add.waddr, after_inst)); } - if (before_inst->alu.mul.magic_write) { + if (before_inst->alu.mul.op != V3D_QPU_M_NOP && + before_inst->alu.mul.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.mul.waddr, after_inst)); } - if (v3d_qpu_instr_is_sfu(before_inst)) - return 2; - return latency; } @@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c, { list_addtail(&inst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); + update_scoreboard_for_chosen(scoreboard, inst, c->devinfo); c->qpu_inst_count++; scoreboard->tick++; } @@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, { const struct v3d_qpu_instr *inst = &qinst->qpu; - /* Only TLB Z writes are prohibited in the last slot, but we don't - * have those flagged so prohibit all TLB ops for now. - */ - if (slot == 2 && qpu_inst_is_tlb(inst)) + if (slot == 2 && qinst->is_tlb_z_write) return false; if (slot > 0 && qinst->uniform != ~0) return false; - if (v3d_qpu_uses_vpm(inst)) + if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst)) return false; if (inst->sig.ldvary) @@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { /* GFXH-1625: TMUWT not allowed in the final instruction. */ - if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) + if (c->devinfo->ver == 42 && slot == 2 && + inst->alu.add.op == V3D_QPU_A_TMUWT) { return false; + } - /* No writing physical registers at the end. */ - if (!inst->alu.add.magic_write || - !inst->alu.mul.magic_write) { - return false; + if (c->devinfo->ver == 42) { + /* No writing physical registers at the end. */ + bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; + bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; + if ((!add_is_nop && !inst->alu.add.magic_write) || + (!mul_is_nop && !inst->alu.mul.magic_write)) { + return false; + } + + if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && + !inst->sig_magic) { + return false; + } } - if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) - return false; + if (c->devinfo->ver >= 71) { + /* The thread end instruction must not write to the + * register file via the add/mul ALUs. + */ + if (slot == 0 && + (!inst->alu.add.magic_write || + !inst->alu.mul.magic_write)) { + return false; + } + } - /* RF0-2 might be overwritten during the delay slots by - * fragment shader setup. - */ - if (inst->raddr_a < 3 && - (inst->alu.add.a == V3D_QPU_MUX_A || - inst->alu.add.b == V3D_QPU_MUX_A || - inst->alu.mul.a == V3D_QPU_MUX_A || - inst->alu.mul.b == V3D_QPU_MUX_A)) { - return false; + if (c->devinfo->ver == 42) { + /* RF0-2 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) + return false; + + if (inst->raddr_b < 3 && + !inst->sig.small_imm_b && + v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { + return false; + } } - if (inst->raddr_b < 3 && - !inst->sig.small_imm && - (inst->alu.add.a == V3D_QPU_MUX_B || - inst->alu.add.b == V3D_QPU_MUX_B || - inst->alu.mul.a == V3D_QPU_MUX_B || - inst->alu.mul.b == V3D_QPU_MUX_B)) { - return false; + if (c->devinfo->ver >= 71) { + /* RF2-3 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (v3d71_qpu_reads_raddr(inst, 2) || + v3d71_qpu_reads_raddr(inst, 3)) { + return false; + } + + if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) || + v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) { + return false; + } } } @@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, */ static bool qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, const struct qinst *qinst, uint32_t slot) { @@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, * thread. The simulator complains for safety, though it * would only occur for dead code in our case. */ - if (slot > 0 && - qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || - v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { - return false; + if (slot > 0) { + if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu)) + return false; + if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu)) + return false; } - if (slot > 0 && qinst->qpu.sig.ldvary) - return false; + if (qinst->qpu.sig.ldvary) { + if (c->devinfo->ver == 42 && slot > 0) + return false; + if (c->devinfo->ver >= 71 && slot == 2) + return false; + } /* unifa and the following 3 instructions can't overlap a * thread switch/end. The docs further clarify that this means @@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) return false; + /* See comment when we set has_rf0_flops_conflict for details */ + if (c->devinfo->ver >= 71 && + slot == 2 && + v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) && + !qinst->qpu.sig_magic) { + if (scoreboard->has_rf0_flops_conflict) + return false; + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick) + return false; + } + return true; } @@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, assert(slot <= 2); /* We merge thrsw instructions back into the instruction stream - * manually, so any instructions scheduled after a thrsw shold be + * manually, so any instructions scheduled after a thrsw should be * in the actual delay slots and not in the same slot as the thrsw. */ assert(slot >= 1); @@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, * also apply to instructions scheduled after the thrsw that we want * to place in its delay slots. */ - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot)) return false; /* TLB access is disallowed until scoreboard wait is executed, which @@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (v3d_qpu_writes_flags(&qinst->qpu)) return false; + /* TSY sync ops materialize at the point of the next thread switch, + * therefore, if we have a TSY sync right after a thread switch, we + * cannot place it in its delay slots, or we would be moving the sync + * to the thrsw before it instead. + */ + if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID) + return false; + return true; } @@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard struct qinst *qinst, int instructions_in_sequence, bool is_thrend) { - /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ - if (scoreboard->last_thrsw_tick + 3 > - scoreboard->tick - instructions_in_sequence) { - return false; - } - for (int slot = 0; slot < instructions_in_sequence; slot++) { - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, + qinst, slot)) { return false; + } if (is_thrend && !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { @@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c, /* Find how far back into previous instructions we can put the THRSW. */ int slots_filled = 0; + int invalid_sig_count = 0; + int invalid_seq_count = 0; + bool last_thrsw_after_invalid_ok = false; struct qinst *merge_inst = NULL; vir_for_each_inst_rev(prev_inst, block) { - struct v3d_qpu_sig sig = prev_inst->qpu.sig; - sig.thrsw = true; - uint32_t packed_sig; - - if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) + /* No emitting our thrsw while the previous thrsw hasn't + * happened yet. + */ + if (scoreboard->last_thrsw_tick + 3 > + scoreboard->tick - (slots_filled + 1)) { break; + } + if (!valid_thrsw_sequence(c, scoreboard, prev_inst, slots_filled + 1, is_thrend)) { - break; + /* Even if the current sequence isn't valid, we may + * be able to get a valid sequence by trying to move the + * thrsw earlier, so keep going. + */ + invalid_seq_count++; + goto cont_block; + } + + struct v3d_qpu_sig sig = prev_inst->qpu.sig; + sig.thrsw = true; + uint32_t packed_sig; + if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) { + /* If we can't merge the thrsw here because of signal + * incompatibility, keep going, we might be able to + * merge it in an earlier instruction. + */ + invalid_sig_count++; + goto cont_block; } + /* For last thrsw we need 2 consecutive slots that are + * thrsw compatible, so if we have previously jumped over + * an incompatible signal, flag that we have found the first + * valid slot here and keep going. + */ + if (inst->is_last_thrsw && invalid_sig_count > 0 && + !last_thrsw_after_invalid_ok) { + last_thrsw_after_invalid_ok = true; + invalid_sig_count++; + goto cont_block; + } + + /* We can merge the thrsw in this instruction */ + last_thrsw_after_invalid_ok = false; + invalid_sig_count = 0; + invalid_seq_count = 0; merge_inst = prev_inst; + +cont_block: if (++slots_filled == 3) break; } + /* If we jumped over a signal incompatibility and did not manage to + * merge the thrsw in the end, we need to adjust slots filled to match + * the last valid merge point. + */ + assert((invalid_sig_count == 0 && invalid_seq_count == 0) || + slots_filled >= invalid_sig_count + invalid_seq_count); + if (invalid_sig_count > 0) + slots_filled -= invalid_sig_count; + if (invalid_seq_count > 0) + slots_filled -= invalid_seq_count; + bool needs_free = false; if (merge_inst) { merge_inst->qpu.sig.thrsw = true; @@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c, merge_inst = inst; } + scoreboard->first_thrsw_emitted = true; + /* If we're emitting the last THRSW (other than program end), then * signal that to the HW by emitting two THRSWs in a row. */ @@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c, struct qinst *second_inst = (struct qinst *)merge_inst->link.next; second_inst->qpu.sig.thrsw = true; + scoreboard->last_thrsw_emitted = true; } /* Make sure the thread end executes within the program lifespan */ @@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c, assert(scoreboard->last_branch_tick + 3 < branch_tick); assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); - /* Can't place a branch with msfign != 0 and cond != 0,2,3 after + /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after * setmsf. */ bool is_safe_msf_branch = + c->devinfo->ver >= 71 || inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || @@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c, break; } + /* Do not move up a branch if it can disrupt an ldvary sequence + * as that can cause stomping of the r5 register. + */ + if (scoreboard->last_ldvary_tick + 2 >= + branch_tick - slots_filled) { + break; + } + /* Can't move a conditional branch before the instruction * that writes the flags for its condition. */ @@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c, } static bool -alu_reads_register(struct v3d_qpu_instr *inst, +alu_reads_register(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst, bool add, bool magic, uint32_t index) { uint32_t num_src; - enum v3d_qpu_mux mux_a, mux_b; - - if (add) { + if (add) num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); - mux_a = inst->alu.add.a; - mux_b = inst->alu.add.b; - } else { + else num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); - mux_a = inst->alu.mul.a; - mux_b = inst->alu.mul.b; - } - for (int i = 0; i < num_src; i++) { - if (magic) { - if (i == 0 && mux_a == index) - return true; - if (i == 1 && mux_b == index) - return true; + if (devinfo->ver == 42) { + enum v3d_qpu_mux mux_a, mux_b; + if (add) { + mux_a = inst->alu.add.a.mux; + mux_b = inst->alu.add.b.mux; } else { - if (i == 0 && mux_a == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 0 && mux_a == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; + mux_a = inst->alu.mul.a.mux; + mux_b = inst->alu.mul.b.mux; + } + + for (int i = 0; i < num_src; i++) { + if (magic) { + if (i == 0 && mux_a == index) + return true; + if (i == 1 && mux_b == index) + return true; + } else { + if (i == 0 && mux_a == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 0 && mux_a == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } } } + + return false; + } + + assert(devinfo->ver >= 71); + assert(!magic); + + uint32_t raddr_a, raddr_b; + if (add) { + raddr_a = inst->alu.add.a.raddr; + raddr_b = inst->alu.add.b.raddr; + } else { + raddr_a = inst->alu.mul.a.raddr; + raddr_b = inst->alu.mul.b.raddr; + } + + for (int i = 0; i < num_src; i++) { + if (i == 0 && raddr_a == index) + return true; + if (i == 1 && raddr_b == index) + return true; } return false; @@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c, struct qblock *block, struct v3d_qpu_instr *inst) { - /* We only call this if we have successfuly merged an ldvary into a + const struct v3d_device_info *devinfo = c->devinfo; + + /* We only call this if we have successfully merged an ldvary into a * previous instruction. */ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); @@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c, * the ldvary destination, if it does, then moving the ldvary before * it would overwrite it. */ - if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index)) return false; - if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index)) + return false; + + /* The implicit ldvary destination may not be written to by a signal + * in the instruction following ldvary. Since we are planning to move + * ldvary to the previous instruction, this means we need to check if + * the current instruction has any other signal that could create this + * conflict. The only other signal that can write to the implicit + * ldvary destination that is compatible with ldvary in the same + * instruction is ldunif. + */ + if (inst->sig.ldunif) return false; /* The previous instruction can't write to the same destination as the @@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, } /* The previous instruction cannot have a conflicting signal */ - if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) + if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig)) + return false; + + uint32_t sig; + struct v3d_qpu_sig new_sig = prev->qpu.sig; + new_sig.ldvary = true; + if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) return false; /* The previous instruction cannot use flags since ldvary uses the @@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, /* We can't put an ldvary in the delay slots of a thrsw. We should've * prevented this when pairing up the ldvary with another instruction - * and flagging it for a fixup. + * and flagging it for a fixup. In V3D 7.x this is limited only to the + * second delay slot. */ - assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); + assert((devinfo->ver == 42 && + scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) || + (devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1)); /* Move the ldvary to the previous instruction and remove it from the * current one. @@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c, inst->sig_magic = false; inst->sig_addr = 0; - /* By moving ldvary to the previous instruction we make it update - * r5 in the current one, so nothing else in it should write r5. + /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */ + if (devinfo->ver >= 71) { + scoreboard->last_implicit_rf0_write_tick = scoreboard->tick; + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); + } + + /* By moving ldvary to the previous instruction we make it update r5 + * (rf0 for ver >= 71) in the current one, so nothing else in it + * should write this register. + * * This should've been prevented by our depedency tracking, which * would not allow ldvary to be paired up with an instruction that - * writes r5 (since our dependency tracking doesn't know that the - * ldvary write r5 happens in the next instruction). + * writes r5/rf0 (since our dependency tracking doesn't know that the + * ldvary write to r5/rf0 happens in the next instruction). */ - assert(!v3d_qpu_writes_r5(c->devinfo, inst)); + assert(!v3d_qpu_writes_r5(devinfo, inst)); + assert(devinfo->ver == 42 || + (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0))); return true; } @@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c, merge->inst->uniform; } + chosen->inst->ldtmu_count += + merge->inst->ldtmu_count; + if (debug) { fprintf(stderr, "t=%4d: merging: ", time); @@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c, } } } - if (mux_read_stalls(scoreboard, inst)) + if (read_stalls(c->devinfo, scoreboard, inst)) c->qpu_inst_stalled_count++; } @@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_branch_tick = -10; scoreboard.last_setmsf_tick = -10; scoreboard.last_stallable_sfu_tick = -10; + scoreboard.first_ldtmu_after_thrsw = true; + scoreboard.last_implicit_rf0_write_tick = - 10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c index ec9ed66650c..538b247e3e0 100644 --- a/src/broadcom/compiler/qpu_validate.c +++ b/src/broadcom/compiler/qpu_validate.c @@ -41,6 +41,7 @@ struct v3d_qpu_validate_state { int last_sfu_write; int last_branch_ip; int last_thrsw_ip; + int first_tlb_z_write; /* Set when we've found the last-THRSW signal, or if we were started * in single-segment mode. @@ -110,11 +111,58 @@ static void qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) { const struct v3d_device_info *devinfo = state->c->devinfo; + + if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write) + state->first_tlb_z_write = state->ip; + const struct v3d_qpu_instr *inst = &qinst->qpu; + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && + state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write && + inst->branch.msfign != V3D_QPU_MSFIGN_NONE && + inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && + inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && + inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { + fail_instr(state, "Implicit branch MSF read after TLB Z write"); + } + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return; + if (inst->alu.add.op == V3D_QPU_A_SETMSF && + state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write) { + fail_instr(state, "SETMSF after TLB Z write"); + } + + if (state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write && + inst->alu.add.op == V3D_QPU_A_MSF) { + fail_instr(state, "MSF read after TLB Z write"); + } + + if (devinfo->ver < 71) { + if (inst->sig.small_imm_a || inst->sig.small_imm_c || + inst->sig.small_imm_d) { + fail_instr(state, "small imm a/c/d added after V3D 7.1"); + } + } else { + if ((inst->sig.small_imm_a || inst->sig.small_imm_b) && + !vir_is_add(qinst)) { + fail_instr(state, "small imm a/b used but no ADD inst"); + } + if ((inst->sig.small_imm_c || inst->sig.small_imm_d) && + !vir_is_mul(qinst)) { + fail_instr(state, "small imm c/d used but no MUL inst"); + } + if (inst->sig.small_imm_a + inst->sig.small_imm_b + + inst->sig.small_imm_c + inst->sig.small_imm_d > 1) { + fail_instr(state, "only one small immediate can be " + "enabled per instruction"); + } + } + /* LDVARY writes r5 two instructions later and LDUNIF writes * r5 one instruction later, which is illegal to have * together. @@ -128,7 +176,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) * * FIXME: This would not check correctly for V3D 4.2 versions lower * than V3D 4.2.14, but that is not a real issue because the simulator - * will still catch this, and we are not really targetting any such + * will still catch this, and we are not really targeting any such * versions anyway. */ if (state->c->devinfo->ver < 42) { @@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) "SFU write started during THRSW delay slots "); } - if (inst->sig.ldvary) - fail_instr(state, "LDVARY during THRSW delay slots"); + if (inst->sig.ldvary) { + if (devinfo->ver == 42) + fail_instr(state, "LDVARY during THRSW delay slots"); + if (devinfo->ver >= 71 && + state->ip - state->last_thrsw_ip == 2) { + fail_instr(state, "LDVARY in 2nd THRSW delay slot"); + } + } } (void)qpu_magic_waddr_matches; /* XXX */ @@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) vpm_writes + tlb_writes + tsy_writes + - inst->sig.ldtmu + + (devinfo->ver == 42 ? inst->sig.ldtmu : 0) + inst->sig.ldtlb + inst->sig.ldvpm + inst->sig.ldtlbu > 1) { @@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) inst->type == V3D_QPU_INSTR_TYPE_ALU) { if ((inst->alu.add.op != V3D_QPU_A_NOP && !inst->alu.add.magic_write)) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver == 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71) { + if (state->last_thrsw_ip - state->ip == 0) { + fail_instr(state, + "ADD RF write at THREND"); + } + if (inst->alu.add.waddr == 2 || + inst->alu.add.waddr == 3) { + fail_instr(state, + "RF2-3 write after THREND"); + } + } } if ((inst->alu.mul.op != V3D_QPU_M_NOP && !inst->alu.mul.magic_write)) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver == 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71) { + if (state->last_thrsw_ip - state->ip == 0) { + fail_instr(state, + "MUL RF write at THREND"); + } + + if (inst->alu.mul.waddr == 2 || + inst->alu.mul.waddr == 3) { + fail_instr(state, + "RF2-3 write after THREND"); + } + } } if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && !inst->sig_magic) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver == 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71 && + (inst->sig_addr == 2 || + inst->sig_addr == 3)) { + fail_instr(state, "RF2-3 write after THREND"); + } } /* GFXH-1625: No TMUWT in the last instruction */ @@ -312,7 +397,7 @@ qpu_validate(struct v3d_compile *c) * keep compiling the validation code to make sure it doesn't get * broken. */ -#ifndef DEBUG +#if !MESA_DEBUG return; #endif @@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c) .last_sfu_write = -10, .last_thrsw_ip = -10, .last_branch_ip = -10, + .first_tlb_z_write = INT_MAX, .ip = 0, .last_thrsw_found = !c->last_thrsw, diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c deleted file mode 100644 index b933635f6fe..00000000000 --- a/src/broadcom/compiler/v3d33_tex.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright © 2016-2018 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "v3d_compiler.h" - -/* We don't do any address packing. */ -#define __gen_user_data void -#define __gen_address_type uint32_t -#define __gen_address_offset(reloc) (*reloc) -#define __gen_emit_reloc(cl, reloc) -#include "cle/v3d_packet_v33_pack.h" - -void -v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) -{ - /* FIXME: We don't bother implementing pipelining for texture reads - * for any pre 4.x hardware. It should be straight forward to do but - * we are not really testing or even targetting this hardware at - * present. - */ - ntq_flush_tmu(c); - - unsigned unit = instr->texture_index; - - struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = { - V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header, - - .fetch_sample_mode = instr->op == nir_texop_txf, - }; - - struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked = { - }; - - switch (instr->sampler_dim) { - case GLSL_SAMPLER_DIM_1D: - if (instr->is_array) - p0_unpacked.lookup_type = TEXTURE_1D_ARRAY; - else - p0_unpacked.lookup_type = TEXTURE_1D; - break; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - if (instr->is_array) - p0_unpacked.lookup_type = TEXTURE_2D_ARRAY; - else - p0_unpacked.lookup_type = TEXTURE_2D; - break; - case GLSL_SAMPLER_DIM_3D: - p0_unpacked.lookup_type = TEXTURE_3D; - break; - case GLSL_SAMPLER_DIM_CUBE: - p0_unpacked.lookup_type = TEXTURE_CUBE_MAP; - break; - default: - unreachable("Bad sampler type"); - } - - struct qreg coords[5]; - int next_coord = 0; - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - for (int j = 0; j < instr->coord_components; j++) { - coords[next_coord++] = - ntq_get_src(c, instr->src[i].src, j); - } - if (instr->coord_components < 2) - coords[next_coord++] = vir_uniform_f(c, 0.5); - break; - case nir_tex_src_bias: - coords[next_coord++] = - ntq_get_src(c, instr->src[i].src, 0); - - p0_unpacked.bias_supplied = true; - break; - case nir_tex_src_lod: - coords[next_coord++] = - vir_FADD(c, - ntq_get_src(c, instr->src[i].src, 0), - vir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, - unit)); - - if (instr->op != nir_texop_txf && - instr->op != nir_texop_tg4) { - p0_unpacked.disable_autolod_use_bias_only = true; - } - break; - case nir_tex_src_comparator: - coords[next_coord++] = - ntq_get_src(c, instr->src[i].src, 0); - - p0_unpacked.shadow = true; - break; - - case nir_tex_src_offset: { - p0_unpacked.texel_offset_for_s_coordinate = - nir_src_comp_as_int(instr->src[i].src, 0); - - if (instr->coord_components >= 2) - p0_unpacked.texel_offset_for_t_coordinate = - nir_src_comp_as_int(instr->src[i].src, 1); - - if (instr->coord_components >= 3) - p0_unpacked.texel_offset_for_r_coordinate = - nir_src_comp_as_int(instr->src[i].src, 2); - break; - } - - default: - unreachable("unknown texture source"); - } - } - - /* Limit the number of channels returned to both how many the NIR - * instruction writes and how many the instruction could produce. - */ - p1_unpacked.return_words_of_texture_data = - instr->dest.is_ssa ? - nir_ssa_def_components_read(&instr->dest.ssa) : - (1 << instr->dest.reg.reg->num_components) - 1; - - uint32_t p0_packed; - V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL, - (uint8_t *)&p0_packed, - &p0_unpacked); - - uint32_t p1_packed; - V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL, - (uint8_t *)&p1_packed, - &p1_unpacked); - /* Load unit number into the address field, which will be be used by - * the driver to decide which texture to put in the actual address - * field. - */ - p1_packed |= unit << 5; - - /* There is no native support for GL texture rectangle coordinates, so - * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, - * 1]). - */ - if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { - coords[0] = vir_FMUL(c, coords[0], - vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, - unit)); - coords[1] = vir_FMUL(c, coords[1], - vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, - unit)); - } - - int texture_u[] = { - vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed), - vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed), - }; - - for (int i = 0; i < next_coord; i++) { - struct qreg dst; - - if (i == next_coord - 1) - dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL); - else - dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU); - - struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]); - - if (i < 2) - tmu->uniform = texture_u[i]; - } - - vir_emit_thrsw(c); - - for (int i = 0; i < 4; i++) { - if (p1_unpacked.return_words_of_texture_data & (1 << i)) - ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); - } -} diff --git a/src/broadcom/compiler/v3d33_vpm_setup.c b/src/broadcom/compiler/v3d33_vpm_setup.c deleted file mode 100644 index 8bce67dfae9..00000000000 --- a/src/broadcom/compiler/v3d33_vpm_setup.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright © 2016-2018 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "v3d_compiler.h" - -/* We don't do any address packing. */ -#define __gen_user_data void -#define __gen_address_type uint32_t -#define __gen_address_offset(reloc) (*reloc) -#define __gen_emit_reloc(cl, reloc) -#include "broadcom/cle/v3d_packet_v33_pack.h" - -void -v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components) -{ - struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = { - V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header, - - .horiz = true, - .laned = false, - /* If the field is 0, that means a read count of 32. */ - .num = num_components & 31, - .segs = true, - .stride = 1, - .size = VPM_SETUP_SIZE_32_BIT, - .addr = c->num_inputs, - }; - - uint32_t packed; - V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL, - (uint8_t *)&packed, - &unpacked); - vir_VPMSETUP(c, vir_uniform_ui(c, packed)); -} - -void -v3d33_vir_vpm_write_setup(struct v3d_compile *c) -{ - uint32_t packed; - struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = { - V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header, - - .horiz = true, - .laned = false, - .segs = true, - .stride = 1, - .size = VPM_SETUP_SIZE_32_BIT, - .addr = 0, - }; - - V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL, - (uint8_t *)&packed, - &unpacked); - vir_VPMSETUP(c, vir_uniform_ui(c, packed)); -} diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 0c1419661d3..12aaacdc14a 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -31,6 +31,7 @@ #include <stdint.h> #include <string.h> +#include "util/blend.h" #include "util/macros.h" #include "common/v3d_debug.h" #include "common/v3d_device_info.h" @@ -40,7 +41,6 @@ #include "util/u_math.h" #include "qpu/qpu_instr.h" -#include "pipe/p_state.h" /** * Maximum number of outstanding TMU operations we can queue for execution. @@ -87,7 +87,7 @@ enum qfile { /** A physical register, such as the W coordinate payload. */ QFILE_REG, - /** One of the regsiters for fixed function interactions. */ + /** One of the registers for fixed function interactions. */ QFILE_MAGIC, /** @@ -97,12 +97,6 @@ enum qfile { QFILE_TEMP, /** - * VPM reads use this with an index value to say what part of the VPM - * is being read. - */ - QFILE_VPM, - - /** * Stores an immediate value in the index field that will be used * directly by qpu_load_imm(). */ @@ -169,6 +163,19 @@ struct qinst { * otherwise. */ int uniform; + + /* If this is a a TLB Z write */ + bool is_tlb_z_write; + + /* If this is a retiring TMU instruction (the last in a lookup sequence), + * how many ldtmu instructions are required to read the results. + */ + uint32_t ldtmu_count; + + /* Position of this instruction in the program. Filled in during + * register allocation. + */ + int32_t ip; }; enum quniform_contents { @@ -330,6 +337,19 @@ enum quniform_contents { * Current value of gl_ViewIndex for Multiview rendering. */ QUNIFORM_VIEW_INDEX, + + /** + * Inline uniform buffers + */ + QUNIFORM_INLINE_UBO_0, + QUNIFORM_INLINE_UBO_1, + QUNIFORM_INLINE_UBO_2, + QUNIFORM_INLINE_UBO_3, + + /** + * Current value of DrawIndex for Multidraw + */ + QUNIFORM_DRAW_ID, }; static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) @@ -369,13 +389,7 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) return slot.slot_and_component & 3; } -enum v3d_execution_environment { - V3D_ENVIRONMENT_OPENGL = 0, - V3D_ENVIRONMENT_VULKAN, -}; - struct v3d_key { - void *shader_state; struct { uint8_t swizzle[4]; } tex[V3D_MAX_TEXTURE_SAMPLERS]; @@ -388,9 +402,9 @@ struct v3d_key { uint8_t num_samplers_used; uint8_t ucp_enables; bool is_last_geometry_stage; - bool robust_buffer_access; - - enum v3d_execution_environment environment; + bool robust_uniform_access; + bool robust_storage_access; + bool robust_image_access; }; struct v3d_fs_key { @@ -400,7 +414,6 @@ struct v3d_fs_key { bool line_smoothing; bool point_coord_upper_left; bool msaa; - bool sample_coverage; bool sample_alpha_to_coverage; bool sample_alpha_to_one; /* Mask of which color render targets are present. */ @@ -419,14 +432,12 @@ struct v3d_fs_key { */ struct { enum pipe_format format; - const uint8_t *swizzle; + uint8_t swizzle[4]; } color_fmt[V3D_MAX_DRAW_BUFFERS]; - uint8_t logicop_func; + enum pipe_logicop logicop_func; uint32_t point_sprite_mask; - struct pipe_rt_blend_state blend; - /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios: * * - If there is a geometry shader, then gl_PrimitiveID must be written @@ -468,7 +479,7 @@ struct v3d_vs_key { bool clamp_color; }; -/** A basic block of VIR intructions. */ +/** A basic block of VIR instructions. */ struct qblock { struct list_head link; @@ -566,6 +577,7 @@ enum v3d_compilation_result { */ struct v3d_compiler { const struct v3d_device_info *devinfo; + uint32_t max_inline_uniform_buffers; struct ra_regs *regs; struct ra_class *reg_class_any[3]; struct ra_class *reg_class_r5[3]; @@ -584,6 +596,19 @@ struct v3d_interp_input { unsigned mode; /* interpolation mode */ }; +struct v3d_ra_node_info { + struct { + uint32_t priority; + uint8_t class_bits; + bool is_program_end; + bool unused; + + /* V3D 7.x */ + bool is_ldunif_dst; + } *info; + uint32_t alloc_count; +}; + struct v3d_compile { const struct v3d_device_info *devinfo; nir_shader *s; @@ -596,7 +621,7 @@ struct v3d_compile { void *debug_output_data; /** - * Mapping from nir_register * or nir_ssa_def * to array of struct + * Mapping from nir_register * or nir_def * to array of struct * qreg for the values. */ struct hash_table *def_ht; @@ -615,11 +640,12 @@ struct v3d_compile { uint32_t output_fifo_size; struct { - nir_dest *dest; + nir_def *def; uint8_t num_components; uint8_t component_mask; } flush[MAX_TMU_QUEUE_SIZE]; uint32_t flush_count; + uint32_t total_count; } tmu; /** @@ -652,16 +678,13 @@ struct v3d_compile { bool uses_center_w; bool writes_z; + bool writes_z_from_fep; + bool reads_z; bool uses_implicit_point_line_varyings; /* True if a fragment shader reads gl_PrimitiveID */ bool fs_uses_primitive_id; - /* If the fragment shader does anything that requires to force - * per-sample MSAA, such as reading gl_SampleID. - */ - bool force_per_sample_msaa; - /* Whether we are using the fallback scheduler. This will be set after * register allocation has failed once. */ @@ -681,6 +704,11 @@ struct v3d_compile { bool disable_constant_ubo_load_sorting; bool sorted_any_ubo_loads; + /* Moves UBO/SSBO loads right before their first user (nir_opt_move). + * This can reduce register pressure. + */ + bool move_buffer_loads; + /* Emits ldunif for each new uniform, even if the uniform was already * emitted in the same block. Useful to compile shaders with high * register pressure or to disable the optimization during uniform @@ -692,6 +720,19 @@ struct v3d_compile { bool disable_loop_unrolling; bool unrolled_any_loops; + /* Disables nir_opt_gcm to reduce register pressure. */ + bool disable_gcm; + + /* If calling nir_opt_gcm made any progress. Used to skip new rebuilds + * if possible + */ + bool gcm_progress; + + /* Disables scheduling of general TMU loads (and unfiltered image load). + */ + bool disable_general_tmu_sched; + bool has_general_tmu_load; + /* Minimum number of threads we are willing to use to register allocate * a shader with the current compilation strategy. This only prevents * us from lowering the thread count to register allocate successfully, @@ -705,7 +746,9 @@ struct v3d_compile { * strategies that can reduce register pressure and hopefully reduce or * eliminate TMU spills in the shader. */ - bool tmu_spilling_allowed; + uint32_t max_tmu_spills; + + uint32_t compile_strategy_idx; /* The UBO index and block used with the last unifa load, as well as the * current unifa offset *after* emitting that load. This is used to skip @@ -715,6 +758,7 @@ struct v3d_compile { struct qblock *current_unifa_block; int32_t current_unifa_index; uint32_t current_unifa_offset; + bool current_unifa_is_ubo; /* State for whether we're executing on each channel currently. 0 if * yes, otherwise a block number + 1 that the channel jumped to. @@ -749,6 +793,11 @@ struct v3d_compile { struct qreg cs_shared_offset; int local_invocation_index_bits; + /* Starting value of the sample mask in a fragment shader. We use + * this to identify lanes that have been terminated/discarded. + */ + struct qreg start_msf; + /* If the shader uses subgroup functionality */ bool has_subgroups; @@ -761,14 +810,27 @@ struct v3d_compile { uint32_t spill_size; /* Shader-db stats */ uint32_t spills, fills, loops; + + /* Whether we are in the process of spilling registers for + * register allocation + */ + bool spilling; + /** * Register spilling's per-thread base address, shared between each - * spill/fill's addressing calculations. + * spill/fill's addressing calculations (also used for scratch + * access). */ struct qreg spill_base; + /* Bit vector of which temps may be spilled */ BITSET_WORD *spillable; + /* Used during register allocation */ + int thread_index; + struct v3d_ra_node_info nodes; + struct ra_graph *g; + /** * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. * @@ -799,11 +861,16 @@ struct v3d_compile { uint32_t uniform_array_size; uint32_t num_uniforms; uint32_t output_position_index; - nir_variable *output_color_var[4]; + nir_variable *output_color_var[V3D_MAX_DRAW_BUFFERS]; uint32_t output_sample_mask_index; struct qreg undef; uint32_t num_temps; + /* Number of temps in the program right before we spill a new temp. We + * use this to know which temps existed before a spill and which were + * added with the spill itself. + */ + uint32_t spill_start_num_temps; struct vir_cursor cursor; struct list_head blocks; @@ -848,12 +915,16 @@ struct v3d_compile { bool emitted_tlb_load; bool lock_scoreboard_on_first_thrsw; - /* Total number of spilled registers in the program */ - uint32_t spill_count; - enum v3d_compilation_result compilation_result; bool tmu_dirty_rcl; + bool has_global_address; + + /* If we have processed a discard/terminate instruction. This may + * cause some lanes to be inactive even during uniform control + * flow. + */ + bool emitted_discard; }; struct v3d_uniform_list { @@ -866,6 +937,13 @@ struct v3d_prog_data { struct v3d_uniform_list uniforms; uint32_t spill_size; + uint32_t tmu_spills; + uint32_t tmu_fills; + uint32_t tmu_count; + + uint32_t qpu_read_stalls; + + uint8_t compile_strategy_idx; uint8_t threads; @@ -877,6 +955,8 @@ struct v3d_prog_data { bool tmu_dirty_rcl; bool has_control_barrier; + + bool has_global_address; }; struct v3d_vs_prog_data { @@ -964,10 +1044,15 @@ struct v3d_fs_prog_data { uint8_t num_inputs; bool writes_z; + bool writes_z_from_fep; bool disable_ez; bool uses_center_w; bool uses_implicit_point_line_varyings; bool lock_scoreboard_on_first_thrsw; + + /* If the fragment shader does anything that requires to force + * per-sample MSAA, such as reading gl_SampleID. + */ bool force_per_sample_msaa; }; @@ -998,6 +1083,10 @@ v3d_compute_vpm_config(struct v3d_device_info *devinfo, struct v3d_gs_prog_data *gs, struct vpm_config *vpm_cfg_bin, struct vpm_config *vpm_cfg); +void +v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo, + uint32_t *p1_packed, + bool unnormalized_coordinates); static inline bool vir_has_uniform(struct qinst *inst) @@ -1005,7 +1094,8 @@ vir_has_uniform(struct qinst *inst) return inst->uniform != ~0; } -const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo); +const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo, + uint32_t max_inline_uniform_buffers); void v3d_compiler_free(const struct v3d_compiler *compiler); void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s); @@ -1066,15 +1156,14 @@ bool vir_is_raw_mov(struct qinst *inst); bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); bool vir_is_add(struct qinst *inst); bool vir_is_mul(struct qinst *inst); -bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); -bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); +bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); uint8_t vir_channels_written(struct qinst *inst); struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); -void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, - struct qreg result); +void ntq_store_def(struct v3d_compile *c, nir_def *def, int chan, + struct qreg result); bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components); -void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest, +void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_def *def, uint32_t component_mask); void ntq_flush_tmu(struct v3d_compile *c); void vir_emit_thrsw(struct v3d_compile *c); @@ -1095,32 +1184,27 @@ bool vir_opt_redundant_flags(struct v3d_compile *c); bool vir_opt_small_immediates(struct v3d_compile *c); bool vir_opt_vpm(struct v3d_compile *c); bool vir_opt_constant_alu(struct v3d_compile *c); -void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_line_smooth(nir_shader *shader); -void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c); -void v3d_nir_lower_scratch(nir_shader *s); -void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); -void v3d_nir_lower_image_load_store(nir_shader *s); -void vir_lower_uniforms(struct v3d_compile *c); - -void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); -void v3d33_vir_vpm_write_setup(struct v3d_compile *c); -void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); -void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); -void v3d40_vir_emit_image_load_store(struct v3d_compile *c, - nir_intrinsic_instr *instr); +bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); +bool v3d_nir_lower_line_smooth(nir_shader *shader); +bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); +bool v3d_nir_lower_scratch(nir_shader *s); +bool v3d_nir_lower_txf_ms(nir_shader *s); +bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c); +bool v3d_nir_lower_load_store_bitsize(nir_shader *s); + +void v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); +void v3d_vir_emit_image_load_store(struct v3d_compile *c, + nir_intrinsic_instr *instr); void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); void qpu_validate(struct v3d_compile *c); -struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled); +struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); bool vir_init_reg_sets(struct v3d_compiler *compiler); int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str); -bool v3d_gl_format_is_return_32(GLenum format); +bool v3d_gl_format_is_return_32(enum pipe_format format); uint32_t v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src); @@ -1220,28 +1304,35 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ #define VIR_SFU(name) \ static inline struct qreg \ vir_##name(struct v3d_compile *c, struct qreg a) \ -{ \ - if (c->devinfo->ver >= 41) { \ - return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ - c->undef, \ - a, c->undef)); \ - } else { \ - vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ - return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ - } \ +{ \ + return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ + c->undef, \ + a, c->undef)); \ } \ static inline struct qinst * \ vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ struct qreg a) \ { \ - if (c->devinfo->ver >= 41) { \ - return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ - dest, \ - a, c->undef)); \ - } else { \ - vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ - return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ - } \ + return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ + dest, \ + a, c->undef)); \ +} + +#define VIR_SFU2(name) \ +static inline struct qreg \ +vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ +{ \ + return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ + c->undef, \ + a, b)); \ +} \ +static inline struct qinst * \ +vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ + struct qreg a, struct qreg b) \ +{ \ + return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ + dest, \ + a, b)); \ } #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name) @@ -1343,6 +1434,28 @@ VIR_SFU(LOG) VIR_SFU(SIN) VIR_SFU(RSQRT2) +VIR_SFU(BALLOT) +VIR_SFU(BCASTF) +VIR_SFU(ALLEQ) +VIR_SFU(ALLFEQ) +VIR_SFU2(ROTQ) +VIR_SFU2(ROT) +VIR_SFU2(SHUFFLE) + +VIR_A_ALU2(VPACK) +VIR_A_ALU2(V8PACK) +VIR_A_ALU2(V10PACK) +VIR_A_ALU2(V11FPACK) + +VIR_M_ALU1(FTOUNORM16) +VIR_M_ALU1(FTOSNORM16) + +VIR_M_ALU1(VFTOUNORM8) +VIR_M_ALU1(VFTOSNORM8) + +VIR_M_ALU1(VFTOUNORM10LO) +VIR_M_ALU1(VFTOUNORM10HI) + static inline struct qinst * vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, struct qreg dest, struct qreg src) @@ -1372,16 +1485,11 @@ vir_NOP(struct v3d_compile *c) static inline struct qreg vir_LDTMU(struct v3d_compile *c) { - if (c->devinfo->ver >= 41) { - struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, - c->undef, c->undef); - ldtmu->qpu.sig.ldtmu = true; - - return vir_emit_def(c, ldtmu); - } else { - vir_NOP(c)->qpu.sig.ldtmu = true; - return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); - } + struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtmu->qpu.sig.ldtmu = true; + + return vir_emit_def(c, ldtmu); } static inline struct qreg @@ -1394,7 +1502,6 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1) static inline struct qreg vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) { - assert(c->devinfo->ver >= 41); /* XXX */ assert((config & 0xffffff00) == 0xffffff00); struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, @@ -1407,38 +1514,12 @@ vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) static inline struct qreg vir_TLB_COLOR_READ(struct v3d_compile *c) { - assert(c->devinfo->ver >= 41); /* XXX */ - struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef); ldtlb->qpu.sig.ldtlb = true; return vir_emit_def(c, ldtlb); } -/* -static inline struct qreg -vir_LOAD_IMM(struct v3d_compile *c, uint32_t val) -{ - return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef, - vir_reg(QFILE_LOAD_IMM, val), c->undef)); -} - -static inline struct qreg -vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val) -{ - return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef, - vir_reg(QFILE_LOAD_IMM, val), - c->undef)); -} -static inline struct qreg -vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val) -{ - return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef, - vir_reg(QFILE_LOAD_IMM, val), - c->undef)); -} -*/ - static inline struct qinst * vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) { diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c index 2706432d5ef..9a651bfc6a7 100644 --- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c +++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c @@ -40,9 +40,20 @@ * calculations and load/store using the TMU general memory access path. */ +static const unsigned bits_8[4] = {8, 8, 8, 8}; +static const unsigned bits_16[4] = {16, 16, 16, 16}; +static const unsigned bits_1010102[4] = {10, 10, 10, 2}; + bool v3d_gl_format_is_return_32(enum pipe_format format) { + /* We can get a NONE format in Vulkan because we support the + * shaderStorageImageReadWithoutFormat feature. We consider these to + * always use 32-bit precision. + */ + if (format == PIPE_FORMAT_NONE) + return true; + const struct util_format_description *desc = util_format_description(format); const struct util_format_channel_description *chan = &desc->channel[0]; @@ -52,15 +63,17 @@ v3d_gl_format_is_return_32(enum pipe_format format) /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a * 32-bit SSA value, with as many channels as necessary to store all the bits + * + * This is the generic helper, using all common nir operations. */ -static nir_ssa_def * -pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, +static nir_def * +pack_bits(nir_builder *b, nir_def *color, const unsigned *bits, int num_components, bool mask) { - nir_ssa_def *results[4]; + nir_def *results[4]; int offset = 0; for (int i = 0; i < num_components; i++) { - nir_ssa_def *chan = nir_channel(b, color, i); + nir_def *chan = nir_channel(b, color, i); /* Channels being stored shouldn't cross a 32-bit boundary. */ assert((offset & ~31) == ((offset + bits[i] - 1) & ~31)); @@ -84,10 +97,187 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, return nir_vec(b, results, DIV_ROUND_UP(offset, 32)); } -static void -v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) +/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is + * just easier to read vfpack on the code, specially while using the PRM as + * reference + */ +static inline nir_def * +nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2) +{ + return nir_pack_half_2x16_split(b, p1, p2); +} + +static inline nir_def * +pack_11f11f10f(nir_builder *b, nir_def *color) +{ + nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + nir_def *undef = nir_undef(b, 1, color->bit_size); + nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef); + + return nir_pack_32_to_r11g11b10_v3d(b, p1, p2); +} + +static inline nir_def * +pack_r10g10b10a2_uint(nir_builder *b, nir_def *color) +{ + nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + + return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2); +} + +static inline nir_def * +pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color) +{ + nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1); + + nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2); + + return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2); +} + +enum hw_conversion { + NONE, + TO_SNORM, + TO_UNORM +}; + +static inline nir_def * +pack_8bit(nir_builder *b, nir_def *color, + unsigned num_components, + enum hw_conversion conversion) +{ + /* Note that usually you should not use this method (that relies on + * custom packing) for 1 component if we are not doing any + * conversion. But we support also that case, and let the caller + * decide which method to use. + */ + nir_def *p1; + nir_def *p2; + + if (conversion == NONE) { + p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0), + nir_channel(b, color, num_components == 1 ? 0 : 1)); + } else { + p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, num_components == 1 ? 0 : 1)); + p1 = (conversion == TO_UNORM) ? + nir_pack_2x16_to_unorm_2x8_v3d(b, p1) : + nir_pack_2x16_to_snorm_2x8_v3d(b, p1); + } + if (num_components == 4) { + if (conversion == NONE) { + p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + } else { + p2 = nir_vfpack(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + p2 = (conversion == TO_UNORM) ? + nir_pack_2x16_to_unorm_2x8_v3d(b, p2) : + nir_pack_2x16_to_snorm_2x8_v3d(b, p2); + } + } else { + /* Using an undef here would be more correct. But for this + * case we are getting worse shader-db values with some CTS + * tests, so we just reuse the first packing. + */ + p2 = p1; + } + + return nir_pack_4x16_to_4x8_v3d(b, p1, p2); +} + +static inline nir_def * +pack_16bit(nir_builder *b, nir_def *color, + unsigned num_components, + enum hw_conversion conversion) +{ + nir_def *results[2] = {0}; + nir_def *channels[4] = {0}; + + for (unsigned i = 0; i < num_components; i++) { + channels[i] = nir_channel(b, color, i); + switch (conversion) { + case TO_SNORM: + channels[i] = nir_f2snorm_16_v3d(b, channels[i]); + break; + case TO_UNORM: + channels[i] = nir_f2unorm_16_v3d(b, channels[i]); + break; + default: + /* Note that usually you should not use this method + * (that relies on custom packing) if we are not doing + * any conversion. But we support also that case, and + * let the caller decide which method to use. + */ + break; + } + } + + switch (num_components) { + case 1: + results[0] = channels[0]; + break; + case 4: + results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]); + FALLTHROUGH; + case 2: + results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]); + break; + default: + unreachable("Invalid number of components"); + } + + return nir_vec(b, results, DIV_ROUND_UP(num_components, 2)); +} + +static inline nir_def * +pack_xbit(nir_builder *b, nir_def *color, + unsigned num_components, + const struct util_format_channel_description *r_chan) +{ + bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED); + enum hw_conversion conversion = NONE; + if (r_chan->normalized) { + conversion = + (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM; + } + + switch (r_chan->size) { + case 8: + if (conversion == NONE && num_components < 2) + return pack_bits(b, color, bits_8, num_components, pack_mask); + else + return pack_8bit(b, color, num_components, conversion); + break; + case 16: + /* pack_mask implies that the generic packing method would + * need to include extra operations to handle negative values, + * so in that case, even without a conversion, it is better to + * use the packing using custom hw operations. + */ + if (conversion == NONE && !pack_mask) + return pack_bits(b, color, bits_16, num_components, pack_mask); + else + return pack_16bit(b, color, num_components, conversion); + break; + default: + unreachable("unrecognized bits"); + } +} + +static bool +v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr) { enum pipe_format format = nir_intrinsic_format(instr); + assert(format != PIPE_FORMAT_NONE); const struct util_format_description *desc = util_format_description(format); const struct util_format_channel_description *r_chan = &desc->channel[0]; @@ -95,10 +285,10 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *color = nir_channels(b, - nir_ssa_for_src(b, instr->src[3], 4), - (1 << num_components) - 1); - nir_ssa_def *formatted = NULL; + nir_def *color = nir_trim_vector(b, + instr->src[3].ssa, + num_components); + nir_def *formatted = NULL; if (format == PIPE_FORMAT_R11G11B10_FLOAT) { formatted = nir_format_pack_11f11f10f(b, color); @@ -110,9 +300,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) */ formatted = color; } else { - static const unsigned bits_8[4] = {8, 8, 8, 8}; - static const unsigned bits_16[4] = {16, 16, 16, 16}; - static const unsigned bits_1010102[4] = {10, 10, 10, 2}; const unsigned *bits; switch (r_chan->size) { @@ -132,11 +319,13 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) bool pack_mask = false; if (r_chan->pure_integer && r_chan->type == UTIL_FORMAT_TYPE_SIGNED) { - formatted = nir_format_clamp_sint(b, color, bits); + /* We don't need to do any conversion or clamping in this case */ + formatted = color; pack_mask = true; } else if (r_chan->pure_integer && r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) { - formatted = nir_format_clamp_uint(b, color, bits); + /* We don't need to do any conversion or clamping in this case */ + formatted = color; } else if (r_chan->normalized && r_chan->type == UTIL_FORMAT_TYPE_SIGNED) { formatted = nir_format_float_to_snorm(b, color, bits); @@ -154,75 +343,116 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) pack_mask); } - nir_instr_rewrite_src(&instr->instr, &instr->src[3], - nir_src_for_ssa(formatted)); + nir_src_rewrite(&instr->src[3], formatted); instr->num_components = formatted->num_components; + + return true; } -static void + +static bool +v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr) +{ + enum pipe_format format = nir_intrinsic_format(instr); + assert(format != PIPE_FORMAT_NONE); + const struct util_format_description *desc = + util_format_description(format); + const struct util_format_channel_description *r_chan = &desc->channel[0]; + unsigned num_components = util_format_get_nr_components(format); + b->cursor = nir_before_instr(&instr->instr); + + nir_def *color = + nir_trim_vector(b, instr->src[3].ssa, num_components); + nir_def *formatted = NULL; + if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + formatted = nir_format_pack_r9g9b9e5(b, color); + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + formatted = pack_11f11f10f(b, color); + } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) { + formatted = pack_r10g10b10a2_uint(b, color); + } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) { + formatted = pack_r10g10b10a2_unorm(b, color); + } else if (r_chan->size == 32) { + /* For 32-bit formats, we just have to move the vector + * across (possibly reducing the number of channels). + */ + formatted = color; + } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) { + assert(r_chan->size == 16); + formatted = nir_format_float_to_half(b, color); + formatted = pack_bits(b, formatted, bits_16, num_components, + false); + } else { + assert(r_chan->size == 8 || r_chan->size == 16); + formatted = pack_xbit(b, color, num_components, r_chan); + } + + nir_src_rewrite(&instr->src[3], formatted); + instr->num_components = formatted->num_components; + + return true; +} + +static bool v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr) { static const unsigned bits16[] = {16, 16, 16, 16}; enum pipe_format format = nir_intrinsic_format(instr); if (v3d_gl_format_is_return_32(format)) - return; + return false; b->cursor = nir_after_instr(&instr->instr); - assert(instr->dest.is_ssa); - nir_ssa_def *result = &instr->dest.ssa; + nir_def *result = &instr->def; if (util_format_is_pure_uint(format)) { result = nir_format_unpack_uint(b, result, bits16, 4); } else if (util_format_is_pure_sint(format)) { result = nir_format_unpack_sint(b, result, bits16, 4); } else { - nir_ssa_def *rg = nir_channel(b, result, 0); - nir_ssa_def *ba = nir_channel(b, result, 1); - result = nir_vec4(b, - nir_unpack_half_2x16_split_x(b, rg), - nir_unpack_half_2x16_split_y(b, rg), - nir_unpack_half_2x16_split_x(b, ba), - nir_unpack_half_2x16_split_y(b, ba)); + nir_def *rg = nir_channel(b, result, 0); + nir_def *ba = nir_channel(b, result, 1); + result = nir_vec4(b, + nir_unpack_half_2x16_split_x(b, rg), + nir_unpack_half_2x16_split_y(b, rg), + nir_unpack_half_2x16_split_x(b, ba), + nir_unpack_half_2x16_split_y(b, ba)); } - nir_ssa_def_rewrite_uses_after(&instr->dest.ssa, result, + nir_def_rewrite_uses_after(&instr->def, result, result->parent_instr); + + return true; } -void -v3d_nir_lower_image_load_store(nir_shader *s) +static bool +v3d_nir_lower_image_load_store_cb(nir_builder *b, + nir_intrinsic_instr *intr, + void *_state) { - nir_foreach_function(function, s) { - if (!function->impl) - continue; - - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = - nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_image_load: - v3d_nir_lower_image_load(&b, intr); - break; - case nir_intrinsic_image_store: - v3d_nir_lower_image_store(&b, intr); - break; - default: - break; - } - } - } + struct v3d_compile *c = (struct v3d_compile *) _state; - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + switch (intr->intrinsic) { + case nir_intrinsic_image_load: + return v3d_nir_lower_image_load(b, intr); + case nir_intrinsic_image_store: + if (c->devinfo->ver >= 71) + return v3d_nir_lower_image_store_v71(b, intr); + else + return v3d_nir_lower_image_store_v42(b, intr); + break; + default: + return false; } + + return false; +} + +bool +v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c) +{ + return nir_shader_intrinsics_pass(s, + v3d_nir_lower_image_load_store_cb, + nir_metadata_block_index | + nir_metadata_dominance, c); } diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c index 895b1a39163..55e2e4f2e11 100644 --- a/src/broadcom/compiler/v3d_nir_lower_io.c +++ b/src/broadcom/compiler/v3d_nir_lower_io.c @@ -24,8 +24,6 @@ #include "compiler/v3d_compiler.h" #include "compiler/nir/nir_builder.h" -#include "util/u_helpers.h" - /** * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io * intrinsics into something amenable to the V3D architecture. @@ -64,7 +62,7 @@ struct v3d_nir_lower_io_state { BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)]; - nir_ssa_def *pos[4]; + nir_def *pos[4]; }; static void @@ -72,8 +70,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, struct v3d_nir_lower_io_state *state); static void -v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset, - nir_ssa_def *chan) +v3d_nir_store_output(nir_builder *b, int base, nir_def *offset, + nir_def *chan) { if (offset) { /* When generating the VIR instruction, the base and the offset @@ -90,29 +88,6 @@ v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset, nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0); } -/* Convert the uniform offset to bytes. If it happens to be a constant, - * constant-folding will clean up the shift for us. - */ -static void -v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b, - nir_intrinsic_instr *intr) -{ - /* On SPIR-V/Vulkan we are already getting our offsets in - * bytes. - */ - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) - return; - - b->cursor = nir_before_instr(&intr->instr); - - nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16); - - nir_instr_rewrite_src(&intr->instr, - &intr->src[0], - nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa, - nir_imm_int(b, 4)))); -} - static int v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component) { @@ -159,14 +134,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, /* If this is a geometry shader we need to emit our outputs * to the current vertex offset in the VPM. */ - nir_ssa_def *offset_reg = + nir_def *offset_reg = c->s->info.stage == MESA_SHADER_GEOMETRY ? nir_load_var(b, state->gs.output_offset_var) : NULL; int start_comp = nir_intrinsic_component(intr); unsigned location = nir_intrinsic_io_semantics(intr).location; - nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0], - intr->num_components); + nir_def *src = intr->src[0].ssa; /* Save off the components of the position for the setup of VPM inputs * read by fixed function HW. */ @@ -184,8 +158,8 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, if (location == VARYING_SLOT_LAYER) { assert(c->s->info.stage == MESA_SHADER_GEOMETRY); - nir_ssa_def *header = nir_load_var(b, state->gs.header_var); - header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff)); + nir_def *header = nir_load_var(b, state->gs.header_var); + header = nir_iand_imm(b, header, 0xff00ffff); /* From the GLES 3.2 spec: * @@ -205,24 +179,26 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, * to 0 in that case (we always allocate tile state for at * least one layer). */ - nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32); - nir_ssa_def *cond = nir_ige(b, src, fb_layers); - nir_ssa_def *layer_id = + nir_def *fb_layers = nir_load_fb_layers_v3d(b, 32); + nir_def *cond = nir_ige(b, src, fb_layers); + nir_def *layer_id = nir_bcsel(b, cond, nir_imm_int(b, 0), - nir_ishl(b, src, nir_imm_int(b, 16))); + nir_ishl_imm(b, src, 16)); header = nir_ior(b, header, layer_id); nir_store_var(b, state->gs.header_var, header, 0x1); } /* Scalarize outputs if it hasn't happened already, since we want to - * schedule each VPM write individually. We can skip any outut + * schedule each VPM write individually. We can skip any output * components not read by the FS. */ for (int i = 0; i < intr->num_components; i++) { int vpm_offset = v3d_varying_slot_vpm_offset(c, location, start_comp + i); + if (!(nir_intrinsic_write_mask(intr) & (1 << i))) + continue; if (vpm_offset == -1) continue; @@ -261,9 +237,9 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b, { b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *header = nir_load_var(b, state->gs.header_var); - nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var); - nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var); + nir_def *header = nir_load_var(b, state->gs.header_var); + nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var); + nir_def *output_offset = nir_load_var(b, state->gs.output_offset_var); /* Emit fixed function outputs */ v3d_nir_emit_ff_vpm_outputs(c, b, state); @@ -273,13 +249,13 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b, /* Update VPM offset for next vertex output data and header */ output_offset = - nir_iadd(b, output_offset, - nir_imm_int(b, state->gs.output_vertex_data_size)); + nir_iadd_imm(b, output_offset, + state->gs.output_vertex_data_size); - header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1)); + header_offset = nir_iadd_imm(b, header_offset, 1); /* Reset the New Primitive bit */ - header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe)); + header = nir_iand_imm(b, header, 0xfffffffe); nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1); nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1); @@ -304,7 +280,7 @@ v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b, * doesn't provide means to do that, so we need to apply the swizzle in the * vertex shader. * - * This is required at least in Vulkan to support madatory vertex attribute + * This is required at least in Vulkan to support mandatory vertex attribute * format VK_FORMAT_B8G8R8A8_UNORM. */ static void @@ -327,59 +303,6 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b, nir_intrinsic_set_component(instr, (comp + 2) % 4); } -/* Sometimes the origin of gl_PointCoord is in the upper left rather than the - * lower left so we need to flip it. - * - * This is needed for Vulkan, Gallium uses lower_wpos_pntc. - */ -static void -v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b, - nir_intrinsic_instr *intr) -{ - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); - - /* Gallium uses lower_wpos_pntc */ - if (c->key->environment == V3D_ENVIRONMENT_OPENGL) - return; - - b->cursor = nir_after_instr(&intr->instr); - - int comp = nir_intrinsic_component(intr); - - nir_variable *input_var = - nir_find_variable_with_driver_location(c->s, - nir_var_shader_in, - nir_intrinsic_base(intr)); - - if (input_var && util_varying_is_point_coord(input_var->data.location, - c->fs_key->point_sprite_mask)) { - assert(intr->num_components == 1); - - nir_ssa_def *result = &intr->dest.ssa; - - switch (comp) { - case 0: - case 1: - if (!c->fs_key->is_points) - result = nir_imm_float(b, 0.0); - break; - case 2: - result = nir_imm_float(b, 0.0); - break; - case 3: - result = nir_imm_float(b, 1.0); - break; - } - if (c->fs_key->point_coord_upper_left && comp == 1) - result = nir_fsub(b, nir_imm_float(b, 1.0), result); - if (result != &intr->dest.ssa) { - nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, - result, - result->parent_instr); - } - } -} - static void v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr, @@ -393,12 +316,6 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, case nir_intrinsic_load_input: if (c->s->info.stage == MESA_SHADER_VERTEX) v3d_nir_lower_vertex_input(c, b, intr); - else if (c->s->info.stage == MESA_SHADER_FRAGMENT) - v3d_nir_lower_fragment_input(c, b, intr); - break; - - case nir_intrinsic_load_uniform: - v3d_nir_lower_uniform(c, b, intr); break; case nir_intrinsic_store_output: @@ -558,16 +475,16 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, /* If this is a geometry shader we need to emit our fixed function * outputs to the current vertex offset in the VPM. */ - nir_ssa_def *offset_reg = + nir_def *offset_reg = c->s->info.stage == MESA_SHADER_GEOMETRY ? nir_load_var(b, state->gs.output_offset_var) : NULL; for (int i = 0; i < 4; i++) { if (!state->pos[i]) - state->pos[i] = nir_ssa_undef(b, 1, 32); + state->pos[i] = nir_undef(b, 1, 32); } - nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]); + nir_def *rcp_wc = nir_frcp(b, state->pos[3]); if (state->pos_vpm_offset != -1) { for (int i = 0; i < 4; i++) { @@ -578,8 +495,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, if (state->vp_vpm_offset != -1) { for (int i = 0; i < 2; i++) { - nir_ssa_def *pos; - nir_ssa_def *scale; + nir_def *pos; + nir_def *scale; pos = state->pos[i]; if (i == 0) scale = nir_load_viewport_x_scale(b); @@ -598,14 +515,18 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, * The correct fix for this as recommended by Broadcom * is to convert to .8 fixed-point with ffloor(). */ - pos = nir_f2i32(b, nir_ffloor(b, pos)); - v3d_nir_store_output(b, state->vp_vpm_offset + i, - offset_reg, pos); + if (c->devinfo->ver == 42) + pos = nir_f2i32(b, nir_ffloor(b, pos)); + else + pos = nir_f2i32(b, nir_fround_even(b, pos)); + + v3d_nir_store_output(b, state->vp_vpm_offset + i, + offset_reg, pos); } } if (state->zs_vpm_offset != -1) { - nir_ssa_def *z = state->pos[2]; + nir_def *z = state->pos[2]; z = nir_fmul(b, z, nir_load_viewport_z_scale(b)); z = nir_fmul(b, z, rcp_wc); z = nir_fadd(b, z, nir_load_viewport_z_offset(b)); @@ -679,21 +600,22 @@ emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b, * have a variable just to keep track of the number of vertices we * emitted and instead we can just compute it here from the header * offset variable by removing the one generic header slot that always - * goes at the begining of out header. + * goes at the beginning of out header. */ - nir_ssa_def *header_offset = + nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var); - nir_ssa_def *vertex_count = - nir_isub(b, header_offset, nir_imm_int(b, 1)); - nir_ssa_def *header = - nir_ior(b, nir_imm_int(b, state->gs.output_header_size), - nir_ishl(b, vertex_count, - nir_imm_int(b, VERTEX_COUNT_OFFSET))); + nir_def *vertex_count = + nir_iadd_imm(b, header_offset, -1); + nir_def *header = + nir_ior_imm(b, + nir_ishl_imm(b, vertex_count, + VERTEX_COUNT_OFFSET), + state->gs.output_header_size); v3d_nir_store_output(b, 0, NULL, header); } -void +bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) { struct v3d_nir_lower_io_state state = { 0 }; @@ -713,36 +635,39 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) unreachable("Unsupported shader stage"); } - nir_foreach_function(function, s) { - if (function->impl) { - nir_builder b; - nir_builder_init(&b, function->impl); - - if (c->s->info.stage == MESA_SHADER_GEOMETRY) - emit_gs_prolog(c, &b, function->impl, &state); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) - v3d_nir_lower_io_instr(c, &b, instr, - &state); - } - - nir_block *last = nir_impl_last_block(function->impl); - b.cursor = nir_after_block(last); - if (s->info.stage == MESA_SHADER_VERTEX) { - v3d_nir_emit_ff_vpm_outputs(c, &b, &state); - } else if (s->info.stage == MESA_SHADER_GEOMETRY) { - emit_gs_vpm_output_header_prolog(c, &b, &state); - } - - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + nir_foreach_function_impl(impl, s) { + nir_builder b = nir_builder_create(impl); + + if (c->s->info.stage == MESA_SHADER_GEOMETRY) + emit_gs_prolog(c, &b, impl, &state); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) + v3d_nir_lower_io_instr(c, &b, instr, + &state); } + + nir_block *last = nir_impl_last_block(impl); + b.cursor = nir_after_block(last); + if (s->info.stage == MESA_SHADER_VERTEX) { + v3d_nir_emit_ff_vpm_outputs(c, &b, &state); + } else if (s->info.stage == MESA_SHADER_GEOMETRY) { + emit_gs_vpm_output_header_prolog(c, &b, &state); + } + + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); } if (s->info.stage == MESA_SHADER_VERTEX || s->info.stage == MESA_SHADER_GEOMETRY) { v3d_nir_lower_io_update_output_var_base(c, &state); } + + /* It is really unlikely that we don't get progress here, and fully + * filtering when not would make code more complex, but we are still + * interested on getting this lowering going through NIR_PASS + */ + return true; } diff --git a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c index 8f6e7d4e648..05b5224bc52 100644 --- a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c +++ b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Raspberry Pi + * Copyright © 2020 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -42,25 +42,23 @@ lower_line_smooth_intrinsic(struct lower_line_smooth_state *state, { b->cursor = nir_before_instr(&intr->instr); - nir_ssa_def *one = nir_imm_float(b, 1.0f); + nir_def *one = nir_imm_float(b, 1.0f); - nir_ssa_def *coverage = nir_load_var(b, state->coverage); + nir_def *coverage = nir_load_var(b, state->coverage); - nir_ssa_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage), + nir_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage), intr->src[0].ssa); - nir_instr_rewrite_src(&intr->instr, - &intr->src[0], - nir_src_for_ssa(new_val)); + nir_src_rewrite(&intr->src[0], new_val); } -static void +static bool lower_line_smooth_func(struct lower_line_smooth_state *state, nir_function_impl *impl) { - nir_builder b; + bool progress = false; - nir_builder_init(&b, impl); + nir_builder b = nir_builder_create(impl); nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { @@ -72,58 +70,66 @@ lower_line_smooth_func(struct lower_line_smooth_state *state, if (intr->intrinsic != nir_intrinsic_store_output || nir_intrinsic_base(intr) != 0 || - intr->num_components != 4 || - !intr->src[0].is_ssa) + intr->num_components != 4) continue; lower_line_smooth_intrinsic(state, &b, intr); + progress = true; } } + + return progress; } static void initialise_coverage_var(struct lower_line_smooth_state *state, nir_function_impl *impl) { - nir_builder b; - - nir_builder_init(&b, impl); + nir_builder b = nir_builder_at(nir_before_impl(impl)); - b.cursor = nir_before_block(nir_start_block(impl)); + nir_def *line_width = nir_load_line_width(&b); - nir_ssa_def *line_width = nir_load_line_width(&b); + nir_def *real_line_width = nir_load_aa_line_width(&b); - nir_ssa_def *real_line_width = nir_load_aa_line_width(&b); - - /* The line coord varies from 0.0 to 1.0 across the width of the line */ - nir_ssa_def *line_coord = nir_load_line_coord(&b); + /* According to the PRM, the line coord varies from 0.0 to 1.0 across + * the width of the line. But actually, when a perspective projection + * is used, it is also applied to the line coords, so the values end + * up being between [min_coord, 1], based on the Wc coordinate. We + * need to re-map the values to be between [0.0, 1.0]. + */ + nir_def *line_coord = nir_load_line_coord(&b); + nir_def *wc = nir_load_fep_w_v3d(&b, 32); + nir_def *min_coord_val = nir_fsub(&b, nir_imm_float(&b, 1.0f), wc); + nir_def *normalized_line_coord = nir_fdiv(&b, + nir_fsub(&b, line_coord, min_coord_val), + nir_fsub_imm(&b, 1.0, min_coord_val));; /* fabs(line_coord - 0.5) * real_line_width */ - nir_ssa_def *pixels_from_center = + nir_def *pixels_from_center = nir_fmul(&b, real_line_width, - nir_fabs(&b, nir_fsub(&b, line_coord, + nir_fabs(&b, nir_fsub(&b, normalized_line_coord, nir_imm_float(&b, 0.5f)))); /* 0.5 - 1/√2 * (pixels_from_center - line_width * 0.5) */ - nir_ssa_def *coverage = + nir_def *coverage = nir_fsub(&b, nir_imm_float(&b, 0.5f), nir_fmul(&b, nir_imm_float(&b, 1.0f / M_SQRT2), nir_fsub(&b, pixels_from_center, - nir_fmul(&b, - line_width, - nir_imm_float(&b, 0.5f))))); + nir_fmul_imm(&b, + line_width, + 0.5f)))); /* Discard fragments that aren’t covered at all by the line */ - nir_ssa_def *outside = nir_fge(&b, nir_imm_float(&b, 0.0f), coverage); + nir_def *outside = nir_fle_imm(&b, coverage, 0.0f); nir_discard_if(&b, outside); /* Clamp to at most 1.0. If it was less than 0.0 then the fragment will * be discarded so we don’t need to handle that. */ - nir_ssa_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f)); + nir_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f)); nir_store_var(&b, state->coverage, clamped, 0x1 /* writemask */); } @@ -140,9 +146,11 @@ make_coverage_var(nir_shader *s) return var; } -void +bool v3d_nir_lower_line_smooth(nir_shader *s) { + bool progress = false; + assert(s->info.stage == MESA_SHADER_FRAGMENT); struct lower_line_smooth_state state = { @@ -150,10 +158,20 @@ v3d_nir_lower_line_smooth(nir_shader *s) .coverage = make_coverage_var(s), }; - nir_foreach_function(function, s) { + nir_foreach_function_with_impl(function, impl, s) { if (function->is_entrypoint) - initialise_coverage_var(&state, function->impl); + initialise_coverage_var(&state, impl); + + progress |= lower_line_smooth_func(&state, impl); - lower_line_smooth_func(&state, function->impl); + if (progress) { + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); + } else { + nir_metadata_preserve(impl, nir_metadata_all); + } } + + return progress; } diff --git a/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c new file mode 100644 index 00000000000..0caf5dbc92c --- /dev/null +++ b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c @@ -0,0 +1,260 @@ +/* + * Copyright © 2021 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/v3d_compiler.h" +#include "compiler/nir/nir_builder.h" + +/** + * The V3D TMU unit can only do 32-bit general vector access so for anything + * else we need to split vector load/store instructions to scalar. + * + * Note that a vectorization pass after this lowering may be able to + * re-vectorize some of these using 32-bit load/store instructions instead, + * which we do support. + */ + +static int +value_src(nir_intrinsic_op intrinsic) +{ + switch (intrinsic) { + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: + return 0; + default: + unreachable("Unsupported intrinsic"); + } +} + +static int +offset_src(nir_intrinsic_op intrinsic) +{ + switch (intrinsic) { + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_global_2x32: + return 0; + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: + return 1; + case nir_intrinsic_store_ssbo: + return 2; + default: + unreachable("Unsupported intrinsic"); + } +} + +static nir_intrinsic_instr * +init_scalar_intrinsic(nir_builder *b, + nir_intrinsic_instr *intr, + uint32_t component, + nir_def *offset, + uint32_t bit_size, + nir_def **scalar_offset) +{ + + nir_intrinsic_instr *new_intr = + nir_intrinsic_instr_create(b->shader, intr->intrinsic); + + nir_intrinsic_copy_const_indices(new_intr, intr); + + const int offset_units = bit_size / 8; + assert(offset_units >= 1); + + if (nir_intrinsic_has_align_mul(intr)) { + assert(nir_intrinsic_has_align_offset(intr)); + unsigned align_mul = nir_intrinsic_align_mul(intr); + unsigned align_off = nir_intrinsic_align_offset(intr); + + align_off += offset_units * component; + align_off = align_off % align_mul; + + nir_intrinsic_set_align(new_intr, align_mul, align_off); + } + + *scalar_offset = offset; + unsigned offset_adj = offset_units * component; + if (nir_intrinsic_has_base(intr)) { + nir_intrinsic_set_base( + new_intr, nir_intrinsic_base(intr) + offset_adj); + } else { + *scalar_offset = + nir_iadd(b, offset, + nir_imm_intN_t(b, offset_adj, + offset->bit_size)); + } + + new_intr->num_components = 1; + + return new_intr; +} + +static bool +lower_load_bitsize(nir_builder *b, + nir_intrinsic_instr *intr) +{ + uint32_t bit_size = intr->def.bit_size; + if (bit_size == 32) + return false; + + /* No need to split if it is already scalar */ + int num_comp = nir_intrinsic_dest_components(intr); + if (num_comp <= 1) + return false; + + b->cursor = nir_before_instr(&intr->instr); + + /* For global 2x32 we ignore Y component because it must be zero */ + unsigned offset_idx = offset_src(intr->intrinsic); + nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1); + + /* Split vector store to multiple scalar loads */ + nir_def *dest_components[4] = { NULL }; + const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; + for (int component = 0; component < num_comp; component++) { + nir_def *scalar_offset; + nir_intrinsic_instr *new_intr = + init_scalar_intrinsic(b, intr, component, offset, + bit_size, &scalar_offset); + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (i == offset_idx) { + nir_def *final_offset; + final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ? + scalar_offset : + nir_vec2(b, scalar_offset, + nir_imm_int(b, 0)); + new_intr->src[i] = nir_src_for_ssa(final_offset); + } else { + new_intr->src[i] = intr->src[i]; + } + } + + nir_def_init(&new_intr->instr, &new_intr->def, 1, + bit_size); + dest_components[component] = &new_intr->def; + + nir_builder_instr_insert(b, &new_intr->instr); + } + + nir_def *new_dst = nir_vec(b, dest_components, num_comp); + nir_def_rewrite_uses(&intr->def, new_dst); + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_store_bitsize(nir_builder *b, + nir_intrinsic_instr *intr) +{ + /* No need to split if it is already scalar */ + int value_idx = value_src(intr->intrinsic); + int num_comp = nir_intrinsic_src_components(intr, value_idx); + if (num_comp <= 1) + return false; + + /* No need to split if it is 32-bit */ + if (nir_src_bit_size(intr->src[value_idx]) == 32) + return false; + + nir_def *value = intr->src[value_idx].ssa; + + b->cursor = nir_before_instr(&intr->instr); + + /* For global 2x32 we ignore Y component because it must be zero */ + unsigned offset_idx = offset_src(intr->intrinsic); + nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1); + + /* Split vector store to multiple scalar stores */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; + unsigned wrmask = nir_intrinsic_write_mask(intr); + while (wrmask) { + unsigned component = ffs(wrmask) - 1; + + nir_def *scalar_offset; + nir_intrinsic_instr *new_intr = + init_scalar_intrinsic(b, intr, component, offset, + value->bit_size, &scalar_offset); + + nir_intrinsic_set_write_mask(new_intr, 0x1); + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (i == value_idx) { + nir_def *scalar_value = + nir_channels(b, value, 1 << component); + new_intr->src[i] = nir_src_for_ssa(scalar_value); + } else if (i == offset_idx) { + nir_def *final_offset; + final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ? + scalar_offset : + nir_vec2(b, scalar_offset, + nir_imm_int(b, 0)); + new_intr->src[i] = nir_src_for_ssa(final_offset); + } else { + new_intr->src[i] = intr->src[i]; + } + } + + nir_builder_instr_insert(b, &new_intr->instr); + + wrmask &= ~(1 << component); + } + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr, + void *data) +{ + switch (intr->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_global_2x32: + return lower_load_bitsize(b, intr); + + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_global_2x32: + return lower_store_bitsize(b, intr); + + default: + return false; + } +} + +bool +v3d_nir_lower_load_store_bitsize(nir_shader *s) +{ + return nir_shader_intrinsics_pass(s, lower_load_store_bitsize, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} diff --git a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c index 11782c7348f..4affb79a7e2 100644 --- a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c +++ b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c @@ -36,8 +36,8 @@ #include "v3d_compiler.h" -typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c); -typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c); +typedef nir_def *(*nir_pack_func)(nir_builder *b, nir_def *c); +typedef nir_def *(*nir_unpack_func)(nir_builder *b, nir_def *c); static bool logicop_depends_on_dst_color(int logicop_func) @@ -53,9 +53,9 @@ logicop_depends_on_dst_color(int logicop_func) } } -static nir_ssa_def * +static nir_def * v3d_logicop(nir_builder *b, int logicop_func, - nir_ssa_def *src, nir_ssa_def *dst) + nir_def *src, nir_def *dst) { switch (logicop_func) { case PIPE_LOGICOP_CLEAR: @@ -96,8 +96,8 @@ v3d_logicop(nir_builder *b, int logicop_func, } } -static nir_ssa_def * -v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) +static nir_def * +v3d_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz) { switch (swiz) { default: @@ -116,57 +116,57 @@ v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) } } -static nir_ssa_def * -v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans, +static nir_def * +v3d_nir_swizzle_and_pack(nir_builder *b, nir_def **chans, const uint8_t *swiz, nir_pack_func pack_func) { - nir_ssa_def *c[4]; + nir_def *c[4]; for (int i = 0; i < 4; i++) c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]); return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3])); } -static nir_ssa_def * -v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed, +static nir_def * +v3d_nir_unpack_and_swizzle(nir_builder *b, nir_def *packed, const uint8_t *swiz, nir_unpack_func unpack_func) { - nir_ssa_def *unpacked = unpack_func(b, packed); + nir_def *unpacked = unpack_func(b, packed); - nir_ssa_def *unpacked_chans[4]; + nir_def *unpacked_chans[4]; for (int i = 0; i < 4; i++) unpacked_chans[i] = nir_channel(b, unpacked, i); - nir_ssa_def *c[4]; + nir_def *c[4]; for (int i = 0; i < 4; i++) c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]); return nir_vec4(b, c[0], c[1], c[2], c[3]); } -static nir_ssa_def * -pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) +static nir_def * +pack_unorm_rgb10a2(nir_builder *b, nir_def *c) { static const unsigned bits[4] = { 10, 10, 10, 2 }; - nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits); + nir_def *unorm = nir_format_float_to_unorm(b, c, bits); - nir_ssa_def *chans[4]; + nir_def *chans[4]; for (int i = 0; i < 4; i++) chans[i] = nir_channel(b, unorm, i); - nir_ssa_def *result = nir_mov(b, chans[0]); + nir_def *result = nir_mov(b, chans[0]); int offset = bits[0]; for (int i = 1; i < 4; i++) { - nir_ssa_def *shifted_chan = - nir_ishl(b, chans[i], nir_imm_int(b, offset)); + nir_def *shifted_chan = + nir_ishl_imm(b, chans[i], offset); result = nir_ior(b, result, shifted_chan); offset += bits[i]; } return result; } -static nir_ssa_def * -unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) +static nir_def * +unpack_unorm_rgb10a2(nir_builder *b, nir_def *c) { static const unsigned bits[4] = { 10, 10, 10, 2 }; const unsigned masks[4] = { BITFIELD_MASK(bits[0]), @@ -174,11 +174,11 @@ unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) BITFIELD_MASK(bits[2]), BITFIELD_MASK(bits[3]) }; - nir_ssa_def *chans[4]; + nir_def *chans[4]; for (int i = 0; i < 4; i++) { - nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i])); + nir_def *unorm = nir_iand_imm(b, c, masks[i]); chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]); - c = nir_ushr(b, c, nir_imm_int(b, bits[i])); + c = nir_ushr_imm(b, c, bits[i]); } return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]); @@ -201,13 +201,13 @@ v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt) } } -static nir_ssa_def * +static nir_def * v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample) { uint32_t num_components = util_format_get_nr_components(c->fs_key->color_fmt[rt].format); - nir_ssa_def *color[4]; + nir_def *color[4]; for (int i = 0; i < 4; i++) { if (i < num_components) { color[i] = @@ -222,71 +222,68 @@ v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample) return nir_vec4(b, color[0], color[1], color[2], color[3]); } -static nir_ssa_def * +static nir_def * v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b, - nir_ssa_def **src_chans, nir_ssa_def **dst_chans, + nir_def **src_chans, nir_def **dst_chans, int rt, int sample) { const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt); - nir_ssa_def *op_res[4]; + nir_def *op_res[4]; for (int i = 0; i < 4; i++) { - nir_ssa_def *src = src_chans[i]; - nir_ssa_def *dst = + nir_def *src = src_chans[i]; + nir_def *dst = v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]); op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst); - /* In Vulkan we configure our integer RTs to clamp, so we need - * to ignore result bits that don't fit in the destination RT - * component size. + /* We configure our integer RTs to clamp, so we need to ignore + * result bits that don't fit in the destination RT component + * size. */ - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { - uint32_t bits = - util_format_get_component_bits( - c->fs_key->color_fmt[rt].format, - UTIL_FORMAT_COLORSPACE_RGB, i); - if (bits > 0 && bits < 32) { - nir_ssa_def *mask = - nir_imm_int(b, (1u << bits) - 1); - op_res[i] = nir_iand(b, op_res[i], mask); - } + uint32_t bits = + util_format_get_component_bits( + c->fs_key->color_fmt[rt].format, + UTIL_FORMAT_COLORSPACE_RGB, i); + if (bits > 0 && bits < 32) { + op_res[i] = + nir_iand_imm(b, op_res[i], (1u << bits) - 1); } } - nir_ssa_def *r[4]; + nir_def *r[4]; for (int i = 0; i < 4; i++) r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]); return nir_vec4(b, r[0], r[1], r[2], r[3]); } -static nir_ssa_def * +static nir_def * v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b, - nir_ssa_def **src_chans, nir_ssa_def **dst_chans, + nir_def **src_chans, nir_def **dst_chans, int rt, int sample, nir_pack_func pack_func, nir_unpack_func unpack_func) { static const uint8_t src_swz[4] = { 0, 1, 2, 3 }; - nir_ssa_def *packed_src = + nir_def *packed_src = v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func); const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt); - nir_ssa_def *packed_dst = + nir_def *packed_dst = v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func); - nir_ssa_def *packed_result = + nir_def *packed_result = v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst); return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func); } -static nir_ssa_def * +static nir_def * v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b, - nir_ssa_def *src, int rt, int sample) + nir_def *src, int rt, int sample) { - nir_ssa_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample); + nir_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample); - nir_ssa_def *src_chans[4], *dst_chans[4]; + nir_def *src_chans[4], *dst_chans[4]; for (unsigned i = 0; i < 4; i++) { src_chans[i] = nir_channel(b, src, i); dst_chans[i] = nir_channel(b, dst, i); @@ -309,7 +306,7 @@ v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b, static void v3d_emit_ms_output(nir_builder *b, - nir_ssa_def *color, nir_src *offset, + nir_def *color, nir_src *offset, nir_alu_type type, int rt, int sample) { nir_store_tlb_sample_color_v3d(b, color, nir_imm_int(b, rt), .base = sample, .component = 0, .src_type = type); @@ -321,7 +318,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, nir_intrinsic_instr *intr, int rt) { - nir_ssa_def *frag_color = intr->src[0].ssa; + nir_def *frag_color = intr->src[0].ssa; const int logic_op = c->fs_key->logicop_func; @@ -331,7 +328,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, nir_src *offset = &intr->src[1]; nir_alu_type type = nir_intrinsic_src_type(intr); for (int i = 0; i < V3D_MAX_SAMPLES; i++) { - nir_ssa_def *sample = + nir_def *sample = v3d_nir_emit_logic_op(c, b, frag_color, rt, i); v3d_emit_ms_output(b, sample, offset, type, rt, i); @@ -339,11 +336,10 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, nir_instr_remove(&intr->instr); } else { - nir_ssa_def *result = + nir_def *result = v3d_nir_emit_logic_op(c, b, frag_color, rt, 0); - nir_instr_rewrite_src(&intr->instr, &intr->src[0], - nir_src_for_ssa(result)); + nir_src_rewrite(&intr->src[0], result); intr->num_components = result->num_components; } } @@ -351,6 +347,8 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c, static bool v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c) { + bool progress = false; + nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; @@ -384,35 +382,40 @@ v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c) continue; } - nir_function_impl *impl = - nir_cf_node_get_function(&block->cf_node); - nir_builder b; - nir_builder_init(&b, impl); - b.cursor = nir_before_instr(&intr->instr); + nir_builder b = nir_builder_at(nir_before_instr(&intr->instr)); v3d_nir_lower_logic_op_instr(c, &b, intr, rt); + + progress = true; } } - return true; + return progress; } -void +bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c) { + bool progress = false; + /* Nothing to do if logic op is 'copy src to dst' or if logic ops are * disabled (we set the logic op to copy in that case). */ if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY) - return; + return false; - nir_foreach_function(function, s) { - if (function->impl) { - nir_foreach_block(block, function->impl) - v3d_nir_lower_logic_ops_block(block, c); + nir_foreach_function_impl(impl, s) { + nir_foreach_block(block, impl) + progress |= v3d_nir_lower_logic_ops_block(block, c); - nir_metadata_preserve(function->impl, + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); + } else { + nir_metadata_preserve(impl, + nir_metadata_all); } } + + return progress; } diff --git a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c b/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c deleted file mode 100644 index 40f1cc23b1a..00000000000 --- a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright © 2020 Raspberry Pi - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "compiler/v3d_compiler.h" -#include "compiler/nir/nir_builder.h" - -static void -rewrite_offset(nir_builder *b, - nir_intrinsic_instr *instr, - uint32_t buffer_idx, - uint32_t offset_src, - nir_intrinsic_op buffer_size_op) -{ - b->cursor = nir_before_instr(&instr->instr); - - /* Get size of the buffer */ - nir_intrinsic_instr *size = - nir_intrinsic_instr_create(b->shader, buffer_size_op); - size->src[0] = nir_src_for_ssa(nir_imm_int(b, buffer_idx)); - nir_ssa_dest_init(&size->instr, &size->dest, 1, 32, NULL); - nir_builder_instr_insert(b, &size->instr); - - /* All out TMU accesses are 32-bit aligned */ - nir_ssa_def *aligned_buffer_size = - nir_iand(b, &size->dest.ssa, nir_imm_int(b, 0xfffffffc)); - - /* Rewrite offset */ - nir_ssa_def *offset = - nir_umin(b, instr->src[offset_src].ssa, aligned_buffer_size); - nir_instr_rewrite_src(&instr->instr, &instr->src[offset_src], - nir_src_for_ssa(offset)); -} - -static void -lower_load(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - uint32_t index = nir_src_comp_as_uint(instr->src[0], 0); - - nir_intrinsic_op op; - if (instr->intrinsic == nir_intrinsic_load_ubo) { - op = nir_intrinsic_get_ubo_size; - if (c->key->environment == V3D_ENVIRONMENT_VULKAN) - index--; - } else { - op = nir_intrinsic_get_ssbo_size; - } - - rewrite_offset(b, instr, index, 1, op); -} - -static void -lower_store(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - uint32_t index = nir_src_comp_as_uint(instr->src[1], 0); - rewrite_offset(b, instr, index, 2, nir_intrinsic_get_ssbo_size); -} - -static void -lower_atomic(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - uint32_t index = nir_src_comp_as_uint(instr->src[0], 0); - rewrite_offset(b, instr, index, 1, nir_intrinsic_get_ssbo_size); -} - -static void -lower_shared(struct v3d_compile *c, - nir_builder *b, - nir_intrinsic_instr *instr) -{ - b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *aligned_size = - nir_imm_int(b, c->s->info.shared_size & 0xfffffffc); - nir_ssa_def *offset = nir_umin(b, instr->src[0].ssa, aligned_size); - nir_instr_rewrite_src(&instr->instr, &instr->src[0], - nir_src_for_ssa(offset)); -} - -static void -lower_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr) -{ - if (instr->type != nir_instr_type_intrinsic) - return; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ssbo: - lower_load(c, b, intr); - break; - case nir_intrinsic_store_ssbo: - lower_store(c, b, intr); - break; - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - lower_atomic(c, b, intr); - break; - case nir_intrinsic_load_shared: - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - lower_shared(c, b, intr); - break; - default: - break; - } -} - -void -v3d_nir_lower_robust_buffer_access(nir_shader *s, struct v3d_compile *c) -{ - nir_foreach_function(function, s) { - if (function->impl) { - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) - lower_instr(c, &b, instr); - } - - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); - } - } -} diff --git a/src/broadcom/compiler/v3d_nir_lower_scratch.c b/src/broadcom/compiler/v3d_nir_lower_scratch.c index 893b6f6ae28..93ed1bb6e26 100644 --- a/src/broadcom/compiler/v3d_nir_lower_scratch.c +++ b/src/broadcom/compiler/v3d_nir_lower_scratch.c @@ -34,11 +34,11 @@ * writemasks in the process. */ -static nir_ssa_def * +static nir_def * v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr) { bool is_store = instr->intrinsic == nir_intrinsic_store_scratch; - nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1); + nir_def *offset = instr->src[is_store ? 1 : 0].ssa; assert(nir_intrinsic_align_mul(instr) >= 4); assert(nir_intrinsic_align_offset(instr) == 0); @@ -55,18 +55,18 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr) { b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr); + nir_def *offset = v3d_nir_scratch_offset(b,instr); - nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS]; + nir_def *chans[NIR_MAX_VEC_COMPONENTS]; for (int i = 0; i < instr->num_components; i++) { - nir_ssa_def *chan_offset = + nir_def *chan_offset = nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4); nir_intrinsic_instr *chan_instr = nir_intrinsic_instr_create(b->shader, instr->intrinsic); chan_instr->num_components = 1; - nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1, - instr->dest.ssa.bit_size, NULL); + nir_def_init(&chan_instr->instr, &chan_instr->def, 1, + instr->def.bit_size); chan_instr->src[0] = nir_src_for_ssa(chan_offset); @@ -74,11 +74,11 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr) nir_builder_instr_insert(b, &chan_instr->instr); - chans[i] = &chan_instr->dest.ssa; + chans[i] = &chan_instr->def; } - nir_ssa_def *result = nir_vec(b, chans, instr->num_components); - nir_ssa_def_rewrite_uses(&instr->dest.ssa, result); + nir_def *result = nir_vec(b, chans, instr->num_components); + nir_def_rewrite_uses(&instr->def, result); nir_instr_remove(&instr->instr); } @@ -87,15 +87,14 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr) { b->cursor = nir_before_instr(&instr->instr); - nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr); - nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0], - instr->num_components); + nir_def *offset = v3d_nir_scratch_offset(b, instr); + nir_def *value = instr->src[0].ssa; for (int i = 0; i < instr->num_components; i++) { if (!(nir_intrinsic_write_mask(instr) & (1 << i))) continue; - nir_ssa_def *chan_offset = + nir_def *chan_offset = nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4); nir_intrinsic_instr *chan_instr = @@ -115,39 +114,29 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr) nir_instr_remove(&instr->instr); } -void -v3d_nir_lower_scratch(nir_shader *s) +static bool +v3d_nir_lower_scratch_cb(nir_builder *b, + nir_intrinsic_instr *intr, + void *_state) { - nir_foreach_function(function, s) { - if (!function->impl) - continue; - - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = - nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_load_scratch: - v3d_nir_lower_load_scratch(&b, intr); - break; - case nir_intrinsic_store_scratch: - v3d_nir_lower_store_scratch(&b, intr); - break; - default: - break; - } - } - } - - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + switch (intr->intrinsic) { + case nir_intrinsic_load_scratch: + v3d_nir_lower_load_scratch(b, intr); + return true; + case nir_intrinsic_store_scratch: + v3d_nir_lower_store_scratch(b, intr); + return true; + default: + return false; } + + return false; +} + +bool +v3d_nir_lower_scratch(nir_shader *s) +{ + return nir_shader_intrinsics_pass(s, v3d_nir_lower_scratch_cb, + nir_metadata_block_index | + nir_metadata_dominance, NULL); } diff --git a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c index d79969374d5..e78c3cb9e3e 100644 --- a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c +++ b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c @@ -32,25 +32,21 @@ * 2x2 quad. */ -#define V3D_MAX_SAMPLES 4 - -static nir_ssa_def * +static nir_def * v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data) { nir_tex_instr *instr = nir_instr_as_tex(in_instr); b->cursor = nir_before_instr(&instr->instr); - int coord_index = nir_tex_instr_src_index(instr, nir_tex_src_coord); - int sample_index = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - nir_ssa_def *coord = instr->src[coord_index].src.ssa; - nir_ssa_def *sample = instr->src[sample_index].src.ssa; + nir_def *coord = nir_steal_tex_src(instr, nir_tex_src_coord); + nir_def *sample = nir_steal_tex_src(instr, nir_tex_src_ms_index); - nir_ssa_def *one = nir_imm_int(b, 1); - nir_ssa_def *x = nir_iadd(b, + nir_def *one = nir_imm_int(b, 1); + nir_def *x = nir_iadd(b, nir_ishl(b, nir_channel(b, coord, 0), one), nir_iand(b, sample, one)); - nir_ssa_def *y = nir_iadd(b, + nir_def *y = nir_iadd(b, nir_ishl(b, nir_channel(b, coord, 1), one), nir_iand(b, nir_ushr(b, sample, one), one)); if (instr->is_array) @@ -58,10 +54,7 @@ v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data) else coord = nir_vec2(b, x, y); - nir_instr_rewrite_src(&instr->instr, - &instr->src[nir_tex_src_coord].src, - nir_src_for_ssa(coord)); - nir_tex_instr_remove_src(instr, sample_index); + nir_tex_instr_add_src(instr, nir_tex_src_coord, coord); instr->op = nir_texop_txf; instr->sampler_dim = GLSL_SAMPLER_DIM_2D; @@ -75,11 +68,11 @@ v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data) nir_instr_as_tex(instr)->op == nir_texop_txf_ms); } -void -v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c) +bool +v3d_nir_lower_txf_ms(nir_shader *s) { - nir_shader_lower_instructions(s, - v3d_nir_lower_txf_ms_filter, - v3d_nir_lower_txf_ms_instr, - NULL); + return nir_shader_lower_instructions(s, + v3d_nir_lower_txf_ms_filter, + v3d_nir_lower_txf_ms_instr, + NULL); } diff --git a/src/broadcom/compiler/v3d_packing.c b/src/broadcom/compiler/v3d_packing.c new file mode 100644 index 00000000000..46643edd5e6 --- /dev/null +++ b/src/broadcom/compiler/v3d_packing.c @@ -0,0 +1,50 @@ +/* + * Copyright © 2023 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3d_compiler.h" + +#define __gen_user_data void +#define __gen_address_type uint32_t +#define __gen_address_offset(reloc) (*reloc) +#define __gen_emit_reloc(cl, reloc) +#define __gen_unpack_address(cl, s, e) (__gen_unpack_uint(cl, s, e) << (31 - (e - s))) +#include "cle/v3d_packet_v42_pack.h" + + +/* Typically, this method would wrap calling version-specific variant of this + * method, but as TMU_CONFIG_PARAMETER_1 doesn't change between v42 and v71, + * we can assume that p1_packed is the same struct, and use the same method. + */ +void +v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo, + uint32_t *p1_packed, + bool unnormalized_coordinates) +{ + assert(devinfo->ver == 71 || devinfo->ver == 42); + + struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked; + V3D42_TMU_CONFIG_PARAMETER_1_unpack((uint8_t *)p1_packed, &p1_unpacked); + p1_unpacked.unnormalized_coordinates = unnormalized_coordinates; + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)p1_packed, + &p1_unpacked); +} diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d_tex.c index 7bebfe95552..643c73c4e58 100644 --- a/src/broadcom/compiler/v3d40_tex.c +++ b/src/broadcom/compiler/v3d_tex.c @@ -28,27 +28,29 @@ #define __gen_address_type uint32_t #define __gen_address_offset(reloc) (*reloc) #define __gen_emit_reloc(cl, reloc) -#include "cle/v3d_packet_v41_pack.h" +#include "cle/v3d_packet_v42_pack.h" -static inline void +static inline struct qinst * vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val) { /* XXX perf: We should figure out how to merge ALU operations * producing the val with this MOV, when possible. */ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); + return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); } -static inline void +static inline struct qinst * vir_TMU_WRITE_or_count(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val, uint32_t *tmu_writes) { - if (tmu_writes) + if (tmu_writes) { (*tmu_writes)++; - else - vir_TMU_WRITE(c, waddr, val); + return NULL; + } else { + return vir_TMU_WRITE(c, waddr, val); + } } static void @@ -59,11 +61,11 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data inst->uniform = vir_get_uniform_index(c, contents, data); } -static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = { +static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = { .per_pixel_mask_enable = true, }; -static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = { +static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = { .op = V3D_TMU_OP_REGULAR, }; @@ -84,7 +86,7 @@ handle_tex_src(struct v3d_compile *c, nir_tex_instr *instr, unsigned src_idx, unsigned non_array_components, - struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, + struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked, struct qreg *s_out, unsigned *tmu_writes) { @@ -199,7 +201,7 @@ handle_tex_src(struct v3d_compile *c, static void vir_tex_handle_srcs(struct v3d_compile *c, nir_tex_instr *instr, - struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, + struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked, struct qreg *s, unsigned *tmu_writes) { @@ -222,31 +224,62 @@ get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr) } void -v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) +v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) { - assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42); - unsigned texture_idx = instr->texture_index; - unsigned sampler_idx = instr->sampler_index; - struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = { + /* For instructions that don't have a sampler (i.e. txf) we bind + * default sampler state via the backend_flags to handle precision. + */ + unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ? + instr->sampler_index : instr->backend_flags; + + /* Even if the texture operation doesn't need a sampler by + * itself, we still need to add the sampler configuration + * parameter if the output is 32 bit + */ + assert(sampler_idx < c->key->num_samplers_used); + bool output_type_32_bit = + c->key->sampler[sampler_idx].return_size == 32; + + struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = { }; /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. */ - p0_unpacked.return_words_of_texture_data = - instr->dest.is_ssa ? - nir_ssa_def_components_read(&instr->dest.ssa) : - (1 << instr->dest.reg.reg->num_components) - 1; + nir_intrinsic_instr *store = nir_store_reg_for_def(&instr->def); + if (store == NULL) { + p0_unpacked.return_words_of_texture_data = + nir_def_components_read(&instr->def); + } else { + nir_def *reg = store->src[1].ssa; + nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + unsigned reg_num_components = + nir_intrinsic_num_components(decl); + + /* For the non-ssa case we don't have a full equivalent to + * nir_def_components_read. This is a problem for the 16 + * bit case. nir_lower_tex will not change the destination as + * nir_tex_instr_dest_size will still return 4. The driver is + * just expected to not store on other channels, so we + * manually ensure that here. + */ + uint32_t num_components = output_type_32_bit ? + MIN2(reg_num_components, 4) : + MIN2(reg_num_components, 2); + + p0_unpacked.return_words_of_texture_data = (1 << num_components) - 1; + } assert(p0_unpacked.return_words_of_texture_data != 0); - struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { .op = V3D_TMU_OP_REGULAR, .gather_mode = instr->op == nir_texop_tg4, .gather_component = instr->component, .coefficient_mode = instr->op == nir_texop_txd, - .disable_autolod = instr->op == nir_texop_tg4 + .disable_autolod = instr->op == nir_texop_tg4, + .lod_query = instr->op == nir_texop_lod, }; const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr); @@ -270,22 +303,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL); uint32_t p0_packed; - V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL, (uint8_t *)&p0_packed, &p0_unpacked); uint32_t p2_packed; - V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL, (uint8_t *)&p2_packed, &p2_unpacked); - /* We manually set the LOD Query bit (see - * V3D42_TMU_CONFIG_PARAMETER_2) as right now is the only V42 specific - * feature over V41 we are using - */ - if (instr->op == nir_texop_lod) - p2_packed |= 1UL << 24; - /* Load texture_idx number into the high bits of the texture address field, * which will be be used by the driver to decide which texture to put * in the actual address field. @@ -294,14 +320,6 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed); - /* Even if the texture operation doesn't need a sampler by - * itself, we still need to add the sampler configuration - * parameter if the output is 32 bit - */ - bool output_type_32_bit = - c->key->sampler[sampler_idx].return_size == 32 && - !instr->is_shadow; - /* p1 is optional, but we can skip it only if p2 can be skipped too */ bool needs_p2_config = (instr->op == nir_texop_lod || @@ -313,7 +331,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) output_type_32_bit; if (non_default_p1_config) { - struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = { .output_type_32_bit = output_type_32_bit, .unnormalized_coordinates = (instr->sampler_dim == @@ -330,7 +348,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) p0_unpacked.return_words_of_texture_data < (1 << 2)); uint32_t p1_packed; - V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)&p1_packed, &p1_unpacked); @@ -358,7 +376,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) * address */ uint32_t p1_packed_default; - V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)&p1_packed_default, &p1_unpacked_default); vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default); @@ -368,48 +386,54 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); /* Emit retiring TMU write */ + struct qinst *retiring; if (instr->op == nir_texop_txf) { assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE); - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); } else if (instr->op == nir_texop_txl) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); } else { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); } - ntq_add_pending_tmu_flush(c, &instr->dest, + retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data; + ntq_add_pending_tmu_flush(c, &instr->def, p0_unpacked.return_words_of_texture_data); } static uint32_t -v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr) +v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr) +{ + nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr); + switch (atomic_op) { + case nir_atomic_op_iadd: return v3d_get_op_for_atomic_add(instr, 3); + case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN; + case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; + case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX; + case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX; + case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC; + case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC; + case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT; + case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; + case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + default: unreachable("unknown atomic op"); + } +} + +static uint32_t +v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { case nir_intrinsic_image_load: case nir_intrinsic_image_store: return V3D_TMU_OP_REGULAR; - case nir_intrinsic_image_atomic_add: - return v3d_get_op_for_atomic_add(instr, 3); - case nir_intrinsic_image_atomic_imin: - return V3D_TMU_OP_WRITE_SMIN; - case nir_intrinsic_image_atomic_umin: - return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; - case nir_intrinsic_image_atomic_imax: - return V3D_TMU_OP_WRITE_SMAX; - case nir_intrinsic_image_atomic_umax: - return V3D_TMU_OP_WRITE_UMAX; - case nir_intrinsic_image_atomic_and: - return V3D_TMU_OP_WRITE_AND_READ_INC; - case nir_intrinsic_image_atomic_or: - return V3D_TMU_OP_WRITE_OR_READ_DEC; - case nir_intrinsic_image_atomic_xor: - return V3D_TMU_OP_WRITE_XOR_READ_NOT; - case nir_intrinsic_image_atomic_exchange: - return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; - case nir_intrinsic_image_atomic_comp_swap: - return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + return v3d_image_atomic_tmu_op(instr); + default: unreachable("unknown image intrinsic"); }; @@ -427,7 +451,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr) * which is why we always call ntq_get_src() even if we are only interested in * register write counts. */ -static void +static struct qinst * vir_image_emit_register_writes(struct v3d_compile *c, nir_intrinsic_instr *instr, bool atomic_add_replaced, @@ -480,7 +504,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, } /* Second atomic argument */ - if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) { + if (instr->intrinsic == nir_intrinsic_image_atomic_swap && + nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) { struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0); vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0, tmu_writes); @@ -494,7 +519,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, V3D_QPU_PF_PUSHZ); } - vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); + struct qinst *retiring = + vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); if (!tmu_writes && vir_in_nonuniform_control_flow(c) && instr->intrinsic != nir_intrinsic_image_load) { @@ -502,6 +528,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, (struct qinst *)c->cur_block->instructions.prev; vir_set_cond(last_inst, V3D_QPU_COND_IFA); } + + return retiring; } static unsigned @@ -516,21 +544,21 @@ get_required_image_tmu_writes(struct v3d_compile *c, } void -v3d40_vir_emit_image_load_store(struct v3d_compile *c, - nir_intrinsic_instr *instr) +v3d_vir_emit_image_load_store(struct v3d_compile *c, + nir_intrinsic_instr *instr) { unsigned format = nir_intrinsic_format(instr); unsigned unit = nir_src_as_uint(instr->src[0]); - struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = { }; - struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = { + struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = { .per_pixel_mask_enable = true, .output_type_32_bit = v3d_gl_format_is_return_32(format), }; - struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 }; + struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 }; /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. @@ -542,19 +570,20 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, p0_unpacked.return_words_of_texture_data = (1 << instr_return_channels) - 1; - p2_unpacked.op = v3d40_image_load_store_tmu_op(instr); + p2_unpacked.op = v3d_image_load_store_tmu_op(instr); /* If we were able to replace atomic_add for an inc/dec, then we * need/can to do things slightly different, like not loading the * amount to add/sub, as that is implicit. */ bool atomic_add_replaced = - (instr->intrinsic == nir_intrinsic_image_atomic_add && - (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC || - p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + instr->intrinsic == nir_intrinsic_image_atomic && + nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd && + (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC || + p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC); uint32_t p0_packed; - V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL, (uint8_t *)&p0_packed, &p0_unpacked); @@ -565,12 +594,12 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, p0_packed |= unit << 24; uint32_t p1_packed; - V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)&p1_packed, &p1_unpacked); uint32_t p2_packed; - V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL, + V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL, (uint8_t *)&p2_packed, &p2_unpacked); @@ -599,8 +628,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked))) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); - vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); - - ntq_add_pending_tmu_flush(c, &instr->dest, + struct qinst *retiring = + vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); + retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data; + ntq_add_pending_tmu_flush(c, &instr->def, p0_unpacked.return_words_of_texture_data); } diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 27869a35a3b..c59a8aac434 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -23,7 +23,6 @@ #include "broadcom/common/v3d_device_info.h" #include "v3d_compiler.h" -#include "util/u_prim.h" #include "compiler/nir/nir_schedule.h" #include "compiler/nir/nir_builder.h" @@ -89,7 +88,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) * pointer, so each read has a side effect (we don't care for ldunif * because we reconstruct the uniform stream buffer after compiling * with the surviving uniforms), so allowing DCE to remove - * one would break follow-up loads. We could fix this by emiting a + * one would break follow-up loads. We could fix this by emitting a * unifa for each ldunifa, but each unifa requires 3 delay slots * before a ldunifa, so that would be quite expensive. */ @@ -113,10 +112,10 @@ vir_is_raw_mov(struct qinst *inst) return false; } - if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { + if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) { return false; } @@ -156,30 +155,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) } bool -vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) +vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, + struct qinst *inst) { - for (int i = 0; i < vir_get_nsrc(inst); i++) { - switch (inst->src[i].file) { - case QFILE_VPM: - return true; - default: - break; - } - } - - if (devinfo->ver < 41 && (inst->qpu.sig.ldvary || - inst->qpu.sig.ldtlb || - inst->qpu.sig.ldtlbu || - inst->qpu.sig.ldvpm)) { - return true; - } - - return false; -} + if (!devinfo->has_accumulators) + return false; -bool -vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) -{ switch (inst->dst.file) { case QFILE_MAGIC: switch (inst->dst.index) { @@ -195,9 +176,6 @@ vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) break; } - if (devinfo->ver < 41 && inst->qpu.sig.ldtmu) - return true; - return false; } @@ -209,15 +187,15 @@ vir_set_unpack(struct qinst *inst, int src, if (vir_is_add(inst)) { if (src == 0) - inst->qpu.alu.add.a_unpack = unpack; + inst->qpu.alu.add.a.unpack = unpack; else - inst->qpu.alu.add.b_unpack = unpack; + inst->qpu.alu.add.b.unpack = unpack; } else { assert(vir_is_mul(inst)); if (src == 0) - inst->qpu.alu.mul.a_unpack = unpack; + inst->qpu.alu.mul.a.unpack = unpack; else - inst->qpu.alu.mul.b_unpack = unpack; + inst->qpu.alu.mul.b.unpack = unpack; } } @@ -369,6 +347,8 @@ vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct q inst->src[1] = src1; inst->uniform = ~0; + inst->ip = -1; + return inst; } @@ -385,6 +365,8 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q inst->src[1] = src1; inst->uniform = ~0; + inst->ip = -1; + return inst; } @@ -404,12 +386,16 @@ vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) inst->dst = vir_nop_reg(); inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0); + inst->ip = -1; + return inst; } static void vir_emit(struct v3d_compile *c, struct qinst *inst) { + inst->ip = -1; + switch (c->cursor.mode) { case vir_cursor_add: list_add(&inst->link, c->cursor.link); @@ -509,13 +495,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor) } const struct v3d_compiler * -v3d_compiler_init(const struct v3d_device_info *devinfo) +v3d_compiler_init(const struct v3d_device_info *devinfo, + uint32_t max_inline_uniform_buffers) { struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler); if (!compiler) return NULL; compiler->devinfo = devinfo; + compiler->max_inline_uniform_buffers = max_inline_uniform_buffers; if (!vir_init_reg_sets(compiler)) { ralloc_free(compiler); @@ -531,6 +519,19 @@ v3d_compiler_free(const struct v3d_compiler *compiler) ralloc_free((void *)compiler); } +struct v3d_compiler_strategy { + const char *name; + uint32_t max_threads; + uint32_t min_threads; + bool disable_general_tmu_sched; + bool disable_gcm; + bool disable_loop_unrolling; + bool disable_ubo_load_sorting; + bool move_buffer_loads; + bool disable_tmu_pipelining; + uint32_t max_tmu_spills; +}; + static struct v3d_compile * vir_compile_init(const struct v3d_compiler *compiler, struct v3d_key *key, @@ -539,12 +540,8 @@ vir_compile_init(const struct v3d_compiler *compiler, void *debug_output_data), void *debug_output_data, int program_id, int variant_id, - uint32_t max_threads, - uint32_t min_threads_for_reg_alloc, - bool tmu_spilling_allowed, - bool disable_loop_unrolling, - bool disable_constant_ubo_load_sorting, - bool disable_tmu_pipelining, + uint32_t compile_strategy_idx, + const struct v3d_compiler_strategy *strategy, bool fallback_scheduler) { struct v3d_compile *c = rzalloc(NULL, struct v3d_compile); @@ -554,17 +551,22 @@ vir_compile_init(const struct v3d_compiler *compiler, c->key = key; c->program_id = program_id; c->variant_id = variant_id; - c->threads = max_threads; + c->compile_strategy_idx = compile_strategy_idx; + c->threads = strategy->max_threads; c->debug_output = debug_output; c->debug_output_data = debug_output_data; c->compilation_result = V3D_COMPILATION_SUCCEEDED; - c->min_threads_for_reg_alloc = min_threads_for_reg_alloc; - c->tmu_spilling_allowed = tmu_spilling_allowed; + c->min_threads_for_reg_alloc = strategy->min_threads; + c->max_tmu_spills = strategy->max_tmu_spills; c->fallback_scheduler = fallback_scheduler; - c->disable_tmu_pipelining = disable_tmu_pipelining; - c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; - c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL - ? true : disable_loop_unrolling; + c->disable_general_tmu_sched = strategy->disable_general_tmu_sched; + c->disable_tmu_pipelining = strategy->disable_tmu_pipelining; + c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting; + c->move_buffer_loads = strategy->move_buffer_loads; + c->disable_gcm = strategy->disable_gcm; + c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL) + ? true : strategy->disable_loop_unrolling; + s = nir_shader_clone(c, s); c->s = s; @@ -590,17 +592,107 @@ type_size_vec4(const struct glsl_type *type, bool bindless) return glsl_count_attribute_slots(type, false); } +static enum nir_lower_tex_packing +lower_tex_packing_cb(const nir_tex_instr *tex, const void *data) +{ + struct v3d_compile *c = (struct v3d_compile *) data; + + int sampler_index = nir_tex_instr_need_sampler(tex) ? + tex->sampler_index : tex->backend_flags; + + assert(sampler_index < c->key->num_samplers_used); + return c->key->sampler[sampler_index].return_size == 16 ? + nir_lower_tex_packing_16 : nir_lower_tex_packing_none; +} + +static bool +v3d_nir_lower_null_pointers_cb(nir_builder *b, + nir_intrinsic_instr *intr, + void *_state) +{ + uint32_t buffer_src_idx; + + switch (intr->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + buffer_src_idx = 0; + break; + case nir_intrinsic_store_ssbo: + buffer_src_idx = 1; + break; + default: + return false; + } + + /* If index if constant we are good */ + nir_src *src = &intr->src[buffer_src_idx]; + if (nir_src_is_const(*src)) + return false; + + /* Otherwise, see if it comes from a bcsel including a null pointer */ + if (src->ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr); + if (alu->op != nir_op_bcsel) + return false; + + /* A null pointer is specified using block index 0xffffffff */ + int32_t null_src_idx = -1; + for (int i = 1; i < 3; i++) { + /* FIXME: since we are running this before optimization maybe + * we need to also handle the case where we may have bcsel + * chain that we need to recurse? + */ + if (!nir_src_is_const(alu->src[i].src)) + continue; + if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff) + continue; + + /* One of the bcsel srcs is a null pointer reference */ + null_src_idx = i; + break; + } + + if (null_src_idx < 0) + return false; + + assert(null_src_idx == 1 || null_src_idx == 2); + int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1; + + /* Rewrite the null pointer reference so we use the same buffer index + * as the other bcsel branch. This will allow optimization to remove + * the bcsel and we should then end up with a constant buffer index + * like we need. + */ + b->cursor = nir_before_instr(&alu->instr); + nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa); + nir_src_rewrite(&alu->src[null_src_idx].src, copy); + + return true; +} + +static bool +v3d_nir_lower_null_pointers(nir_shader *s) +{ + return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb, + nir_metadata_block_index | + nir_metadata_dominance, NULL); +} + static void v3d_lower_nir(struct v3d_compile *c) { struct nir_lower_tex_options tex_options = { .lower_txd = true, + .lower_tg4_offsets = true, .lower_tg4_broadcom_swizzle = true, .lower_rect = false, /* XXX: Use this on V3D 3.x */ .lower_txp = ~0, /* Apply swizzles to all samplers. */ .swizzle_result = ~0, + .lower_invalid_implicit_lod = true, }; /* Lower the format swizzle and (for 32-bit returns) @@ -612,38 +704,35 @@ v3d_lower_nir(struct v3d_compile *c) tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j]; } - assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler)); - for (int i = 0; i < c->key->num_samplers_used; i++) { - if (c->key->sampler[i].return_size == 16) { - tex_options.lower_tex_packing[i] = - nir_lower_tex_packing_16; - } - } - - /* CS textures may not have return_size reflecting the shadow state. */ - nir_foreach_uniform_variable(var, c->s) { - const struct glsl_type *type = glsl_without_array(var->type); - unsigned array_len = MAX2(glsl_get_length(var->type), 1); + tex_options.lower_tex_packing_cb = lower_tex_packing_cb; + tex_options.lower_tex_packing_data = c; - if (!glsl_type_is_sampler(type) || - !glsl_sampler_type_is_shadow(type)) - continue; + NIR_PASS(_, c->s, nir_lower_tex, &tex_options); + NIR_PASS(_, c->s, nir_lower_system_values); - for (int i = 0; i < array_len; i++) { - tex_options.lower_tex_packing[var->data.binding + i] = - nir_lower_tex_packing_16; - } + if (c->s->info.zero_initialize_shared_memory && + c->s->info.shared_size > 0) { + /* All our BOs allocate full pages, so the underlying allocation + * for shared memory will always be a multiple of 4KB. This + * ensures that we can do an exact number of full chunk_size + * writes to initialize the memory independently of the actual + * shared_size used by the shader, which is a requirement of + * the initialization pass. + */ + const unsigned chunk_size = 16; /* max single store size */ + NIR_PASS(_, c->s, nir_zero_initialize_shared_memory, + align(c->s->info.shared_size, chunk_size), chunk_size); } - NIR_PASS_V(c->s, nir_lower_tex, &tex_options); - NIR_PASS_V(c->s, nir_lower_system_values); - NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL); + NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL); - NIR_PASS_V(c->s, nir_lower_vars_to_scratch, - nir_var_function_temp, - 0, - glsl_get_natural_size_align_bytes); - NIR_PASS_V(c->s, v3d_nir_lower_scratch); + NIR_PASS(_, c->s, nir_lower_vars_to_scratch, + nir_var_function_temp, + 0, + glsl_get_natural_size_align_bytes); + NIR_PASS(_, c->s, nir_lower_is_helper_invocation); + NIR_PASS(_, c->s, v3d_nir_lower_scratch); + NIR_PASS(_, c->s, v3d_nir_lower_null_pointers); } static void @@ -711,6 +800,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c, /* Set us up for shared input/output segments. This is apparently * necessary for our VCM setup to avoid varying corruption. + * + * FIXME: initial testing on V3D 7.1 seems to work fine when using + * separate segments. So we could try to reevaluate in the future, if + * there is any advantage of using separate segments. */ prog_data->separate_segments = false; prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, @@ -807,13 +900,14 @@ v3d_fs_set_prog_data(struct v3d_compile *c, { v3d_set_fs_prog_data_inputs(c, prog_data); prog_data->writes_z = c->writes_z; + prog_data->writes_z_from_fep = c->writes_z_from_fep; prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; prog_data->uses_center_w = c->uses_center_w; prog_data->uses_implicit_point_line_varyings = c->uses_implicit_point_line_varyings; prog_data->lock_scoreboard_on_first_thrsw = c->lock_scoreboard_on_first_thrsw; - prog_data->force_per_sample_msaa = c->force_per_sample_msaa; + prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading; prog_data->uses_pid = c->fs_uses_primitive_id; } @@ -837,8 +931,14 @@ v3d_set_prog_data(struct v3d_compile *c, prog_data->threads = c->threads; prog_data->single_seg = !c->last_thrsw; prog_data->spill_size = c->spill_size; + prog_data->tmu_spills = c->spills; + prog_data->tmu_fills = c->fills; + prog_data->tmu_count = c->tmu.total_count; + prog_data->qpu_read_stalls = c->qpu_inst_stalled_count; + prog_data->compile_strategy_idx = c->compile_strategy_idx; prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl; prog_data->has_control_barrier = c->s->info.uses_control_barrier; + prog_data->has_global_address = c->has_global_address; v3d_set_prog_data_uniforms(c, prog_data); @@ -882,32 +982,32 @@ v3d_nir_lower_vs_early(struct v3d_compile *c) /* Split our I/O vars and dead code eliminate the unused * components. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, - nir_var_shader_in | nir_var_shader_out); + NIR_PASS(_, c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); uint64_t used_outputs[4] = {0}; for (int i = 0; i < c->vs_key->num_used_outputs; i++) { int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]); int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]); used_outputs[comp] |= 1ull << slot; } - NIR_PASS_V(c->s, nir_remove_unused_io_vars, - nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ - NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + NIR_PASS(_, c->s, nir_remove_unused_io_vars, + nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ + NIR_PASS(_, c->s, nir_lower_global_vars_to_local); v3d_optimize_nir(c, c->s); - NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); + NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); /* This must go before nir_lower_io */ if (c->vs_key->per_vertex_point_size) - NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f); - NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - type_size_vec4, - (nir_lower_io_options)0); + NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); /* clean up nir_lower_io's deref_var remains and do a constant folding pass * on the code it generated. */ - NIR_PASS_V(c->s, nir_opt_dce); - NIR_PASS_V(c->s, nir_opt_constant_folding); + NIR_PASS(_, c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_opt_constant_folding); } static void @@ -916,29 +1016,32 @@ v3d_nir_lower_gs_early(struct v3d_compile *c) /* Split our I/O vars and dead code eliminate the unused * components. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, - nir_var_shader_in | nir_var_shader_out); + NIR_PASS(_, c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); uint64_t used_outputs[4] = {0}; for (int i = 0; i < c->gs_key->num_used_outputs; i++) { int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); used_outputs[comp] |= 1ull << slot; } - NIR_PASS_V(c->s, nir_remove_unused_io_vars, - nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ - NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + NIR_PASS(_, c->s, nir_remove_unused_io_vars, + nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ + NIR_PASS(_, c->s, nir_lower_global_vars_to_local); v3d_optimize_nir(c, c->s); - NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); + NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); /* This must go before nir_lower_io */ if (c->gs_key->per_vertex_point_size) - NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f); - NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - type_size_vec4, - (nir_lower_io_options)0); - /* clean up nir_lower_io's deref_var remains */ - NIR_PASS_V(c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); + /* clean up nir_lower_io's deref_var remains and do a constant folding pass + * on the code it generated. + */ + NIR_PASS(_, c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_opt_constant_folding); } static void @@ -977,11 +1080,11 @@ v3d_nir_lower_fs_early(struct v3d_compile *c) if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) v3d_fixup_fs_output_types(c); - NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c); + NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c); if (c->fs_key->line_smoothing) { - v3d_nir_lower_line_smooth(c->s); - NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + NIR_PASS(_, c->s, v3d_nir_lower_line_smooth); + NIR_PASS(_, c->s, nir_lower_global_vars_to_local); /* The lowering pass can introduce new sysval reads */ nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s)); } @@ -991,26 +1094,26 @@ static void v3d_nir_lower_gs_late(struct v3d_compile *c) { if (c->key->ucp_enables) { - NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables, - false, NULL); + NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables, + true, NULL); } /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); } static void v3d_nir_lower_vs_late(struct v3d_compile *c) { if (c->key->ucp_enables) { - NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables, - false, false, NULL); + NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables, + false, true, NULL); NIR_PASS_V(c->s, nir_lower_io_to_scalar, - nir_var_shader_out); + nir_var_shader_out, NULL, NULL); } /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); } static void @@ -1024,9 +1127,9 @@ v3d_nir_lower_fs_late(struct v3d_compile *c) * are using. */ if (c->key->ucp_enables) - NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true); + NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true); - NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); } static uint32_t @@ -1107,6 +1210,69 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr, return false; } +static unsigned +v3d_instr_delay_cb(nir_instr *instr, void *data) +{ + struct v3d_compile *c = (struct v3d_compile *) data; + + switch (instr->type) { + case nir_instr_type_undef: + case nir_instr_type_load_const: + case nir_instr_type_alu: + case nir_instr_type_deref: + case nir_instr_type_jump: + case nir_instr_type_parallel_copy: + case nir_instr_type_call: + case nir_instr_type_phi: + return 1; + + /* We should not use very large delays for TMU instructions. Typically, + * thread switches will be sufficient to hide all or most of the latency, + * so we typically only need a little bit of extra room. If we over-estimate + * the latency here we may end up unnecessarily delaying the critical path in + * the shader, which would have a negative effect in performance, so here + * we are trying to strike a balance based on empirical testing. + */ + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (!c->disable_general_tmu_sched) { + switch (intr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + return 0; + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_shared: + case nir_intrinsic_image_load: + return 3; + case nir_intrinsic_load_ubo: + if (nir_src_is_divergent(intr->src[1])) + return 3; + FALLTHROUGH; + default: + return 1; + } + } else { + switch (intr->intrinsic) { + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + return 0; + default: + return 1; + } + } + break; + } + + case nir_instr_type_tex: + return 5; + } + + return 0; +} + static bool should_split_wrmask(const nir_instr *instr, const void *data) { @@ -1197,7 +1363,7 @@ v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref) * reference offset, since otherwise we would not be able to * skip the unifa write for them. See ntq_emit_load_ubo_unifa. */ - if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE) + if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE) continue; /* We will move this load if its offset is smaller than ref's @@ -1349,16 +1515,14 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c, static bool v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c) { - nir_foreach_function(function, s) { - if (function->impl) { - nir_foreach_block(block, function->impl) { - c->sorted_any_ubo_loads |= - v3d_nir_sort_constant_ubo_loads_block(c, block); - } - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); + nir_foreach_function_impl(impl, s) { + nir_foreach_block(block, impl) { + c->sorted_any_ubo_loads |= + v3d_nir_sort_constant_ubo_loads_block(c, block); } + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); } return c->sorted_any_ubo_loads; } @@ -1376,8 +1540,8 @@ lower_load_num_subgroups(struct v3d_compile *c, DIV_ROUND_UP(c->s->info.workgroup_size[0] * c->s->info.workgroup_size[1] * c->s->info.workgroup_size[2], V3D_CHANNELS); - nir_ssa_def *result = nir_imm_int(b, num_subgroups); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); + nir_def *result = nir_imm_int(b, num_subgroups); + nir_def_rewrite_uses(&intr->def, result); nir_instr_remove(&intr->instr); } @@ -1404,6 +1568,36 @@ lower_subgroup_intrinsics(struct v3d_compile *c, case nir_intrinsic_load_subgroup_size: case nir_intrinsic_load_subgroup_invocation: case nir_intrinsic_elect: + case nir_intrinsic_ballot: + case nir_intrinsic_inverse_ballot: + case nir_intrinsic_ballot_bitfield_extract: + case nir_intrinsic_ballot_bit_count_reduce: + case nir_intrinsic_ballot_find_lsb: + case nir_intrinsic_ballot_find_msb: + case nir_intrinsic_ballot_bit_count_exclusive: + case nir_intrinsic_ballot_bit_count_inclusive: + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + case nir_intrinsic_read_invocation: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_load_subgroup_eq_mask: + case nir_intrinsic_load_subgroup_ge_mask: + case nir_intrinsic_load_subgroup_gt_mask: + case nir_intrinsic_load_subgroup_le_mask: + case nir_intrinsic_load_subgroup_lt_mask: + case nir_intrinsic_shuffle: + case nir_intrinsic_shuffle_xor: + case nir_intrinsic_shuffle_up: + case nir_intrinsic_shuffle_down: + case nir_intrinsic_vote_all: + case nir_intrinsic_vote_any: + case nir_intrinsic_vote_feq: + case nir_intrinsic_vote_ieq: + case nir_intrinsic_quad_broadcast: + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: c->has_subgroups = true; break; default: @@ -1418,18 +1612,15 @@ static bool v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c) { bool progress = false; - nir_foreach_function(function, s) { - if (function->impl) { - nir_builder b; - nir_builder_init(&b, function->impl); + nir_foreach_function_impl(impl, s) { + nir_builder b = nir_builder_create(impl); - nir_foreach_block(block, function->impl) - progress |= lower_subgroup_intrinsics(c, block, &b); + nir_foreach_block(block, impl) + progress |= lower_subgroup_intrinsics(c, block, &b); - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); - } + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance); } return progress; } @@ -1483,30 +1674,54 @@ v3d_attempt_compile(struct v3d_compile *c) break; } - NIR_PASS_V(c->s, v3d_nir_lower_io, c); - NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); - NIR_PASS_V(c->s, v3d_nir_lower_image_load_store); + NIR_PASS(_, c->s, v3d_nir_lower_io, c); + NIR_PASS(_, c->s, v3d_nir_lower_txf_ms); + NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c); + + NIR_PASS(_, c->s, nir_opt_idiv_const, 8); nir_lower_idiv_options idiv_options = { - .imprecise_32bit_lowering = true, .allow_fp16 = true, }; - NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options); - - if (c->key->robust_buffer_access) { - /* v3d_nir_lower_robust_buffer_access assumes constant buffer - * indices on ubo/ssbo intrinsics so run copy propagation and - * constant folding passes before we run the lowering to warrant - * this. We also want to run the lowering before v3d_optimize to - * clean-up redundant get_buffer_size calls produced in the pass. - */ - NIR_PASS_V(c->s, nir_copy_prop); - NIR_PASS_V(c->s, nir_opt_constant_folding); - NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c); + NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options); + NIR_PASS(_, c->s, nir_lower_alu); + + if (c->key->robust_uniform_access || c->key->robust_storage_access || + c->key->robust_image_access) { + /* nir_lower_robust_access assumes constant buffer + * indices on ubo/ssbo intrinsics so run copy propagation and + * constant folding passes before we run the lowering to warrant + * this. We also want to run the lowering before v3d_optimize to + * clean-up redundant get_buffer_size calls produced in the pass. + */ + NIR_PASS(_, c->s, nir_copy_prop); + NIR_PASS(_, c->s, nir_opt_constant_folding); + + nir_lower_robust_access_options opts = { + .lower_image = c->key->robust_image_access, + .lower_ssbo = c->key->robust_storage_access, + .lower_ubo = c->key->robust_uniform_access, + }; + + NIR_PASS(_, c->s, nir_lower_robust_access, &opts); } - NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s); + NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s); - NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c); + NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize); + + NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c); + + const nir_lower_subgroups_options subgroup_opts = { + .subgroup_size = V3D_CHANNELS, + .ballot_components = 1, + .ballot_bit_size = 32, + .lower_to_scalar = true, + .lower_inverse_ballot = true, + .lower_subgroup_masks = true, + .lower_relative_shuffle = true, + .lower_quad = true, + }; + NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts); v3d_optimize_nir(c, c->s); @@ -1519,25 +1734,25 @@ v3d_attempt_compile(struct v3d_compile *c) while (more_late_algebraic) { more_late_algebraic = false; NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); - NIR_PASS_V(c->s, nir_opt_constant_folding); - NIR_PASS_V(c->s, nir_copy_prop); - NIR_PASS_V(c->s, nir_opt_dce); - NIR_PASS_V(c->s, nir_opt_cse); + NIR_PASS(_, c->s, nir_opt_constant_folding); + NIR_PASS(_, c->s, nir_copy_prop); + NIR_PASS(_, c->s, nir_opt_dce); + NIR_PASS(_, c->s, nir_opt_cse); } - NIR_PASS_V(c->s, nir_lower_bool_to_int32); - nir_convert_to_lcssa(c->s, true, true); + NIR_PASS(_, c->s, nir_lower_bool_to_int32); + NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true); NIR_PASS_V(c->s, nir_divergence_analysis); - NIR_PASS_V(c->s, nir_convert_from_ssa, true); + NIR_PASS(_, c->s, nir_convert_from_ssa, true); struct nir_schedule_options schedule_options = { /* Schedule for about half our register space, to enable more * shaders to hit 4 threads. */ - .threshold = 24, + .threshold = c->threads == 4 ? 24 : 48, /* Vertex shaders share the same memory for inputs and outputs, - * fragement and geometry shaders do not. + * fragment and geometry shaders do not. */ .stages_with_shared_io_memory = (((1 << MESA_ALL_SHADER_STAGES) - 1) & @@ -1548,11 +1763,22 @@ v3d_attempt_compile(struct v3d_compile *c) .intrinsic_cb = v3d_intrinsic_dependency_cb, .intrinsic_cb_data = c, + + .instr_delay_cb = v3d_instr_delay_cb, + .instr_delay_cb_data = c, }; NIR_PASS_V(c->s, nir_schedule, &schedule_options); if (!c->disable_constant_ubo_load_sorting) - NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c); + NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c); + + const nir_move_options buffer_opts = c->move_buffer_loads ? + (nir_move_load_ubo | nir_move_load_ssbo) : 0; + NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform | + nir_move_const_undef | + buffer_opts); + + NIR_PASS_V(c->s, nir_trivialize_registers); v3d_nir_to_vir(c); } @@ -1611,32 +1837,28 @@ int v3d_shaderdb_dump(struct v3d_compile *c, * register allocation to any particular thread count). This is fine * because v3d_nir_to_vir will cap this to the actual minimum. */ -struct v3d_compiler_strategy { - const char *name; - uint32_t max_threads; - uint32_t min_threads; - bool disable_loop_unrolling; - bool disable_ubo_load_sorting; - bool disable_tmu_pipelining; - bool tmu_spilling_allowed; -} static const strategies[] = { - /*0*/ { "default", 4, 4, false, false, false, false }, - /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false }, - /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false }, - /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false }, - /*4*/ { "lower thread count", 2, 1, false, false, false, false }, - /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false }, - /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false }, - /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true }, - /*8*/ { "fallback scheduler", 2, 1, true, true, true, true } +static const struct v3d_compiler_strategy strategies[] = { + /*0*/ { "default", 4, 4, false, false, false, false, false, false, 0 }, + /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, false, false, 0 }, + /*2*/ { "disable gcm", 4, 4, true, true, false, false, false, false, 0 }, + /*3*/ { "disable loop unrolling", 4, 4, true, true, true, false, false, false, 0 }, + /*4*/ { "disable UBO load sorting", 4, 4, true, true, true, true, false, false, 0 }, + /*5*/ { "disable TMU pipelining", 4, 4, true, true, true, true, false, true, 0 }, + /*6*/ { "lower thread count", 2, 1, false, false, false, false, false, false, -1 }, + /*7*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, false, false, -1 }, + /*8*/ { "disable gcm (2t)", 2, 1, true, true, false, false, false, false, -1 }, + /*9*/ { "disable loop unrolling (2t)", 2, 1, true, true, true, false, false, false, -1 }, + /*10*/ { "Move buffer loads (2t)", 2, 1, true, true, true, true, true, false, -1 }, + /*11*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, true, true, -1 }, + /*12*/ { "fallback scheduler", 2, 1, true, true, true, true, true, true, -1 } }; /** * If a particular optimization didn't make any progress during a compile - * attempt disabling it alone won't allow us to compile the shader successfuly, + * attempt disabling it alone won't allow us to compile the shader successfully, * since we'll end up with the same code. Detect these scenarios so we can * avoid wasting time with useless compiles. We should also consider if the - * strategy changes other aspects of the compilation process though, like + * gy changes other aspects of the compilation process though, like * spilling, and not skip it in that case. */ static bool @@ -1649,31 +1871,55 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx) assert(idx > 0); /* Don't skip a strategy that changes spilling behavior */ - if (strategies[idx].tmu_spilling_allowed != - strategies[idx - 1].tmu_spilling_allowed) { + if (strategies[idx].max_tmu_spills != + strategies[idx - 1].max_tmu_spills) { return false; } switch (idx) { - /* Loop unrolling: skip if we didn't unroll any loops */ + /* General TMU sched.: skip if we didn't emit any TMU loads */ case 1: - case 5: + case 7: + return !c->has_general_tmu_load; + /* Global code motion: skip if nir_opt_gcm didn't make any progress */ + case 2: + case 8: + return !c->gcm_progress; + /* Loop unrolling: skip if we didn't unroll any loops */ + case 3: + case 9: return !c->unrolled_any_loops; /* UBO load sorting: skip if we didn't sort any loads */ - case 2: - case 6: + case 4: return !c->sorted_any_ubo_loads; + /* Move buffer loads: we assume any shader with difficult RA + * most likely has UBO / SSBO loads so we never try to skip. + * For now, we only try this for 2-thread compiles since it + * is expected to impact instruction counts and latency. + */ + case 10: + assert(c->threads < 4); + return false; /* TMU pipelining: skip if we didn't pipeline any TMU ops */ - case 3: - case 7: + case 5: + case 11: return !c->pipelined_any_tmu; /* Lower thread count: skip if we already tried less that 4 threads */ - case 4: + case 6: return c->threads < 4; default: return false; }; } + +static inline void +set_best_compile(struct v3d_compile **best, struct v3d_compile *c) +{ + if (*best) + vir_compile_destroy(*best); + *best = c; +} + uint64_t *v3d_compile(const struct v3d_compiler *compiler, struct v3d_key *key, struct v3d_prog_data **out_prog_data, @@ -1685,58 +1931,106 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, uint32_t *final_assembly_size) { struct v3d_compile *c = NULL; - for (int i = 0; i < ARRAY_SIZE(strategies); i++) { + + uint32_t best_spill_fill_count = UINT32_MAX; + struct v3d_compile *best_c = NULL; + for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) { /* Fallback strategy */ - if (i > 0) { + if (strat > 0) { assert(c); - if (skip_compile_strategy(c, i)) + if (skip_compile_strategy(c, strat)) continue; char *debug_msg; int ret = asprintf(&debug_msg, - "Falling back to strategy '%s' for %s", - strategies[i].name, - vir_get_stage_name(c)); + "Falling back to strategy '%s' " + "for %s prog %d/%d", + strategies[strat].name, + vir_get_stage_name(c), + c->program_id, c->variant_id); if (ret >= 0) { - if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) + if (V3D_DBG(PERF)) fprintf(stderr, "%s\n", debug_msg); c->debug_output(debug_msg, c->debug_output_data); free(debug_msg); } - vir_compile_destroy(c); + if (c != best_c) + vir_compile_destroy(c); } c = vir_compile_init(compiler, key, s, debug_output, debug_output_data, program_id, variant_id, - strategies[i].max_threads, - strategies[i].min_threads, - strategies[i].tmu_spilling_allowed, - strategies[i].disable_loop_unrolling, - strategies[i].disable_ubo_load_sorting, - strategies[i].disable_tmu_pipelining, - i == ARRAY_SIZE(strategies) - 1); + strat, &strategies[strat], + strat == ARRAY_SIZE(strategies) - 1); v3d_attempt_compile(c); - if (i >= ARRAY_SIZE(strategies) - 1 || - c->compilation_result != - V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) { + /* Broken shader or driver bug */ + if (c->compilation_result == V3D_COMPILATION_FAILED) break; + + /* If we compiled without spills, choose this. + * Otherwise if this is a 4-thread compile, choose this (these + * have a very low cap on the allowed TMU spills so we assume + * it will be better than a 2-thread compile without spills). + * Otherwise, keep going while tracking the strategy with the + * lowest spill count. + */ + if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) { + if (c->spills == 0 || + strategies[strat].min_threads == 4 || + V3D_DBG(OPT_COMPILE_TIME)) { + set_best_compile(&best_c, c); + break; + } else if (c->spills + c->fills < + best_spill_fill_count) { + set_best_compile(&best_c, c); + best_spill_fill_count = c->spills + c->fills; + } + + if (V3D_DBG(PERF)) { + char *debug_msg; + int ret = asprintf(&debug_msg, + "Compiled %s prog %d/%d with %d " + "spills and %d fills. Will try " + "more strategies.", + vir_get_stage_name(c), + c->program_id, c->variant_id, + c->spills, c->fills); + if (ret >= 0) { + fprintf(stderr, "%s\n", debug_msg); + c->debug_output(debug_msg, c->debug_output_data); + free(debug_msg); + } + } } + + /* Only try next streategy if we failed to register allocate + * or we had to spill. + */ + assert(c->compilation_result == + V3D_COMPILATION_FAILED_REGISTER_ALLOCATION || + c->spills > 0); } - if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) && + /* If the best strategy was not the last, choose that */ + if (best_c && c != best_c) + set_best_compile(&c, best_c); + + if (V3D_DBG(PERF) && c->compilation_result != V3D_COMPILATION_FAILED_REGISTER_ALLOCATION && c->spills > 0) { char *debug_msg; int ret = asprintf(&debug_msg, - "Compiled %s with %d spills and %d fills", + "Compiled %s prog %d/%d with %d " + "spills and %d fills", vir_get_stage_name(c), + c->program_id, c->variant_id, c->spills, c->fills); fprintf(stderr, "%s\n", debug_msg); @@ -1747,8 +2041,12 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, } if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) { - fprintf(stderr, "Failed to compile %s with any strategy.\n", - vir_get_stage_name(c)); + fprintf(stderr, "Failed to compile %s prog %d/%d " + "with any strategy.\n", + vir_get_stage_name(c), c->program_id, c->variant_id); + + vir_compile_destroy(c); + return NULL; } struct v3d_prog_data *prog_data; @@ -1762,8 +2060,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, char *shaderdb; int ret = v3d_shaderdb_dump(c, &shaderdb); if (ret >= 0) { - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) - fprintf(stderr, "SHADER-DB: %s\n", shaderdb); + if (V3D_DBG(SHADERDB)) + fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb); c->debug_output(shaderdb, c->debug_output_data); free(shaderdb); @@ -1872,8 +2170,11 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) struct qinst *prev_inst = NULL; assert(c->cur_block); -#ifdef DEBUG - /* Check if the current instruction is part of the current block */ +#if MESA_DEBUG + /* We can only reuse a uniform if it was emitted in the same block, + * so callers must make sure the current instruction is being emitted + * in the current block. + */ bool found = false; vir_for_each_inst(inst, c->cur_block) { if (&inst->link == c->cursor.link) { @@ -1882,7 +2183,7 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) } } - assert(found || list_is_empty(&c->cur_block->instructions)); + assert(found || &c->cur_block->instructions == c->cursor.link); #endif list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev, @@ -1900,6 +2201,12 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) if (!prev_inst) return false; + /* Only reuse the ldunif result if it was written to a temp register, + * otherwise there may be special restrictions (for example, ldunif + * may write directly to unifa, which is a write-only register). + */ + if (prev_inst->dst.file != QFILE_TEMP) + return false; list_for_each_entry_from(struct qinst, inst, prev_inst->link.next, &c->cur_block->instructions, link) { diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c index 5c47bbdc1b0..631eeee52ab 100644 --- a/src/broadcom/compiler/vir_dump.c +++ b/src/broadcom/compiler/vir_dump.c @@ -182,11 +182,6 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst, break; } - case QFILE_VPM: - fprintf(stderr, "vpm%d.%d", - reg.index / 4, reg.index % 4); - break; - case QFILE_TEMP: fprintf(stderr, "t%d", reg.index); break; @@ -197,9 +192,6 @@ static void vir_dump_sig_addr(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) { - if (devinfo->ver < 41) - return; - if (!instr->sig_magic) fprintf(stderr, ".rf%d", instr->sig_addr); else { @@ -270,8 +262,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack)); - unpack[0] = instr->alu.add.a_unpack; - unpack[1] = instr->alu.add.b_unpack; + unpack[0] = instr->alu.add.a.unpack; + unpack[1] = instr->alu.add.b.unpack; } else { fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op)); fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc)); @@ -282,8 +274,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack)); - unpack[0] = instr->alu.mul.a_unpack; - unpack[1] = instr->alu.mul.b_unpack; + unpack[0] = instr->alu.mul.a.unpack; + unpack[1] = instr->alu.mul.b.unpack; } for (int i = 0; i < nsrc; i++) { diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c index 2fd6430a0f4..d1f44aa9cf7 100644 --- a/src/broadcom/compiler/vir_live_variables.c +++ b/src/broadcom/compiler/vir_live_variables.c @@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c) flags_inst = NULL; } - /* Payload registers: r0/1/2 contain W, centroid W, - * and Z at program start. Register allocation will - * force their nodes to R0/1/2. + /* Payload registers: for fragment shaders, W, + * centroid W, and Z will be initialized in r0/1/2 + * until v42, or r1/r2/r3 since v71. + * + * For compute shaders, payload is in r0/r2 up to v42, + * r2/r3 since v71. + * + * Register allocation will force their nodes to those + * registers. */ if (inst->src[0].file == QFILE_REG) { - switch (inst->src[0].index) { - case 0: - case 1: - case 2: + uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0; + uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2; + if (inst->src[0].index >= min_payload_r || + inst->src[0].index <= max_payload_r) { c->temp_start[inst->dst.index] = 0; - break; } } @@ -306,6 +311,8 @@ vir_calculate_live_intervals(struct v3d_compile *c) vir_for_each_block(block, c) { ralloc_free(block->def); + ralloc_free(block->defin); + ralloc_free(block->defout); ralloc_free(block->use); ralloc_free(block->live_in); ralloc_free(block->live_out); diff --git a/src/broadcom/compiler/vir_opt_constant_alu.c b/src/broadcom/compiler/vir_opt_constant_alu.c index 483646f882e..dc4c8a65026 100644 --- a/src/broadcom/compiler/vir_opt_constant_alu.c +++ b/src/broadcom/compiler/vir_opt_constant_alu.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -155,6 +155,7 @@ vir_opt_constant_alu(struct v3d_compile *c) { bool progress = false; vir_for_each_block(block, c) { + c->cur_block = block; vir_for_each_inst_safe(inst, block) { progress = try_opt_constant_alu(c, inst) || progress; } diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c index c5bb6112173..611c4693ed3 100644 --- a/src/broadcom/compiler/vir_opt_copy_propagate.c +++ b/src/broadcom/compiler/vir_opt_copy_propagate.c @@ -35,7 +35,7 @@ #include "v3d_compiler.h" static bool -is_copy_mov(struct qinst *inst) +is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst) { if (!inst) return false; @@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst) return false; } - switch (inst->src[0].file) { - case QFILE_MAGIC: - /* No copy propagating from R3/R4/R5 -- the MOVs from those - * are there to register allocate values produced into R3/4/5 - * to other regs (though hopefully r3/4/5). - */ - switch (inst->src[0].index) { - case V3D_QPU_WADDR_R3: - case V3D_QPU_WADDR_R4: - case V3D_QPU_WADDR_R5: - return false; + if (devinfo->ver == 42) { + switch (inst->src[0].file) { + case QFILE_MAGIC: + /* No copy propagating from R3/R4/R5 -- the MOVs from + * those are there to register allocate values produced + * into R3/4/5 to other regs (though hopefully r3/4/5). + */ + switch (inst->src[0].index) { + case V3D_QPU_WADDR_R3: + case V3D_QPU_WADDR_R4: + case V3D_QPU_WADDR_R5: + return false; + default: + break; + } + break; + + case QFILE_REG: + switch (inst->src[0].index) { + case 0: + case 1: + case 2: + /* MOVs from rf0/1/2 are only to track the live + * intervals for W/centroid W/Z. + */ + return false; + } + break; + default: break; } - break; - - case QFILE_REG: - switch (inst->src[0].index) { - case 0: - case 1: - case 2: - /* MOVs from rf0/1/2 are only to track the live + } else { + assert(devinfo->ver >= 71); + switch (inst->src[0].file) { + case QFILE_REG: + switch (inst->src[0].index) { + /* MOVs from rf1/2/3 are only to track the live * intervals for W/centroid W/Z. + * + * Note: rf0 can be implicitly written by ldvary + * (no temp involved), so it is not an SSA value and + * could clash with writes to other temps that are + * also allocated to rf0. In theory, that would mean + * that we can't copy propagate from it, but we handle + * this at register allocation time, preventing temps + * from being allocated to rf0 while the rf0 value from + * ldvary is still live. */ - return false; - } - break; + case 1: + case 2: + case 3: + return false; + } + break; - default: - break; + default: + break; + } } return true; @@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan) if (vir_is_add(inst)) { if (chan == 0) - return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE; else - return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE; } else { if (chan == 0) - return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE; else - return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE; } } @@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) */ struct qinst *mov = movs[inst->src[i].index]; if (!mov) { - if (!is_copy_mov(c->defs[inst->src[i].index])) + if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index])) continue; mov = c->defs[inst->src[i].index]; @@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) continue; /* these ops can't represent abs. */ - if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) { + if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) { switch (inst->qpu.alu.add.op) { case V3D_QPU_A_VFPACK: case V3D_QPU_A_FROUND: @@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) inst->src[i] = mov->src[0]; if (vir_has_unpack(mov, 0)) { - enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack; + enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack; vir_set_unpack(inst, i, unpack); } @@ -238,12 +267,14 @@ vir_opt_copy_propagate(struct v3d_compile *c) */ memset(movs, 0, sizeof(struct qinst *) * c->num_temps); + c->cur_block = block; vir_for_each_inst(inst, block) { + progress = try_copy_prop(c, inst, movs) || progress; apply_kills(c, movs, inst); - if (is_copy_mov(inst)) + if (is_copy_mov(c->devinfo, inst)) movs[inst->dst.index] = inst; } } diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c index 64c762c88db..fd1af944427 100644 --- a/src/broadcom/compiler/vir_opt_dead_code.c +++ b/src/broadcom/compiler/vir_opt_dead_code.c @@ -52,21 +52,10 @@ dce(struct v3d_compile *c, struct qinst *inst) } static bool -has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst) -{ - for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file == QFILE_VPM) - return true; - } - - return false; -} - -static bool can_write_to_null(struct v3d_compile *c, struct qinst *inst) { /* The SFU instructions must write to a physical register. */ - if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu)) + if (v3d_qpu_uses_sfu(&inst->qpu)) return false; return true; @@ -149,30 +138,25 @@ check_first_ldunifa(struct v3d_compile *c, } static bool -increment_unifa_address(struct v3d_compile *c, struct qblock *block, struct qinst *unifa) +increment_unifa_address(struct v3d_compile *c, struct qinst *unifa) { - struct qblock *current_block = c->cur_block; if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU && unifa->qpu.alu.mul.op == V3D_QPU_M_MOV) { c->cursor = vir_after_inst(unifa); - c->cur_block = block; struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); vir_ADD_dest(c, unifa_reg, unifa->src[0], vir_uniform_ui(c, 4u)); vir_remove_instruction(c, unifa); - c->cur_block = current_block; return true; } if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU && unifa->qpu.alu.add.op == V3D_QPU_A_ADD) { c->cursor = vir_after_inst(unifa); - c->cur_block = block; struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); struct qreg tmp = vir_ADD(c, unifa->src[1], vir_uniform_ui(c, 4u)); vir_ADD_dest(c, unifa_reg, unifa->src[0], tmp); vir_remove_instruction(c, unifa); - c->cur_block = current_block; return true; } @@ -200,7 +184,7 @@ vir_opt_dead_code(struct v3d_compile *c) vir_for_each_block(block, c) { struct qinst *last_flags_write = NULL; - + c->cur_block = block; vir_for_each_inst_safe(inst, block) { /* If this instruction reads the flags, we can't * remove the flags generation for it. @@ -246,7 +230,6 @@ vir_opt_dead_code(struct v3d_compile *c) } if (v3d_qpu_writes_flags(&inst->qpu) || - has_nonremovable_reads(c, inst) || (is_ldunifa && !is_first_ldunifa && !is_last_ldunifa)) { /* If we can't remove the instruction, but we * don't need its destination value, just @@ -276,7 +259,7 @@ vir_opt_dead_code(struct v3d_compile *c) */ if (is_first_ldunifa) { assert(unifa); - if (!increment_unifa_address(c, block, unifa)) + if (!increment_unifa_address(c, unifa)) continue; } diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c index 4609ef9c361..6b61ed6a39a 100644 --- a/src/broadcom/compiler/vir_opt_redundant_flags.c +++ b/src/broadcom/compiler/vir_opt_redundant_flags.c @@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b) a->qpu.flags.mpf != b->qpu.flags.mpf || a->qpu.alu.add.op != b->qpu.alu.add.op || a->qpu.alu.mul.op != b->qpu.alu.mul.op || - a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack || - a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack || + a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack || + a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack || a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack || - a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack || - a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack || + a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack || + a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack || a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) { return false; } @@ -99,6 +99,7 @@ vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block) struct qinst *last_flags = NULL; bool progress = false; + c->cur_block = block; vir_for_each_inst(inst, block) { if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || inst->qpu.flags.auf != V3D_QPU_UF_NONE || diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c index 47d7722968d..56f0bf20706 100644 --- a/src/broadcom/compiler/vir_opt_small_immediates.c +++ b/src/broadcom/compiler/vir_opt_small_immediates.c @@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c) /* The small immediate value sits in the raddr B field, so we * can't have 2 small immediates in one instruction (unless * they're the same value, but that should be optimized away - * elsewhere). + * elsewhere). Since 7.x we can encode small immediates in + * any raddr field, but each instruction can still only use + * one. */ bool uses_small_imm = false; for (int i = 0; i < vir_get_nsrc(inst); i++) { @@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c) */ struct v3d_qpu_sig new_sig = inst->qpu.sig; uint32_t sig_packed; - new_sig.small_imm = true; + if (c->devinfo->ver == 42) { + new_sig.small_imm_b = true; + } else { + if (vir_is_add(inst)) { + if (i == 0) + new_sig.small_imm_a = true; + else + new_sig.small_imm_b = true; + } else { + if (i == 0) + new_sig.small_imm_c = true; + else + new_sig.small_imm_d = true; + } + } + if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed)) continue; @@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c) vir_dump_inst(c, inst); fprintf(stderr, "\n"); } - inst->qpu.sig.small_imm = true; + inst->qpu.sig.small_imm_a = new_sig.small_imm_a; + inst->qpu.sig.small_imm_b = new_sig.small_imm_b; + inst->qpu.sig.small_imm_c = new_sig.small_imm_c; + inst->qpu.sig.small_imm_d = new_sig.small_imm_d; inst->qpu.raddr_b = packed; inst->src[i].file = QFILE_SMALL_IMM; diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index 08698b4ece1..53e84840899 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -26,12 +26,100 @@ #include "common/v3d_device_info.h" #include "v3d_compiler.h" -#define QPU_R(i) { .magic = false, .index = i } - #define ACC_INDEX 0 #define ACC_COUNT 6 -#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) -#define PHYS_COUNT 64 + +/* RA nodes used to track RF registers with implicit writes */ +#define IMPLICIT_RF_COUNT 1 + +#define PHYS_COUNT 64 + +static uint8_t +get_phys_index(const struct v3d_device_info *devinfo) +{ + if (devinfo->has_accumulators) + return ACC_INDEX + ACC_COUNT; + else + return 0; +} + +/* ACC as accumulator */ +#define CLASS_BITS_PHYS (1 << 0) +#define CLASS_BITS_ACC (1 << 1) +#define CLASS_BITS_R5 (1 << 4) + +static uint8_t +get_class_bit_any(const struct v3d_device_info *devinfo) +{ + if (devinfo->has_accumulators) + return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5); + else + return CLASS_BITS_PHYS; +} + +static uint8_t +filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits) +{ + if (!devinfo->has_accumulators) { + assert(class_bits & CLASS_BITS_PHYS); + class_bits = CLASS_BITS_PHYS; + } + return class_bits; +} + +static inline uint32_t +temp_to_node(struct v3d_compile *c, uint32_t temp) +{ + return temp + (c->devinfo->has_accumulators ? ACC_COUNT : + IMPLICIT_RF_COUNT); +} + +static inline uint32_t +node_to_temp(struct v3d_compile *c, uint32_t node) +{ + assert((c->devinfo->has_accumulators && node >= ACC_COUNT) || + (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT)); + return node - (c->devinfo->has_accumulators ? ACC_COUNT : + IMPLICIT_RF_COUNT); +} + +static inline uint8_t +get_temp_class_bits(struct v3d_compile *c, + uint32_t temp) +{ + return c->nodes.info[temp_to_node(c, temp)].class_bits; +} + +static inline void +set_temp_class_bits(struct v3d_compile *c, + uint32_t temp, uint8_t class_bits) +{ + c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits; +} + +static struct ra_class * +choose_reg_class(struct v3d_compile *c, uint8_t class_bits) +{ + if (class_bits == CLASS_BITS_PHYS) { + return c->compiler->reg_class_phys[c->thread_index]; + } else if (class_bits == (CLASS_BITS_R5)) { + assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_r5[c->thread_index]; + } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) { + assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_phys_or_acc[c->thread_index]; + } else { + assert(class_bits == get_class_bit_any(c->devinfo)); + return c->compiler->reg_class_any[c->thread_index]; + } +} + +static inline struct ra_class * +choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp) +{ + assert(temp < c->num_temps && temp < c->nodes.alloc_count); + return choose_reg_class(c, get_temp_class_bits(c, temp)); +} static inline bool qinst_writes_tmu(const struct v3d_device_info *devinfo, @@ -46,23 +134,22 @@ static bool is_end_of_tmu_sequence(const struct v3d_device_info *devinfo, struct qinst *inst, struct qblock *block) { - if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { - return true; - } - - if (!inst->qpu.sig.ldtmu) + /* Only tmuwt and ldtmu can finish TMU sequences */ + bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + inst->qpu.alu.add.op == V3D_QPU_A_TMUWT; + bool is_ldtmu = inst->qpu.sig.ldtmu; + if (!is_tmuwt && !is_ldtmu) return false; + /* Check if this is the last tmuwt or ldtmu in the sequence */ list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, &block->instructions, link) { - if (scan_inst->qpu.sig.ldtmu) - return false; + is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT; + is_ldtmu = scan_inst->qpu.sig.ldtmu; - if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { - return true; - } + if (is_tmuwt || is_ldtmu) + return false; if (qinst_writes_tmu(devinfo, scan_inst)) return true; @@ -79,11 +166,101 @@ vir_is_mov_uniform(struct v3d_compile *c, int temp) return def && def->qpu.sig.ldunif; } +static bool +can_reconstruct_inst(struct qinst *inst) +{ + assert(inst); + + if (vir_is_add(inst)) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_FXCD: + case V3D_QPU_A_FYCD: + case V3D_QPU_A_XCD: + case V3D_QPU_A_YCD: + case V3D_QPU_A_IID: + case V3D_QPU_A_EIDX: + case V3D_QPU_A_TIDX: + case V3D_QPU_A_SAMPID: + /* No need to check input unpacks because none of these + * opcodes read sources. FXCD,FYCD have pack variants. + */ + return inst->qpu.flags.ac == V3D_QPU_COND_NONE && + inst->qpu.flags.auf == V3D_QPU_UF_NONE && + inst->qpu.flags.apf == V3D_QPU_PF_NONE && + inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE; + default: + return false; + } + } + + return false; +} + +static bool +can_reconstruct_temp(struct v3d_compile *c, int temp) +{ + struct qinst *def = c->defs[temp]; + return def && can_reconstruct_inst(def); +} + +static struct qreg +reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op) +{ + struct qreg dest; + switch (op) { + case V3D_QPU_A_FXCD: + dest = vir_FXCD(c); + break; + case V3D_QPU_A_FYCD: + dest = vir_FYCD(c); + break; + case V3D_QPU_A_XCD: + dest = vir_XCD(c); + break; + case V3D_QPU_A_YCD: + dest = vir_YCD(c); + break; + case V3D_QPU_A_IID: + dest = vir_IID(c); + break; + case V3D_QPU_A_EIDX: + dest = vir_EIDX(c); + break; + case V3D_QPU_A_TIDX: + dest = vir_TIDX(c); + break; + case V3D_QPU_A_SAMPID: + dest = vir_SAMPID(c); + break; + default: + unreachable("Unexpected opcode for reconstruction"); + } + + return dest; +} + +enum temp_spill_type { + SPILL_TYPE_UNIFORM, + SPILL_TYPE_RECONSTRUCT, + SPILL_TYPE_TMU +}; + +static enum temp_spill_type +get_spill_type_for_temp(struct v3d_compile *c, int temp) +{ + if (vir_is_mov_uniform(c, temp)) + return SPILL_TYPE_UNIFORM; + + if (can_reconstruct_temp(c, temp)) + return SPILL_TYPE_RECONSTRUCT; + + return SPILL_TYPE_TMU; +} + static int -v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, - uint32_t *temp_to_node) +v3d_choose_spill_node(struct v3d_compile *c) { - const float tmu_scale = 5; + const float tmu_scale = 10; float block_scale = 1.0; float spill_costs[c->num_temps]; bool in_tmu_operation = false; @@ -99,7 +276,8 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, * starting output writes. */ bool no_spilling = - c->threads > 1 && started_last_seg; + (c->threads > 1 && started_last_seg) || + (c->max_tmu_spills == 0); /* Discourage spilling of TMU operations */ for (int i = 0; i < vir_get_nsrc(inst); i++) { @@ -107,7 +285,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, continue; int temp = inst->src[i].index; - if (vir_is_mov_uniform(c, temp)) { + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); + + if (spill_type != SPILL_TYPE_TMU) { spill_costs[temp] += block_scale; } else if (!no_spilling) { float tmu_op_scale = in_tmu_operation ? @@ -122,11 +303,11 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, if (inst->dst.file == QFILE_TEMP) { int temp = inst->dst.index; + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); - if (vir_is_mov_uniform(c, temp)) { - /* We just rematerialize the unform - * later. - */ + if (spill_type != SPILL_TYPE_TMU) { + /* We just rematerialize it later */ } else if (!no_spilling) { spill_costs[temp] += (block_scale * tmu_scale); @@ -147,10 +328,6 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, if (inst->is_last_thrsw) started_last_seg = true; - if (v3d_qpu_writes_vpm(&inst->qpu) || - v3d_qpu_uses_tlb(&inst->qpu)) - started_last_seg = true; - /* Track when we're in between a TMU setup and the * final LDTMU or TMUWT from that TMU setup. We * penalize spills during that time. @@ -163,12 +340,53 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, } } + /* We always emit a "last thrsw" to ensure all our spilling occurs + * before the last thread section. See vir_emit_last_thrsw. + */ + assert(started_last_seg); + for (unsigned i = 0; i < c->num_temps; i++) { - if (BITSET_TEST(c->spillable, i)) - ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]); + if (BITSET_TEST(c->spillable, i)) { + ra_set_node_spill_cost(c->g, temp_to_node(c, i), + spill_costs[i]); + } } - return ra_get_best_spill_node(g); + return ra_get_best_spill_node(c->g); +} + +static void +ensure_nodes(struct v3d_compile *c) +{ + if (c->num_temps < c->nodes.alloc_count) + return; + + c->nodes.alloc_count *= 2; + c->nodes.info = reralloc_array_size(c, + c->nodes.info, + sizeof(c->nodes.info[0]), + c->nodes.alloc_count + + MAX2(ACC_COUNT, IMPLICIT_RF_COUNT)); +} + +/* Creates the interference node for a new temp. We use this to keep the node + * list updated during the spilling process, which generates new temps/nodes. + */ +static void +add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) +{ + ensure_nodes(c); + + int node = ra_add_node(c->g, choose_reg_class(c, class_bits)); + assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT : + node == temp + IMPLICIT_RF_COUNT); + + /* We fill the node priority after we are done inserting spills */ + c->nodes.info[node].class_bits = class_bits; + c->nodes.info[node].priority = 0; + c->nodes.info[node].is_ldunif_dst = false; + c->nodes.info[node].is_program_end = false; + c->nodes.info[node].unused = false; } /* The spill offset for this thread takes a bit of setup, so do it once at @@ -206,79 +424,224 @@ v3d_setup_spill_base(struct v3d_compile *c) vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0)); /* Make sure that we don't spill the spilling setup instructions. */ - for (int i = start_num_temps; i < c->num_temps; i++) + for (int i = start_num_temps; i < c->num_temps; i++) { BITSET_CLEAR(c->spillable, i); + /* If we are spilling, update the RA map with the temps added + * by the spill setup. Our spill_base register can never be an + * accumulator because it is used for TMU spill/fill and thus + * needs to persist across thread switches. + */ + if (c->spilling) { + int temp_class = CLASS_BITS_PHYS; + if (c->devinfo->has_accumulators && + i != c->spill_base.index) { + temp_class |= CLASS_BITS_ACC; + } + add_node(c, i, temp_class); + } + } + /* Restore the current block. */ c->cur_block = current_block; c->cursor = vir_after_block(c->cur_block); } -static struct qinst * -v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) +/** + * Computes the address for a spill/fill sequence and completes the spill/fill + * sequence by emitting the following code: + * + * ldunif.spill_offset + * add tmua spill_base spill_offset + * thrsw + * + * If the sequence is for a spill, then it will emit a tmuwt after the thrsw, + * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'. + * + * The parameter 'ip' represents the ip at which the spill/fill is happening. + * This is used to disallow accumulators on temps that cross this ip boundary + * due to the new thrsw itroduced in the sequence above. + */ +static void +v3d_emit_spill_tmua(struct v3d_compile *c, + uint32_t spill_offset, + enum v3d_qpu_cond cond, + int32_t ip, + struct qreg *fill_dst) { - return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), - c->spill_base, vir_uniform_ui(c, spill_offset)); -} + assert(ip >= 0); + + /* Load a uniform with the spill offset and add it to the spill base + * to obtain the TMUA address. It can be of class ANY because we know + * we are consuming it immediately without thrsw in between. + */ + assert(c->disable_ldunif_opt); + struct qreg offset = vir_uniform_ui(c, spill_offset); + add_node(c, offset.index, get_class_bit_any(c->devinfo)); + /* We always enable per-quad on spills/fills to ensure we spill + * any channels involved with helper invocations. + */ + struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset); + inst->qpu.flags.ac = cond; + inst->ldtmu_count = 1; + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, + 0xffffff7f); /* per-quad */ + + vir_emit_thrsw(c); + + /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the + * result of the fill. The TMUWT temp is not really read, the ldtmu + * temp will be used immediately so just like the uniform above we + * can allow accumulators. + */ + int temp_class = + filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC); + if (!fill_dst) { + struct qreg dst = vir_TMUWT(c); + assert(dst.file == QFILE_TEMP); + add_node(c, dst.index, temp_class); + } else { + *fill_dst = vir_LDTMU(c); + assert(fill_dst->file == QFILE_TEMP); + add_node(c, fill_dst->index, temp_class); + } + + /* Temps across the thread switch we injected can't be assigned to + * accumulators. + * + * Fills inject code before ip, so anything that starts at ip or later + * is not affected by the thrsw. Something that ends at ip will be + * affected though. + * + * Spills inject code after ip, so anything that starts strictly later + * than ip is not affected (the temp starting at ip is usually the + * spilled temp except for postponed spills). Something that ends at ip + * won't be affected either. + */ + for (int i = 0; i < c->spill_start_num_temps; i++) { + bool thrsw_cross = fill_dst ? + c->temp_start[i] < ip && c->temp_end[i] >= ip : + c->temp_start[i] <= ip && c->temp_end[i] > ip; + if (thrsw_cross) { + ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class(c, CLASS_BITS_PHYS)); + } + } +} static void -v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst, - struct qinst *position, uint32_t spill_offset) +v3d_emit_tmu_spill(struct v3d_compile *c, + struct qinst *inst, + struct qreg spill_temp, + struct qinst *position, + uint32_t ip, + uint32_t spill_offset) { assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); + assert(inst->dst.file == QFILE_TEMP); c->cursor = vir_after_inst(position); - inst->dst = vir_get_temp(c); + enum v3d_qpu_cond cond = vir_get_cond(inst); + + /* If inst and position don't match, this is a postponed spill, + * in which case we have already allocated the temp for the spill + * and we should use that, otherwise create a new temp with the + * same register class bits as the original. + */ + if (inst == position) { + uint8_t class_bits = get_temp_class_bits(c, inst->dst.index); + inst->dst = vir_get_temp(c); + add_node(c, inst->dst.index, class_bits); + } else { + inst->dst = spill_temp; + + /* If this is a postponed spill the register being spilled may + * have been written more than once including conditional + * writes, so ignore predication on the spill instruction and + * always spill the full register. + */ + cond = V3D_QPU_COND_NONE; + } + struct qinst *tmp = vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), inst->dst); tmp->qpu.flags.mc = cond; - tmp = v3d_emit_spill_tmua(c, spill_offset); - tmp->qpu.flags.ac = cond; - vir_emit_thrsw(c); - vir_TMUWT(c); + + v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL); + c->spills++; c->tmu_dirty_rcl = true; } +static inline bool +interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end) +{ + return !(t0_start >= t1_end || t1_start >= t0_end); +} + static void -v3d_spill_reg(struct v3d_compile *c, int spill_temp) +v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes, + int spill_temp) { - c->spill_count++; + c->spill_start_num_temps = c->num_temps; + c->spilling = true; - bool is_uniform = vir_is_mov_uniform(c, spill_temp); + enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp); uint32_t spill_offset = 0; - - if (!is_uniform) { + if (spill_type == SPILL_TYPE_TMU) { spill_offset = c->spill_size; c->spill_size += V3D_CHANNELS * sizeof(uint32_t); - if (spill_offset == 0) + if (spill_offset == 0) { v3d_setup_spill_base(c); + + /* Don't allocate our spill base to rf0 to avoid + * conflicts with instructions doing implicit writes + * to that register. + */ + if (!c->devinfo->has_accumulators) { + ra_add_node_interference( + c->g, + temp_to_node(c, c->spill_base.index), + implicit_rf_nodes[0]); + } + } } struct qinst *last_thrsw = c->last_thrsw; assert(last_thrsw && last_thrsw->is_last_thrsw); - int start_num_temps = c->num_temps; - int uniform_index = ~0; - if (is_uniform) { + if (spill_type == SPILL_TYPE_UNIFORM) { struct qinst *orig_unif = c->defs[spill_temp]; uniform_index = orig_unif->uniform; } + enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP; + if (spill_type == SPILL_TYPE_RECONSTRUCT) { + struct qinst *orig_def = c->defs[spill_temp]; + assert(vir_is_add(orig_def)); + reconstruct_op = orig_def->qpu.alu.add.op; + } + + uint32_t spill_node = temp_to_node(c, spill_temp); + /* We must disable the ldunif optimization if we are spilling uniforms */ bool had_disable_ldunif_opt = c->disable_ldunif_opt; c->disable_ldunif_opt = true; struct qinst *start_of_tmu_sequence = NULL; struct qinst *postponed_spill = NULL; + struct qreg postponed_spill_temp = { 0 }; vir_for_each_block(block, c) { vir_for_each_inst_safe(inst, block) { + int32_t ip = inst->ip; + /* Track when we're in between a TMU setup and the final * LDTMU or TMUWT from that TMU setup. We can't spill/fill any * temps during that time, because that involves inserting a @@ -289,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) if (is_end_of_tmu_sequence(c->devinfo, inst, block)) { if (postponed_spill) { v3d_emit_tmu_spill(c, postponed_spill, - inst, spill_offset); + postponed_spill_temp, + inst, ip, spill_offset); } start_of_tmu_sequence = NULL; @@ -302,49 +666,103 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) } /* fills */ + int filled_src = -1; for (int i = 0; i < vir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_TEMP || inst->src[i].index != spill_temp) { continue; } + if (filled_src >= 0) { + inst->src[i] = inst->src[filled_src]; + continue; + } + c->cursor = vir_before_inst(inst); - if (is_uniform) { + if (spill_type == SPILL_TYPE_UNIFORM) { struct qreg unif = vir_uniform(c, c->uniform_contents[uniform_index], c->uniform_data[uniform_index]); inst->src[i] = unif; + /* We are using the uniform in the + * instruction immediately after, so + * we can use any register class for it. + */ + add_node(c, unif.index, + get_class_bit_any(c->devinfo)); + } else if (spill_type == SPILL_TYPE_RECONSTRUCT) { + struct qreg temp = + reconstruct_temp(c, reconstruct_op); + inst->src[i] = temp; + /* We are using the temp in the + * instruction immediately after so we + * can use ACC. + */ + int temp_class = + filter_class_bits(c->devinfo, CLASS_BITS_PHYS | + CLASS_BITS_ACC); + add_node(c, temp.index, temp_class); } else { - /* If we have a postponed spill, we don't need - * a fill as the temp would not have been - * spilled yet. + /* If we have a postponed spill, we + * don't need a fill as the temp would + * not have been spilled yet, however, + * we need to update the temp index. */ - if (postponed_spill) - continue; - if (start_of_tmu_sequence) - c->cursor = vir_before_inst(start_of_tmu_sequence); - - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - inst->src[i] = vir_LDTMU(c); - c->fills++; + if (postponed_spill) { + inst->src[i] = + postponed_spill_temp; + } else { + int32_t fill_ip = ip; + if (start_of_tmu_sequence) { + c->cursor = vir_before_inst(start_of_tmu_sequence); + fill_ip = start_of_tmu_sequence->ip; + } + + v3d_emit_spill_tmua(c, spill_offset, + V3D_QPU_COND_NONE, + fill_ip, &inst->src[i]); + c->fills++; + } } + + filled_src = i; } /* spills */ if (inst->dst.file == QFILE_TEMP && inst->dst.index == spill_temp) { - if (is_uniform) { + if (spill_type != SPILL_TYPE_TMU) { c->cursor.link = NULL; vir_remove_instruction(c, inst); } else { - if (start_of_tmu_sequence) + /* If we are in the middle of a TMU + * sequence, we postpone the actual + * spill until we have finished it. We, + * still need to replace the spill temp + * with a new temp though. + */ + if (start_of_tmu_sequence) { + if (postponed_spill) { + postponed_spill->dst = + postponed_spill_temp; + } + if (!postponed_spill || + vir_get_cond(inst) == V3D_QPU_COND_NONE) { + postponed_spill_temp = + vir_get_temp(c); + add_node(c, + postponed_spill_temp.index, + c->nodes.info[spill_node].class_bits); + } postponed_spill = inst; - else - v3d_emit_tmu_spill(c, inst, inst, + } else { + v3d_emit_tmu_spill(c, inst, + postponed_spill_temp, + inst, ip, spill_offset); + } } } } @@ -358,21 +776,64 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) /* Don't allow spilling of our spilling instructions. There's no way * they can help get things colored. */ - for (int i = start_num_temps; i < c->num_temps; i++) + for (int i = c->spill_start_num_temps; i < c->num_temps; i++) BITSET_CLEAR(c->spillable, i); + /* Reset interference for spilled node */ + ra_set_node_spill_cost(c->g, spill_node, 0); + ra_reset_node_interference(c->g, spill_node); + BITSET_CLEAR(c->spillable, spill_temp); + + /* Rebuild program ips */ + int32_t ip = 0; + vir_for_each_inst_inorder(inst, c) + inst->ip = ip++; + + /* Rebuild liveness */ + vir_calculate_live_intervals(c); + + /* Add interferences for the new spilled temps and update interferences + * for c->spill_base (since we may have modified its liveness). Also, + * update node priorities based one new liveness data. + */ + uint32_t sb_temp =c->spill_base.index; + uint32_t sb_node = temp_to_node(c, sb_temp); + for (uint32_t i = 0; i < c->num_temps; i++) { + if (c->temp_end[i] == -1) + continue; + + uint32_t node_i = temp_to_node(c, i); + c->nodes.info[node_i].priority = + c->temp_end[i] - c->temp_start[i]; + + for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps); + j < c->num_temps; j++) { + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { + uint32_t node_j = temp_to_node(c, j); + ra_add_node_interference(c->g, node_i, node_j); + } + } + + if (spill_type == SPILL_TYPE_TMU) { + if (i != sb_temp && + interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[sb_temp], c->temp_end[sb_temp])) { + ra_add_node_interference(c->g, node_i, sb_node); + } + } + } + c->disable_ldunif_opt = had_disable_ldunif_opt; + c->spilling = false; } -struct node_to_temp_map { - uint32_t temp; - uint32_t priority; -}; - struct v3d_ra_select_callback_data { + uint32_t phys_index; uint32_t next_acc; uint32_t next_phys; - struct node_to_temp_map *map; + struct v3d_ra_node_info *nodes; + const struct v3d_device_info *devinfo; }; /* Choosing accumulators improves chances of merging QPU instructions @@ -384,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, int priority) { + if (!v3d_ra->devinfo->has_accumulators) + return false; + /* Favor accumulators if we have less that this number of physical * registers. Accumulators have more restrictions (like being * invalidated through thrsw), so running out of physical registers @@ -393,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, static const int available_rf_threshold = 5; int available_rf = 0 ; for (int i = 0; i < PHYS_COUNT; i++) { - if (BITSET_TEST(regs, PHYS_INDEX + i)) + if (BITSET_TEST(regs, v3d_ra->phys_index + i)) available_rf++; if (available_rf >= available_rf_threshold) break; @@ -419,6 +883,19 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, unsigned int *out) { + if (!v3d_ra->devinfo->has_accumulators) + return false; + + /* Choose r5 for our ldunifs if possible (nobody else can load to that + * reg, and it keeps the QPU cond field free from being occupied by + * ldunifrf). + */ + int r5 = ACC_INDEX + 5; + if (BITSET_TEST(regs, r5)) { + *out = r5; + return true; + } + /* Round-robin through our accumulators to give post-RA instruction * selection more options. */ @@ -438,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, static bool v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + unsigned int node, BITSET_WORD *regs, unsigned int *out) { + /* If this node is for an unused temp, ignore. */ + if (v3d_ra->nodes->info[node].unused) { + *out = 0; + return true; + } + + /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst + * so we can avoid turning them into ldunifrf (which uses the + * cond field to encode the dst and would prevent merge with + * instructions that use cond flags). + */ + if (v3d_ra->nodes->info[node].is_ldunif_dst && + BITSET_TEST(regs, v3d_ra->phys_index)) { + assert(v3d_ra->devinfo->ver >= 71); + *out = v3d_ra->phys_index; + return true; + } + + /* The last 3 instructions in a shader can't use some specific registers + * (usually early rf registers, depends on v3d version) so try to + * avoid allocating these to registers used by the last instructions + * in the shader. + */ + const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4; + if (v3d_ra->nodes->info[node].is_program_end && + v3d_ra->next_phys < safe_rf_start) { + v3d_ra->next_phys = safe_rf_start; + } + for (int i = 0; i < PHYS_COUNT; i++) { int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; - int phys = PHYS_INDEX + phys_off; + + /* Try to keep rf0 available for ldunif in 7.x (see above). */ + if (v3d_ra->devinfo->ver >= 71 && phys_off == 0) + continue; + + int phys = v3d_ra->phys_index + phys_off; if (BITSET_TEST(regs, phys)) { v3d_ra->next_phys = phys_off + 1; @@ -452,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, } } + /* If we couldn't allocate, do try to assign rf0 if it is available. */ + if (v3d_ra->devinfo->ver >= 71 && + BITSET_TEST(regs, v3d_ra->phys_index)) { + v3d_ra->next_phys = 1; + *out = v3d_ra->phys_index; + return true; + } + return false; } @@ -459,22 +979,14 @@ static unsigned int v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) { struct v3d_ra_select_callback_data *v3d_ra = data; - int r5 = ACC_INDEX + 5; - - /* Choose r5 for our ldunifs if possible (nobody else can load to that - * reg, and it keeps the QPU cond field free from being occupied by - * ldunifrf). - */ - if (BITSET_TEST(regs, r5)) - return r5; unsigned int reg; - if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) && + if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) && v3d_ra_select_accum(v3d_ra, regs, ®)) { return reg; } - if (v3d_ra_select_rf(v3d_ra, regs, ®)) + if (v3d_ra_select_rf(v3d_ra, n, regs, ®)) return reg; /* If we ran out of physical registers try to assign an accumulator @@ -492,9 +1004,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler) /* Allocate up to 3 regfile classes, for the ways the physical * register file can be divided up for fragment shader threading. */ - int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); + int max_thread_index = 2; + uint8_t phys_index = get_phys_index(compiler->devinfo); - compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, + compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT, false); if (!compiler->regs) return false; @@ -502,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler) for (int threads = 0; threads < max_thread_index; threads++) { compiler->reg_class_any[threads] = ra_alloc_contig_reg_class(compiler->regs, 1); - compiler->reg_class_r5[threads] = - ra_alloc_contig_reg_class(compiler->regs, 1); - compiler->reg_class_phys_or_acc[threads] = - ra_alloc_contig_reg_class(compiler->regs, 1); + if (compiler->devinfo->has_accumulators) { + compiler->reg_class_r5[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + compiler->reg_class_phys_or_acc[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + } compiler->reg_class_phys[threads] = ra_alloc_contig_reg_class(compiler->regs, 1); - for (int i = PHYS_INDEX; - i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { - ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + /* Init physical regs */ + for (int i = phys_index; + i < phys_index + (PHYS_COUNT >> threads); i++) { + if (compiler->devinfo->has_accumulators) + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); ra_class_add_reg(compiler->reg_class_phys[threads], i); ra_class_add_reg(compiler->reg_class_any[threads], i); } - for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { - ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); - ra_class_add_reg(compiler->reg_class_any[threads], i); + /* Init accumulator regs */ + if (compiler->devinfo->has_accumulators) { + for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->reg_class_any[threads], i); + } + /* r5 can only store a single 32-bit value, so not much can + * use it. + */ + ra_class_add_reg(compiler->reg_class_r5[threads], + ACC_INDEX + 5); + ra_class_add_reg(compiler->reg_class_any[threads], + ACC_INDEX + 5); } - /* r5 can only store a single 32-bit value, so not much can - * use it. - */ - ra_class_add_reg(compiler->reg_class_r5[threads], - ACC_INDEX + 5); - ra_class_add_reg(compiler->reg_class_any[threads], - ACC_INDEX + 5); } ra_set_finalize(compiler->regs, NULL); @@ -534,52 +1054,220 @@ vir_init_reg_sets(struct v3d_compiler *compiler) return true; } -static int -node_to_temp_priority(const void *in_a, const void *in_b) +static inline bool +tmu_spilling_allowed(struct v3d_compile *c) { - const struct node_to_temp_map *a = in_a; - const struct node_to_temp_map *b = in_b; - - return a->priority - b->priority; + return c->spills + c->fills < c->max_tmu_spills; } -/** - * Computes the number of registers to spill in a batch after a register - * allocation failure. - */ -static uint32_t -get_spill_batch_size(struct v3d_compile *c) -{ - /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of - * over-spilling if the program requires few spills to compile. - */ - if (c->spill_count < 10) - return 1; - - /* If we have to spill more than that we assume performance is not going to - * be great and we shift focus to batching spills to cut down compile - * time at the expense of over-spilling. - */ - return 20; -} - -/* Don't emit spills using the TMU until we've dropped thread count first. We, - * may also disable spilling when certain optimizations that are known to - * increase register pressure are active so we favor recompiling with - * optimizations disabled instead of spilling. - */ -static inline bool -tmu_spilling_allowed(struct v3d_compile *c, int thread_index) +static void +update_graph_and_reg_classes_for_inst(struct v3d_compile *c, + int *acc_nodes, + int *implicit_rf_nodes, + int last_ldvary_ip, + struct qinst *inst) { - return thread_index == 0 && c->tmu_spilling_allowed; + int32_t ip = inst->ip; + assert(ip >= 0); + + /* If the instruction writes r4 (and optionally moves its + * result to a temp), nothing else can be stored in r4 across + * it. + */ + if (vir_writes_r4_implicitly(c->devinfo, inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + acc_nodes[4]); + } + } + } + + /* If any instruction writes to a physical register implicitly + * nothing else can write the same register across it. + */ + if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + implicit_rf_nodes[0]); + } + } + } + + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_LDVPMV_IN: + case V3D_QPU_A_LDVPMV_OUT: + case V3D_QPU_A_LDVPMD_IN: + case V3D_QPU_A_LDVPMD_OUT: + case V3D_QPU_A_LDVPMP: + case V3D_QPU_A_LDVPMG_IN: + case V3D_QPU_A_LDVPMG_OUT: { + /* LDVPMs only store to temps (the MA flag + * decides whether the LDVPM is in or out) + */ + assert(inst->dst.file == QFILE_TEMP); + set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } + + case V3D_QPU_A_RECIP: + case V3D_QPU_A_RSQRT: + case V3D_QPU_A_EXP: + case V3D_QPU_A_LOG: + case V3D_QPU_A_SIN: + case V3D_QPU_A_RSQRT2: { + /* The SFU instructions write directly to the + * phys regfile. + */ + assert(inst->dst.file == QFILE_TEMP); + set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } + + default: + break; + } + } + + if (inst->src[0].file == QFILE_REG) { + switch (inst->src[0].index) { + case 0: + /* V3D 7.x doesn't use rf0 for thread payload */ + if (c->devinfo->ver >= 71) + break; + else + FALLTHROUGH; + case 1: + case 2: + case 3: { + /* Payload setup instructions: Force allocate + * the dst to the given register (so the MOV + * will disappear). + */ + assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); + assert(inst->dst.file == QFILE_TEMP); + uint32_t node = temp_to_node(c, inst->dst.index); + ra_set_node_reg(c->g, node, + get_phys_index(c->devinfo) + + inst->src[0].index); + break; + } + } + } + + /* Don't allocate rf0 to temps that cross ranges where we have + * live implicit rf0 writes from ldvary. We can identify these + * by tracking the last ldvary instruction and explicit reads + * of rf0. + */ + if (c->devinfo->ver >= 71 && + ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) || + (vir_get_nsrc(inst) > 1 && + inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && + c->temp_end[i] > last_ldvary_ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + implicit_rf_nodes[0]); + } + } + } + + if (inst->dst.file == QFILE_TEMP) { + /* Only a ldunif gets to write to R5, which only has a + * single 32-bit channel of storage. + * + * NOTE: ldunifa is subject to the same, however, going by + * shader-db it is best to keep r5 exclusive to ldunif, probably + * because ldunif has usually a shorter lifespan, allowing for + * more accumulator reuse and QPU merges. + */ + if (c->devinfo->has_accumulators) { + if (!inst->qpu.sig.ldunif) { + uint8_t class_bits = + get_temp_class_bits(c, inst->dst.index) & + ~CLASS_BITS_R5; + set_temp_class_bits(c, inst->dst.index, + class_bits); + + } + } else { + /* Make sure we don't allocate the ldvary's + * destination to rf0, since it would clash + * with its implicit write to that register. + */ + if (inst->qpu.sig.ldvary) { + ra_add_node_interference(c->g, + temp_to_node(c, inst->dst.index), + implicit_rf_nodes[0]); + } + /* Flag dst temps from ldunif(a) instructions + * so we can try to assign rf0 to them and avoid + * converting these to ldunif(a)rf. + */ + if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) { + const uint32_t dst_n = + temp_to_node(c, inst->dst.index); + c->nodes.info[dst_n].is_ldunif_dst = true; + } + } + } + + /* All accumulators are invalidated across a thread switch. */ + if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + set_temp_class_bits(c, i, + CLASS_BITS_PHYS); + } + } + } } -#define CLASS_BIT_PHYS (1 << 0) -#define CLASS_BIT_ACC (1 << 1) -#define CLASS_BIT_R5 (1 << 4) -#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \ - CLASS_BIT_ACC | \ - CLASS_BIT_R5) +static void +flag_program_end_nodes(struct v3d_compile *c) +{ + /* Only look for registers used in this many instructions */ + uint32_t last_set_count = 6; + + struct qblock *last_block = vir_exit_block(c); + list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) { + if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU) + continue; + + int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); + for (int i = 0; i < num_src; i++) { + if (inst->src[i].file == QFILE_TEMP) { + int node = temp_to_node(c, inst->src[i].index); + c->nodes.info[node].is_program_end = true; + } + } + + num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); + for (int i = 0; i < num_src; i++) { + if (inst->src[i].file == QFILE_TEMP) { + int node = temp_to_node(c, inst->src[i].index); + c->nodes.info[node].is_program_end = true; + + } + } + + if (inst->dst.file == QFILE_TEMP) { + int node = temp_to_node(c, inst->dst.index); + c->nodes.info[node].is_program_end = true; + } + + if (--last_set_count == 0) + break; + } +} /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. @@ -587,24 +1275,37 @@ tmu_spilling_allowed(struct v3d_compile *c, int thread_index) * The return value should be freed by the caller. */ struct qpu_reg * -v3d_register_allocate(struct v3d_compile *c, bool *spilled) +v3d_register_allocate(struct v3d_compile *c) { - uint32_t UNUSED start_num_temps = c->num_temps; - struct node_to_temp_map map[c->num_temps]; - uint32_t temp_to_node[c->num_temps]; - uint8_t class_bits[c->num_temps]; int acc_nodes[ACC_COUNT]; + int implicit_rf_nodes[IMPLICIT_RF_COUNT]; + + unsigned num_ra_nodes = c->num_temps; + if (c->devinfo->has_accumulators) + num_ra_nodes += ARRAY_SIZE(acc_nodes); + else + num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes); + + c->nodes = (struct v3d_ra_node_info) { + .alloc_count = c->num_temps, + .info = ralloc_array_size(c, sizeof(c->nodes.info[0]), + num_ra_nodes), + }; + + uint32_t phys_index = get_phys_index(c->devinfo); + struct v3d_ra_select_callback_data callback_data = { + .phys_index = phys_index, .next_acc = 0, /* Start at RF3, to try to keep the TLB writes from using - * RF0-2. + * RF0-2. Start at RF4 in 7.x to prevent TLB writes from + * using RF2-3. */ - .next_phys = 3, - .map = map, + .next_phys = c->devinfo->ver == 42 ? 3 : 4, + .nodes = &c->nodes, + .devinfo = c->devinfo, }; - *spilled = false; - vir_calculate_live_intervals(c); /* Convert 1, 2, 4 threads to 0, 1, 2 index. @@ -612,257 +1313,163 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) * V3D 4.x has double the physical register space, so 64 physical regs * are available at both 1x and 2x threading, and 4x has 32. */ - int thread_index = ffs(c->threads) - 1; - if (c->devinfo->ver >= 40) { - if (thread_index >= 1) - thread_index--; - } + c->thread_index = ffs(c->threads) - 1; + if (c->thread_index >= 1) + c->thread_index--; - struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, - c->num_temps + - ARRAY_SIZE(acc_nodes)); - ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data); + c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes); + ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data); /* Make some fixed nodes for the accumulators, which we will need to * interfere with when ops have implied r3/r4 writes or for the thread * switches. We could represent these as classes for the nodes to * live in, but the classes take up a lot of memory to set up, so we - * don't want to make too many. + * don't want to make too many. We use the same mechanism on platforms + * without accumulators that can have implicit writes to phys regs. */ - for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) { - acc_nodes[i] = c->num_temps + i; - ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i); - } - - for (uint32_t i = 0; i < c->num_temps; i++) { - map[i].temp = i; - map[i].priority = c->temp_end[i] - c->temp_start[i]; - } - qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); - for (uint32_t i = 0; i < c->num_temps; i++) { - temp_to_node[map[i].temp] = i; + for (uint32_t i = 0; i < num_ra_nodes; i++) { + c->nodes.info[i].is_ldunif_dst = false; + c->nodes.info[i].is_program_end = false; + c->nodes.info[i].unused = false; + c->nodes.info[i].priority = 0; + c->nodes.info[i].class_bits = 0; + if (c->devinfo->has_accumulators && i < ACC_COUNT) { + acc_nodes[i] = i; + ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); + } else if (!c->devinfo->has_accumulators && + i < ARRAY_SIZE(implicit_rf_nodes)) { + implicit_rf_nodes[i] = i; + ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i); + } else { + uint32_t t = node_to_temp(c, i); + c->nodes.info[i].priority = + c->temp_end[t] - c->temp_start[t]; + c->nodes.info[i].class_bits = + get_class_bit_any(c->devinfo); + } } - /* Figure out our register classes and preallocated registers. We - * start with any temp being able to be in any file, then instructions - * incrementally remove bits that the temp definitely can't be in. + /* Walk the instructions adding register class restrictions and + * interferences. */ - memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits)); - int ip = 0; + int last_ldvary_ip = -1; vir_for_each_inst_inorder(inst, c) { - /* If the instruction writes r3/r4 (and optionally moves its - * result to a temp), nothing else can be stored in r3/r4 across - * it. + inst->ip = ip++; + + /* ldunif(a) always write to a temporary, so we have + * liveness info available to decide if rf0 is + * available for them, however, ldvary is different: + * it always writes to rf0 directly so we don't have + * liveness information for its implicit rf0 write. + * + * That means the allocator may assign rf0 to a temp + * that is defined while an implicit rf0 write from + * ldvary is still live. We fix that by manually + * tracking rf0 live ranges from ldvary instructions. */ - if (vir_writes_r3(c->devinfo, inst)) { - for (int i = 0; i < c->num_temps; i++) { - if (c->temp_start[i] < ip && - c->temp_end[i] > ip) { - ra_add_node_interference(g, - temp_to_node[i], - acc_nodes[3]); - } - } - } - if (vir_writes_r4(c->devinfo, inst)) { - for (int i = 0; i < c->num_temps; i++) { - if (c->temp_start[i] < ip && - c->temp_end[i] > ip) { - ra_add_node_interference(g, - temp_to_node[i], - acc_nodes[4]); - } - } - } - - if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { - switch (inst->qpu.alu.add.op) { - case V3D_QPU_A_LDVPMV_IN: - case V3D_QPU_A_LDVPMV_OUT: - case V3D_QPU_A_LDVPMD_IN: - case V3D_QPU_A_LDVPMD_OUT: - case V3D_QPU_A_LDVPMP: - case V3D_QPU_A_LDVPMG_IN: - case V3D_QPU_A_LDVPMG_OUT: - /* LDVPMs only store to temps (the MA flag - * decides whether the LDVPM is in or out) - */ - assert(inst->dst.file == QFILE_TEMP); - class_bits[inst->dst.index] &= CLASS_BIT_PHYS; - break; - - case V3D_QPU_A_RECIP: - case V3D_QPU_A_RSQRT: - case V3D_QPU_A_EXP: - case V3D_QPU_A_LOG: - case V3D_QPU_A_SIN: - case V3D_QPU_A_RSQRT2: - /* The SFU instructions write directly to the - * phys regfile. - */ - assert(inst->dst.file == QFILE_TEMP); - class_bits[inst->dst.index] &= CLASS_BIT_PHYS; - break; - - default: - break; - } - } + if (inst->qpu.sig.ldvary) + last_ldvary_ip = ip; - if (inst->src[0].file == QFILE_REG) { - switch (inst->src[0].index) { - case 0: - case 1: - case 2: - case 3: - /* Payload setup instructions: Force allocate - * the dst to the given register (so the MOV - * will disappear). - */ - assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); - assert(inst->dst.file == QFILE_TEMP); - ra_set_node_reg(g, - temp_to_node[inst->dst.index], - PHYS_INDEX + - inst->src[0].index); - break; - } - } - - if (inst->dst.file == QFILE_TEMP) { - /* Only a ldunif gets to write to R5, which only has a - * single 32-bit channel of storage. - */ - if (!inst->qpu.sig.ldunif) { - class_bits[inst->dst.index] &= ~CLASS_BIT_R5; - } else { - /* Until V3D 4.x, we could only load a uniform - * to r5, so we'll need to spill if uniform - * loads interfere with each other. - */ - if (c->devinfo->ver < 40) { - class_bits[inst->dst.index] &= - CLASS_BIT_R5; - } - } - } - - if (inst->qpu.sig.thrsw) { - /* All accumulators are invalidated across a thread - * switch. - */ - for (int i = 0; i < c->num_temps; i++) { - if (c->temp_start[i] < ip && c->temp_end[i] > ip) - class_bits[i] &= CLASS_BIT_PHYS; - } - } - - ip++; + update_graph_and_reg_classes_for_inst(c, acc_nodes, + implicit_rf_nodes, + last_ldvary_ip, inst); } + /* Flag the nodes that are used in the last instructions of the program + * (there are some registers that cannot be used in the last 3 + * instructions). We only do this for fragment shaders, because the idea + * is that by avoiding this conflict we may be able to emit the last + * thread switch earlier in some cases, however, in non-fragment shaders + * this won't happen because the last instructions are always VPM stores + * with a small immediate, which conflicts with other signals, + * preventing us from ever moving the thrsw earlier. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) + flag_program_end_nodes(c); + + /* Set the register classes for all our temporaries in the graph */ for (uint32_t i = 0; i < c->num_temps; i++) { - if (class_bits[i] == CLASS_BIT_PHYS) { - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_phys[thread_index]); - } else if (class_bits[i] == (CLASS_BIT_R5)) { - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_r5[thread_index]); - } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) { - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_phys_or_acc[thread_index]); - } else { - assert(class_bits[i] == CLASS_BITS_ANY); - ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_any[thread_index]); - } + ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class_for_temp(c, i)); } + /* Add register interferences based on liveness data */ for (uint32_t i = 0; i < c->num_temps; i++) { + /* And while we are here, let's also flag nodes for + * unused temps. + */ + if (c->temp_start[i] > c->temp_end[i]) + c->nodes.info[temp_to_node(c, i)].unused = true; + for (uint32_t j = i + 1; j < c->num_temps; j++) { - if (!(c->temp_start[i] >= c->temp_end[j] || - c->temp_start[j] >= c->temp_end[i])) { - ra_add_node_interference(g, - temp_to_node[i], - temp_to_node[j]); + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + temp_to_node(c, j)); } } } - /* Debug code to force a bit of register spilling, for running across - * conformance tests to make sure that spilling works. + /* Debug option to force a bit of TMU spilling, for running + * across conformance tests to make sure that spilling works. */ - int force_register_spills = 0; - if (c->spill_size < - V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { - int node = v3d_choose_spill_node(c, g, temp_to_node); - if (node != -1) { - v3d_spill_reg(c, map[node].temp); - ralloc_free(g); - *spilled = true; - return NULL; + const int force_register_spills = 0; + if (force_register_spills > 0) + c->max_tmu_spills = UINT32_MAX; + + struct qpu_reg *temp_registers = NULL; + while (true) { + if (c->spill_size < + V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { + int node = v3d_choose_spill_node(c); + uint32_t temp = node_to_temp(c, node); + if (node != -1) { + v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + continue; + } } - } - - bool ok = ra_allocate(g); - if (!ok) { - const uint32_t spill_batch_size = get_spill_batch_size(c); - - for (uint32_t i = 0; i < spill_batch_size; i++) { - int node = v3d_choose_spill_node(c, g, temp_to_node); - if (node == -1) - break; - - /* TMU spills inject thrsw signals that invalidate - * accumulators, so we can't batch them. - */ - bool is_uniform = vir_is_mov_uniform(c, map[node].temp); - if (i > 0 && !is_uniform) - break; - if (is_uniform || tmu_spilling_allowed(c, thread_index)) { - v3d_spill_reg(c, map[node].temp); - - /* Ask the outer loop to call back in. */ - *spilled = true; + if (ra_allocate(c->g)) + break; - /* See comment above about batching TMU spills. - */ - if (!is_uniform) { - assert(i == 0); - break; - } - } else { - break; - } + /* Failed allocation, try to spill */ + int node = v3d_choose_spill_node(c); + if (node == -1) + goto spill_fail; + + uint32_t temp = node_to_temp(c, node); + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); + if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) { + v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + if (c->spills + c->fills > c->max_tmu_spills) + goto spill_fail; + } else { + goto spill_fail; } - - ralloc_free(g); - return NULL; } - /* Ensure that we are not accessing temp_to_node out of bounds. We - * should never trigger this assertion because `c->num_temps` only - * grows when we spill, in which case we return early and don't get - * here. - */ - assert(start_num_temps == c->num_temps); - struct qpu_reg *temp_registers = calloc(c->num_temps, - sizeof(*temp_registers)); - + /* Allocation was successful, build the 'temp -> reg' map */ + temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); for (uint32_t i = 0; i < c->num_temps; i++) { - int ra_reg = ra_get_node_reg(g, temp_to_node[i]); - if (ra_reg < PHYS_INDEX) { + int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i)); + if (ra_reg < phys_index) { temp_registers[i].magic = true; temp_registers[i].index = (V3D_QPU_WADDR_R0 + ra_reg - ACC_INDEX); } else { temp_registers[i].magic = false; - temp_registers[i].index = ra_reg - PHYS_INDEX; + temp_registers[i].index = ra_reg - phys_index; } } - ralloc_free(g); - +spill_fail: + ralloc_free(c->nodes.info); + c->nodes.info = NULL; + c->nodes.alloc_count = 0; + ralloc_free(c->g); + c->g = NULL; return temp_registers; } diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c index aa33545420e..605c3e4c7d5 100644 --- a/src/broadcom/compiler/vir_to_qpu.c +++ b/src/broadcom/compiler/vir_to_qpu.c @@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr) return reg; } -static inline struct qpu_reg -qpu_acc(int acc) -{ - return qpu_magic(V3D_QPU_WADDR_R0 + acc); -} - struct v3d_qpu_instr v3d_qpu_nop(void) { @@ -92,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst) return q; } +static void +v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src) +{ + /* If we have a small immediate move it from inst->raddr_b to the + * corresponding raddr. + */ + if (src.smimm) { + assert(instr->sig.small_imm_a || instr->sig.small_imm_b || + instr->sig.small_imm_c || instr->sig.small_imm_d); + *raddr = instr->raddr_b; + return; + } + + assert(!src.magic); + *raddr = src.index; +} + /** * Allocates the src register (accumulator or register file) into the RADDR * fields of the instruction. */ static void -set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) +v3d42_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) { if (src.smimm) { - assert(instr->sig.small_imm); + assert(instr->sig.small_imm_b); *mux = V3D_QPU_MUX_B; return; } @@ -112,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) return; } - if (instr->alu.add.a != V3D_QPU_MUX_A && - instr->alu.add.b != V3D_QPU_MUX_A && - instr->alu.mul.a != V3D_QPU_MUX_A && - instr->alu.mul.b != V3D_QPU_MUX_A) { + if (instr->alu.add.a.mux != V3D_QPU_MUX_A && + instr->alu.add.b.mux != V3D_QPU_MUX_A && + instr->alu.mul.a.mux != V3D_QPU_MUX_A && + instr->alu.mul.b.mux != V3D_QPU_MUX_A) { instr->raddr_a = src.index; *mux = V3D_QPU_MUX_A; } else { if (instr->raddr_a == src.index) { *mux = V3D_QPU_MUX_A; } else { - assert(!(instr->alu.add.a == V3D_QPU_MUX_B && - instr->alu.add.b == V3D_QPU_MUX_B && - instr->alu.mul.a == V3D_QPU_MUX_B && - instr->alu.mul.b == V3D_QPU_MUX_B) || + assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B && + instr->alu.add.b.mux == V3D_QPU_MUX_B && + instr->alu.mul.a.mux == V3D_QPU_MUX_B && + instr->alu.mul.b.mux == V3D_QPU_MUX_B) || src.index == instr->raddr_b); instr->raddr_b = src.index; @@ -134,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) } } -static bool -is_no_op_mov(struct qinst *qinst) +/* + * The main purpose of the following wrapper is to make calling set_src + * cleaner. This is the reason it receives both mux and raddr pointers. Those + * will be filled or not based on the device version. + */ +static void +set_src(struct v3d_qpu_instr *instr, + enum v3d_qpu_mux *mux, + uint8_t *raddr, + struct qpu_reg src, + const struct v3d_device_info *devinfo) { - static const struct v3d_qpu_sig no_sig = {0}; - - /* Make sure it's just a lone MOV. */ - if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || - qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || - qinst->qpu.alu.add.op != V3D_QPU_A_NOP || - memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { - return false; - } + if (devinfo->ver < 71) + return v3d42_set_src(instr, mux, src); + else + return v3d71_set_src(instr, raddr, src); +} - /* Check if it's a MOV from a register to itself. */ +static bool +v3d42_mov_src_and_dst_equal(struct qinst *qinst) +{ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; if (qinst->qpu.alu.mul.magic_write) { if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4) return false; - if (qinst->qpu.alu.mul.a != + if (qinst->qpu.alu.mul.a.mux != V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) { return false; } } else { int raddr; - switch (qinst->qpu.alu.mul.a) { + switch (qinst->qpu.alu.mul.a.mux) { case V3D_QPU_MUX_A: raddr = qinst->qpu.raddr_a; break; @@ -174,10 +192,61 @@ is_no_op_mov(struct qinst *qinst) return false; } + return true; +} + +static bool +v3d71_mov_src_and_dst_equal(struct qinst *qinst) +{ + if (qinst->qpu.alu.mul.magic_write) + return false; + + enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; + int raddr; + + raddr = qinst->qpu.alu.mul.a.raddr; + if (raddr != waddr) + return false; + + return true; +} + +static bool +mov_src_and_dst_equal(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return v3d42_mov_src_and_dst_equal(qinst); + else + return v3d71_mov_src_and_dst_equal(qinst); +} + + +static bool +is_no_op_mov(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + static const struct v3d_qpu_sig no_sig = {0}; + + /* Make sure it's just a lone MOV. We only check for M_MOV. Although + * for V3D 7.x there is also A_MOV, we don't need to check for it as + * we always emit using M_MOV. We could use A_MOV later on the + * squedule to improve performance + */ + if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || + qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || + qinst->qpu.alu.add.op != V3D_QPU_A_NOP || + memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { + return false; + } + + if (!mov_src_and_dst_equal(qinst, devinfo)) + return false; + /* No packing or flags updates, or we need to execute the * instruction. */ - if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || + if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE || qinst->qpu.flags.mc != V3D_QPU_COND_NONE || qinst->qpu.flags.mpf != V3D_QPU_PF_NONE || @@ -193,8 +262,6 @@ v3d_generate_code_block(struct v3d_compile *c, struct qblock *block, struct qpu_reg *temp_registers) { - int last_vpm_read_index = -1; - vir_for_each_inst_safe(qinst, block) { #if 0 fprintf(stderr, "translating qinst to qpu: "); @@ -202,8 +269,6 @@ v3d_generate_code_block(struct v3d_compile *c, fprintf(stderr, "\n"); #endif - struct qinst *temp; - if (vir_has_uniform(qinst)) c->num_uniforms++; @@ -219,8 +284,14 @@ v3d_generate_code_block(struct v3d_compile *c, src[i] = qpu_magic(qinst->src[i].index); break; case QFILE_NULL: + /* QFILE_NULL is an undef, so we can load + * anything. Using a reg that doesn't have + * sched. restrictions. + */ + src[i] = qpu_reg(5); + break; case QFILE_LOAD_IMM: - src[i] = qpu_acc(0); + assert(!"not reached"); break; case QFILE_TEMP: src[i] = temp_registers[index]; @@ -228,18 +299,6 @@ v3d_generate_code_block(struct v3d_compile *c, case QFILE_SMALL_IMM: src[i].smimm = true; break; - - case QFILE_VPM: - assert((int)qinst->src[i].index >= - last_vpm_read_index); - (void)last_vpm_read_index; - last_vpm_read_index = qinst->src[i].index; - - temp = new_qpu_nop_before(qinst); - temp->qpu.sig.ldvpm = true; - - src[i] = qpu_acc(3); - break; } } @@ -261,10 +320,6 @@ v3d_generate_code_block(struct v3d_compile *c, dst = temp_registers[qinst->dst.index]; break; - case QFILE_VPM: - dst = qpu_magic(V3D_QPU_WADDR_VPM); - break; - case QFILE_SMALL_IMM: case QFILE_LOAD_IMM: assert(!"not reached"); @@ -276,10 +331,15 @@ v3d_generate_code_block(struct v3d_compile *c, assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); - if (!dst.magic || - dst.index != V3D_QPU_WADDR_R5) { - assert(c->devinfo->ver >= 40); + bool use_rf; + if (c->devinfo->has_accumulators) { + use_rf = !dst.magic || + dst.index != V3D_QPU_WADDR_R5; + } else { + use_rf = dst.magic || dst.index != 0; + } + if (use_rf) { if (qinst->qpu.sig.ldunif) { qinst->qpu.sig.ldunif = false; qinst->qpu.sig.ldunifrf = true; @@ -299,13 +359,18 @@ v3d_generate_code_block(struct v3d_compile *c, qinst->qpu.sig_magic = dst.magic; } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) { assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.a, src[0]); + &qinst->qpu.alu.add.a.mux, + &qinst->qpu.alu.add.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.b, src[1]); + &qinst->qpu.alu.add.b.mux, + &qinst->qpu.alu.add.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.add.waddr = dst.index; @@ -313,17 +378,21 @@ v3d_generate_code_block(struct v3d_compile *c, } else { if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.a, src[0]); + &qinst->qpu.alu.mul.a.mux, + &qinst->qpu.alu.mul.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.b, src[1]); + &qinst->qpu.alu.mul.b.mux, + &qinst->qpu.alu.mul.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.mul.waddr = dst.index; qinst->qpu.alu.mul.magic_write = dst.magic; - if (is_no_op_mov(qinst)) { + if (is_no_op_mov(qinst, c->devinfo)) { vir_remove_instruction(c, qinst); continue; } @@ -378,11 +447,7 @@ v3d_dump_qpu(struct v3d_compile *c) const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]); fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str); - /* We can only do this on 4.x, because we're not tracking TMU - * implicit uniforms here on 3.x. - */ - if (c->devinfo->ver >= 40 && - reads_uniform(c->devinfo, c->qpu_insts[i])) { + if (reads_uniform(c->devinfo, c->qpu_insts[i])) { fprintf(stderr, " ("); vir_dump_uniform(c->uniform_contents[next_uniform], c->uniform_data[next_uniform]); @@ -394,8 +459,7 @@ v3d_dump_qpu(struct v3d_compile *c) } /* Make sure our dumping lined up. */ - if (c->devinfo->ver >= 40) - assert(next_uniform == c->num_uniforms); + assert(next_uniform == c->num_uniforms); fprintf(stderr, "\n"); } @@ -431,8 +495,8 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers) } assert(i == c->qpu_inst_count); - if (V3D_DEBUG & (V3D_DEBUG_QPU | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(QPU) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { v3d_dump_qpu(c); } diff --git a/src/broadcom/drm-shim/README.md b/src/broadcom/drm-shim/README.md index 16cbff75825..614cc8304bf 100644 --- a/src/broadcom/drm-shim/README.md +++ b/src/broadcom/drm-shim/README.md @@ -1,12 +1,3 @@ -### v3d backend - -This implements some of v3d using the closed source v3dv3 tree's -C/C++-based simulator. All execution is synchronous. - -Export: `MESA_LOADER_DRIVER_OVERRIDE=v3d -LD_PRELOAD=$prefix/lib/libv3d_drm_shim.so`. The v3dv3 version exposed -will depend on the v3dv3 build -- 3.3, 4.1, and 4.2 are supported. - ### v3d_noop backend This implements the minimum of v3d in order to make shader-db work. diff --git a/src/broadcom/drm-shim/meson.build b/src/broadcom/drm-shim/meson.build index b44b6c15d18..212c0287aa8 100644 --- a/src/broadcom/drm-shim/meson.build +++ b/src/broadcom/drm-shim/meson.build @@ -19,55 +19,19 @@ # SOFTWARE. libvc4_noop_drm_shim = shared_library( - ['vc4_noop_drm_shim'], + 'vc4_noop_drm_shim', 'vc4_noop.c', - include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux], + include_directories: [inc_include, inc_src], dependencies: dep_drm_shim, gnu_symbol_visibility : 'hidden', install : true, ) libv3d_noop_drm_shim = shared_library( - ['v3d_noop_drm_shim'], + 'v3d_noop_drm_shim', 'v3d_noop.c', - include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux], + include_directories: [inc_include, inc_src], dependencies: dep_drm_shim, gnu_symbol_visibility : 'hidden', install : true, ) - -dep_v3dv3 = dependency('v3dv3', required: false) -if dep_v3dv3.found() - v3dv3_c_args = '-DUSE_V3D_SIMULATOR' - - inc_gallium_v3d = include_directories('../../gallium/drivers/v3d') - - per_version_libs = [] - foreach ver : v3d_versions - per_version_libs += static_library( - 'libv3d_drm_shim-v' + ver, - [ - 'v3dx.c', - v3d_xml_pack - ], - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_gallium_v3d, inc_simulator], - c_args : [no_override_init_args, '-DV3D_VERSION=' + ver, v3dv3_c_args], - gnu_symbol_visibility : 'hidden', - dependencies: [dep_valgrind, dep_thread, dep_v3dv3], - ) - endforeach - - libv3d_drm_shim = shared_library( - ['v3d_drm_shim'], - [ - 'v3d.c', - '../simulator/v3d_simulator_wrapper.cpp', - ], - dependencies: [idep_mesautil, dep_dl, dep_drm_shim, dep_v3dv3], - link_with: per_version_libs, - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_gallium_v3d, inc_simulator], - c_args : [no_override_init_args, '-std=gnu99', v3dv3_c_args], - gnu_symbol_visibility : 'hidden', - cpp_args : [v3dv3_c_args] - ) -endif diff --git a/src/broadcom/drm-shim/v3d.c b/src/broadcom/drm-shim/v3d.c deleted file mode 100644 index f4d5bd31323..00000000000 --- a/src/broadcom/drm-shim/v3d.c +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright © 2018 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#include <stdio.h> -#include <sys/ioctl.h> -#include "drm-uapi/v3d_drm.h" -#include "drm-shim/drm_shim.h" -#include "v3d.h" -#include "v3d_simulator_wrapper.h" - -bool drm_shim_driver_prefers_first_render_node = false; - -static struct v3d_device_info devinfo; -struct v3d_shim_device v3d = { - .devinfo = &devinfo -}; - -struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle) -{ - return v3d_bo(drm_shim_bo_lookup(shim_fd, handle)); -} - -int -v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg) -{ - /* No need to wait on anything yet, given that we submit - * synchronously. - */ - return 0; -} - -int -v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg) -{ - struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); - struct drm_v3d_mmap_bo *map = arg; - struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle); - - map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo); - - drm_shim_bo_put(bo); - - return 0; -} - -int -v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg) -{ - struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); - struct drm_v3d_get_bo_offset *get = arg; - struct v3d_bo *bo = v3d_bo_lookup(shim_fd, get->handle); - - get->offset = bo->offset; - - drm_shim_bo_put(&bo->base); - - return 0; -} - -void -drm_shim_driver_init(void) -{ - shim_device.bus_type = DRM_BUS_PLATFORM; - shim_device.driver_name = "v3d"; - - drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n" - "OF_COMPATIBLE_N=1\n" - "OF_COMPATIBLE_0=brcm,7278-v3d\n", - "/sys/dev/char/%d:%d/device/uevent", - DRM_MAJOR, render_node_minor); - - v3d.hw = v3d_hw_auto_new(NULL); - v3d.devinfo->ver = v3d_hw_get_version(v3d.hw); - - if (v3d.devinfo->ver >= 42) - v3d42_drm_shim_driver_init(); - else if (v3d.devinfo->ver >= 41) - v3d41_drm_shim_driver_init(); - else - v3d33_drm_shim_driver_init(); -} diff --git a/src/broadcom/drm-shim/v3d.h b/src/broadcom/drm-shim/v3d.h deleted file mode 100644 index 0712b8b3f24..00000000000 --- a/src/broadcom/drm-shim/v3d.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright © 2018 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef DRM_SHIM_V3D_H -#define DRM_SHIM_V3D_H - -#include "broadcom/common/v3d_device_info.h" -#include "util/vma.h" - -struct drm_shim_fd; - -struct v3d_shim_device { - struct v3d_hw *hw; - struct v3d_device_info *devinfo; - - /* Base virtual address of the heap. */ - void *mem; - /* Base hardware address of the heap. */ - uint32_t mem_base; - /* Size of the heap. */ - size_t mem_size; - - /* Allocator for the GPU virtual addresses. */ - struct util_vma_heap heap; -}; -extern struct v3d_shim_device v3d; - -struct v3d_bo { - struct shim_bo base; - uint64_t offset; - void *sim_vaddr; - void *gem_vaddr; -}; - -static inline struct v3d_bo * -v3d_bo(struct shim_bo *bo) -{ - return (struct v3d_bo *)bo; -} - -struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle); -int v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg); -int v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg); -int v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg); - -void v3d33_drm_shim_driver_init(void); -void v3d41_drm_shim_driver_init(void); -void v3d42_drm_shim_driver_init(void); - -#endif /* DRM_SHIM_V3D_H */ diff --git a/src/broadcom/drm-shim/v3d_noop.c b/src/broadcom/drm-shim/v3d_noop.c index fd92e8859c5..8a27052441b 100644 --- a/src/broadcom/drm-shim/v3d_noop.c +++ b/src/broadcom/drm-shim/v3d_noop.c @@ -122,6 +122,15 @@ v3d_ioctl_get_param(int fd, unsigned long request, void *arg) case DRM_V3D_PARAM_SUPPORTS_TFU: gp->value = 1; return 0; + case DRM_V3D_PARAM_SUPPORTS_CSD: + gp->value = 1; + return 0; + case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH: + gp->value = 1; + return 0; + case DRM_V3D_PARAM_SUPPORTS_PERFMON: + gp->value = 1; + return 0; default: break; } diff --git a/src/broadcom/drm-shim/v3dx.c b/src/broadcom/drm-shim/v3dx.c deleted file mode 100644 index a22550a03a5..00000000000 --- a/src/broadcom/drm-shim/v3dx.c +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright © 2014-2017 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -/* @file - * - * v3d driver code interacting v3dv3 simulator/fpga library. - * - * This is compiled per V3D version we support, since the register definitions - * conflict. - */ - -#include <errno.h> -#include <stdbool.h> -#include <stdio.h> -#include <string.h> -#include <sys/mman.h> -#include "util/macros.h" -#include "util/u_mm.h" -#include "broadcom/common/v3d_macros.h" -#include "v3d_simulator_wrapper.h" -#include "drm-shim/drm_shim.h" -#include "drm-uapi/v3d_drm.h" -#include "v3d.h" - -#define HW_REGISTER_RO(x) (x) -#define HW_REGISTER_RW(x) (x) -#if V3D_VERSION >= 41 -#include "libs/core/v3d/registers/4.1.34.0/v3d.h" -#else -#include "libs/core/v3d/registers/3.3.0.0/v3d.h" -#endif - -#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d.hw, reg, val) -#define V3D_READ(reg) v3d_hw_read_reg(v3d.hw, reg) - -static void -v3d_flush_l3() -{ - if (!v3d_hw_has_gca(v3d.hw)) - return; - -#if V3D_VERSION < 40 - uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL); - - V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET); - V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET); -#endif -} - -/* Invalidates the L2 cache. This is a read-only cache. */ -static void -v3d_flush_l2(void) -{ - V3D_WRITE(V3D_CTL_0_L2CACTL, - V3D_CTL_0_L2CACTL_L2CCLR_SET | - V3D_CTL_0_L2CACTL_L2CENA_SET); -} - -/* Invalidates texture L2 cachelines */ -static void -v3d_flush_l2t(void) -{ - V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0); - V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0); - V3D_WRITE(V3D_CTL_0_L2TCACTL, - V3D_CTL_0_L2TCACTL_L2TFLS_SET | - (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); -} - -/* Invalidates the slice caches. These are read-only caches. */ -static void -v3d_flush_slices(void) -{ - V3D_WRITE(V3D_CTL_0_SLCACTL, ~0); -} - -static void -v3d_flush_caches(void) -{ - v3d_flush_l3(); - v3d_flush_l2(); - v3d_flush_l2t(); - v3d_flush_slices(); -} - -static void -v3d_simulator_copy_in_handle(struct shim_fd *shim_fd, int handle) -{ - if (!handle) - return; - - struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle); - - memcpy(bo->sim_vaddr, bo->gem_vaddr, bo->base.size); -} - -static void -v3d_simulator_copy_out_handle(struct shim_fd *shim_fd, int handle) -{ - if (!handle) - return; - - struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle); - - memcpy(bo->gem_vaddr, bo->sim_vaddr, bo->base.size); -} - -static int -v3dX(v3d_ioctl_submit_cl)(int fd, unsigned long request, void *arg) -{ - struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); - struct drm_v3d_submit_cl *submit = arg; - uint32_t *bo_handles = (uint32_t *)(uintptr_t)submit->bo_handles; - - for (int i = 0; i < submit->bo_handle_count; i++) - v3d_simulator_copy_in_handle(shim_fd, bo_handles[i]); - - v3d_flush_caches(); - - if (submit->qma) { - V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma); - V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms); - } -#if V3D_VERSION >= 41 - if (submit->qts) { - V3D_WRITE(V3D_CLE_0_CT0QTS, - V3D_CLE_0_CT0QTS_CTQTSEN_SET | - submit->qts); - } -#endif - - fprintf(stderr, "submit %x..%x!\n", submit->bcl_start, submit->bcl_end); - - V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start); - V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end); - - /* Wait for bin to complete before firing render, as it seems the - * simulator doesn't implement the semaphores. - */ - while (V3D_READ(V3D_CLE_0_CT0CA) != - V3D_READ(V3D_CLE_0_CT0EA)) { - v3d_hw_tick(v3d.hw); - } - - fprintf(stderr, "submit %x..%x!\n", submit->rcl_start, submit->rcl_end); - - v3d_flush_caches(); - - V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start); - V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end); - - while (V3D_READ(V3D_CLE_0_CT1CA) != - V3D_READ(V3D_CLE_0_CT1EA)) { - v3d_hw_tick(v3d.hw); - } - - for (int i = 0; i < submit->bo_handle_count; i++) - v3d_simulator_copy_out_handle(shim_fd, bo_handles[i]); - - return 0; -} - -static int -v3dX(v3d_ioctl_submit_tfu)(int fd, unsigned long request, void *arg) -{ - struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); - struct drm_v3d_submit_tfu *submit = arg; - - v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[0]); - v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[1]); - v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[2]); - v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[3]); - - int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; - - V3D_WRITE(V3D_TFU_IIA, submit->iia); - V3D_WRITE(V3D_TFU_IIS, submit->iis); - V3D_WRITE(V3D_TFU_ICA, submit->ica); - V3D_WRITE(V3D_TFU_IUA, submit->iua); - V3D_WRITE(V3D_TFU_IOA, submit->ioa); - V3D_WRITE(V3D_TFU_IOS, submit->ios); - V3D_WRITE(V3D_TFU_COEF0, submit->coef[0]); - V3D_WRITE(V3D_TFU_COEF1, submit->coef[1]); - V3D_WRITE(V3D_TFU_COEF2, submit->coef[2]); - V3D_WRITE(V3D_TFU_COEF3, submit->coef[3]); - - V3D_WRITE(V3D_TFU_ICFG, submit->icfg); - - while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { - v3d_hw_tick(v3d.hw); - } - - v3d_simulator_copy_out_handle(shim_fd, submit->bo_handles[0]); - - return 0; -} - -static int -v3dX(v3d_ioctl_create_bo)(int fd, unsigned long request, void *arg) -{ - struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); - struct drm_v3d_create_bo *create = arg; - struct v3d_bo *bo = calloc(1, sizeof(*bo)); - - drm_shim_bo_init(&bo->base, create->size); - bo->offset = util_vma_heap_alloc(&v3d.heap, create->size, 4096); - if (bo->offset == 0) - return -ENOMEM; - - bo->sim_vaddr = v3d.mem + bo->offset - v3d.mem_base; -#if 0 - /* Place a mapping of the BO inside of the simulator's address space - * for V3D memory. This lets us avoid copy in/out for simpenrose, but - * I'm betting we'll need something else for FPGA. - */ - void *sim_addr = v3d.mem + bo->block->ofs; - void *mmap_ret = mmap(sim_addr, create->size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, bo->base.fd, 0); - assert(mmap_ret == sim_addr); -#else - /* Make a simulator-private mapping of the shim GEM object. */ - bo->gem_vaddr = mmap(NULL, bo->base.size, - PROT_READ | PROT_WRITE, - MAP_SHARED, - bo->base.fd, 0); - if (bo->gem_vaddr == MAP_FAILED) { - fprintf(stderr, "v3d: mmap of shim bo failed\n"); - abort(); - } -#endif - - create->offset = bo->offset; - create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base); - - drm_shim_bo_put(&bo->base); - - return 0; -} - -static int -v3dX(v3d_ioctl_get_param)(int fd, unsigned long request, void *arg) -{ - struct drm_v3d_get_param *gp = arg; - static const uint32_t reg_map[] = { - [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG, - [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1, - [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2, - [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3, - [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0, - [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1, - [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2, - }; - - switch (gp->param) { - case DRM_V3D_PARAM_SUPPORTS_TFU: - gp->value = 1; - return 0; - } - - if (gp->param < ARRAY_SIZE(reg_map) && reg_map[gp->param]) { - gp->value = V3D_READ(reg_map[gp->param]); - return 0; - } - - fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param); - return -1; -} - -static ioctl_fn_t driver_ioctls[] = { - [DRM_V3D_SUBMIT_CL] = v3dX(v3d_ioctl_submit_cl), - [DRM_V3D_SUBMIT_TFU] = v3dX(v3d_ioctl_submit_tfu), - [DRM_V3D_WAIT_BO] = v3d_ioctl_wait_bo, - [DRM_V3D_CREATE_BO] = v3dX(v3d_ioctl_create_bo), - [DRM_V3D_GET_PARAM] = v3dX(v3d_ioctl_get_param), - [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo, - [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset, -}; - -static void -v3d_isr(uint32_t hub_status) -{ - /* Check the per-core bits */ - if (hub_status & (1 << 0)) { - uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS); - - if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { - fprintf(stderr, "GMP violation at 0x%08x\n", - V3D_READ(V3D_GMP_0_VIO_ADDR)); - abort(); - } else { - fprintf(stderr, - "Unexpected ISR with core status 0x%08x\n", - core_status); - } - abort(); - } - - return; -} - -static void -v3dX(simulator_init_regs)(void) -{ -#if V3D_VERSION == 33 - /* Set OVRTMUOUT to match kernel behavior. - * - * This means that the texture sampler uniform configuration's tmu - * output type field is used, instead of using the hardware default - * behavior based on the texture type. If you want the default - * behavior, you can still put "2" in the indirect texture state's - * output_type field. - */ - V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET); -#endif - - uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_GMPV_SET; - V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); - V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); - - v3d_hw_set_isr(v3d.hw, v3d_isr); -} - -static void -v3d_bo_free(struct shim_bo *shim_bo) -{ - struct v3d_bo *bo = v3d_bo(shim_bo); - - if (bo->gem_vaddr) - munmap(bo->gem_vaddr, shim_bo->size); - - util_vma_heap_free(&v3d.heap, bo->offset, bo->base.size); -} - -void -v3dX(drm_shim_driver_init)(void) -{ - shim_device.driver_ioctls = driver_ioctls; - shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls); - - shim_device.driver_bo_free = v3d_bo_free; - - /* Allocate a gig of memory to play in. */ - v3d_hw_alloc_mem(v3d.hw, 1024 * 1024 * 1024); - v3d.mem_base = - v3d_hw_get_mem(v3d.hw, &v3d.mem_size, - &v3d.mem); - util_vma_heap_init(&v3d.heap, 4096, v3d.mem_size - 4096); - - v3dX(simulator_init_regs)(); -} diff --git a/src/broadcom/drm-shim/vc4_noop.c b/src/broadcom/drm-shim/vc4_noop.c index 3f85158e6df..b9c83db8313 100644 --- a/src/broadcom/drm-shim/vc4_noop.c +++ b/src/broadcom/drm-shim/vc4_noop.c @@ -51,6 +51,20 @@ vc4_ioctl_create_bo(int fd, unsigned long request, void *arg) } static int +vc4_ioctl_create_shader_bo(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_vc4_create_shader_bo *create = arg; + struct shim_bo *bo = calloc(1, sizeof(*bo)); + + drm_shim_bo_init(bo, create->size); + create->handle = drm_shim_bo_get_handle(shim_fd, bo); + drm_shim_bo_put(bo); + + return 0; +} + +static int vc4_ioctl_mmap_bo(int fd, unsigned long request, void *arg) { struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); @@ -101,6 +115,7 @@ vc4_ioctl_get_param(int fd, unsigned long request, void *arg) static ioctl_fn_t driver_ioctls[] = { [DRM_VC4_CREATE_BO] = vc4_ioctl_create_bo, + [DRM_VC4_CREATE_SHADER_BO] = vc4_ioctl_create_shader_bo, [DRM_VC4_MMAP_BO] = vc4_ioctl_mmap_bo, [DRM_VC4_GET_PARAM] = vc4_ioctl_get_param, [DRM_VC4_GET_TILING] = vc4_ioctl_noop, diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build index 2e1145dd0c0..f8e93526300 100644 --- a/src/broadcom/meson.build +++ b/src/broadcom/meson.build @@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle') subdir('cle') -v3d_versions = ['33', '41', '42'] +v3d_versions = ['42', '71'] v3d_libs = [] if with_gallium_v3d or with_broadcom_vk @@ -38,12 +38,12 @@ endif per_version_libs = [] foreach ver : v3d_versions per_version_libs += static_library( - 'libbroadcom-v' + ver, + 'broadcom-v' + ver, [ files('clif/v3dx_dump.c'), v3d_xml_pack ], - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom], + include_directories : [inc_include, inc_src, inc_broadcom], c_args : [no_override_init_args, '-DV3D_VERSION=' + ver], gnu_symbol_visibility : 'hidden', dependencies: [dep_valgrind, dep_thread], @@ -61,7 +61,7 @@ libv3d_neon = static_library( 'v3d_neon', 'common/v3d_tiling.c', include_directories : [ - inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom, + inc_src, inc_include, inc_broadcom, ], c_args : [v3d_args, v3d_neon_c_args], gnu_symbol_visibility : 'hidden', @@ -69,12 +69,12 @@ libv3d_neon = static_library( ) libbroadcom_v3d = static_library( - 'libbroadcom_v3d', + 'broadcom_v3d', [ files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c', 'common/v3d_util.c'), v3d_xml_pack, ], - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom], + include_directories : [inc_include, inc_src, inc_broadcom], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', link_whole : v3d_libs + per_version_libs, diff --git a/src/broadcom/qpu/meson.build b/src/broadcom/qpu/meson.build index eea1f9bb058..fefc6a5cc56 100644 --- a/src/broadcom/qpu/meson.build +++ b/src/broadcom/qpu/meson.build @@ -25,9 +25,9 @@ libbroadcom_qpu_files = files( ) libbroadcom_qpu = static_library( - ['broadcom_qpu', v3d_xml_pack], - libbroadcom_qpu_files, - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom], + 'broadcom_qpu', + [libbroadcom_qpu_files, v3d_xml_pack], + include_directories : [inc_include, inc_src, inc_broadcom], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', dependencies : [dep_libdrm, dep_valgrind], @@ -42,7 +42,7 @@ test( 'qpu_disasm', 'tests/qpu_disasm.c', link_with: libbroadcom_qpu, dependencies : idep_mesautil, - include_directories: [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux] + include_directories: [inc_include, inc_src] ), suite : ['broadcom'], ) diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c index b5648bd76e2..c1590a760de 100644 --- a/src/broadcom/qpu/qpu_disasm.c +++ b/src/broadcom/qpu/qpu_disasm.c @@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n) static void -v3d_qpu_disasm_raddr(struct disasm_state *disasm, - const struct v3d_qpu_instr *instr, uint8_t mux) +v3d33_qpu_disasm_raddr(struct disasm_state *disasm, + const struct v3d_qpu_instr *instr, + enum v3d_qpu_mux mux) { if (mux == V3D_QPU_MUX_A) { append(disasm, "rf%d", instr->raddr_a); } else if (mux == V3D_QPU_MUX_B) { - if (instr->sig.small_imm) { + if (instr->sig.small_imm_b) { uint32_t val; ASSERTED bool ok = v3d_qpu_small_imm_unpack(disasm->devinfo, @@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm, } } +enum v3d_qpu_input_class { + V3D_QPU_ADD_A, + V3D_QPU_ADD_B, + V3D_QPU_MUL_A, + V3D_QPU_MUL_B +}; + +static void +v3d71_qpu_disasm_raddr(struct disasm_state *disasm, + const struct v3d_qpu_instr *instr, + uint8_t raddr, + enum v3d_qpu_input_class input_class) +{ + bool is_small_imm = false; + switch(input_class) { + case V3D_QPU_ADD_A: + is_small_imm = instr->sig.small_imm_a; + break; + case V3D_QPU_ADD_B: + is_small_imm = instr->sig.small_imm_b; + break; + case V3D_QPU_MUL_A: + is_small_imm = instr->sig.small_imm_c; + break; + case V3D_QPU_MUL_B: + is_small_imm = instr->sig.small_imm_d; + break; + } + + if (is_small_imm) { + uint32_t val; + ASSERTED bool ok = + v3d_qpu_small_imm_unpack(disasm->devinfo, + raddr, + &val); + + if ((int)val >= -16 && (int)val <= 15) + append(disasm, "%d", val); + else + append(disasm, "0x%08x", val); + assert(ok); + } else { + append(disasm, "rf%d", raddr); + } +} + +static void +v3d_qpu_disasm_raddr(struct disasm_state *disasm, + const struct v3d_qpu_instr *instr, + const struct v3d_qpu_input *input, + enum v3d_qpu_input_class input_class) +{ + if (disasm->devinfo->ver < 71) + v3d33_qpu_disasm_raddr(disasm, instr, input->mux); + else + v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class); +} + static void v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic) { @@ -110,7 +169,7 @@ v3d_qpu_disasm_add(struct disasm_state *disasm, append(disasm, "%s", v3d_qpu_pf_name(instr->flags.apf)); append(disasm, "%s", v3d_qpu_uf_name(instr->flags.auf)); - append(disasm, " "); + append(disasm, " "); if (has_dst) { v3d_qpu_disasm_waddr(disasm, instr->alu.add.waddr, @@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm, if (num_src >= 1) { if (has_dst) append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.add.a_unpack)); + v3d_qpu_unpack_name(instr->alu.add.a.unpack)); } if (num_src >= 2) { append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.add.b_unpack)); + v3d_qpu_unpack_name(instr->alu.add.b.unpack)); } } @@ -141,7 +200,7 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm, bool has_dst = v3d_qpu_mul_op_has_dst(instr->alu.mul.op); int num_src = v3d_qpu_mul_op_num_src(instr->alu.mul.op); - pad_to(disasm, 21); + pad_to(disasm, 30); append(disasm, "; "); append(disasm, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op)); @@ -153,7 +212,7 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm, if (instr->alu.mul.op == V3D_QPU_M_NOP) return; - append(disasm, " "); + append(disasm, " "); if (has_dst) { v3d_qpu_disasm_waddr(disasm, instr->alu.mul.waddr, @@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm, if (num_src >= 1) { if (has_dst) append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.mul.a_unpack)); + v3d_qpu_unpack_name(instr->alu.mul.a.unpack)); } if (num_src >= 2) { append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.mul.b_unpack)); + v3d_qpu_unpack_name(instr->alu.mul.b.unpack)); } } @@ -217,7 +276,7 @@ v3d_qpu_disasm_sig(struct disasm_state *disasm, return; } - pad_to(disasm, 41); + pad_to(disasm, 60); if (sig->thrsw) append(disasm, "; thrsw"); diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c index 569c5fc4074..9a6434d94dd 100644 --- a/src/broadcom/qpu/qpu_instr.c +++ b/src/broadcom/qpu/qpu_instr.c @@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo, if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU) return "tmu"; + /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below + */ + if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD) + return "quad"; + + if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP) + return "rep"; + static const char *waddr_magic[] = { [V3D_QPU_WADDR_R0] = "r0", [V3D_QPU_WADDR_R1] = "r1", @@ -169,6 +177,19 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op) [V3D_QPU_A_ITOF] = "itof", [V3D_QPU_A_CLZ] = "clz", [V3D_QPU_A_UTOF] = "utof", + [V3D_QPU_A_MOV] = "mov", + [V3D_QPU_A_FMOV] = "fmov", + [V3D_QPU_A_VPACK] = "vpack", + [V3D_QPU_A_V8PACK] = "v8pack", + [V3D_QPU_A_V10PACK] = "v10pack", + [V3D_QPU_A_V11FPACK] = "v11fpack", + [V3D_QPU_A_BALLOT] = "ballot", + [V3D_QPU_A_BCASTF] = "bcastf", + [V3D_QPU_A_ALLEQ] = "alleq", + [V3D_QPU_A_ALLFEQ] = "allfeq", + [V3D_QPU_A_ROTQ] = "rotq", + [V3D_QPU_A_ROT] = "rot", + [V3D_QPU_A_SHUFFLE] = "shuffle", }; if (op >= ARRAY_SIZE(op_names)) @@ -191,6 +212,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op) [V3D_QPU_M_MOV] = "mov", [V3D_QPU_M_NOP] = "nop", [V3D_QPU_M_FMUL] = "fmul", + [V3D_QPU_M_FTOUNORM16] = "ftounorm16", + [V3D_QPU_M_FTOSNORM16] = "ftosnorm16", + [V3D_QPU_M_VFTOUNORM8] = "vftounorm8", + [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8", + [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo", + [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi", }; if (op >= ARRAY_SIZE(op_names)) @@ -450,6 +477,21 @@ static const uint8_t add_op_args[] = { [V3D_QPU_A_ITOF] = D | A, [V3D_QPU_A_CLZ] = D | A, [V3D_QPU_A_UTOF] = D | A, + + [V3D_QPU_A_MOV] = D | A, + [V3D_QPU_A_FMOV] = D | A, + [V3D_QPU_A_VPACK] = D | A | B, + [V3D_QPU_A_V8PACK] = D | A | B, + [V3D_QPU_A_V10PACK] = D | A | B, + [V3D_QPU_A_V11FPACK] = D | A | B, + + [V3D_QPU_A_BALLOT] = D | A, + [V3D_QPU_A_BCASTF] = D | A, + [V3D_QPU_A_ALLEQ] = D | A, + [V3D_QPU_A_ALLFEQ] = D | A, + [V3D_QPU_A_ROTQ] = D | A | B, + [V3D_QPU_A_ROT] = D | A | B, + [V3D_QPU_A_SHUFFLE] = D | A | B, }; static const uint8_t mul_op_args[] = { @@ -463,6 +505,12 @@ static const uint8_t mul_op_args[] = { [V3D_QPU_M_NOP] = 0, [V3D_QPU_M_MOV] = D | A, [V3D_QPU_M_FMUL] = D | A | B, + [V3D_QPU_M_FTOUNORM16] = D | A, + [V3D_QPU_M_FTOSNORM16] = D | A, + [V3D_QPU_M_VFTOUNORM8] = D | A, + [V3D_QPU_M_VFTOSNORM8] = D | A, + [V3D_QPU_M_VFTOUNORM10LO] = D | A, + [V3D_QPU_M_VFTOUNORM10HI] = D | A, }; bool @@ -636,19 +684,23 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op) } bool -v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) +v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) { - if (inst->sig.ldtlb || - inst->sig.ldtlbu) - return true; + return inst->sig.ldtlb || inst->sig.ldtlbu; +} +bool +v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) +{ if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { - if (inst->alu.add.magic_write && + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr)) { return true; } - if (inst->alu.mul.magic_write && + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr)) { return true; } @@ -658,18 +710,32 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) } bool +v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) +{ + return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst); +} + +bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) { - if (v3d_qpu_instr_is_sfu(inst)) - return true; + return v3d_qpu_instr_is_sfu(inst) || v3d_qpu_instr_is_legacy_sfu(inst); +} +/* Checks whether the instruction implements a SFU operation by the writing + * to specific magic register addresses instead of using SFU ALU opcodes. + */ +bool +v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) +{ if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { - if (inst->alu.add.magic_write && + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) { return true; } - if (inst->alu.mul.magic_write && + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) { return true; } @@ -689,6 +755,13 @@ v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) case V3D_QPU_A_LOG: case V3D_QPU_A_SIN: case V3D_QPU_A_RSQRT2: + case V3D_QPU_A_BALLOT: + case V3D_QPU_A_BCASTF: + case V3D_QPU_A_ALLEQ: + case V3D_QPU_A_ALLFEQ: + case V3D_QPU_A_ROTQ: + case V3D_QPU_A_ROT: + case V3D_QPU_A_SHUFFLE: return true; default: return false; @@ -702,9 +775,11 @@ v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { return (inst->type == V3D_QPU_INSTR_TYPE_ALU && - ((inst->alu.add.magic_write && + ((inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.add.waddr)) || - (inst->alu.mul.magic_write && + (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.mul.waddr)))); } @@ -740,12 +815,14 @@ v3d_qpu_writes_vpm(const struct v3d_qpu_instr *inst) if (v3d_qpu_add_op_writes_vpm(inst->alu.add.op)) return true; - if (inst->alu.add.magic_write && + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr)) { return true; } - if (inst->alu.mul.magic_write && + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr)) { return true; } @@ -773,12 +850,18 @@ v3d_qpu_writes_unifa(const struct v3d_device_info *devinfo, inst->alu.mul.waddr == V3D_QPU_WADDR_UNIFA) { return true; } + + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + inst->sig_magic && + inst->sig_addr == V3D_QPU_WADDR_UNIFA) { + return true; + } } return false; } -static bool +bool v3d_qpu_waits_vpm(const struct v3d_qpu_instr *inst) { return inst->type == V3D_QPU_INSTR_TYPE_ALU && @@ -805,10 +888,12 @@ qpu_writes_magic_waddr_explicitly(const struct v3d_device_info *devinfo, uint32_t waddr) { if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { - if (inst->alu.add.magic_write && inst->alu.add.waddr == waddr) + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && inst->alu.add.waddr == waddr) return true; - if (inst->alu.mul.magic_write && inst->alu.mul.waddr == waddr) + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && inst->alu.mul.waddr == waddr) return true; } @@ -824,6 +909,9 @@ bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if(!devinfo->has_accumulators) + return false; + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3)) return true; @@ -834,14 +922,19 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if (!devinfo->has_accumulators) + return false; + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { - if (inst->alu.add.magic_write && + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && (inst->alu.add.waddr == V3D_QPU_WADDR_R4 || v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))) { return true; } - if (inst->alu.mul.magic_write && + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && (inst->alu.mul.waddr == V3D_QPU_WADDR_R4 || v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))) { return true; @@ -862,6 +955,9 @@ bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if (!devinfo->has_accumulators) + return false; + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5)) return true; @@ -872,6 +968,9 @@ bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if (!devinfo->has_accumulators) + return false; + if (v3d_qpu_writes_r5(devinfo, inst)) return true; if (v3d_qpu_writes_r4(devinfo, inst)) @@ -889,15 +988,67 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, } bool +v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) +{ + if (devinfo->ver >= 71 && + (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) { + return true; + } + + return false; +} + +bool v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) { int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); - return ((add_nsrc > 0 && inst->alu.add.a == mux) || - (add_nsrc > 1 && inst->alu.add.b == mux) || - (mul_nsrc > 0 && inst->alu.mul.a == mux) || - (mul_nsrc > 1 && inst->alu.mul.b == mux)); + return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) || + (add_nsrc > 1 && inst->alu.add.b.mux == mux) || + (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) || + (mul_nsrc > 1 && inst->alu.mul.b.mux == mux)); +} + +bool +v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr) +{ + int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); + int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); + + return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) || + (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) || + (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) || + (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr); +} + +bool +v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, + uint8_t waddr) +{ + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (v3d_qpu_add_op_has_dst(inst->alu.add.op) && + !inst->alu.add.magic_write && + inst->alu.add.waddr == waddr) { + return true; + } + + if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) && + !inst->alu.mul.magic_write && + inst->alu.mul.waddr == waddr) { + return true; + } + + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic && inst->sig_addr == waddr) { + return true; + } + + return false; } bool diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h index 4f165e93914..fe9b5d3a00f 100644 --- a/src/broadcom/qpu/qpu_instr.h +++ b/src/broadcom/qpu/qpu_instr.h @@ -50,10 +50,13 @@ struct v3d_qpu_sig { bool ldvpm:1; bool ldtlb:1; bool ldtlbu:1; - bool small_imm:1; bool ucb:1; bool rotate:1; bool wrtmuc:1; + bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */ + bool small_imm_b:1; /* raddr_b (add b) */ + bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */ + bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */ }; enum v3d_qpu_cond { @@ -88,12 +91,13 @@ enum v3d_qpu_uf { }; enum v3d_qpu_waddr { - V3D_QPU_WADDR_R0 = 0, - V3D_QPU_WADDR_R1 = 1, - V3D_QPU_WADDR_R2 = 2, - V3D_QPU_WADDR_R3 = 3, - V3D_QPU_WADDR_R4 = 4, - V3D_QPU_WADDR_R5 = 5, + V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */ + V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */ V3D_QPU_WADDR_NOP = 6, V3D_QPU_WADDR_TLB = 7, V3D_QPU_WADDR_TLBU = 8, @@ -108,12 +112,12 @@ enum v3d_qpu_waddr { V3D_QPU_WADDR_SYNC = 16, V3D_QPU_WADDR_SYNCU = 17, V3D_QPU_WADDR_SYNCB = 18, - V3D_QPU_WADDR_RECIP = 19, - V3D_QPU_WADDR_RSQRT = 20, - V3D_QPU_WADDR_EXP = 21, - V3D_QPU_WADDR_LOG = 22, - V3D_QPU_WADDR_SIN = 23, - V3D_QPU_WADDR_RSQRT2 = 24, + V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */ V3D_QPU_WADDR_TMUC = 32, V3D_QPU_WADDR_TMUS = 33, V3D_QPU_WADDR_TMUT = 34, @@ -129,7 +133,8 @@ enum v3d_qpu_waddr { V3D_QPU_WADDR_TMUHSCM = 44, V3D_QPU_WADDR_TMUHSF = 45, V3D_QPU_WADDR_TMUHSLOD = 46, - V3D_QPU_WADDR_R5REP = 55, + V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */ + V3D_QPU_WADDR_REP = 55, /* V3D 7.x */ }; struct v3d_qpu_flags { @@ -222,6 +227,21 @@ enum v3d_qpu_add_op { V3D_QPU_A_ITOF, V3D_QPU_A_CLZ, V3D_QPU_A_UTOF, + + /* V3D 7.x */ + V3D_QPU_A_FMOV, + V3D_QPU_A_MOV, + V3D_QPU_A_VPACK, + V3D_QPU_A_V8PACK, + V3D_QPU_A_V10PACK, + V3D_QPU_A_V11FPACK, + V3D_QPU_A_BALLOT, + V3D_QPU_A_BCASTF, + V3D_QPU_A_ALLEQ, + V3D_QPU_A_ALLFEQ, + V3D_QPU_A_ROTQ, + V3D_QPU_A_ROT, + V3D_QPU_A_SHUFFLE, }; enum v3d_qpu_mul_op { @@ -235,6 +255,14 @@ enum v3d_qpu_mul_op { V3D_QPU_M_MOV, V3D_QPU_M_NOP, V3D_QPU_M_FMUL, + + /* V3D 7.x */ + V3D_QPU_M_FTOUNORM16, + V3D_QPU_M_FTOSNORM16, + V3D_QPU_M_VFTOUNORM8, + V3D_QPU_M_VFTOSNORM8, + V3D_QPU_M_VFTOUNORM10LO, + V3D_QPU_M_VFTOUNORM10HI, }; enum v3d_qpu_output_pack { @@ -276,6 +304,15 @@ enum v3d_qpu_input_unpack { /** Swap high and low 16 bits */ V3D_QPU_UNPACK_SWAP_16, + + /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */ + V3D_QPU_UNPACK_UL, + /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */ + V3D_QPU_UNPACK_UH, + /** Convert low 16 bits from 16-bit integer to signed 32-bit int */ + V3D_QPU_UNPACK_IL, + /** Convert high 16 bits from 16-bit integer to signed 32-bit int */ + V3D_QPU_UNPACK_IH, }; enum v3d_qpu_mux { @@ -289,25 +326,29 @@ enum v3d_qpu_mux { V3D_QPU_MUX_B, }; +struct v3d_qpu_input { + union { + enum v3d_qpu_mux mux; /* V3D 4.x */ + uint8_t raddr; /* V3D 7.x */ + }; + enum v3d_qpu_input_unpack unpack; +}; + struct v3d_qpu_alu_instr { struct { enum v3d_qpu_add_op op; - enum v3d_qpu_mux a, b; + struct v3d_qpu_input a, b; uint8_t waddr; bool magic_write; enum v3d_qpu_output_pack output_pack; - enum v3d_qpu_input_unpack a_unpack; - enum v3d_qpu_input_unpack b_unpack; } add; struct { enum v3d_qpu_mul_op op; - enum v3d_qpu_mux a, b; + struct v3d_qpu_input a, b; uint8_t waddr; bool magic_write; enum v3d_qpu_output_pack output_pack; - enum v3d_qpu_input_unpack a_unpack; - enum v3d_qpu_input_unpack b_unpack; } mul; }; @@ -379,8 +420,8 @@ struct v3d_qpu_instr { struct v3d_qpu_sig sig; uint8_t sig_addr; bool sig_magic; /* If the signal writes to a magic address */ - uint8_t raddr_a; - uint8_t raddr_b; + uint8_t raddr_a; /* V3D 4.x */ + uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */ struct v3d_qpu_flags flags; union { @@ -450,8 +491,11 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; +bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; @@ -463,11 +507,14 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; +bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux); bool v3d_qpu_uses_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_waits_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_reads_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_reads_or_writes_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; @@ -481,4 +528,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + +bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr); +bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, + uint8_t waddr); #endif diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c index eee1e9f95a5..c4added7344 100644 --- a/src/broadcom/qpu/qpu_pack.c +++ b/src/broadcom/qpu/qpu_pack.c @@ -84,6 +84,9 @@ #define V3D_QPU_MUL_A_SHIFT 18 #define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18) +#define V3D_QPU_RADDR_C_SHIFT 18 +#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18) + #define V3D_QPU_ADD_B_SHIFT 15 #define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15) @@ -98,6 +101,9 @@ #define V3D_QPU_BRANCH_BDI_SHIFT 12 #define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12) +#define V3D_QPU_RADDR_D_SHIFT 12 +#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12) + #define V3D_QPU_RADDR_A_SHIFT 6 #define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6) @@ -112,12 +118,15 @@ #define LDTMU .ldtmu = true #define LDVARY .ldvary = true #define LDVPM .ldvpm = true -#define SMIMM .small_imm = true #define LDTLB .ldtlb = true #define LDTLBU .ldtlbu = true #define UCB .ucb = true #define ROT .rotate = true #define WRTMUC .wrtmuc = true +#define SMIMM_A .small_imm_a = true +#define SMIMM_B .small_imm_b = true +#define SMIMM_C .small_imm_c = true +#define SMIMM_D .small_imm_d = true static const struct v3d_qpu_sig v33_sig_map[] = { /* MISC R3 R4 R5 */ @@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { [11] = { THRSW, LDVARY, LDUNIF }, [12] = { LDVARY, LDTMU, }, [13] = { THRSW, LDVARY, LDTMU, }, - [14] = { SMIMM, LDVARY, }, - [15] = { SMIMM, }, + [14] = { SMIMM_B, LDVARY, }, + [15] = { SMIMM_B, }, [16] = { LDTLB, }, [17] = { LDTLBU, }, /* 18-21 reserved */ @@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { [27] = { THRSW, LDVPM, LDUNIF }, [28] = { LDVPM, LDTMU, }, [29] = { THRSW, LDVPM, LDTMU, }, - [30] = { SMIMM, LDVPM, }, - [31] = { SMIMM, }, + [30] = { SMIMM_B, LDVPM, }, + [31] = { SMIMM_B, }, }; static const struct v3d_qpu_sig v40_sig_map[] = { @@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = { [10] = { LDVARY, LDUNIF }, [11] = { THRSW, LDVARY, LDUNIF }, /* 12-13 reserved */ - [14] = { SMIMM, LDVARY, }, - [15] = { SMIMM, }, + [14] = { SMIMM_B, LDVARY, }, + [15] = { SMIMM_B, }, [16] = { LDTLB, }, [17] = { LDTLBU, }, [18] = { WRTMUC }, @@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = { [22] = { UCB, }, [23] = { ROT, }, /* 24-30 reserved */ - [31] = { SMIMM, LDTMU, }, + [31] = { SMIMM_B, LDTMU, }, }; static const struct v3d_qpu_sig v41_sig_map[] = { @@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = { [11] = { THRSW, LDVARY, LDUNIF }, [12] = { LDUNIFRF }, [13] = { THRSW, LDUNIFRF }, - [14] = { SMIMM, LDVARY, }, - [15] = { SMIMM, }, + [14] = { SMIMM_B, LDVARY }, + [15] = { SMIMM_B, }, [16] = { LDTLB, }, [17] = { LDTLBU, }, [18] = { WRTMUC }, @@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = { [24] = { LDUNIFA}, [25] = { LDUNIFARF }, /* 26-30 reserved */ - [31] = { SMIMM, LDTMU, }, + [31] = { SMIMM_B, LDTMU, }, +}; + + +static const struct v3d_qpu_sig v71_sig_map[] = { + /* MISC phys RF0 */ + [0] = { }, + [1] = { THRSW, }, + [2] = { LDUNIF }, + [3] = { THRSW, LDUNIF }, + [4] = { LDTMU, }, + [5] = { THRSW, LDTMU, }, + [6] = { LDTMU, LDUNIF }, + [7] = { THRSW, LDTMU, LDUNIF }, + [8] = { LDVARY, }, + [9] = { THRSW, LDVARY, }, + [10] = { LDVARY, LDUNIF }, + [11] = { THRSW, LDVARY, LDUNIF }, + [12] = { LDUNIFRF }, + [13] = { THRSW, LDUNIFRF }, + [14] = { SMIMM_A, }, + [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + [18] = { WRTMUC }, + [19] = { THRSW, WRTMUC }, + [20] = { LDVARY, WRTMUC }, + [21] = { THRSW, LDVARY, WRTMUC }, + [22] = { UCB, }, + /* 23 reserved */ + [24] = { LDUNIFA}, + [25] = { LDUNIFARF }, + /* 26-29 reserved */ + [30] = { SMIMM_C, }, + [31] = { SMIMM_D, }, }; bool @@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo, if (packed_sig >= ARRAY_SIZE(v33_sig_map)) return false; - if (devinfo->ver >= 41) + if (devinfo->ver >= 71) + *sig = v71_sig_map[packed_sig]; + else if (devinfo->ver >= 41) *sig = v41_sig_map[packed_sig]; else if (devinfo->ver == 40) *sig = v40_sig_map[packed_sig]; @@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo, { static const struct v3d_qpu_sig *map; - if (devinfo->ver >= 41) + if (devinfo->ver >= 71) + map = v71_sig_map; + else if (devinfo->ver >= 41) map = v41_sig_map; else if (devinfo->ver == 40) map = v40_sig_map; @@ -256,13 +303,6 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo, return false; } -static inline unsigned -fui( float f ) -{ - union {float f; unsigned ui;} fi; - fi.f = f; - return fi.ui; -} static const uint32_t small_immediates[] = { 0, 1, 2, 3, @@ -425,8 +465,13 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo, if (flags_present & MUF) *packed_cond |= cond->muf - V3D_QPU_UF_ANDZ + 4; - if (flags_present & AC) - *packed_cond |= (cond->ac - V3D_QPU_COND_IFA) << 2; + if (flags_present & AC) { + if (*packed_cond & (1 << 6)) + *packed_cond |= cond->ac - V3D_QPU_COND_IFA; + else + *packed_cond |= (cond->ac - + V3D_QPU_COND_IFA) << 2; + } if (flags_present & MC) { if (*packed_cond & (1 << 6)) @@ -445,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo, /* Make a mapping of the table of opcodes in the spec. The opcode is * determined by a combination of the opcode field, and in the case of 0 or - * 1-arg opcodes, the mux_b field as well. + * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as + * well. */ -#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1)) -#define ANYMUX MUX_MASK(0, 7) +#define OP_MASK(val) BITFIELD64_BIT(val) +#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1) +#define ANYMUX OP_RANGE(0, 7) +#define ANYOPMASK OP_RANGE(0, 63) struct opcode_desc { uint8_t opcode_first; uint8_t opcode_last; - uint8_t mux_b_mask; - uint8_t mux_a_mask; + + union { + struct { + uint8_t b_mask; + uint8_t a_mask; + } mux; + uint64_t raddr_mask; + }; + uint8_t op; /* first_ver == 0 if it's the same across all V3D versions. @@ -467,122 +522,329 @@ struct opcode_desc { uint8_t last_ver; }; -static const struct opcode_desc add_ops[] = { +static const struct opcode_desc add_ops_v33[] = { /* FADD is FADDNF depending on the order of the mux_a/mux_b. */ - { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADD }, - { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADDNF }, - { 53, 55, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, - { 56, 56, ANYMUX, ANYMUX, V3D_QPU_A_ADD }, - { 57, 59, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, - { 60, 60, ANYMUX, ANYMUX, V3D_QPU_A_SUB }, - { 61, 63, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, - { 64, 111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB }, - { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN }, - { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX }, - { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN }, - { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX }, - { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL }, - { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR }, - { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR }, - { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR }, + { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD }, + { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF }, + { 53, 55, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, + { 56, 56, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD }, + { 57, 59, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, + { 60, 60, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB }, + { 61, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, + { 64, 111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB }, + { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN }, + { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX }, + { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN }, + { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX }, + { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL }, + { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR }, + { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR }, + { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR }, /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */ - { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN }, - { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX }, - { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN }, - - { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND }, - { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR }, - { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR }, - - { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD }, - { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB }, - { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT }, - { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG }, - { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH }, - { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH }, - { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP }, - { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP }, - { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF }, - { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF }, - { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 }, - { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX }, - { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX }, - { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR }, - { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA }, - { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA }, - { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB }, - { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB }, - - { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD }, - { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD }, - { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD }, - { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD }, - - { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF }, - { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF }, - { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 }, - { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 }, - { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 }, - { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 }, - { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT }, - { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT }, - { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 }, - { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 }, - { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, - - { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, - { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, - { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, - { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, - { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 }, - { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 }, - { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 }, - { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 }, - { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 }, - { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 }, - { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, - { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, + { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN }, + { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX }, + { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN }, + + { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND }, + { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR }, + { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR }, + + { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD }, + { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB }, + { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT }, + { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG }, + { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH }, + { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH }, + { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP }, + { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP }, + { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF }, + { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, + + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD }, + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD }, + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD }, + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD }, + + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 }, + { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 }, + { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, + + { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 }, + { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, + { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, /* FIXME: MORE COMPLICATED */ - /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ + /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ - { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP }, - { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX }, + { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP }, + { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX }, - { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND }, - { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN }, - { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC }, - { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ }, - { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR }, - { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ }, - { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL }, - { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC }, + { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND }, + { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN }, + { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC }, + { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ }, + { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR }, + { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ }, + { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL }, + { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC }, - { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX }, - { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY }, + { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX }, + { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY }, /* The stvpms are distinguished by the waddr field. */ - { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV }, - { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD }, - { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP }, + { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV }, + { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD }, + { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP }, + + { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF }, + { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ }, + { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF }, +}; + +static const struct opcode_desc mul_ops_v33[] = { + { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD }, + { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB }, + { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 }, + { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL }, + { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 }, + { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP }, + { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 }, + { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42}, + { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 }, + { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 }, + + { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL }, +}; - { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF }, - { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ }, - { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF }, +/* Note that it would have been possible to define all the add/mul opcodes in + * just one table, using the first_ver/last_ver. But taking into account that + * for v71 there were a lot of changes, it was more tidy this way. Also right + * now we are doing a linear search on those tables, so this maintains the + * tables smaller. + * + * Just in case we merge the tables, we define the first_ver as 71 for those + * opcodes that changed on v71 + */ +static const struct opcode_desc add_ops_v71[] = { + /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */ + { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD }, + { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF }, + { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, + { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD }, + { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, + { 60, 60, .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB }, + { 61, 63, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, + { 64, 111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB }, + { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN }, + { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX }, + { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN }, + { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX }, + { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL }, + { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR }, + { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR }, + { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR }, + /* FMIN is instead FMAX depending on the raddr_a/b order. */ + { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN }, + { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX }, + { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN }, + + { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND }, + { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR }, + { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR }, + { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD }, + { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB }, + + { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT }, + { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG }, + { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH }, + { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH }, + { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP }, + { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ }, + { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF }, + { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF }, + + { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, + { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX }, + { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX }, + { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR }, + { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA }, + { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, + { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB }, + { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, + { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD }, + { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD }, + { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF }, + { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF }, + { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID }, + { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID }, + { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID }, + { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT }, + { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT }, + { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST }, + { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST }, + + { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD }, + { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD }, + + { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 }, + { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 }, + { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 }, + + { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 }, + { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 }, + { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 }, + { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 }, + { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 }, + { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 }, + { 188, 188, .raddr_mask = OP_MASK(38), V3D_QPU_A_BALLOT, 71 }, + { 188, 188, .raddr_mask = OP_MASK(39), V3D_QPU_A_BCASTF, 71 }, + { 188, 188, .raddr_mask = OP_MASK(40), V3D_QPU_A_ALLEQ, 71 }, + { 188, 188, .raddr_mask = OP_MASK(41), V3D_QPU_A_ALLFEQ, 71 }, + + { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 }, + + /* The stvpms are distinguished by the waddr field. */ + { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71}, + { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71}, + { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71}, + + { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FROUND, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FROUND, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FROUND, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(3), V3D_QPU_A_FTOIN, 71 }, + { 245, 245, .raddr_mask = OP_MASK(7), V3D_QPU_A_FTOIN, 71 }, + { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 }, + { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC }, + { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC }, + { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC }, + { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC }, + + { 246, 246, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 }, + + { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 }, + + { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 }, + { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 }, + + { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 }, + + { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 }, + + { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 }, + { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 }, + + { 252, 252, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROTQ, 71 }, + { 253, 253, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROT, 71 }, + { 254, 254, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHUFFLE, 71 }, }; -static const struct opcode_desc mul_ops[] = { - { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD }, - { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB }, - { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 }, - { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL }, - { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 }, - { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP }, - { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV }, - { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV }, - { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 }, - { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV }, - { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL }, +static const struct opcode_desc mul_ops_v71[] = { + /* For V3D 7.1, second mask field would be ignored */ + { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 }, + { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 }, + { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, + { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, + { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 }, + { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 }, + { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 }, + + { 14, 14, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 }, + + { 14, 14, .raddr_mask = OP_MASK(3), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(7), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 }, + + { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 }, + { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 }, + { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 }, + { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 }, + { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 }, + { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 }, + + { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 }, + + { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL }, }; /* Returns true if op_desc should be filtered out based on devinfo->ver @@ -591,17 +853,23 @@ static const struct opcode_desc mul_ops[] = { */ static bool opcode_invalid_in_version(const struct v3d_device_info *devinfo, - const struct opcode_desc *op_desc) + const uint8_t first_ver, + const uint8_t last_ver) { - return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) || - (op_desc->last_ver != 0 && devinfo->ver > op_desc->last_ver); + return (first_ver != 0 && devinfo->ver < first_ver) || + (last_ver != 0 && devinfo->ver > last_ver); } +/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending + * on the devinfo->ver some would be ignored. We do this way just to avoid + * having two really similar lookup_opcode methods + */ static const struct opcode_desc * lookup_opcode_from_packed(const struct v3d_device_info *devinfo, const struct opcode_desc *opcodes, size_t num_opcodes, uint32_t opcode, - uint32_t mux_a, uint32_t mux_b) + uint32_t mux_a, uint32_t mux_b, + uint32_t raddr) { for (int i = 0; i < num_opcodes; i++) { const struct opcode_desc *op_desc = &opcodes[i]; @@ -610,14 +878,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo, opcode > op_desc->opcode_last) continue; - if (opcode_invalid_in_version(devinfo, op_desc)) + if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) continue; - if (!(op_desc->mux_b_mask & (1 << mux_b))) - continue; + if (devinfo->ver < 71) { + if (!(op_desc->mux.b_mask & (1 << mux_b))) + continue; - if (!(op_desc->mux_a_mask & (1 << mux_a))) - continue; + if (!(op_desc->mux.a_mask & (1 << mux_a))) + continue; + } else { + if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr))) + continue; + } return op_desc; } @@ -670,6 +943,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked, } static bool +v3d_qpu_int32_unpack_unpack(uint32_t packed, + enum v3d_qpu_input_unpack *unpacked) +{ + switch (packed) { + case 0: + *unpacked = V3D_QPU_UNPACK_NONE; + return true; + case 1: + *unpacked = V3D_QPU_UNPACK_UL; + return true; + case 2: + *unpacked = V3D_QPU_UNPACK_UH; + return true; + case 3: + *unpacked = V3D_QPU_UNPACK_IL; + return true; + case 4: + *unpacked = V3D_QPU_UNPACK_IH; + return true; + default: + return false; + } +} + +static bool +v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked, + uint32_t *packed) +{ + switch (unpacked) { + case V3D_QPU_UNPACK_NONE: + *packed = 0; + return true; + case V3D_QPU_UNPACK_UL: + *packed = 1; + return true; + case V3D_QPU_UNPACK_UH: + *packed = 2; + return true; + case V3D_QPU_UNPACK_IL: + *packed = 3; + return true; + case V3D_QPU_UNPACK_IH: + *packed = 4; + return true; + default: + return false; + } +} + +static bool v3d_qpu_float16_unpack_unpack(uint32_t packed, enum v3d_qpu_input_unpack *unpacked) { @@ -720,10 +1043,10 @@ v3d_qpu_float16_unpack_pack(enum v3d_qpu_input_unpack unpacked, } static bool -v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked, +v3d_qpu_float32_pack_pack(enum v3d_qpu_output_pack pack, uint32_t *packed) { - switch (unpacked) { + switch (pack) { case V3D_QPU_PACK_NONE: *packed = 0; return true; @@ -739,8 +1062,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked, } static bool -v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, - struct v3d_qpu_instr *instr) +v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) { uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A); @@ -757,8 +1080,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, map_op = (map_op - 253 + 245); const struct opcode_desc *desc = - lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops), - map_op, mux_a, mux_b); + lookup_opcode_from_packed(devinfo, add_ops_v33, + ARRAY_SIZE(add_ops_v33), + map_op, mux_a, mux_b, 0); if (!desc) return false; @@ -814,12 +1138,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.add.output_pack = V3D_QPU_PACK_NONE; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, - &instr->alu.add.b_unpack)) { + &instr->alu.add.b.unpack)) { return false; } break; @@ -833,7 +1157,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.add.output_pack = mux_b & 0x3; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } break; @@ -845,7 +1169,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.add.output_pack = V3D_QPU_PACK_NONE; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } break; @@ -853,23 +1177,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, case V3D_QPU_A_VFMIN: case V3D_QPU_A_VFMAX: if (!v3d_qpu_float16_unpack_unpack(op & 0x7, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } instr->alu.add.output_pack = V3D_QPU_PACK_NONE; - instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; break; default: instr->alu.add.output_pack = V3D_QPU_PACK_NONE; - instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; - instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; break; } - instr->alu.add.a = mux_a; - instr->alu.add.b = mux_b; + instr->alu.add.a.mux = mux_a; + instr->alu.add.b.mux = mux_b; instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); instr->alu.add.magic_write = false; @@ -894,18 +1218,205 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, } static bool -v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, +v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) +{ + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); + uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B); + uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); + uint32_t map_op = op; + + const struct opcode_desc *desc = + lookup_opcode_from_packed(devinfo, + add_ops_v71, + ARRAY_SIZE(add_ops_v71), + map_op, 0, 0, + raddr_b); + if (!desc) + return false; + + instr->alu.add.op = desc->op; + + /* FADD/FADDNF and FMIN/FMAX are determined by the order of the + * operands. + */ + if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a > + instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) { + if (instr->alu.add.op == V3D_QPU_A_FMIN) + instr->alu.add.op = V3D_QPU_A_FMAX; + if (instr->alu.add.op == V3D_QPU_A_FADD) + instr->alu.add.op = V3D_QPU_A_FADDNF; + } + + /* Some QPU ops require a bit more than just basic opcode and mux a/b + * comparisons to distinguish them. + */ + switch (instr->alu.add.op) { + case V3D_QPU_A_STVPMV: + case V3D_QPU_A_STVPMD: + case V3D_QPU_A_STVPMP: + switch (waddr) { + case 0: + instr->alu.add.op = V3D_QPU_A_STVPMV; + break; + case 1: + instr->alu.add.op = V3D_QPU_A_STVPMD; + break; + case 2: + instr->alu.add.op = V3D_QPU_A_STVPMP; + break; + default: + return false; + } + break; + default: + break; + } + + switch (instr->alu.add.op) { + case V3D_QPU_A_FADD: + case V3D_QPU_A_FADDNF: + case V3D_QPU_A_FSUB: + case V3D_QPU_A_FMIN: + case V3D_QPU_A_FMAX: + case V3D_QPU_A_FCMP: + case V3D_QPU_A_VFPACK: + if (instr->alu.add.op != V3D_QPU_A_VFPACK && + instr->alu.add.op != V3D_QPU_A_FCMP) { + instr->alu.add.output_pack = (op >> 4) & 0x3; + } else { + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, + &instr->alu.add.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, + &instr->alu.add.b.unpack)) { + return false; + } + break; + + case V3D_QPU_A_FFLOOR: + case V3D_QPU_A_FROUND: + case V3D_QPU_A_FTRUNC: + case V3D_QPU_A_FCEIL: + case V3D_QPU_A_FDX: + case V3D_QPU_A_FDY: + instr->alu.add.output_pack = raddr_b & 0x3; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + case V3D_QPU_A_FTOIN: + case V3D_QPU_A_FTOIZ: + case V3D_QPU_A_FTOUZ: + case V3D_QPU_A_FTOC: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + unreachable("pending v71 update"); + if (!v3d_qpu_float16_unpack_unpack(op & 0x7, + &instr->alu.add.a.unpack)) { + return false; + } + + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + + case V3D_QPU_A_MOV: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + case V3D_QPU_A_FMOV: + instr->alu.add.output_pack = raddr_b & 0x3; + + /* Mul alu FMOV has one additional variant */ + int32_t unpack = (raddr_b >> 2) & 0x7; + if (unpack == 7) + return false; + + if (!v3d_qpu_float32_unpack_unpack(unpack, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + default: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + + instr->alu.add.a.raddr = raddr_a; + instr->alu.add.b.raddr = raddr_b; + instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); + + instr->alu.add.magic_write = false; + if (packed_inst & V3D_QPU_MA) { + switch (instr->alu.add.op) { + case V3D_QPU_A_LDVPMV_IN: + instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT; + break; + case V3D_QPU_A_LDVPMD_IN: + instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT; + break; + case V3D_QPU_A_LDVPMG_IN: + instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT; + break; + default: + instr->alu.add.magic_write = true; + break; + } + } + + return true; +} + +static bool +v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, struct v3d_qpu_instr *instr) { + if (devinfo->ver < 71) + return v3d33_qpu_add_unpack(devinfo, packed_inst, instr); + else + return v3d71_qpu_add_unpack(devinfo, packed_inst, instr); +} + +static bool +v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) +{ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A); uint32_t mux_b = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_B); { const struct opcode_desc *desc = - lookup_opcode_from_packed(devinfo, mul_ops, - ARRAY_SIZE(mul_ops), - op, mux_a, mux_b); + lookup_opcode_from_packed(devinfo, + mul_ops_v33, + ARRAY_SIZE(mul_ops_v33), + op, mux_a, mux_b, 0); if (!desc) return false; @@ -917,12 +1428,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.mul.a_unpack)) { + &instr->alu.mul.a.unpack)) { return false; } if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, - &instr->alu.mul.b_unpack)) { + &instr->alu.mul.b.unpack)) { return false; } @@ -933,7 +1444,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ((mux_b >> 2) & 1)); if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3, - &instr->alu.mul.a_unpack)) { + &instr->alu.mul.a.unpack)) { return false; } @@ -943,29 +1454,123 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, - &instr->alu.mul.a_unpack)) { + &instr->alu.mul.a.unpack)) { return false; } - instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; break; default: instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; - instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE; - instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; break; } - instr->alu.mul.a = mux_a; - instr->alu.mul.b = mux_b; + instr->alu.mul.a.mux = mux_a; + instr->alu.mul.b.mux = mux_b; instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; return true; } +static bool +v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) +{ + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); + uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C); + uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D); + + { + const struct opcode_desc *desc = + lookup_opcode_from_packed(devinfo, + mul_ops_v71, + ARRAY_SIZE(mul_ops_v71), + op, 0, 0, + raddr_d); + if (!desc) + return false; + + instr->alu.mul.op = desc->op; + } + + switch (instr->alu.mul.op) { + case V3D_QPU_M_FMUL: + instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, + &instr->alu.mul.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, + &instr->alu.mul.b.unpack)) { + return false; + } + + break; + + case V3D_QPU_M_FMOV: + instr->alu.mul.output_pack = raddr_d & 0x3; + + if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7, + &instr->alu.mul.a.unpack)) { + return false; + } + + break; + + case V3D_QPU_M_VFMUL: + unreachable("pending v71 update"); + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, + &instr->alu.mul.a.unpack)) { + return false; + } + + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + break; + + case V3D_QPU_M_MOV: + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7, + &instr->alu.mul.a.unpack)) { + return false; + } + break; + + default: + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + + instr->alu.mul.a.raddr = raddr_c; + instr->alu.mul.b.raddr = raddr_d; + instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); + instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; + + return true; +} + +static bool +v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) +{ + if (devinfo->ver < 71) + return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr); + else + return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr); +} + static const struct opcode_desc * lookup_opcode_from_instr(const struct v3d_device_info *devinfo, const struct opcode_desc *opcodes, size_t num_opcodes, @@ -977,7 +1582,7 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo, if (op_desc->op != op) continue; - if (opcode_invalid_in_version(devinfo, op_desc)) + if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) continue; return op_desc; @@ -987,15 +1592,16 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo, } static bool -v3d_qpu_add_pack(const struct v3d_device_info *devinfo, - const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +v3d33_qpu_add_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) { uint32_t waddr = instr->alu.add.waddr; - uint32_t mux_a = instr->alu.add.a; - uint32_t mux_b = instr->alu.add.b; + uint32_t mux_a = instr->alu.add.a.mux; + uint32_t mux_b = instr->alu.add.b.mux; int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); const struct opcode_desc *desc = - lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops), + lookup_opcode_from_instr(devinfo, add_ops_v33, + ARRAY_SIZE(add_ops_v33), instr->alu.add.op); if (!desc) @@ -1007,10 +1613,10 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, * identify the operation type. */ if (nsrc < 2) - mux_b = ffs(desc->mux_b_mask) - 1; + mux_b = ffs(desc->mux.b_mask) - 1; if (nsrc < 1) - mux_a = ffs(desc->mux_a_mask) - 1; + mux_a = ffs(desc->mux.a_mask) - 1; bool no_magic_write = false; @@ -1063,12 +1669,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, } opcode |= output_pack << 4; - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &a_unpack)) { return false; } - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, &b_unpack)) { return false; } @@ -1102,23 +1708,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, uint32_t a_unpack; uint32_t b_unpack; - if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS || - instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) { + if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || + instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { return false; } - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &a_unpack)) { return false; } - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, &b_unpack)) { return false; } - opcode = (opcode & ~(1 << 2)) | (a_unpack << 2); - opcode = (opcode & ~(1 << 0)) | (b_unpack << 0); + opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); + opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); break; } @@ -1137,13 +1743,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, } mux_b |= packed; - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &packed)) { return false; } if (packed == 0) return false; - opcode = (opcode & ~(1 << 2)) | packed << 2; + opcode = (opcode & ~(0x3 << 2)) | packed << 2; break; } @@ -1155,7 +1761,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, return false; uint32_t packed; - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &packed)) { return false; } @@ -1168,11 +1774,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, case V3D_QPU_A_VFMIN: case V3D_QPU_A_VFMAX: if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || - instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) { + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { return false; } - if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, &packed)) { return false; } @@ -1182,8 +1788,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, default: if (instr->alu.add.op != V3D_QPU_A_NOP && (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || - instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE || - instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) { + instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { return false; } break; @@ -1200,15 +1806,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, } static bool -v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, - const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) { - uint32_t mux_a = instr->alu.mul.a; - uint32_t mux_b = instr->alu.mul.b; + uint32_t waddr = instr->alu.add.waddr; + uint32_t raddr_a = instr->alu.add.a.raddr; + uint32_t raddr_b = instr->alu.add.b.raddr; + + int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); + const struct opcode_desc *desc = + lookup_opcode_from_instr(devinfo, add_ops_v71, + ARRAY_SIZE(add_ops_v71), + instr->alu.add.op); + if (!desc) + return false; + + uint32_t opcode = desc->opcode_first; + + /* If an operation doesn't use an arg, its raddr values may be used to + * identify the operation type. + */ + if (nsrc < 2) + raddr_b = ffsll(desc->raddr_mask) - 1; + + bool no_magic_write = false; + + switch (instr->alu.add.op) { + case V3D_QPU_A_STVPMV: + waddr = 0; + no_magic_write = true; + break; + case V3D_QPU_A_STVPMD: + waddr = 1; + no_magic_write = true; + break; + case V3D_QPU_A_STVPMP: + waddr = 2; + no_magic_write = true; + break; + + case V3D_QPU_A_LDVPMV_IN: + case V3D_QPU_A_LDVPMD_IN: + case V3D_QPU_A_LDVPMP: + case V3D_QPU_A_LDVPMG_IN: + assert(!instr->alu.add.magic_write); + break; + + case V3D_QPU_A_LDVPMV_OUT: + case V3D_QPU_A_LDVPMD_OUT: + case V3D_QPU_A_LDVPMG_OUT: + assert(!instr->alu.add.magic_write); + *packed_instr |= V3D_QPU_MA; + break; + + default: + break; + } + + switch (instr->alu.add.op) { + case V3D_QPU_A_FADD: + case V3D_QPU_A_FADDNF: + case V3D_QPU_A_FSUB: + case V3D_QPU_A_FMIN: + case V3D_QPU_A_FMAX: + case V3D_QPU_A_FCMP: { + uint32_t output_pack; + uint32_t a_unpack; + uint32_t b_unpack; + + if (instr->alu.add.op != V3D_QPU_A_FCMP) { + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, + &output_pack)) { + return false; + } + opcode |= output_pack << 4; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } + + /* These operations with commutative operands are + * distinguished by the order of the operands come in. + */ + bool ordering = + instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a > + instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b; + if (((instr->alu.add.op == V3D_QPU_A_FMIN || + instr->alu.add.op == V3D_QPU_A_FADD) && ordering) || + ((instr->alu.add.op == V3D_QPU_A_FMAX || + instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) { + uint32_t temp; + + temp = a_unpack; + a_unpack = b_unpack; + b_unpack = temp; + + temp = raddr_a; + raddr_a = raddr_b; + raddr_b = temp; + + /* If we are swapping raddr_a/b we also need to swap + * small_imm_a/b. + */ + if (instr->sig.small_imm_a || instr->sig.small_imm_b) { + assert(instr->sig.small_imm_a != + instr->sig.small_imm_b); + struct v3d_qpu_sig new_sig = instr->sig; + new_sig.small_imm_a = !instr->sig.small_imm_a; + new_sig.small_imm_b = !instr->sig.small_imm_b; + uint32_t sig; + if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) + return false; + *packed_instr &= ~V3D_QPU_SIG_MASK; + *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); + } + } + + opcode |= a_unpack << 2; + opcode |= b_unpack << 0; + + break; + } + + case V3D_QPU_A_VFPACK: { + uint32_t a_unpack; + uint32_t b_unpack; + + if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || + instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } + + opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); + opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); + + break; + } + + case V3D_QPU_A_FFLOOR: + case V3D_QPU_A_FROUND: + case V3D_QPU_A_FTRUNC: + case V3D_QPU_A_FCEIL: + case V3D_QPU_A_FDX: + case V3D_QPU_A_FDY: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, + &packed)) { + return false; + } + raddr_b |= packed; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + if (packed == 0) + return false; + raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2; + break; + } + + case V3D_QPU_A_FTOIN: + case V3D_QPU_A_FTOIZ: + case V3D_QPU_A_FTOUZ: + case V3D_QPU_A_FTOC: + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) + return false; + + uint32_t packed; + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + if (packed == 0) + return false; + + raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2; + + break; + + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + + if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + opcode |= packed; + break; + + case V3D_QPU_A_MOV: { + uint32_t packed; + + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) + return false; + + if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + + raddr_b |= packed << 2; + break; + } + + case V3D_QPU_A_FMOV: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, + &packed)) { + return false; + } + raddr_b = packed; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + raddr_b |= packed << 2; + break; + } + + default: + if (instr->alu.add.op != V3D_QPU_A_NOP && + (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || + instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } + break; + } + + *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A); + *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B); + *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD); + *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A); + if (instr->alu.add.magic_write && !no_magic_write) + *packed_instr |= V3D_QPU_MA; + + return true; +} + +static bool +v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + uint32_t mux_a = instr->alu.mul.a.mux; + uint32_t mux_b = instr->alu.mul.b.mux; int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); const struct opcode_desc *desc = - lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops), + lookup_opcode_from_instr(devinfo, mul_ops_v33, + ARRAY_SIZE(mul_ops_v33), instr->alu.mul.op); if (!desc) @@ -1220,10 +2091,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, * that here. If mux a/b determine packing, it will be set below. */ if (nsrc < 2) - mux_b = ffs(desc->mux_b_mask) - 1; + mux_b = ffs(desc->mux.b_mask) - 1; if (nsrc < 1) - mux_a = ffs(desc->mux_a_mask) - 1; + mux_a = ffs(desc->mux.a_mask) - 1; switch (instr->alu.mul.op) { case V3D_QPU_M_FMUL: { @@ -1238,13 +2109,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, */ opcode += packed << 4; - if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, &packed)) { return false; } opcode |= packed << 2; - if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, &packed)) { return false; } @@ -1262,7 +2133,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, opcode |= (packed >> 1) & 1; mux_b = (packed & 1) << 2; - if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, &packed)) { return false; } @@ -1276,22 +2147,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) return false; - if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack, + if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, &packed)) { return false; } - if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16) + if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) opcode = 8; else opcode |= (packed + 4) & 7; - if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) + if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) return false; break; } default: + if (instr->alu.mul.op != V3D_QPU_M_NOP && + (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || + instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } break; } @@ -1307,6 +2184,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, } static bool +v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + uint32_t raddr_c = instr->alu.mul.a.raddr; + uint32_t raddr_d = instr->alu.mul.b.raddr; + int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); + + const struct opcode_desc *desc = + lookup_opcode_from_instr(devinfo, mul_ops_v71, + ARRAY_SIZE(mul_ops_v71), + instr->alu.mul.op); + if (!desc) + return false; + + uint32_t opcode = desc->opcode_first; + + /* Some opcodes have a single valid value for their raddr_d, so set + * that here. If raddr_b determine packing, it will be set below. + */ + if (nsrc < 2) + raddr_d = ffsll(desc->raddr_mask) - 1; + + switch (instr->alu.mul.op) { + case V3D_QPU_M_FMUL: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, + &packed)) { + return false; + } + /* No need for a +1 because desc->opcode_first has a 1 in this + * field. + */ + opcode += packed << 4; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + opcode |= packed << 2; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, + &packed)) { + return false; + } + opcode |= packed << 0; + break; + } + + case V3D_QPU_M_FMOV: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, + &packed)) { + return false; + } + raddr_d |= packed; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + raddr_d |= packed << 2; + break; + } + + case V3D_QPU_M_VFMUL: { + unreachable("pending v71 update"); + uint32_t packed; + + if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) + return false; + + if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) + opcode = 8; + else + opcode |= (packed + 4) & 7; + + if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) + return false; + + break; + } + + case V3D_QPU_M_MOV: { + uint32_t packed; + + if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) + return false; + + if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + + raddr_d |= packed << 2; + break; + } + + default: + if (instr->alu.mul.op != V3D_QPU_M_NOP && + (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || + instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } + break; + } + + *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C); + *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D); + *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL); + *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M); + if (instr->alu.mul.magic_write) + *packed_instr |= V3D_QPU_MM; + + return true; +} + +static bool +v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + if (devinfo->ver < 71) + return v3d33_qpu_add_pack(devinfo, instr, packed_instr); + else + return v3d71_qpu_add_pack(devinfo, instr, packed_instr); +} + +static bool +v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + if (devinfo->ver < 71) + return v3d33_qpu_mul_pack(devinfo, instr, packed_instr); + else + return v3d71_qpu_mul_pack(devinfo, instr, packed_instr); +} + +static bool v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, uint64_t packed_instr, struct v3d_qpu_instr *instr) @@ -1334,8 +2355,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, return false; } - instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); - instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); + if (devinfo->ver <= 71) { + /* + * For v71 this will be set on add/mul unpack, as raddr are now + * part of v3d_qpu_input + */ + instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); + instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); + } if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr)) return false; @@ -1421,8 +2448,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo, *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); if (instr->type == V3D_QPU_INSTR_TYPE_ALU) { - *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); - *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); + if (devinfo->ver < 71) { + /* + * For v71 this will be set on add/mul unpack, as raddr are now + * part of v3d_qpu_input + */ + *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); + *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); + } if (!v3d_qpu_add_pack(devinfo, instr, packed_instr)) return false; diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c index e6b1918b8f0..be7b78d5ef0 100644 --- a/src/broadcom/qpu/tests/qpu_disasm.c +++ b/src/broadcom/qpu/tests/qpu_disasm.c @@ -34,29 +34,29 @@ static const struct { uint64_t inst; const char *expected; } tests[] = { - { 33, 0x3d003186bb800000ull, "nop ; nop ; ldvary" }, - { 33, 0x3c20318105829000ull, "fadd r1, r1, r5 ; nop ; thrsw" }, - { 33, 0x3c403186bb81d000ull, "vpmsetup -, r5 ; nop ; ldunif" }, - { 33, 0x3f003186bb800000ull, "nop ; nop ; ldvpm" }, - { 33, 0x3c002380b6edb000ull, "or rf0, r3, r3 ; mov vpm, r3" }, - { 33, 0x57403006bbb80000ull, "nop ; fmul r0, rf0, r5 ; ldvpm; ldunif" }, - { 33, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3; fmul.pushz rf43.l, r5, r1.h" }, - { 33, 0xb0044c56ba326840ull, "flpop rf22, rf33 ; fmul.pushz rf49.l, r4.h, r1.abs" }, + { 33, 0x3d003186bb800000ull, "nop ; nop ; ldvary" }, + { 33, 0x3c20318105829000ull, "fadd r1, r1, r5 ; nop ; thrsw" }, + { 33, 0x3c403186bb81d000ull, "vpmsetup -, r5 ; nop ; ldunif" }, + { 33, 0x3f003186bb800000ull, "nop ; nop ; ldvpm" }, + { 33, 0x3c002380b6edb000ull, "or rf0, r3, r3 ; mov vpm, r3" }, + { 33, 0x57403006bbb80000ull, "nop ; fmul r0, rf0, r5 ; ldvpm; ldunif" }, + { 33, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3 ; fmul.pushz rf43.l, r5, r1.h" }, + { 33, 0xb0044c56ba326840ull, "flpop rf22, rf33 ; fmul.pushz rf49.l, r4.h, r1.abs" }, /* vfmul input packing */ - { 33, 0x101e8b6e8aad4000ull, "fmax.nornn rf46, r4.l, r2.l; vfmul.ifnb rf45, r3, r5" }, - { 33, 0x1857d3c219825000ull, "faddnf.norc r2.l, r5.l, r4; vfmul.ifb rf15, r0.ll, r4; ldunif" }, - { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna rf61.h, r4.abs, r2.l; vfmul rf55, r2.hh, r1" }, - { 33, 0x2011c89b402cc000ull, "fsub.norz rf27, r4.abs, r1.abs; vfmul.ifa rf34, r3.swp, r1" }, + { 33, 0x101e8b6e8aad4000ull, "fmax.nornn rf46, r4.l, r2.l ; vfmul.ifnb rf45, r3, r5" }, + { 33, 0x1857d3c219825000ull, "faddnf.norc r2.l, r5.l, r4 ; vfmul.ifb rf15, r0.ll, r4 ; ldunif" }, + { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna rf61.h, r4.abs, r2.l; vfmul rf55, r2.hh, r1" }, + { 33, 0x2011c89b402cc000ull, "fsub.norz rf27, r4.abs, r1.abs; vfmul.ifa rf34, r3.swp, r1" }, - { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc rf43, rf15.l, r0.h; fmul.ifna rf10.h, r4.l, r5.abs" }, - { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb rf7.h, r1.l; fmul.pushn rf46, r3.l, r2.abs" }, + { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc rf43, rf15.l, r0.h; fmul.ifna rf10.h, r4.l, r5.abs" }, + { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb rf7.h, r1.l ; fmul.pushn rf46, r3.l, r2.abs" }, /* small immediates */ - { 33, 0x5de24398bbdc6218ull, "vflb.andnn rf24 ; fmul rf14, -8, rf8.h" }, - { 33, 0x25ef83d8b166f00full, "vfmin.pushn rf24, 15.ff, r5; smul24.ifnb rf15, r1, r3" }, - { 33, 0xadedcdf70839f990ull, "faddnf.pushc rf55, -16.l, r3.abs; fmul.ifb rf55.l, rf38.l, r1.h" }, - { 33, 0x7dff89fa6a01f020ull, "fsub.nornc rf58.h, 0x3b800000.l, r3.l; fmul.ifnb rf39, r0.h, r0.h" }, + { 33, 0x5de24398bbdc6218ull, "vflb.andnn rf24 ; fmul rf14, -8, rf8.h" }, + { 33, 0x25ef83d8b166f00full, "vfmin.pushn rf24, 15.ff, r5 ; smul24.ifnb rf15, r1, r3" }, + { 33, 0xadedcdf70839f990ull, "faddnf.pushc rf55, -16.l, r3.abs; fmul.ifb rf55.l, rf38.l, r1.h" }, + { 33, 0x7dff89fa6a01f020ull, "fsub.nornc rf58.h, 0x3b800000.l, r3.l; fmul.ifnb rf39, r0.h, r0.h" }, /* branch conditions */ { 33, 0x02000006002034c0ull, "b.anyap rf19" }, @@ -68,36 +68,36 @@ static const struct { { 33, 0x0200000300006000ull, "bu.na0 lri, a:unif" }, /* Special waddr names */ - { 33, 0x3c00318735808000ull, "vfpack tlb, r0, r1 ; nop" }, - { 33, 0xe0571c938e8d5000ull, "fmax.andc recip, r5.h, r2.l; fmul.ifb rf50.h, r3.l, r4.abs; ldunif" }, - { 33, 0xc04098d4382c9000ull, "add.pushn rsqrt, r1, r1; fmul rf35.h, r3.abs, r1.abs; ldunif" }, - { 33, 0x481edcd6b3184500ull, "vfmin.norn log, r4.hh, r0; fmul.ifnb rf51, rf20.abs, r0.l" }, - { 33, 0x041618d57c453000ull, "shl.andn exp, r3, r2; add.ifb rf35, r1, r2" }, - { 33, 0x7048e5da49272800ull, "fsub.ifa rf26, r2.l, rf32; fmul.pushc sin, r1.h, r1.abs; ldunif" }, + { 33, 0x3c00318735808000ull, "vfpack tlb, r0, r1 ; nop" }, + { 33, 0xe0571c938e8d5000ull, "fmax.andc recip, r5.h, r2.l ; fmul.ifb rf50.h, r3.l, r4.abs; ldunif" }, + { 33, 0xc04098d4382c9000ull, "add.pushn rsqrt, r1, r1 ; fmul rf35.h, r3.abs, r1.abs ; ldunif" }, + { 33, 0x481edcd6b3184500ull, "vfmin.norn log, r4.hh, r0 ; fmul.ifnb rf51, rf20.abs, r0.l" }, + { 33, 0x041618d57c453000ull, "shl.andn exp, r3, r2 ; add.ifb rf35, r1, r2" }, + { 33, 0x7048e5da49272800ull, "fsub.ifa rf26, r2.l, rf32 ; fmul.pushc sin, r1.h, r1.abs; ldunif" }, /* v4.1 signals */ - { 41, 0x1f010520cf60a000ull, "fcmp.andz rf32, r2.h, r1.h; vfmul rf20, r0.hh, r3; ldunifa" }, - { 41, 0x932045e6c16ea000ull, "fcmp rf38, r2.abs, r5; fmul rf23.l, r3, r3.abs; ldunifarf.rf1" }, - { 41, 0xd72f0434e43ae5c0ull, "fcmp rf52.h, rf23, r5.abs; fmul rf16.h, rf23, r1; ldunifarf.rf60" }, - { 41, 0xdb3048eb9d533780ull, "fmax rf43.l, r3.h, rf30; fmul rf35.h, r4, r2.l; ldunifarf.r1" }, - { 41, 0x733620471e6ce700ull, "faddnf rf7.l, rf28.h, r1.l; fmul r1, r3.h, r3.abs; ldunifarf.rsqrt2" }, - { 41, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3; fmul.pushz rf43.l, r5, r1.h" }, + { 41, 0x1f010520cf60a000ull, "fcmp.andz rf32, r2.h, r1.h ; vfmul rf20, r0.hh, r3 ; ldunifa" }, + { 41, 0x932045e6c16ea000ull, "fcmp rf38, r2.abs, r5 ; fmul rf23.l, r3, r3.abs ; ldunifarf.rf1" }, + { 41, 0xd72f0434e43ae5c0ull, "fcmp rf52.h, rf23, r5.abs ; fmul rf16.h, rf23, r1 ; ldunifarf.rf60" }, + { 41, 0xdb3048eb9d533780ull, "fmax rf43.l, r3.h, rf30 ; fmul rf35.h, r4, r2.l ; ldunifarf.r1" }, + { 41, 0x733620471e6ce700ull, "faddnf rf7.l, rf28.h, r1.l ; fmul r1, r3.h, r3.abs ; ldunifarf.rsqrt2" }, + { 41, 0x9c094adef634b000ull, "ffloor.ifb rf30.l, r3 ; fmul.pushz rf43.l, r5, r1.h" }, /* v4.1 opcodes */ - { 41, 0x3de020c7bdfd200dull, "ldvpmg_in rf7, r2, r2; mov r3, 13" }, - { 41, 0x3de02040f8ff7201ull, "stvpmv 1, rf8 ; mov r1, 1" }, - { 41, 0xd8000e50bb2d3000ull, "sampid rf16 ; fmul rf57.h, r3, r1.l" }, + { 41, 0x3de020c7bdfd200dull, "ldvpmg_in rf7, r2, r2 ; mov r3, 13" }, + { 41, 0x3de02040f8ff7201ull, "stvpmv 1, rf8 ; mov r1, 1" }, + { 41, 0xd8000e50bb2d3000ull, "sampid rf16 ; fmul rf57.h, r3, r1.l" }, /* v4.1 SFU instructions. */ - { 41, 0xe98d60c1ba2aef80ull, "recip rf1, rf62 ; fmul r3.h, r2.l, r1.l; ldunifrf.rf53" }, - { 41, 0x7d87c2debc51c000ull, "rsqrt rf30, r4 ; fmul rf11, r4.h, r2.h; ldunifrf.rf31" }, - { 41, 0xb182475abc2bb000ull, "rsqrt2 rf26, r3 ; fmul rf29.l, r2.h, r1.abs; ldunifrf.rf9" }, - { 41, 0x79880808bc0b6900ull, "sin rf8, rf36 ; fmul rf32, r2.h, r0.l; ldunifrf.rf32" }, - { 41, 0x04092094bc5a28c0ull, "exp.ifb rf20, r2 ; add r2, rf35, r2" }, - { 41, 0xe00648bfbc32a000ull, "log rf63, r2 ; fmul.andnn rf34.h, r4.l, r1.abs" }, + { 41, 0xe98d60c1ba2aef80ull, "recip rf1, rf62 ; fmul r3.h, r2.l, r1.l ; ldunifrf.rf53" }, + { 41, 0x7d87c2debc51c000ull, "rsqrt rf30, r4 ; fmul rf11, r4.h, r2.h ; ldunifrf.rf31" }, + { 41, 0xb182475abc2bb000ull, "rsqrt2 rf26, r3 ; fmul rf29.l, r2.h, r1.abs ; ldunifrf.rf9" }, + { 41, 0x79880808bc0b6900ull, "sin rf8, rf36 ; fmul rf32, r2.h, r0.l ; ldunifrf.rf32" }, + { 41, 0x04092094bc5a28c0ull, "exp.ifb rf20, r2 ; add r2, rf35, r2" }, + { 41, 0xe00648bfbc32a000ull, "log rf63, r2 ; fmul.andnn rf34.h, r4.l, r1.abs" }, /* v4.2 changes */ - { 42, 0x3c203192bb814000ull, "barrierid syncb ; nop ; thrsw" }, + { 42, 0x3c203192bb814000ull, "barrierid syncb ; nop ; thrsw" }, }; static void @@ -133,6 +133,8 @@ main(int argc, char **argv) const char *disasm_output = v3d_qpu_disasm(&devinfo, tests[i].inst); + printf("%s\n", disasm_output); + if (strcmp(disasm_output, tests[i].expected) != 0) { printf("FAIL\n"); printf(" Expected: \"%s\"\n", tests[i].expected); @@ -158,10 +160,10 @@ main(int argc, char **argv) /* Swap the operands to be sure that we test * how the QPUs distinguish between these ops. */ - swap_mux(&instr.alu.add.a, - &instr.alu.add.b); - swap_pack(&instr.alu.add.a_unpack, - &instr.alu.add.b_unpack); + swap_mux(&instr.alu.add.a.mux, + &instr.alu.add.b.mux); + swap_pack(&instr.alu.add.a.unpack, + &instr.alu.add.b.unpack); break; default: break; diff --git a/src/broadcom/simulator/meson.build b/src/broadcom/simulator/meson.build index 51f311bb094..0432fa0e52c 100644 --- a/src/broadcom/simulator/meson.build +++ b/src/broadcom/simulator/meson.build @@ -1,4 +1,4 @@ -# Copyright © 2019 Raspberry Pi +# Copyright © 2019 Raspberry Pi Ltd # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -29,8 +29,8 @@ files_per_version = files( ) v3d_args = [] -dep_v3dv3 = dependency('v3dv3', required: false) -if dep_v3dv3.found() +dep_v3d_hw = dependency('v3d_hw', required: false) +if dep_v3d_hw.found() v3d_args += '-DUSE_V3D_SIMULATOR' endif @@ -40,22 +40,22 @@ foreach ver : v3d_versions 'v3d-simulator-v' + ver, [files_per_version, v3d_xml_pack], include_directories : [ - inc_src, inc_include, inc_gallium_aux, inc_broadcom, + inc_src, inc_include, inc_broadcom, ], c_args : [v3d_args, '-DV3D_VERSION=' + ver], gnu_symbol_visibility: 'hidden', - dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind], + dependencies : [dep_v3d_hw, dep_libdrm, dep_valgrind], ) endforeach libbroadcom_simulator = static_library( 'broadcom_simulator', [libbroadcom_simulator_files], - include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux], + include_directories : [inc_src, inc_include], c_args : [v3d_args, no_override_init_args], cpp_args : [v3d_args], gnu_symbol_visibility : 'hidden', - dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind], + dependencies : [dep_v3d_hw, dep_libdrm, dep_valgrind], link_with : [per_version_libs], build_by_default : false, ) diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c index 494e5bb4475..1d78d7205f1 100644 --- a/src/broadcom/simulator/v3d_simulator.c +++ b/src/broadcom/simulator/v3d_simulator.c @@ -54,32 +54,32 @@ #include "util/hash_table.h" #include "util/ralloc.h" #include "util/set.h" +#include "util/simple_mtx.h" #include "util/u_dynarray.h" #include "util/u_memory.h" #include "util/u_mm.h" #include "util/u_math.h" #include <xf86drm.h> +#include "drm-uapi/amdgpu_drm.h" #include "drm-uapi/i915_drm.h" #include "drm-uapi/v3d_drm.h" #include "v3d_simulator.h" #include "v3d_simulator_wrapper.h" +#include "broadcom/common/v3d_csd.h" + /** Global (across GEM fds) state for the simulator */ static struct v3d_simulator_state { - mtx_t mutex; + simple_mtx_t mutex; mtx_t submit_lock; struct v3d_hw *v3d; int ver; - /* Base virtual address of the heap. */ - void *mem; - /* Base hardware address of the heap. */ - uint32_t mem_base; /* Size of the heap. */ - uint32_t mem_size; + uint64_t mem_size; struct mem_block *heap; struct mem_block *overflow; @@ -90,10 +90,19 @@ static struct v3d_simulator_state { /** Last performance monitor ID. */ uint32_t last_perfid; + /** Total performance counters */ + uint32_t perfcnt_total; + struct util_dynarray bin_oom; int refcount; } sim_state = { - .mutex = _MTX_INITIALIZER_NP, + .mutex = SIMPLE_MTX_INITIALIZER, +}; + +enum gem_type { + GEM_I915, + GEM_AMDGPU, + GEM_DUMB }; /** Per-GEM-fd state for the simulator. */ @@ -109,10 +118,10 @@ struct v3d_simulator_file { uint32_t active_perfid; struct mem_block *gmp; - void *gmp_vaddr; + uint64_t gmp_addr; - /** Actual GEM fd is i915, so we should use their create ioctl. */ - bool is_i915; + /** For specific gpus, use their create ioctl. Otherwise use dumb bo. */ + enum gem_type gem_type; }; /** Wrapper for drm_v3d_bo tracking the simulator-specific state. */ @@ -123,7 +132,7 @@ struct v3d_simulator_bo { struct mem_block *block; uint32_t size; uint64_t mmap_offset; - void *sim_vaddr; + uint64_t sim_addr; void *gem_vaddr; int handle; @@ -184,7 +193,8 @@ set_gmp_flags(struct v3d_simulator_file *file, assert((offset & ((1 << GMP_ALIGN2) - 1)) == 0); int gmp_offset = offset >> GMP_ALIGN2; int gmp_count = align(size, 1 << GMP_ALIGN2) >> GMP_ALIGN2; - uint32_t *gmp = file->gmp_vaddr; + uint32_t *gmp = malloc((gmp_count + gmp_offset)*sizeof(uint32_t)); + v3d_hw_read_mem(sim_state.v3d, gmp, file->gmp_addr, (gmp_offset + gmp_count)*sizeof(uint32_t)); assert(flag <= 0x3); @@ -193,6 +203,9 @@ set_gmp_flags(struct v3d_simulator_file *file, gmp[i / 16] &= ~(0x3 << bitshift); gmp[i / 16] |= flag << bitshift; } + + v3d_hw_write_mem(sim_state.v3d, file->gmp_addr, gmp, (gmp_offset + gmp_count)*sizeof(uint32_t)); + free(gmp); } /** @@ -203,26 +216,25 @@ static struct v3d_simulator_bo * v3d_create_simulator_bo(int fd, unsigned size) { struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + + simple_mtx_lock(&sim_state.mutex); struct v3d_simulator_bo *sim_bo = rzalloc(file, struct v3d_simulator_bo); - size = align(size, 4096); - - sim_bo->file = file; - - mtx_lock(&sim_state.mutex); sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, GMP_ALIGN2, 0); - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); assert(sim_bo->block); - + size = align(size, 4096); + sim_bo->file = file; set_gmp_flags(file, sim_bo->block->ofs, size, 0x3); sim_bo->size = size; /* Allocate space for the buffer in simulator memory. */ - sim_bo->sim_vaddr = sim_state.mem + sim_bo->block->ofs - sim_state.mem_base; - memset(sim_bo->sim_vaddr, 0xd0, size); + sim_bo->sim_addr = sim_bo->block->ofs; + v3d_hw_set_mem(sim_state.v3d, sim_bo->sim_addr, 0xd0, size); - *(uint32_t *)(sim_bo->sim_vaddr + sim_bo->size) = BO_SENTINEL; + uint32_t sentinel = BO_SENTINEL; + v3d_hw_write_mem(sim_state.v3d, sim_bo->sim_addr + sim_bo->size, &sentinel, sizeof(sentinel)); return sim_bo; } @@ -241,7 +253,9 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size) * one. */ int ret; - if (file->is_i915) { + switch (file->gem_type) { + case GEM_I915: + { struct drm_i915_gem_mmap_gtt map = { .handle = handle, }; @@ -252,14 +266,26 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size) */ ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &map); sim_bo->mmap_offset = map.offset; - } else { + break; + } + case GEM_AMDGPU: + { + union drm_amdgpu_gem_mmap map = { 0 }; + map.in.handle = handle; + + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &map); + sim_bo->mmap_offset = map.out.addr_ptr; + break; + } + default: + { struct drm_mode_map_dumb map = { .handle = handle, }; - ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map); sim_bo->mmap_offset = map.offset; } + } if (ret) { fprintf(stderr, "Failed to get MMAP offset: %d\n", ret); abort(); @@ -278,10 +304,10 @@ v3d_create_simulator_bo_for_gem(int fd, int handle, unsigned size) * don't need to go in the lookup table. */ if (handle != 0) { - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); _mesa_hash_table_insert(file->bo_map, int_to_key(handle), sim_bo); - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); } return sim_bo; @@ -311,14 +337,14 @@ v3d_free_simulator_bo(struct v3d_simulator_bo *sim_bo) if (sim_bo->gem_vaddr) munmap(sim_bo->gem_vaddr, sim_bo->size); - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); u_mmFreeMem(sim_bo->block); if (sim_bo->handle) { _mesa_hash_table_remove_key(sim_file->bo_map, int_to_key(sim_bo->handle)); } - mtx_unlock(&sim_state.mutex); ralloc_free(sim_bo); + simple_mtx_unlock(&sim_state.mutex); } static struct v3d_simulator_bo * @@ -327,10 +353,10 @@ v3d_get_simulator_bo(struct v3d_simulator_file *file, int gem_handle) if (gem_handle == 0) return NULL; - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); struct hash_entry *entry = _mesa_hash_table_search(file->bo_map, int_to_key(gem_handle)); - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); return entry ? entry->data : NULL; } @@ -343,7 +369,7 @@ v3d_simulator_copy_in_handle(struct v3d_simulator_file *file, int handle) if (!sim_bo) return; - memcpy(sim_bo->sim_vaddr, sim_bo->gem_vaddr, sim_bo->size); + v3d_hw_write_mem(sim_state.v3d, sim_bo->sim_addr, sim_bo->gem_vaddr, sim_bo->size); } static void @@ -354,10 +380,11 @@ v3d_simulator_copy_out_handle(struct v3d_simulator_file *file, int handle) if (!sim_bo) return; - memcpy(sim_bo->gem_vaddr, sim_bo->sim_vaddr, sim_bo->size); + v3d_hw_read_mem(sim_state.v3d, sim_bo->gem_vaddr, sim_bo->sim_addr, sim_bo->size); - if (*(uint32_t *)(sim_bo->sim_vaddr + - sim_bo->size) != BO_SENTINEL) { + uint32_t sentinel; + v3d_hw_read_mem(sim_state.v3d, &sentinel, sim_bo->sim_addr + sim_bo->size, sizeof(sentinel)); + if (sentinel != BO_SENTINEL) { fprintf(stderr, "Buffer overflow in handle %d\n", handle); } @@ -395,10 +422,10 @@ v3d_get_simulator_perfmon(int fd, uint32_t perfid) struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); assert(perfid <= file->perfmons_size); struct v3d_simulator_perfmon *perfmon = file->perfmons[perfid - 1]; - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); return perfmon; } @@ -414,20 +441,46 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid) perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid); if (perfmon) - v3d41_simulator_perfmon_stop(sim_state.v3d, - perfmon->ncounters, - perfmon->values); + v3d_X_simulator(perfmon_stop)(sim_state.v3d, + perfmon->ncounters, + perfmon->values); perfmon = v3d_get_simulator_perfmon(fd, perfid); if (perfmon) - v3d41_simulator_perfmon_start(sim_state.v3d, - perfmon->ncounters, - perfmon->counters); + v3d_X_simulator(perfmon_start)(sim_state.v3d, + perfmon->ncounters, + perfmon->counters); file->active_perfid = perfid; } static int +v3d_simulator_signal_syncobjs(int fd, struct drm_v3d_multi_sync *ms) +{ + struct drm_v3d_sem *out_syncs = (void *)(uintptr_t)ms->out_syncs; + int n_syncobjs = ms->out_sync_count; + uint32_t syncobjs[n_syncobjs]; + + for (int i = 0; i < n_syncobjs; i++) + syncobjs[i] = out_syncs[i].handle; + return drmSyncobjSignal(fd, (uint32_t *) &syncobjs, n_syncobjs); +} + +static int +v3d_simulator_process_post_deps(int fd, struct drm_v3d_extension *ext) +{ + int ret = 0; + while (ext && ext->id != DRM_V3D_EXT_ID_MULTI_SYNC) + ext = (void *)(uintptr_t) ext->next; + + if (ext) { + struct drm_v3d_multi_sync *ms = (struct drm_v3d_multi_sync *) ext; + ret = v3d_simulator_signal_syncobjs(fd, ms); + } + return ret; +} + +static int v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit) { struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); @@ -441,11 +494,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit) bin_fd = fd; v3d_simulator_perfmon_switch(fd, submit->perfmon_id); - - if (sim_state.ver >= 41) - v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); - else - v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); + v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs); util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *, sim_bo) { @@ -459,7 +508,12 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit) if (ret) return ret; - return 0; + if (submit->flags & DRM_V3D_SUBMIT_EXTENSION) { + struct drm_v3d_extension *ext = (void *)(uintptr_t)submit->extensions; + ret = v3d_simulator_process_post_deps(fd, ext); + } + + return ret; } /** @@ -488,14 +542,30 @@ v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args) * native ioctl in case we're on a render node. */ int ret; - if (file->is_i915) { + switch (file->gem_type) { + case GEM_I915: + { struct drm_i915_gem_create create = { .size = args->size, }; + ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE, &create); args->handle = create.handle; - } else { + break; + } + case GEM_AMDGPU: + { + union drm_amdgpu_gem_create create = { 0 }; + create.in.bo_size = args->size; + + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create); + + args->handle = create.out.handle; + break; + } + default: + { struct drm_mode_create_dumb create = { .width = 128, .bpp = 8, @@ -507,7 +577,7 @@ v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args) args->handle = create.handle; } - + } if (ret == 0) { struct v3d_simulator_bo *sim_bo = v3d_create_simulator_bo_for_gem(fd, args->handle, @@ -564,15 +634,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args) } static int -v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args) -{ - if (sim_state.ver >= 41) - return v3d41_simulator_get_param_ioctl(sim_state.v3d, args); - else - return v3d33_simulator_get_param_ioctl(sim_state.v3d, args); -} - -static int v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) { struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); @@ -583,13 +644,18 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) v3d_simulator_copy_in_handle(file, args->bo_handles[2]); v3d_simulator_copy_in_handle(file, args->bo_handles[3]); - if (sim_state.ver >= 41) - ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args); - else - ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args); + ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args); v3d_simulator_copy_out_handle(file, args->bo_handles[0]); + if (ret) + return ret; + + if (args->flags & DRM_V3D_SUBMIT_EXTENSION) { + struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions; + ret = v3d_simulator_process_post_deps(fd, ext); + } + return ret; } @@ -605,15 +671,311 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args) v3d_simulator_perfmon_switch(fd, args->perfmon_id); - if (sim_state.ver >= 41) - ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args, - file->gmp->ofs); - else - ret = -1; + ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args, + file->gmp->ofs); for (int i = 0; i < args->bo_handle_count; i++) v3d_simulator_copy_out_handle(file, bo_handles[i]); + if (ret < 0) + return ret; + + if (args->flags & DRM_V3D_SUBMIT_EXTENSION) { + struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions; + ret = v3d_simulator_process_post_deps(fd, ext); + } + + return ret; +} + +static void +v3d_rewrite_csd_job_wg_counts_from_indirect(int fd, + struct drm_v3d_extension *ext, + struct drm_v3d_submit_cpu *args) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + struct drm_v3d_indirect_csd *indirect_csd = (struct drm_v3d_indirect_csd *) ext; + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; + + assert(args->bo_handle_count == 1); + struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]); + struct v3d_simulator_bo *indirect = v3d_get_simulator_bo(file, indirect_csd->indirect); + struct drm_v3d_submit_csd *submit = &indirect_csd->submit; + + uint32_t *wg_counts = (uint32_t *) (bo->gem_vaddr + indirect_csd->offset); + + if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0) + return; + + submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + submit->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + + for (int i = 0; i < 3; i++) { + /* 0xffffffff indicates that the uniform rewrite is not needed */ + if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) { + uint32_t uniform_idx = indirect_csd->wg_uniform_offsets[i]; + ((uint32_t *) indirect->gem_vaddr)[uniform_idx] = wg_counts[i]; + } + } + + v3d_simulator_submit_csd_ioctl(fd, submit); +} + +static void +v3d_timestamp_query(int fd, + struct drm_v3d_extension *ext, + struct drm_v3d_submit_cpu *args) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + struct drm_v3d_timestamp_query *timestamp_query = (struct drm_v3d_timestamp_query *) ext; + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; + struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]); + uint32_t *offsets = (void *)(uintptr_t) timestamp_query->offsets; + uint32_t *syncs = (void *)(uintptr_t) timestamp_query->syncs; + + struct timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); + + for (uint32_t i = 0; i < timestamp_query->count; i++) { + uint64_t value = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull; + v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + offsets[i], &value, sizeof(value)); + } + + drmSyncobjSignal(fd, syncs, timestamp_query->count); +} + +static void +v3d_reset_timestamp_queries(int fd, + struct drm_v3d_extension *ext, + struct drm_v3d_submit_cpu *args) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + struct drm_v3d_reset_timestamp_query *reset = (struct drm_v3d_reset_timestamp_query *) ext; + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; + struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]); + uint32_t *syncs = (void *)(uintptr_t) reset->syncs; + + v3d_hw_set_mem(sim_state.v3d, bo->sim_addr + reset->offset, 0, reset->count); + + drmSyncobjReset(fd, syncs, reset->count); +} + +static void +write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value) +{ + if (do_64bit) { + uint64_t *dst64 = (uint64_t *) dst; + dst64[idx] = value; + } else { + uint32_t *dst32 = (uint32_t *) dst; + dst32[idx] = (uint32_t) value; + } +} + +static void +v3d_copy_query_results(int fd, + struct drm_v3d_extension *ext, + struct drm_v3d_submit_cpu *args) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + struct drm_v3d_copy_timestamp_query *copy = (struct drm_v3d_copy_timestamp_query *) ext; + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; + struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]); + struct v3d_simulator_bo *timestamp = v3d_get_simulator_bo(file, bo_handles[1]); + uint32_t *offsets = (void *)(uintptr_t) copy->offsets; + uint32_t *syncs = (void *)(uintptr_t) copy->syncs; + bool available, write_result; + uint8_t *data = malloc(copy->count * copy->stride); + uint64_t query_val; + + uint8_t *p = data; + for (uint32_t i = 0; i < copy->count; i++) { + available = (drmSyncobjWait(fd, &syncs[i], 1, 0, 0, NULL) == 0); + + write_result = available || copy->do_partial; + if (write_result) { + v3d_hw_read_mem(sim_state.v3d, &query_val, timestamp->sim_addr + offsets[i], sizeof(uint64_t)); + write_to_buffer(p, 0, copy->do_64bit, query_val); + } + + if (copy->availability_bit) + write_to_buffer(p, 1, copy->do_64bit, available ? 1u : 0u); + + p += copy->stride; + } + + v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + copy->offset, data, copy->count * copy->stride); + free(data); +} + +static void +v3d_reset_performance_queries(int fd, + struct drm_v3d_extension *ext, + struct drm_v3d_submit_cpu *args) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + struct drm_v3d_reset_performance_query *reset = (struct drm_v3d_reset_performance_query *) ext; + uint64_t *kperfmon_ids = (void *)(uintptr_t) reset->kperfmon_ids; + uint32_t *syncs = (void *)(uintptr_t) reset->syncs; + struct v3d_simulator_perfmon *perfmon; + + for (uint32_t i = 0; i < reset->count; i++) { + uint32_t *ids = (void *)(uintptr_t) kperfmon_ids[i]; + + for (uint32_t j = 0; j < reset->nperfmons; j++) { + mtx_lock(&sim_state.submit_lock); + + /* Stop the perfmon if it is still active */ + if (ids[j] == file->active_perfid) + v3d_simulator_perfmon_switch(fd, 0); + + mtx_unlock(&sim_state.submit_lock); + + perfmon = v3d_get_simulator_perfmon(fd, ids[j]); + + if (!perfmon) + return; + + memset(perfmon->values, 0, perfmon->ncounters * sizeof(uint64_t)); + } + } + + drmSyncobjReset(fd, syncs, reset->count); +} + +static void +v3d_write_performance_query_result(int fd, + struct drm_v3d_copy_performance_query *copy, + uint32_t *kperfmon_ids, + void *data) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + struct v3d_simulator_perfmon *perfmon; + uint64_t counter_values[sim_state.perfcnt_total]; + + for (uint32_t i = 0; i < copy->nperfmons; i++) { + mtx_lock(&sim_state.submit_lock); + + /* Stop the perfmon if it is still active */ + if (kperfmon_ids[i] == file->active_perfid) + v3d_simulator_perfmon_switch(fd, 0); + + mtx_unlock(&sim_state.submit_lock); + + perfmon = v3d_get_simulator_perfmon(fd, kperfmon_ids[i]); + + if (!perfmon) + return; + + memcpy(&counter_values[i * DRM_V3D_MAX_PERF_COUNTERS], perfmon->values, + perfmon->ncounters * sizeof(uint64_t)); + } + + for (uint32_t i = 0; i < copy->ncounters; i++) + write_to_buffer(data, i, copy->do_64bit, counter_values[i]); +} + +static void +v3d_copy_performance_query(int fd, + struct drm_v3d_extension *ext, + struct drm_v3d_submit_cpu *args) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + struct drm_v3d_copy_performance_query *copy = (struct drm_v3d_copy_performance_query *) ext; + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; + struct v3d_simulator_bo *bo = v3d_get_simulator_bo(file, bo_handles[0]); + uint64_t *kperfmon_ids = (void *)(uintptr_t) copy->kperfmon_ids; + uint32_t *syncs = (void *)(uintptr_t) copy->syncs; + bool available, write_result; + uint8_t *data = malloc(copy->count * copy->stride); + + uint8_t *p = data; + for (uint32_t i = 0; i < copy->count; i++) { + /* Although we don't have in_syncs implemented in the simulator, + * we don't need to wait for the availability of the syncobjs, + * as they are signaled by CL and CSD jobs, which are serialized + * by the simulator. + */ + available = (drmSyncobjWait(fd, &syncs[i], 1, 0, 0, NULL) == 0); + + write_result = available || copy->do_partial; + if (write_result) { + v3d_write_performance_query_result(fd, copy, + (void *)(uintptr_t) kperfmon_ids[i], + p); + } + + if (copy->availability_bit) { + write_to_buffer(p, copy->ncounters, copy->do_64bit, + available ? 1u : 0u); + } + + p += copy->stride; + } + + v3d_hw_write_mem(sim_state.v3d, bo->sim_addr + copy->offset, data, copy->count + copy->stride); + free(data); +} + +static int +v3d_simulator_submit_cpu_ioctl(int fd, struct drm_v3d_submit_cpu *args) +{ + struct drm_v3d_extension *ext = (void *)(uintptr_t)args->extensions; + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; + int ret = 0; + + for (int i = 0; i < args->bo_handle_count; i++) + v3d_simulator_copy_in_handle(file, bo_handles[i]); + + while (ext) { + switch (ext->id) { + case DRM_V3D_EXT_ID_MULTI_SYNC: + /* As the simulator serializes the jobs, we don't need + * to handle the in_syncs here. The out_syncs are handled + * by the end of the ioctl in v3d_simulator_process_post_deps(). + */ + break; + case DRM_V3D_EXT_ID_CPU_INDIRECT_CSD: + v3d_rewrite_csd_job_wg_counts_from_indirect(fd, ext, args); + break; + case DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY: + v3d_timestamp_query(fd, ext, args); + break; + case DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY: + v3d_reset_timestamp_queries(fd, ext, args); + break; + case DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY: + v3d_copy_query_results(fd, ext, args); + break; + case DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY: + v3d_reset_performance_queries(fd, ext, args); + break; + case DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY: + v3d_copy_performance_query(fd, ext, args); + break; + default: + fprintf(stderr, "Unknown CPU job 0x%08x\n", (int)ext->id); + break; + } + + ext = (void *)(uintptr_t) ext->next; + } + + for (int i = 0; i < args->bo_handle_count; i++) + v3d_simulator_copy_out_handle(file, bo_handles[i]); + + if (ret < 0) + return ret; + + if (args->flags & DRM_V3D_SUBMIT_EXTENSION) { + ext = (void *)(uintptr_t)args->extensions; + ret = v3d_simulator_process_post_deps(fd, ext); + } + return ret; } @@ -631,7 +993,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args) perfmon->ncounters = args->ncounters; for (int i = 0; i < args->ncounters; i++) { - if (args->counters[i] >= V3D_PERFCNT_NUM) { + if (args->counters[i] >= sim_state.perfcnt_total) { ralloc_free(perfmon); return -EINVAL; } else { @@ -639,10 +1001,10 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args) } } - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); args->id = perfmons_next_id(file); file->perfmons[args->id - 1] = perfmon; - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); return 0; } @@ -657,9 +1019,9 @@ v3d_simulator_perfmon_destroy_ioctl(int fd, struct drm_v3d_perfmon_destroy *args if (!perfmon) return -EINVAL; - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); file->perfmons[args->id - 1] = NULL; - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); ralloc_free(perfmon); @@ -712,7 +1074,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args) return 0; case DRM_IOCTL_V3D_GET_PARAM: - return v3d_simulator_get_param_ioctl(fd, args); + return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args); case DRM_IOCTL_GEM_CLOSE: return v3d_simulator_gem_close_ioctl(fd, args); @@ -723,6 +1085,9 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args) case DRM_IOCTL_V3D_SUBMIT_CSD: return v3d_simulator_submit_csd_ioctl(fd, args); + case DRM_IOCTL_V3D_SUBMIT_CPU: + return v3d_simulator_submit_cpu_ioctl(fd, args); + case DRM_IOCTL_V3D_PERFMON_CREATE: return v3d_simulator_perfmon_create_ioctl(fd, args); @@ -747,20 +1112,28 @@ v3d_simulator_get_mem_size(void) return sim_state.mem_size; } +uint32_t +v3d_simulator_get_mem_free(void) +{ + uint32_t total_free = 0; + struct mem_block *p; + for (p = sim_state.heap->next_free; p != sim_state.heap; p = p->next_free) + total_free += p->size; + return total_free; +} + static void v3d_simulator_init_global() { - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); if (sim_state.refcount++) { - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); return; } sim_state.v3d = v3d_hw_auto_new(NULL); v3d_hw_alloc_mem(sim_state.v3d, 1024 * 1024 * 1024); - sim_state.mem_base = - v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size, - &sim_state.mem); + v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size); /* Allocate from anywhere from 4096 up. We don't allocate at 0, * because for OQs and some other addresses in the HW, 0 means @@ -772,11 +1145,11 @@ v3d_simulator_init_global() * and land there. */ struct mem_block *b = u_mmAllocMem(sim_state.heap, 4096, GMP_ALIGN2, 0); - memset(sim_state.mem + b->ofs - sim_state.mem_base, 0xd0, 4096); + v3d_hw_set_mem(sim_state.v3d, b->ofs, 0xd0, 4096); sim_state.ver = v3d_hw_get_version(sim_state.v3d); - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); sim_state.fd_map = _mesa_hash_table_create(NULL, @@ -785,10 +1158,8 @@ v3d_simulator_init_global() util_dynarray_init(&sim_state.bin_oom, NULL); - if (sim_state.ver >= 41) - v3d41_simulator_init_regs(sim_state.v3d); - else - v3d33_simulator_init_regs(sim_state.v3d); + v3d_X_simulator(init_regs)(sim_state.v3d); + v3d_X_simulator(get_perfcnt_total)(&sim_state.perfcnt_total); } struct v3d_simulator_file * @@ -800,7 +1171,11 @@ v3d_simulator_init(int fd) drmVersionPtr version = drmGetVersion(fd); if (version && strncmp(version->name, "i915", version->name_len) == 0) - sim_file->is_i915 = true; + sim_file->gem_type = GEM_I915; + else if (version && strncmp(version->name, "amdgpu", version->name_len) == 0) + sim_file->gem_type = GEM_AMDGPU; + else + sim_file->gem_type = GEM_DUMB; drmFreeVersion(version); sim_file->bo_map = @@ -808,15 +1183,14 @@ v3d_simulator_init(int fd) _mesa_hash_pointer, _mesa_key_pointer_equal); - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); _mesa_hash_table_insert(sim_state.fd_map, int_to_key(fd + 1), sim_file); - mtx_unlock(&sim_state.mutex); + simple_mtx_unlock(&sim_state.mutex); sim_file->gmp = u_mmAllocMem(sim_state.heap, 8096, GMP_ALIGN2, 0); - sim_file->gmp_vaddr = (sim_state.mem + sim_file->gmp->ofs - - sim_state.mem_base); - memset(sim_file->gmp_vaddr, 0, 8096); + sim_file->gmp_addr = sim_file->gmp->ofs; + v3d_hw_set_mem(sim_state.v3d, sim_file->gmp_addr, 0, 8096); return sim_file; } @@ -824,16 +1198,16 @@ v3d_simulator_init(int fd) void v3d_simulator_destroy(struct v3d_simulator_file *sim_file) { - mtx_lock(&sim_state.mutex); + simple_mtx_lock(&sim_state.mutex); if (!--sim_state.refcount) { _mesa_hash_table_destroy(sim_state.fd_map, NULL); util_dynarray_fini(&sim_state.bin_oom); u_mmDestroy(sim_state.heap); - /* No memsetting the struct, because it contains the mutex. */ - sim_state.mem = NULL; + /* No memsetting the sim_state struct, because it contains the + * mutex. */ } - mtx_unlock(&sim_state.mutex); ralloc_free(sim_file); + simple_mtx_unlock(&sim_state.mutex); } #endif /* USE_V3D_SIMULATOR */ diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h index ef6bf44f19f..03575ae8951 100644 --- a/src/broadcom/simulator/v3d_simulator.h +++ b/src/broadcom/simulator/v3d_simulator.h @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * Copyright © 2014-2017 Broadcom * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org> * @@ -40,17 +40,35 @@ uint32_t v3d_simulator_get_spill(uint32_t spill_size); int v3d_simulator_ioctl(int fd, unsigned long request, void *arg); void v3d_simulator_open_from_handle(int fd, int handle, uint32_t size); uint32_t v3d_simulator_get_mem_size(void); +uint32_t v3d_simulator_get_mem_free(void); #ifdef v3dX # include "v3dx_simulator.h" #else -# define v3dX(x) v3d33_##x +# define v3dX(x) v3d42_##x # include "v3dx_simulator.h" # undef v3dX -# define v3dX(x) v3d41_##x +# define v3dX(x) v3d71_##x # include "v3dx_simulator.h" # undef v3dX + #endif +/* Helper to call simulator ver specific functions */ +#define v3d_X_simulator(thing) ({ \ + __typeof(&v3d42_simulator_##thing) v3d_X_sim_thing;\ + switch (sim_state.ver) { \ + case 42: \ + v3d_X_sim_thing = &v3d42_simulator_##thing; \ + break; \ + case 71: \ + v3d_X_sim_thing = &v3d71_simulator_##thing; \ + break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ + v3d_X_sim_thing; \ +}) + #endif diff --git a/src/broadcom/simulator/v3d_simulator_wrapper.cpp b/src/broadcom/simulator/v3d_simulator_wrapper.cpp index 88e439255d3..ef9bec492ee 100644 --- a/src/broadcom/simulator/v3d_simulator_wrapper.cpp +++ b/src/broadcom/simulator/v3d_simulator_wrapper.cpp @@ -30,12 +30,6 @@ #ifdef USE_V3D_SIMULATOR #include "v3d_simulator_wrapper.h" - -#define V3D_TECH_VERSION 3 -#define V3D_REVISION 3 -#define V3D_SUB_REV 0 -#define V3D_HIDDEN_REV 0 -#define V3D_COMPAT_REV 0 #include "v3d_hw_auto.h" extern "C" { @@ -45,13 +39,29 @@ struct v3d_hw *v3d_hw_auto_new(void *in_params) return v3d_hw_auto_make_unique().release(); } +uint64_t v3d_hw_get_mem(const struct v3d_hw *hw, uint64_t *size) +{ + uint64_t addr; + assert(hw->get_mem(&addr, size)); + return addr; +} + +void v3d_hw_set_mem(struct v3d_hw *hw, uint64_t addr, uint8_t value, uint64_t size) +{ + hw->set_mem(addr, value, size); +} + +void v3d_hw_write_mem(struct v3d_hw *hw, uint64_t addr, const void *p, uint64_t size) +{ + hw->write_mem(addr, p, size); +} -uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p) +void v3d_hw_read_mem(struct v3d_hw *hw, void *p, uint64_t addr, uint64_t size) { - return hw->get_mem(size, p); + hw->read_mem(p, addr, size); } -bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size) +bool v3d_hw_alloc_mem(struct v3d_hw *hw, uint64_t min_size) { return hw->alloc_mem(min_size) == V3D_HW_ALLOC_SUCCESS; } diff --git a/src/broadcom/simulator/v3d_simulator_wrapper.h b/src/broadcom/simulator/v3d_simulator_wrapper.h index 05b2a3361ac..7f2be57a3be 100644 --- a/src/broadcom/simulator/v3d_simulator_wrapper.h +++ b/src/broadcom/simulator/v3d_simulator_wrapper.h @@ -31,8 +31,11 @@ extern "C" { #endif struct v3d_hw *v3d_hw_auto_new(void *params); -uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p); -bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size); +uint64_t v3d_hw_get_mem(const struct v3d_hw *hw, uint64_t *size); +void v3d_hw_set_mem(struct v3d_hw *hw, uint64_t addr, uint8_t value, uint64_t size); +void v3d_hw_write_mem(struct v3d_hw *hw, uint64_t add, const void *p, uint64_t size); +void v3d_hw_read_mem(struct v3d_hw *hw, void *p, uint64_t addr, uint64_t size); +bool v3d_hw_alloc_mem(struct v3d_hw *hw, uint64_t min_size); uint32_t v3d_hw_read_reg(struct v3d_hw *hw, uint32_t reg); void v3d_hw_write_reg(struct v3d_hw *hw, uint32_t reg, uint32_t val); void v3d_hw_tick(struct v3d_hw *hw); diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c index 07bbbe2f8c9..ea682955dca 100644 --- a/src/broadcom/simulator/v3dx_simulator.c +++ b/src/broadcom/simulator/v3dx_simulator.c @@ -40,32 +40,25 @@ #include "v3d_simulator.h" #include "v3d_simulator_wrapper.h" +#include "common/v3d_performance_counters.h" + #include "util/macros.h" #include "util/bitscan.h" #include "drm-uapi/v3d_drm.h" #define HW_REGISTER_RO(x) (x) #define HW_REGISTER_RW(x) (x) -#if V3D_VERSION >= 41 -#include "libs/core/v3d/registers/4.1.35.0/v3d.h" +#if V3D_VERSION == 71 +#include "libs/core/v3d/registers/7.1.7.0/v3d.h" #else -#include "libs/core/v3d/registers/3.3.0.0/v3d.h" +#if V3D_VERSION == 42 +#include "libs/core/v3d/registers/4.2.14.0/v3d.h" +#endif #endif #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val) #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg) -static void -v3d_invalidate_l3(struct v3d_hw *v3d) -{ -#if V3D_VERSION < 40 - uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL); - - V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET); - V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET); -#endif -} - /* Invalidates the L2C cache. This is a read-only cache for uniforms and instructions. */ static void v3d_invalidate_l2c(struct v3d_hw *v3d) @@ -150,7 +143,6 @@ v3d_invalidate_slices(struct v3d_hw *v3d) static void v3d_invalidate_caches(struct v3d_hw *v3d) { - v3d_invalidate_l3(v3d); v3d_invalidate_l2c(v3d); v3d_invalidate_l2t(v3d); v3d_invalidate_slices(v3d); @@ -178,38 +170,48 @@ v3d_flush_caches(struct v3d_hw *v3d) v3d_flush_l2t(v3d); } +#if V3D_VERSION < 71 +#define TFU_REG(NAME) V3D_TFU_ ## NAME +#else +#define TFU_REG(NAME) V3D_IFC_ ## NAME +#endif + + int v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, struct drm_v3d_submit_tfu *args) { - int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; - - V3D_WRITE(V3D_TFU_IIA, args->iia); - V3D_WRITE(V3D_TFU_IIS, args->iis); - V3D_WRITE(V3D_TFU_ICA, args->ica); - V3D_WRITE(V3D_TFU_IUA, args->iua); - V3D_WRITE(V3D_TFU_IOA, args->ioa); - V3D_WRITE(V3D_TFU_IOS, args->ios); - V3D_WRITE(V3D_TFU_COEF0, args->coef[0]); - V3D_WRITE(V3D_TFU_COEF1, args->coef[1]); - V3D_WRITE(V3D_TFU_COEF2, args->coef[2]); - V3D_WRITE(V3D_TFU_COEF3, args->coef[3]); - - V3D_WRITE(V3D_TFU_ICFG, args->icfg); - - while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { + int last_vtct = V3D_READ(TFU_REG(CS)) & TFU_REG(CS_CVTCT_SET); + + V3D_WRITE(TFU_REG(IIA), args->iia); + V3D_WRITE(TFU_REG(IIS), args->iis); + V3D_WRITE(TFU_REG(ICA), args->ica); + V3D_WRITE(TFU_REG(IUA), args->iua); + V3D_WRITE(TFU_REG(IOA), args->ioa); +#if V3D_VERSION >= 71 + V3D_WRITE(TFU_REG(IOC), args->v71.ioc); +#endif + V3D_WRITE(TFU_REG(IOS), args->ios); + V3D_WRITE(TFU_REG(COEF0), args->coef[0]); + V3D_WRITE(TFU_REG(COEF1), args->coef[1]); + V3D_WRITE(TFU_REG(COEF2), args->coef[2]); + V3D_WRITE(TFU_REG(COEF3), args->coef[3]); + + V3D_WRITE(TFU_REG(ICFG), args->icfg); + + while ((V3D_READ(TFU_REG(CS)) & TFU_REG(CS_CVTCT_SET)) == last_vtct) { v3d_hw_tick(v3d); } return 0; } -#if V3D_VERSION >= 41 int v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, struct drm_v3d_submit_csd *args, uint32_t gmp_ofs) { +#if V3D_VERSION >= 42 int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) & V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET); g_gmp_ofs = gmp_ofs; @@ -223,6 +225,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]); V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]); V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]); +#if V3D_VERSION >= 71 + V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0); +#endif /* CFG0 kicks off the job */ V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]); @@ -239,15 +244,21 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, v3d_flush_caches(v3d); return 0; -} +#else + return -1; #endif +} int v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, struct drm_v3d_get_param *args) { static const uint32_t reg_map[] = { +#if V3D_VERSION >= 71 + [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_IDENT0, +#else [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG, +#endif [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1, [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2, [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3, @@ -261,14 +272,20 @@ v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, args->value = 1; return 0; case DRM_V3D_PARAM_SUPPORTS_CSD: - args->value = V3D_VERSION >= 41; + args->value = V3D_VERSION >= 42; return 0; case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH: args->value = 1; return 0; case DRM_V3D_PARAM_SUPPORTS_PERFMON: - args->value = V3D_VERSION >= 41; + args->value = V3D_VERSION >= 42; return 0; + case DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT: + args->value = 1; + return 0; + case DRM_V3D_PARAM_SUPPORTS_CPU_QUEUE: + args->value = 1; + return 0; } if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) { @@ -307,16 +324,17 @@ v3d_isr_core(struct v3d_hw *v3d, return; } +#if V3D_VERSION <= 42 if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { fprintf(stderr, "GMP violation at 0x%08x\n", V3D_READ(V3D_GMP_VIO_ADDR)); - abort(); } else { fprintf(stderr, "Unexpected ISR with core status 0x%08x\n", core_status); } abort(); +#endif } static void @@ -331,11 +349,10 @@ handle_mmu_interruptions(struct v3d_hw *v3d, return; const char *client = "?"; - uint32_t axi_id = V3D_READ(V3D_MMU_VIO_ID); + uint32_t axi_id = V3D_READ(V3D_MMU0_VIO_ID); uint32_t va_width = 30; -#if V3D_VERSION >= 41 - static const char *const v3d41_axi_ids[] = { + static const char *const v3d42_axi_ids[] = { "L2T", "PTB", "PSE", @@ -347,21 +364,21 @@ handle_mmu_interruptions(struct v3d_hw *v3d, }; axi_id = axi_id >> 5; - if (axi_id < ARRAY_SIZE(v3d41_axi_ids)) - client = v3d41_axi_ids[axi_id]; + if (axi_id < ARRAY_SIZE(v3d42_axi_ids)) + client = v3d42_axi_ids[axi_id]; - uint32_t mmu_debug = V3D_READ(V3D_MMU_DEBUG_INFO); + uint32_t mmu_debug = V3D_READ(V3D_MMU0_DEBUG_INFO); + + va_width += ((mmu_debug & V3D_MMU0_DEBUG_INFO_VA_WIDTH_SET) + >> V3D_MMU0_DEBUG_INFO_VA_WIDTH_LSB); - va_width += ((mmu_debug & V3D_MMU_DEBUG_INFO_VA_WIDTH_SET) - >> V3D_MMU_DEBUG_INFO_VA_WIDTH_LSB); -#endif /* Only the top bits (final number depends on the gen) of the virtual * address are reported in the MMU VIO_ADDR register. */ - uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU_VIO_ADDR) << + uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU0_VIO_ADDR) << (va_width - 32)); - /* Difference with the kernal: here were are going to abort after + /* Difference with the kernel: here were are going to abort after * logging, so we don't bother with some stuff that the kernel does, * like restoring the MMU ctrl bits */ @@ -393,6 +410,18 @@ v3d_isr_hub(struct v3d_hw *v3d) } handle_mmu_interruptions(v3d, hub_status); + +#if V3D_VERSION == 71 + if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) { + fprintf(stderr, "GMP violation at 0x%08x\n", + V3D_READ(V3D_GMP_VIO_ADDR)); + } else { + fprintf(stderr, + "Unexpected ISR with status 0x%08x\n", + hub_status); + } + abort(); +#endif } static void @@ -417,24 +446,15 @@ v3d_isr(uint32_t hub_status) void v3dX(simulator_init_regs)(struct v3d_hw *v3d) { -#if V3D_VERSION == 33 - /* Set OVRTMUOUT to match kernel behavior. - * - * This means that the texture sampler uniform configuration's tmu - * output type field is used, instead of using the hardware default - * behavior based on the texture type. If you want the default - * behavior, you can still put "2" in the indirect texture state's - * output_type field. - */ - V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET); -#endif - /* FIXME: the kernel captures some additional core interrupts here, * for tracing. Perhaps we should evaluate to do the same here and add * some debug options. */ - uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET | - V3D_CTL_0_INT_STS_INT_OUTOMEM_SET); + uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET; +#if V3D_VERSION <= 42 + core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET; +#endif + V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); @@ -444,6 +464,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d) V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */ V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */ +#if V3D_VERSION == 71 + hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET; +#endif V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts); V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts); @@ -471,13 +494,11 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d, V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma); V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms); } -#if V3D_VERSION >= 41 if (submit->qts) { V3D_WRITE(V3D_CLE_0_CT0QTS, V3D_CLE_0_CT0QTS_CTQTSEN_SET | submit->qts); } -#endif V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start); V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end); @@ -501,20 +522,18 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d, } } -#if V3D_VERSION >= 41 #define V3D_PCTR_0_PCTR_N(x) (V3D_PCTR_0_PCTR0 + 4 * (x)) #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x)) #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8) #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \ - V3D_PCTR_0_SRC_N_SHIFT(x) + 6)) -#endif + V3D_PCTR_0_SRC_N_SHIFT(x) + \ + V3D_PCTR_0_SRC_0_3_PCTRS0_MSB)) void v3dX(simulator_perfmon_start)(struct v3d_hw *v3d, uint32_t ncounters, uint8_t *events) { -#if V3D_VERSION >= 41 int i, j; uint32_t source; uint32_t mask = BITFIELD_RANGE(0, ncounters); @@ -529,21 +548,23 @@ v3dX(simulator_perfmon_start)(struct v3d_hw *v3d, V3D_WRITE(V3D_PCTR_0_CLR, mask); V3D_WRITE(V3D_PCTR_0_OVERFLOW, mask); V3D_WRITE(V3D_PCTR_0_EN, mask); -#endif } void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d, uint32_t ncounters, uint64_t *values) { -#if V3D_VERSION >= 41 int i; for (i = 0; i < ncounters; i++) values[i] += V3D_READ(V3D_PCTR_0_PCTR_N(i)); V3D_WRITE(V3D_PCTR_0_EN, 0); -#endif +} + +void v3dX(simulator_get_perfcnt_total)(uint32_t *count) +{ + *count = ARRAY_SIZE(v3d_performance_counters); } #endif /* USE_V3D_SIMULATOR */ diff --git a/src/broadcom/simulator/v3dx_simulator.h b/src/broadcom/simulator/v3dx_simulator.h index 145ae59c21e..51fc2409d3e 100644 --- a/src/broadcom/simulator/v3dx_simulator.h +++ b/src/broadcom/simulator/v3dx_simulator.h @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * Copyright © 2014-2017 Broadcom * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org> * @@ -50,3 +50,4 @@ void v3dX(simulator_perfmon_start)(struct v3d_hw *v3d, void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d, uint32_t ncounters, uint64_t *values); +void v3dX(simulator_get_perfcnt_total)(uint32_t *count); diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build index 9d2593cf6d2..3f04a4162dc 100644 --- a/src/broadcom/vulkan/meson.build +++ b/src/broadcom/vulkan/meson.build @@ -1,4 +1,4 @@ -# Copyright © 2019 Raspberry Pi +# Copyright © 2019 Raspberry Pi Ltd # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -25,7 +25,9 @@ v3dv_entrypoints = custom_target( command : [ prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak', '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv', + '--beta', with_vulkan_beta.to_string(), '--device-prefix', 'ver42', + '--device-prefix', 'ver71', ], depend_files : vk_entrypoints_gen_depend_files, ) @@ -38,6 +40,7 @@ libv3dv_files = files( 'v3dv_debug.h', 'v3dv_descriptor_set.c', 'v3dv_device.c', + 'v3dv_event.c', 'v3dv_formats.c', 'v3dv_image.c', 'v3dv_limits.h', @@ -50,9 +53,8 @@ libv3dv_files = files( 'v3dv_query.c', 'v3dv_queue.c', 'v3dv_uniforms.c', - 'v3dv_util.c', 'v3dv_wsi.c', -) +) + [v3d_xml_pack] files_per_version = files( 'v3dvx_cmd_buffer.c', @@ -63,18 +65,16 @@ files_per_version = files( 'v3dvx_pipeline.c', 'v3dvx_meta_common.c', 'v3dvx_pipeline.c', + 'v3dvx_query.c', 'v3dvx_queue.c', ) -# The vulkan driver only supports version >= 42, which is the version present in -# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d -# driver. -v3d_versions = ['42'] +v3d_versions = ['42', '71'] v3dv_flags = [] -dep_v3dv3 = dependency('v3dv3', required : false) -if dep_v3dv3.found() +dep_v3d_hw = dependency('v3d_hw', required : false) +if dep_v3d_hw.found() v3dv_flags += '-DUSE_V3D_SIMULATOR' endif @@ -82,31 +82,27 @@ v3dv_deps = [ dep_dl, dep_libdrm, dep_valgrind, - dep_v3dv3, + dep_v3d_hw, idep_nir, idep_nir_headers, idep_vulkan_util, + idep_vulkan_runtime, + idep_vulkan_wsi, ] if with_platform_x11 v3dv_deps += dep_xcb_dri3 - v3dv_flags += [ - '-DVK_USE_PLATFORM_XCB_KHR', - '-DVK_USE_PLATFORM_XLIB_KHR', - ] - libv3dv_files += files('v3dv_wsi_x11.c') endif if with_platform_wayland - v3dv_deps += [dep_wayland_client, dep_wl_protocols] - v3dv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR' - libv3dv_files += files('v3dv_wsi_wayland.c') + v3dv_deps += dep_wayland_client libv3dv_files += [wayland_drm_client_protocol_h, wayland_drm_protocol_c] endif -if system_has_kms_drm and not with_platform_android - v3dv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR' - libv3dv_files += files('v3dv_wsi_display.c') +if with_platform_android + v3dv_deps += [dep_android, idep_u_gralloc] + v3dv_flags += '-DVK_USE_PLATFORM_ANDROID_KHR' + libv3dv_files += files('v3dv_android.c') endif per_version_libs = [] @@ -115,8 +111,8 @@ foreach ver : v3d_versions 'v3dv-v' + ver, [files_per_version, v3d_xml_pack, v3dv_entrypoints[0]], include_directories : [ - inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom, - inc_compiler, inc_util, inc_vulkan_wsi, + inc_src, inc_include, inc_broadcom, + inc_util, ], c_args : [v3dv_flags, '-DV3D_VERSION=' + ver], gnu_symbol_visibility : 'hidden', @@ -128,17 +124,17 @@ libvulkan_broadcom = shared_library( 'vulkan_broadcom', [libv3dv_files, v3dv_entrypoints, sha1_h], include_directories : [ - inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util, inc_vulkan_wsi, + inc_include, inc_src, inc_broadcom, inc_util, ], link_with : [ libbroadcom_cle, libbroadcom_v3d, - libvulkan_wsi, per_version_libs, ], dependencies : v3dv_deps, c_args : v3dv_flags, - link_args : ['-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections], + link_args : [vulkan_icd_link_args, '-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections], + link_depends : vulkan_icd_link_depends, gnu_symbol_visibility : 'hidden', install : true, ) @@ -162,12 +158,31 @@ broadcom_icd = custom_target( output : 'broadcom_icd.@0@.json'.format(host_machine.cpu()), command : [ prog_python, '@INPUT0@', - '--api-version', '1.0', '--xml', '@INPUT1@', + '--api-version', '1.2', '--xml', '@INPUT1@', '--lib-path', join_paths(get_option('prefix'), get_option('libdir'), 'libvulkan_broadcom.so'), '--out', '@OUTPUT@', ], build_by_default : true, install_dir : with_vulkan_icd_dir, + install_tag : 'runtime', install : true, ) + +_dev_icdname = 'broadcom_devenv_icd.@0@.json'.format(host_machine.cpu()) +_dev_icd = custom_target( + 'broadcom_devenv_icd', + input : [vk_icd_gen, vk_api_xml], + output : _dev_icdname, + command : [ + prog_python, '@INPUT0@', + '--api-version', '1.3', '--xml', '@INPUT1@', + '--lib-path', meson.current_build_dir() / 'libvulkan_broadcom.so', + '--out', '@OUTPUT@', + ], + build_by_default : true, +) + +devenv.append('VK_DRIVER_FILES', _dev_icd.full_path()) +# Deprecated: replaced by VK_DRIVER_FILES above +devenv.append('VK_ICD_FILENAMES', _dev_icd.full_path()) diff --git a/src/broadcom/vulkan/v3dv_android.c b/src/broadcom/vulkan/v3dv_android.c new file mode 100644 index 00000000000..afb691e55d0 --- /dev/null +++ b/src/broadcom/vulkan/v3dv_android.c @@ -0,0 +1,544 @@ +/* + * Copyright © 2017, Google Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "v3dv_private.h" +#include <hardware/gralloc.h> + +#if ANDROID_API_LEVEL >= 26 +#include <hardware/gralloc1.h> +#endif + +#include "drm-uapi/drm_fourcc.h" +#include <hardware/hardware.h> +#include <hardware/hwvulkan.h> + +#include <vulkan/vk_android_native_buffer.h> +#include <vulkan/vk_icd.h> + +#include "vk_android.h" +#include "vk_enum_defines.h" + +#include "util/libsync.h" +#include "util/log.h" +#include "util/os_file.h" + +static int +v3dv_hal_open(const struct hw_module_t *mod, + const char *id, + struct hw_device_t **dev); +static int +v3dv_hal_close(struct hw_device_t *dev); + +static_assert(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC, ""); + +PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = { + .common = + { + .tag = HARDWARE_MODULE_TAG, + .module_api_version = HWVULKAN_MODULE_API_VERSION_0_1, + .hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0), + .id = HWVULKAN_HARDWARE_MODULE_ID, + .name = "Broadcom Vulkan HAL", + .author = "Mesa3D", + .methods = + &(hw_module_methods_t) { + .open = v3dv_hal_open, + }, + }, +}; + +/* If any bits in test_mask are set, then unset them and return true. */ +static inline bool +unmask32(uint32_t *inout_mask, uint32_t test_mask) +{ + uint32_t orig_mask = *inout_mask; + *inout_mask &= ~test_mask; + return *inout_mask != orig_mask; +} + +static int +v3dv_hal_open(const struct hw_module_t *mod, + const char *id, + struct hw_device_t **dev) +{ + assert(mod == &HAL_MODULE_INFO_SYM.common); + assert(strcmp(id, HWVULKAN_DEVICE_0) == 0); + + hwvulkan_device_t *hal_dev = malloc(sizeof(*hal_dev)); + if (!hal_dev) + return -1; + + *hal_dev = (hwvulkan_device_t){ + .common = + { + .tag = HARDWARE_DEVICE_TAG, + .version = HWVULKAN_DEVICE_API_VERSION_0_1, + .module = &HAL_MODULE_INFO_SYM.common, + .close = v3dv_hal_close, + }, + .EnumerateInstanceExtensionProperties = + v3dv_EnumerateInstanceExtensionProperties, + .CreateInstance = v3dv_CreateInstance, + .GetInstanceProcAddr = v3dv_GetInstanceProcAddr, + }; + + mesa_logi("v3dv: Warning: Android Vulkan implementation is experimental"); + + *dev = &hal_dev->common; + return 0; +} + +static int +v3dv_hal_close(struct hw_device_t *dev) +{ + /* hwvulkan.h claims that hw_device_t::close() is never called. */ + return -1; +} + +VkResult +v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc, + struct u_gralloc_buffer_handle *in_hnd, + VkImageDrmFormatModifierExplicitCreateInfoEXT *out, + VkSubresourceLayout *out_layouts, + int max_planes) +{ + struct u_gralloc_buffer_basic_info info; + + if (u_gralloc_get_buffer_basic_info(gralloc, in_hnd, &info) != 0) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + if (info.num_planes > max_planes) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + bool is_disjoint = false; + for (int i = 1; i < info.num_planes; i++) { + if (info.offsets[i] == 0) { + is_disjoint = true; + break; + } + } + + if (is_disjoint) { + /* We don't support disjoint planes yet */ + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } + + memset(out_layouts, 0, sizeof(*out_layouts) * info.num_planes); + memset(out, 0, sizeof(*out)); + + out->sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT; + out->pPlaneLayouts = out_layouts; + + out->drmFormatModifier = info.modifier; + out->drmFormatModifierPlaneCount = info.num_planes; + for (int i = 0; i < info.num_planes; i++) { + out_layouts[i].offset = info.offsets[i]; + out_layouts[i].rowPitch = info.strides[i]; + } + + if (info.drm_fourcc == DRM_FORMAT_YVU420) { + /* Swap the U and V planes to match the VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM */ + VkSubresourceLayout tmp = out_layouts[1]; + out_layouts[1] = out_layouts[2]; + out_layouts[2] = tmp; + } + + return VK_SUCCESS; +} + +VkResult +v3dv_import_native_buffer_fd(VkDevice device_h, + int native_buffer_fd, + const VkAllocationCallbacks *alloc, + VkImage image_h) +{ + VkResult result; + + VkDeviceMemory memory_h; + + const VkMemoryDedicatedAllocateInfo ded_alloc = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, + .pNext = NULL, + .buffer = VK_NULL_HANDLE, + .image = image_h + }; + + const VkImportMemoryFdInfoKHR import_info = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .pNext = &ded_alloc, + .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + .fd = os_dupfd_cloexec(native_buffer_fd), + }; + + result = + v3dv_AllocateMemory(device_h, + &(VkMemoryAllocateInfo) { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = &import_info, + .allocationSize = lseek(native_buffer_fd, 0, SEEK_END), + .memoryTypeIndex = 0, + }, + alloc, &memory_h); + + if (result != VK_SUCCESS) + goto fail_create_image; + + VkBindImageMemoryInfo bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO, + .image = image_h, + .memory = memory_h, + .memoryOffset = 0, + }; + v3dv_BindImageMemory2(device_h, 1, &bind_info); + + return VK_SUCCESS; + +fail_create_image: + close(import_info.fd); + + return result; +} + +static VkResult +format_supported_with_usage(VkDevice device_h, + VkFormat format, + VkImageUsageFlags imageUsage) +{ + V3DV_FROM_HANDLE(v3dv_device, device, device_h); + struct v3dv_physical_device *phys_dev = device->pdevice; + VkPhysicalDevice phys_dev_h = v3dv_physical_device_to_handle(phys_dev); + VkResult result; + + const VkPhysicalDeviceImageFormatInfo2 image_format_info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .format = format, + .type = VK_IMAGE_TYPE_2D, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = imageUsage, + }; + + VkImageFormatProperties2 image_format_props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + }; + + /* Check that requested format and usage are supported. */ + result = v3dv_GetPhysicalDeviceImageFormatProperties2( + phys_dev_h, &image_format_info, &image_format_props); + if (result != VK_SUCCESS) { + return vk_errorf(device, result, + "v3dv_GetPhysicalDeviceImageFormatProperties2 failed " + "inside %s", + __func__); + } + + return VK_SUCCESS; +} + +static VkResult +setup_gralloc0_usage(struct v3dv_device *device, + VkFormat format, + VkImageUsageFlags imageUsage, + int *grallocUsage) +{ + if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)) + *grallocUsage |= GRALLOC_USAGE_HW_RENDER; + + if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_STORAGE_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) + *grallocUsage |= GRALLOC_USAGE_HW_TEXTURE; + + /* All VkImageUsageFlags not explicitly checked here are unsupported for + * gralloc swapchains. + */ + if (imageUsage != 0) { + return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED, + "unsupported VkImageUsageFlags(0x%x) for gralloc " + "swapchain", + imageUsage); + } + + /* Swapchain assumes direct displaying, therefore enable COMPOSER flag, + * In case format is not supported by display controller, gralloc will + * drop this flag and still allocate the buffer in VRAM + */ + *grallocUsage |= GRALLOC_USAGE_HW_COMPOSER; + + if (*grallocUsage == 0) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_GetSwapchainGrallocUsageANDROID(VkDevice device_h, + VkFormat format, + VkImageUsageFlags imageUsage, + int *grallocUsage) +{ + V3DV_FROM_HANDLE(v3dv_device, device, device_h); + VkResult result; + + result = format_supported_with_usage(device_h, format, imageUsage); + if (result != VK_SUCCESS) + return result; + + *grallocUsage = 0; + return setup_gralloc0_usage(device, format, imageUsage, grallocUsage); +} + +#if ANDROID_API_LEVEL >= 26 +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_GetSwapchainGrallocUsage2ANDROID( + VkDevice device_h, + VkFormat format, + VkImageUsageFlags imageUsage, + VkSwapchainImageUsageFlagsANDROID swapchainImageUsage, + uint64_t *grallocConsumerUsage, + uint64_t *grallocProducerUsage) +{ + V3DV_FROM_HANDLE(v3dv_device, device, device_h); + VkResult result; + + *grallocConsumerUsage = 0; + *grallocProducerUsage = 0; + mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage); + + result = format_supported_with_usage(device_h, format, imageUsage); + if (result != VK_SUCCESS) + return result; + + int32_t grallocUsage = 0; + result = setup_gralloc0_usage(device, format, imageUsage, &grallocUsage); + if (result != VK_SUCCESS) + return result; + + /* Setup gralloc1 usage flags from gralloc0 flags. */ + + if (grallocUsage & GRALLOC_USAGE_HW_RENDER) { + *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET; + } + + if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) { + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE; + } + + if (grallocUsage & GRALLOC_USAGE_HW_COMPOSER) { + /* GPU composing case */ + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE; + /* Hardware composing case */ + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER; + } + + if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID) { + uint64_t front_rendering_usage = 0; + u_gralloc_get_front_rendering_usage(device->gralloc, &front_rendering_usage); + *grallocProducerUsage |= front_rendering_usage; + } + + return VK_SUCCESS; +} +#endif + +/* ----------------------------- AHardwareBuffer --------------------------- */ + +static VkResult +get_ahb_buffer_format_properties2(VkDevice device_h, const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties) +{ + V3DV_FROM_HANDLE(v3dv_device, device, device_h); + + /* Get a description of buffer contents . */ + AHardwareBuffer_Desc desc; + AHardwareBuffer_describe(buffer, &desc); + + /* Verify description. */ + const uint64_t gpu_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE | + AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT | + AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER; + + /* "Buffer must be a valid Android hardware buffer object with at least + * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags." + */ + if (!(desc.usage & (gpu_usage))) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + /* Fill properties fields based on description. */ + VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties; + + p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY; + + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601; + p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL; + + p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + + VkFormatProperties2 format_properties = {.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2}; + + p->format = vk_ahb_format_to_image_format(desc.format); + + VkFormat external_format = p->format; + + if (p->format != VK_FORMAT_UNDEFINED) + goto finish; + + /* External format only case + * + * From vkGetAndroidHardwareBufferPropertiesANDROID spec: + * "If the Android hardware buffer has one of the formats listed in the Format + * Equivalence table (see spec.), then format must have the equivalent Vulkan + * format listed in the table. Otherwise, format may be VK_FORMAT_UNDEFINED, + * indicating the Android hardware buffer can only be used with an external format." + * + * From SKIA source code analysis: p->format MUST be VK_FORMAT_UNDEFINED, if the + * format is not in the Equivalence table. + */ + + struct u_gralloc_buffer_handle gr_handle = { + .handle = AHardwareBuffer_getNativeHandle(buffer), + .pixel_stride = desc.stride, + .hal_format = desc.format, + }; + + struct u_gralloc_buffer_basic_info info; + + if (u_gralloc_get_buffer_basic_info(device->gralloc, &gr_handle, &info) != 0) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + switch (info.drm_fourcc) { + case DRM_FORMAT_YVU420: + /* Assuming that U and V planes are swapped earlier */ + external_format = VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM; + break; + case DRM_FORMAT_NV12: + external_format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; + break; + default:; + mesa_loge("Unsupported external DRM format: %d", info.drm_fourcc); + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } + + struct u_gralloc_buffer_color_info color_info; + if (u_gralloc_get_buffer_color_info(device->gralloc, &gr_handle, &color_info) == 0) { + switch (color_info.yuv_color_space) { + case __DRI_YUV_COLOR_SPACE_ITU_REC601: + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601; + break; + case __DRI_YUV_COLOR_SPACE_ITU_REC709: + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709; + break; + case __DRI_YUV_COLOR_SPACE_ITU_REC2020: + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020; + break; + default: + break; + } + + p->suggestedYcbcrRange = (color_info.sample_range == __DRI_YUV_NARROW_RANGE) ? + VK_SAMPLER_YCBCR_RANGE_ITU_NARROW : VK_SAMPLER_YCBCR_RANGE_ITU_FULL; + p->suggestedXChromaOffset = (color_info.horizontal_siting == __DRI_YUV_CHROMA_SITING_0_5) ? + VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN; + p->suggestedYChromaOffset = (color_info.vertical_siting == __DRI_YUV_CHROMA_SITING_0_5) ? + VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN; + } + +finish: + + v3dv_GetPhysicalDeviceFormatProperties2(v3dv_physical_device_to_handle(device->pdevice), + external_format, &format_properties); + + /* v3dv doesn't support direct sampling from linear images but has a logic to copy + * from linear to tiled images implicitly before sampling. Therefore expose optimal + * features for both linear and optimal tiling. + */ + p->formatFeatures = format_properties.formatProperties.optimalTilingFeatures; + p->externalFormat = external_format; + + /* From vkGetAndroidHardwareBufferPropertiesANDROID spec: + * "The formatFeatures member *must* include + * VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT and at least one of + * VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT or + * VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT" + */ + p->formatFeatures |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT; + + return VK_SUCCESS; +} + +VkResult +v3dv_GetAndroidHardwareBufferPropertiesANDROID(VkDevice device_h, + const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferPropertiesANDROID *pProperties) +{ + V3DV_FROM_HANDLE(v3dv_device, dev, device_h); + struct v3dv_physical_device *pdevice = dev->pdevice; + + VkResult result; + + VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop = + vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID); + + /* Fill format properties of an Android hardware buffer. */ + if (format_prop) { + VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = { + .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID, + }; + result = get_ahb_buffer_format_properties2(device_h, buffer, &format_prop2); + if (result != VK_SUCCESS) + return result; + + format_prop->format = format_prop2.format; + format_prop->externalFormat = format_prop2.externalFormat; + format_prop->formatFeatures = + vk_format_features2_to_features(format_prop2.formatFeatures); + format_prop->samplerYcbcrConversionComponents = + format_prop2.samplerYcbcrConversionComponents; + format_prop->suggestedYcbcrModel = format_prop2.suggestedYcbcrModel; + format_prop->suggestedYcbcrRange = format_prop2.suggestedYcbcrRange; + format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset; + format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset; + } + + VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 = + vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID); + if (format_prop2) { + result = get_ahb_buffer_format_properties2(device_h, buffer, format_prop2); + if (result != VK_SUCCESS) + return result; + } + + const native_handle_t *handle = AHardwareBuffer_getNativeHandle(buffer); + assert(handle && handle->numFds > 0); + pProperties->allocationSize = lseek(handle->data[0], 0, SEEK_END); + + /* All memory types. */ + pProperties->memoryTypeBits = (1u << pdevice->memory.memoryTypeCount) - 1; + + return VK_SUCCESS; +} diff --git a/src/broadcom/vulkan/v3dv_bo.c b/src/broadcom/vulkan/v3dv_bo.c index 71679ceec27..1b26abec325 100644 --- a/src/broadcom/vulkan/v3dv_bo.c +++ b/src/broadcom/vulkan/v3dv_bo.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -31,11 +31,12 @@ /* Default max size of the bo cache, in MB. * - * FIXME: we got this value when testing some apps using the rpi4 with 4GB, - * but it should depend on the total amount of RAM. But for that we would need - * to test on real hw with different amount of RAM. Using this value for now. + * This value comes from testing different Vulkan application. Greater values + * didn't get any further performance benefit. This looks somewhat small, but + * from testing those applications, the main consumer of the bo cache are + * the bos used for the CLs, that are usually small. */ -#define DEFAULT_MAX_BO_CACHE_SIZE 512 +#define DEFAULT_MAX_BO_CACHE_SIZE 64 /* Discarded to use a V3D_DEBUG for this, as it would mean adding a run-time * check for most of the calls @@ -67,8 +68,8 @@ bo_dump_stats(struct v3dv_device *device) struct timespec time; clock_gettime(CLOCK_MONOTONIC, &time); - fprintf(stderr, " now: %ld\n", - time.tv_sec); + fprintf(stderr, " now: %lld\n", + (long long)time.tv_sec); } if (cache->size_list_size) { @@ -117,8 +118,8 @@ bo_from_cache(struct v3dv_device *device, uint32_t size, const char *name) } bo_remove_from_cache(cache, bo); - bo->name = name; + p_atomic_set(&bo->refcnt, 1); } mtx_unlock(&cache->lock); return bo; @@ -131,28 +132,39 @@ bo_free(struct v3dv_device *device, if (!bo) return true; - if (bo->map) - v3dv_bo_unmap(device, bo); + assert(p_atomic_read(&bo->refcnt) == 0); + assert(bo->map == NULL); + + if (!bo->is_import) { + device->bo_count--; + device->bo_size -= bo->size; + + if (dump_stats) { + fprintf(stderr, "Freed %s%s%dkb:\n", + bo->name ? bo->name : "", + bo->name ? " " : "", + bo->size / 1024); + bo_dump_stats(device); + } + } + + uint32_t handle = bo->handle; + /* Our BO structs are stored in a sparse array in the physical device, + * so we don't want to free the BO pointer, instead we want to reset it + * to 0, to signal that array entry as being free. + * + * We must do the reset before we actually free the BO in the kernel, since + * otherwise there is a chance the application creates another BO in a + * different thread and gets the same array entry, causing a race. + */ + memset(bo, 0, sizeof(*bo)); struct drm_gem_close c; memset(&c, 0, sizeof(c)); - c.handle = bo->handle; + c.handle = handle; int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_GEM_CLOSE, &c); if (ret != 0) - fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno)); - - device->bo_count--; - device->bo_size -= bo->size; - - if (dump_stats) { - fprintf(stderr, "Freed %s%s%dkb:\n", - bo->name ? bo->name : "", - bo->name ? " " : "", - bo->size / 1024); - bo_dump_stats(device); - } - - vk_free(&device->vk.alloc, bo); + fprintf(stderr, "close object %d: %s\n", handle, strerror(errno)); return ret == 0; } @@ -183,6 +195,7 @@ v3dv_bo_init(struct v3dv_bo *bo, const char *name, bool private) { + p_atomic_set(&bo->refcnt, 1); bo->handle = handle; bo->handle_bit = 1ull << (handle % 64); bo->size = size; @@ -192,9 +205,22 @@ v3dv_bo_init(struct v3dv_bo *bo, bo->name = name; bo->private = private; bo->dumb_handle = -1; + bo->is_import = false; + bo->cl_branch_offset = 0xffffffff; list_inithead(&bo->list_link); } +void +v3dv_bo_init_import(struct v3dv_bo *bo, + uint32_t handle, + uint32_t size, + uint32_t offset, + bool private) +{ + v3dv_bo_init(bo, handle, size, offset, "import", private); + bo->is_import = true; +} + struct v3dv_bo * v3dv_bo_alloc(struct v3dv_device *device, uint32_t size, @@ -218,14 +244,6 @@ v3dv_bo_alloc(struct v3dv_device *device, } } - bo = vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (!bo) { - fprintf(stderr, "Failed to allocate host memory for BO\n"); - return NULL; - } - retry: ; @@ -244,7 +262,6 @@ v3dv_bo_alloc(struct v3dv_device *device, goto retry; } - vk_free(&device->vk.alloc, bo); fprintf(stderr, "Failed to allocate device memory for BO\n"); return NULL; } @@ -252,6 +269,9 @@ v3dv_bo_alloc(struct v3dv_device *device, assert(create.offset % page_align == 0); assert((create.offset & 0xffffffff) == create.offset); + bo = v3dv_device_lookup_bo(device->pdevice, create.handle); + assert(bo && bo->handle == 0); + v3dv_bo_init(bo, create.handle, size, create.offset, name, private); device->bo_count++; @@ -320,7 +340,7 @@ v3dv_bo_map(struct v3dv_device *device, struct v3dv_bo *bo, uint32_t size) if (!ok) return false; - ok = v3dv_bo_wait(device, bo, PIPE_TIMEOUT_INFINITE); + ok = v3dv_bo_wait(device, bo, OS_TIMEOUT_INFINITE); if (!ok) { fprintf(stderr, "memory wait for map failed\n"); return false; @@ -340,7 +360,7 @@ v3dv_bo_unmap(struct v3dv_device *device, struct v3dv_bo *bo) bo->map_size = 0; } -static boolean +static bool reallocate_size_list(struct v3dv_bo_cache *cache, struct v3dv_device *device, uint32_t size) @@ -400,9 +420,11 @@ v3dv_bo_cache_init(struct v3dv_device *device) fprintf(stderr, "MAX BO CACHE SIZE: %iMB\n", device->bo_cache.max_cache_size); } + mtx_lock(&device->bo_cache.lock); device->bo_cache.max_cache_size *= 1024 * 1024; device->bo_cache.cache_count = 0; device->bo_cache.cache_size = 0; + mtx_unlock(&device->bo_cache.lock); } void @@ -455,6 +477,12 @@ v3dv_bo_free(struct v3dv_device *device, if (!bo) return true; + if (!p_atomic_dec_zero(&bo->refcnt)) + return true; + + if (bo->map) + v3dv_bo_unmap(device, bo); + struct timespec time; struct v3dv_bo_cache *cache = &device->bo_cache; uint32_t page_index = bo->size / 4096 - 1; diff --git a/src/broadcom/vulkan/v3dv_bo.h b/src/broadcom/vulkan/v3dv_bo.h index ab2b8c7356d..5e382817b37 100644 --- a/src/broadcom/vulkan/v3dv_bo.h +++ b/src/broadcom/vulkan/v3dv_bo.h @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -39,6 +39,11 @@ struct v3dv_bo { const char *name; + /* In a CL where a BRANCH has been emitted, the offset of the BRANCH + * instruction in the BO. + */ + uint32_t cl_branch_offset; + /** Entry in the linked list of buffers freed, by age. */ struct list_head time_list; /** Entry in the per-page-count linked list of buffers freed (by age). */ @@ -52,14 +57,20 @@ struct v3dv_bo { */ bool private; + /** If this BO has been imported */ + bool is_import; + /** * If this BO was allocated for a swapchain on the display device, the * handle of the dumb BO on that device. */ int32_t dumb_handle; + + int32_t refcnt; }; void v3dv_bo_init(struct v3dv_bo *bo, uint32_t handle, uint32_t size, uint32_t offset, const char *name, bool private); +void v3dv_bo_init_import(struct v3dv_bo *bo, uint32_t handle, uint32_t size, uint32_t offset, bool private); struct v3dv_bo *v3dv_bo_alloc(struct v3dv_device *device, uint32_t size, const char *name, bool private); diff --git a/src/broadcom/vulkan/v3dv_cl.c b/src/broadcom/vulkan/v3dv_cl.c index ed11f53c4bb..7d414999e9b 100644 --- a/src/broadcom/vulkan/v3dv_cl.c +++ b/src/broadcom/vulkan/v3dv_cl.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -27,7 +27,7 @@ * versions, so we just explicitly set the V3D_VERSION and include v3dx_pack * here */ -#define V3D_VERSION 33 +#define V3D_VERSION 42 #include "broadcom/common/v3d_macros.h" #include "broadcom/cle/v3dx_pack.h" @@ -58,6 +58,14 @@ v3dv_cl_destroy(struct v3dv_cl *cl) static bool cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch) { + /* If we are growing, double the BO allocation size to reduce the number + * of allocations with large command buffers. This has a very significant + * impact on the number of draw calls per second reported by vkoverhead. + */ + space = align(space, 4096); + if (cl->bo) + space = MAX2(cl->bo->size * 2, space); + struct v3dv_bo *bo = v3dv_bo_alloc(cl->job->device, space, "CL", true); if (!bo) { fprintf(stderr, "failed to allocate memory for command list\n"); @@ -76,6 +84,7 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch) /* Chain to the new BO from the old one if requested */ if (use_branch && cl->bo) { + cl->bo->cl_branch_offset = v3dv_cl_offset(cl); cl_emit(cl, BRANCH, branch) { branch.address = v3dv_cl_address(bo, 0); } @@ -114,14 +123,18 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space) * end with a 'return from sub list' command. */ bool needs_return_from_sub_list = false; - if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) { - if (cl->size > 0) { + if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE && cl->size > 0) needs_return_from_sub_list = true; - space += cl_packet_length(RETURN_FROM_SUB_LIST); - } - } else { - space += cl_packet_length(BRANCH); - } + + /* + * The CLE processor in the simulator tries to read V3D_CL_MAX_INSTR_SIZE + * bytes form the CL for each new instruction. If the last instruction in our + * CL is smaller than that, and there are not at least V3D_CL_MAX_INSTR_SIZE + * bytes until the end of the BO, it will read out of bounds and possibly + * cause a GMP violation interrupt to trigger. Ensure we always have at + * least that many bytes available to read with the last instruction. + */ + space += V3D_CL_MAX_INSTR_SIZE; if (v3dv_cl_offset(cl) + space <= cl->size) return; diff --git a/src/broadcom/vulkan/v3dv_cl.h b/src/broadcom/vulkan/v3dv_cl.h index 68d5acd455b..7e17ac395c4 100644 --- a/src/broadcom/vulkan/v3dv_cl.h +++ b/src/broadcom/vulkan/v3dv_cl.h @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,7 +26,8 @@ #include "broadcom/cle/v3d_packet_helpers.h" -#include "list.h" +#include "util/list.h" +#include "util/macros.h" struct v3dv_bo; struct v3dv_job; @@ -118,6 +119,13 @@ cl_advance(struct v3dv_cl_out **cl, uint32_t n) } static inline void +cl_advance_and_end(struct v3dv_cl *cl, uint32_t n) +{ + cl->next = (struct v3dv_cl_out *)((char *)(cl->next) + n); + assert(v3dv_cl_offset(cl) <= cl->size); +} + +static inline void cl_aligned_u32(struct v3dv_cl_out **cl, uint32_t n) { *(uint32_t *)(*cl) = n; @@ -143,15 +151,9 @@ cl_aligned_reloc(struct v3dv_cl *cl, uint32_t v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment); void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space); -/* We redefine ALIGN as a macro as we want to use cl_aligned_packet_length for - * struct fields - */ -#define ALIGN(value, alignment) \ - (((value) + (alignment) - 1) & ~((alignment) - 1)) - #define cl_packet_header(packet) V3DX(packet ## _header) #define cl_packet_length(packet) V3DX(packet ## _length) -#define cl_aligned_packet_length(packet, alignment) ALIGN(cl_packet_length(packet), alignment) +#define cl_aligned_packet_length(packet, alignment) ALIGN_POT(cl_packet_length(packet), alignment) #define cl_packet_pack(packet) V3DX(packet ## _pack) #define cl_packet_struct(packet) V3DX(packet) @@ -178,8 +180,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space); ({ \ struct v3dv_cl_out *cl_out = cl_start(cl); \ cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \ - cl_advance(&cl_out, cl_packet_length(packet)); \ - cl_end(cl, cl_out); \ + cl_advance_and_end(cl, cl_packet_length(packet)); \ _loop_terminate = NULL; \ })) \ @@ -195,8 +196,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space); cl_packet_pack(packet)(cl, packed, &name); \ for (int _i = 0; _i < cl_packet_length(packet); _i++) \ ((uint8_t *)cl_out)[_i] = packed[_i] | (prepacked)[_i]; \ - cl_advance(&cl_out, cl_packet_length(packet)); \ - cl_end(cl, cl_out); \ + cl_advance_and_end(cl, cl_packet_length(packet)); \ _loop_terminate = NULL; \ })) \ diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index 0d6c393ee6e..96e83c657e6 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -21,42 +21,26 @@ * IN THE SOFTWARE. */ +#include "broadcom/common/v3d_csd.h" #include "v3dv_private.h" #include "util/u_pack_color.h" -#include "vk_format_info.h" +#include "vk_common_entrypoints.h" #include "vk_util.h" -const struct v3dv_dynamic_state default_dynamic_state = { - .viewport = { - .count = 0, - }, - .scissor = { - .count = 0, - }, - .stencil_compare_mask = - { - .front = ~0u, - .back = ~0u, - }, - .stencil_write_mask = - { - .front = ~0u, - .back = ~0u, - }, - .stencil_reference = - { - .front = 0u, - .back = 0u, - }, - .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f }, - .depth_bias = { - .constant_factor = 0.0f, - .depth_bias_clamp = 0.0f, - .slope_factor = 0.0f, - }, - .line_width = 1.0f, - .color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1, -}; +float +v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline, + struct v3dv_cmd_buffer *buffer) +{ + float width = buffer->vk.dynamic_graphics_state.rs.line.width; + + /* If line smoothing is enabled then we want to add some extra pixels to + * the width in order to have some semi-transparent edges. + */ + if (pipeline->line_smooth) + width = floorf(M_SQRT2 * width) + 3; + + return width; +} void v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo) @@ -83,59 +67,22 @@ v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo) job->bo_handle_mask |= bo->handle_bit; } -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateCommandPool(VkDevice _device, - const VkCommandPoolCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkCommandPool *pCmdPool) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - struct v3dv_cmd_pool *pool; - - /* We only support one queue */ - assert(pCreateInfo->queueFamilyIndex == 0); - - pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool), - VK_OBJECT_TYPE_COMMAND_POOL); - if (pool == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (pAllocator) - pool->alloc = *pAllocator; - else - pool->alloc = device->vk.alloc; - - list_inithead(&pool->cmd_buffers); - - *pCmdPool = v3dv_cmd_pool_to_handle(pool); - - return VK_SUCCESS; -} - static void cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_device *device, - struct v3dv_cmd_pool *pool, - VkCommandBufferLevel level) + struct v3dv_device *device) { /* Do not reset the base object! If we are calling this from a command * buffer reset that would reset the loader's dispatch table for the * command buffer, and any other relevant info from vk_object_base */ - const uint32_t base_size = sizeof(struct vk_object_base); + const uint32_t base_size = sizeof(struct vk_command_buffer); uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size; memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size); cmd_buffer->device = device; - cmd_buffer->pool = pool; - cmd_buffer->level = level; list_inithead(&cmd_buffer->private_objs); list_inithead(&cmd_buffer->jobs); - list_inithead(&cmd_buffer->list_link); - - assert(pool); - list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); cmd_buffer->state.subpass_idx = -1; cmd_buffer->state.meta.subpass_idx = -1; @@ -144,22 +91,35 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer, } static VkResult -cmd_buffer_create(struct v3dv_device *device, - struct v3dv_cmd_pool *pool, - VkCommandBufferLevel level, - VkCommandBuffer *pCommandBuffer) +cmd_buffer_create(struct vk_command_pool *pool, VkCommandBufferLevel level, + struct vk_command_buffer **cmd_buffer_out) { + struct v3dv_device *device = + container_of(pool->base.device, struct v3dv_device, vk); + struct v3dv_cmd_buffer *cmd_buffer; - cmd_buffer = vk_object_zalloc(&device->vk, - &pool->alloc, - sizeof(*cmd_buffer), - VK_OBJECT_TYPE_COMMAND_BUFFER); + cmd_buffer = vk_zalloc(&pool->alloc, + sizeof(*cmd_buffer), + 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (cmd_buffer == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* Here we pass 0 as level because this callback hook doesn't have the level + * info, but that's fine, vk_common_AllocateCommandBuffers will fix it up + * after creation. + */ + VkResult result; + result = vk_command_buffer_init(pool, &cmd_buffer->vk, + &v3dv_cmd_buffer_ops, level); + if (result != VK_SUCCESS) { + vk_free(&pool->alloc, cmd_buffer); + return result; + } - cmd_buffer_init(cmd_buffer, device, pool, level); + cmd_buffer_init(cmd_buffer, device); - *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer); + *cmd_buffer_out = &cmd_buffer->vk; return VK_SUCCESS; } @@ -168,7 +128,7 @@ static void job_destroy_gpu_cl_resources(struct v3dv_job *job) { assert(job->type == V3DV_JOB_TYPE_GPU_CL || - job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); + job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE); v3dv_cl_destroy(&job->bcl); v3dv_cl_destroy(&job->rcl); @@ -189,9 +149,21 @@ job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job) { assert(job->type == V3DV_JOB_TYPE_GPU_CL); - list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) { - list_del(&bo->list_link); - vk_free(&job->device->vk.alloc, bo); + struct v3dv_cmd_buffer *cmd_buffer = job->cmd_buffer; + if (job->clone_owns_bcl) { + /* For suspending jobs in command buffers with the simultaneous use flag + * we allocate a real copy of the BCL. + */ + assert(job->suspending && + cmd_buffer && + (cmd_buffer->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)); + v3dv_cl_destroy(&job->bcl); + } else { + list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) { + list_del(&bo->list_link); + vk_free(&job->device->vk.alloc, bo); + } } list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) { @@ -219,22 +191,6 @@ job_destroy_gpu_csd_resources(struct v3dv_job *job) v3dv_bo_free(job->device, job->csd.shared_memory); } -static void -job_destroy_cpu_wait_events_resources(struct v3dv_job *job) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); - assert(job->cmd_buffer); - vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events); -} - -static void -job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); - assert(job->cmd_buffer); - v3dv_job_destroy(job->cpu.csd_indirect.csd_job); -} - void v3dv_job_destroy(struct v3dv_job *job) { @@ -249,18 +205,12 @@ v3dv_job_destroy(struct v3dv_job *job) if (!job->is_clone) { switch (job->type) { case V3DV_JOB_TYPE_GPU_CL: - case V3DV_JOB_TYPE_GPU_CL_SECONDARY: + case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE: job_destroy_gpu_cl_resources(job); break; case V3DV_JOB_TYPE_GPU_CSD: job_destroy_gpu_csd_resources(job); break; - case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: - job_destroy_cpu_wait_events_resources(job); - break; - case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: - job_destroy_cpu_csd_indirect_resources(job); - break; default: break; } @@ -316,7 +266,7 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer) v3dv_job_destroy(cmd_buffer->state.job); if (cmd_buffer->state.attachments) - vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); + vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments); if (cmd_buffer->state.query.end.alloc_count > 0) vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states); @@ -333,38 +283,22 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer) assert(cmd_buffer->state.meta.attachment_alloc_count > 0); vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments); } + + v3dv_destroy_dynamic_framebuffer(cmd_buffer); } static void -cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer) +cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) { - list_del(&cmd_buffer->pool_link); + struct v3dv_cmd_buffer *cmd_buffer = + container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk); + cmd_buffer_free_resources(cmd_buffer); - vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer); + vk_command_buffer_finish(&cmd_buffer->vk); + vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer); } static bool -attachment_list_is_subset(struct v3dv_subpass_attachment *l1, uint32_t l1_count, - struct v3dv_subpass_attachment *l2, uint32_t l2_count) -{ - for (uint32_t i = 0; i < l1_count; i++) { - uint32_t attachment_idx = l1[i].attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) - continue; - - uint32_t j; - for (j = 0; j < l2_count; j++) { - if (l2[j].attachment == attachment_idx) - break; - } - if (j == l2_count) - return false; - } - - return true; - } - -static bool cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer, uint32_t subpass_idx) { @@ -372,9 +306,9 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer, assert(state->pass); const struct v3dv_physical_device *physical_device = - &cmd_buffer->device->instance->physicalDevice; + cmd_buffer->device->pdevice; - if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) + if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) return false; if (!cmd_buffer->state.job) @@ -399,44 +333,37 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx]; struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx]; - /* Don't merge if the subpasses have different view masks, since in that - * case the framebuffer setup is different and we need to emit different - * RCLs. - */ - if (subpass->view_mask != prev_subpass->view_mask) + if (subpass->ds_attachment.attachment != + prev_subpass->ds_attachment.attachment) return false; - /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED, - * we need to check that for each subpass all its used attachments are - * used by the other subpass. - */ - bool compatible = - attachment_list_is_subset(prev_subpass->color_attachments, - prev_subpass->color_count, - subpass->color_attachments, - subpass->color_count); - if (!compatible) + if (subpass->color_count != prev_subpass->color_count) return false; - compatible = - attachment_list_is_subset(subpass->color_attachments, - subpass->color_count, - prev_subpass->color_attachments, - prev_subpass->color_count); - if (!compatible) - return false; + for (uint32_t i = 0; i < subpass->color_count; i++) { + if (subpass->color_attachments[i].attachment != + prev_subpass->color_attachments[i].attachment) { + return false; + } + } - if (subpass->ds_attachment.attachment != - prev_subpass->ds_attachment.attachment) + /* Don't merge if the subpasses have different view masks, since in that + * case the framebuffer setup is different and we need to emit different + * RCLs. + */ + if (subpass->view_mask != prev_subpass->view_mask) return false; /* FIXME: Since some attachment formats can't be resolved using the TLB we * need to emit separate resolve jobs for them and that would not be * compatible with subpass merges. We could fix that by testing if any of - * the attachments to resolve doesn't suppotr TLB resolves. + * the attachments to resolve doesn't support TLB resolves. */ - if (prev_subpass->resolve_attachments || subpass->resolve_attachments) + if (prev_subpass->resolve_attachments || subpass->resolve_attachments || + prev_subpass->resolve_depth || prev_subpass->resolve_stencil || + subpass->resolve_depth || subpass->resolve_stencil) { return false; + } return true; } @@ -452,18 +379,10 @@ job_compute_frame_tiling(struct v3dv_job *job, uint32_t layers, uint32_t render_target_count, uint8_t max_internal_bpp, - bool msaa) -{ - static const uint8_t tile_sizes[] = { - 64, 64, - 64, 32, - 32, 32, - 32, 16, - 16, 16, - 16, 8, - 8, 8 - }; - + uint8_t total_color_bpp, + bool msaa, + bool double_buffer) +{ assert(job); struct v3dv_frame_tiling *tiling = &job->frame_tiling; @@ -472,23 +391,18 @@ job_compute_frame_tiling(struct v3dv_job *job, tiling->layers = layers; tiling->render_target_count = render_target_count; tiling->msaa = msaa; - - uint32_t tile_size_index = 0; - - if (render_target_count > 2) - tile_size_index += 2; - else if (render_target_count > 1) - tile_size_index += 1; - - if (msaa) - tile_size_index += 2; - tiling->internal_bpp = max_internal_bpp; - tile_size_index += tiling->internal_bpp; - assert(tile_size_index < ARRAY_SIZE(tile_sizes) / 2); + tiling->total_color_bpp = total_color_bpp; + tiling->double_buffer = double_buffer; - tiling->tile_width = tile_sizes[tile_size_index * 2]; - tiling->tile_height = tile_sizes[tile_size_index * 2 + 1]; + /* Double-buffer is incompatible with MSAA */ + assert(!tiling->msaa || !tiling->double_buffer); + + v3d_choose_tile_size(&job->device->devinfo, + render_target_count, + max_internal_bpp, total_color_bpp, msaa, + tiling->double_buffer, + &tiling->tile_width, &tiling->tile_height); tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height); @@ -516,41 +430,17 @@ job_compute_frame_tiling(struct v3dv_job *job, return tiling; } -void -v3dv_job_start_frame(struct v3dv_job *job, - uint32_t width, - uint32_t height, - uint32_t layers, - bool allocate_tile_state_for_all_layers, - uint32_t render_target_count, - uint8_t max_internal_bpp, - bool msaa) +bool +v3dv_job_allocate_tile_state(struct v3dv_job *job) { - assert(job); - - /* Start by computing frame tiling spec for this job */ - const struct v3dv_frame_tiling *tiling = - job_compute_frame_tiling(job, - width, height, layers, - render_target_count, max_internal_bpp, msaa); - - v3dv_cl_ensure_space_with_branch(&job->bcl, 256); - v3dv_return_if_oom(NULL, job); - - /* We only need to allocate tile state for all layers if the binner - * writes primitives to layers other than the first. This can only be - * done using layered rendering (writing gl_Layer from a geometry shader), - * so for other cases of multilayered framebuffers (typically with - * meta copy/clear operations) that won't use layered rendering, we only - * need one layer worth of of tile state for the binner. - */ - if (!allocate_tile_state_for_all_layers) - layers = 1; + struct v3dv_frame_tiling *tiling = &job->frame_tiling; + const uint32_t layers = + job->allocate_tile_state_for_all_layers ? tiling->layers : 1; /* The PTB will request the tile alloc initial size per tile at start * of tile binning. */ - uint32_t tile_alloc_size = 64 * tiling->layers * + uint32_t tile_alloc_size = 64 * layers * tiling->draw_tiles_x * tiling->draw_tiles_y; @@ -573,47 +463,127 @@ v3dv_job_start_frame(struct v3dv_job *job, "tile_alloc", true); if (!job->tile_alloc) { v3dv_flag_oom(NULL, job); - return; + return false; } v3dv_job_add_bo_unchecked(job, job->tile_alloc); const uint32_t tsda_per_tile_size = 256; - const uint32_t tile_state_size = tiling->layers * + const uint32_t tile_state_size = layers * tiling->draw_tiles_x * tiling->draw_tiles_y * tsda_per_tile_size; job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true); if (!job->tile_state) { v3dv_flag_oom(NULL, job); - return; + return false; } v3dv_job_add_bo_unchecked(job, job->tile_state); + return true; +} + +void +v3dv_job_start_frame(struct v3dv_job *job, + uint32_t width, + uint32_t height, + uint32_t layers, + bool allocate_tile_state_for_all_layers, + bool allocate_tile_state_now, + uint32_t render_target_count, + uint8_t max_internal_bpp, + uint8_t total_color_bpp, + bool msaa) +{ + assert(job); + + /* Start by computing frame tiling spec for this job assuming that + * double-buffer mode is disabled. + */ + const struct v3dv_frame_tiling *tiling = + job_compute_frame_tiling(job, width, height, layers, + render_target_count, max_internal_bpp, + total_color_bpp, msaa, false); + + v3dv_cl_ensure_space_with_branch(&job->bcl, 256); + v3dv_return_if_oom(NULL, job); - v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers); + job->allocate_tile_state_for_all_layers = allocate_tile_state_for_all_layers; + + /* For subpass jobs we postpone tile state allocation until we are finishing + * the job and have made a decision about double-buffer. + */ + if (allocate_tile_state_now) { + if (!v3dv_job_allocate_tile_state(job)) + return; + } + + v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, + allocate_tile_state_for_all_layers ? tiling->layers : 1); job->ez_state = V3D_EZ_UNDECIDED; job->first_ez_state = V3D_EZ_UNDECIDED; } +static bool +job_should_enable_double_buffer(struct v3dv_job *job) +{ + /* Incompatibility with double-buffer */ + if (!job->can_use_double_buffer) + return false; + + /* Too much geometry processing */ + if (job->double_buffer_score.geom > 2000000) + return false; + + /* Too little rendering to make up for tile store latency */ + if (job->double_buffer_score.render < 100000) + return false; + + return true; +} + static void cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer) { - assert(cmd_buffer->state.job); + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); + + /* For subpass jobs we always emit the RCL here */ + assert(v3dv_cl_offset(&job->rcl) == 0); + + /* Only emit RCL for the first job in a suspend/resume chain */ + if (!job->resuming) { + /* Decide if we want to enable double-buffer for this job. If we do, then + * we need to rewrite the TILE_BINNING_MODE_CFG packet in the BCL. + */ + if (job_should_enable_double_buffer(job)) { + assert(!job->frame_tiling.double_buffer); + job_compute_frame_tiling(job, + job->frame_tiling.width, + job->frame_tiling.height, + job->frame_tiling.layers, + job->frame_tiling.render_target_count, + job->frame_tiling.internal_bpp, + job->frame_tiling.total_color_bpp, + job->frame_tiling.msaa, + true); + + v3dv_X(job->device, job_emit_enable_double_buffer)(job); + } + + /* At this point we have decided whether we want to use double-buffer or + * not and the job's frame tiling represents that decision so we can + * allocate the tile state, which we need to do before we emit the RCL. + */ + v3dv_job_allocate_tile_state(job); - /* Typically, we have a single job for each subpass and we emit the job's RCL - * here when we are ending the frame for the subpass. However, some commands - * such as vkCmdClearAttachments need to run in their own separate job and - * they emit their own RCL even if they execute inside a subpass. In this - * scenario, we don't want to emit subpass RCL when we end the frame for - * those jobs, so we only emit the subpass RCL if the job has not recorded - * any RCL commands of its own. - */ - if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0) v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer); + } - v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job); + /* Only emit the binning flush for the last job in resume/suspend chain */ + if (!job->suspending) + v3dv_X(cmd_buffer->device, job_emit_binning_flush)(job); } struct v3dv_job * @@ -635,24 +605,47 @@ v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device, } static void -cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer) +cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query, uint32_t count) { - struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); - if (state->query.end.used_count > 0) { - const uint32_t query_count = state->query.end.used_count; - for (uint32_t i = 0; i < query_count; i++) { - assert(i < state->query.end.used_count); - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_END_QUERY, - cmd_buffer, -1); - v3dv_return_if_oom(cmd_buffer, NULL); - - job->cpu.query_end = state->query.end.states[i]; - list_addtail(&job->list_link, &cmd_buffer->jobs); + struct v3dv_job *job = + v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, + V3DV_JOB_TYPE_CPU_END_QUERY, + cmd_buffer, -1); + v3dv_return_if_oom(cmd_buffer, NULL); + + job->cpu.query_end.pool = pool; + job->cpu.query_end.query = query; + job->cpu.query_end.count = count; + list_addtail(&job->list_link, &cmd_buffer->jobs); +} + +static inline bool +cmd_buffer_has_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer) +{ + return cmd_buffer->state.query.end.used_count > 0; +} + +static void +cmd_buffer_add_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer) +{ + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + const uint32_t count = state->query.end.used_count; + for (uint32_t i = 0; i < count; i++) { + assert(i < state->query.end.used_count); + struct v3dv_end_query_info *info = &state->query.end.states[i]; + if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) { + v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, info->pool, + info->query, info->count, 1); + } else { + cmd_buffer_emit_end_query_cpu(cmd_buffer, info->pool, + info->query, info->count); } } + state->query.end.used_count = 0; } void @@ -673,8 +666,17 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer) * should at least have the start frame commands, otherwise, it should have * a transfer command. The only exception are secondary command buffers * inside a render pass. + * + * With dynamic rendering there is also the possibility that we resume a + * suspended pass with an empty job. In that case, we need to ensure the + * empty job is still a valid commmand list, which we will ensure when we + * add the binning flush right below, which only happens if this is the + * last job in the resume/suspend chain. If it is not the last then we know + * it must at least have the BRANCH instruction to link with a follow-up + * resume job. */ - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY || + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY || + (job->resuming && !job->suspending) || v3dv_cl_offset(&job->bcl) > 0); /* When we merge multiple subpasses into the same job we must only emit one @@ -684,6 +686,11 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer) */ assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass); + if (!(cmd_buffer->state.barrier.dst_mask & V3DV_BARRIER_GRAPHICS_BIT)) { + cmd_buffer->state.barrier.bcl_buffer_access = 0; + cmd_buffer->state.barrier.bcl_image_access = 0; + } + /* If we are finishing a job inside a render pass we have two scenarios: * * 1. It is a regular CL, in which case we will submit the job to the GPU, @@ -699,32 +706,36 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer) if (job->type == V3DV_JOB_TYPE_GPU_CL) { cmd_buffer_end_render_pass_frame(cmd_buffer); } else { - assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); + assert(job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE); v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer); } } + bool suspending = job->suspending; list_addtail(&job->list_link, &cmd_buffer->jobs); cmd_buffer->state.job = NULL; /* If we have recorded any state with this last GPU job that requires to - * emit CPU jobs after the job is completed, add them now. The only - * exception is secondary command buffers inside a render pass, because in + * emit jobs after the job is completed, add them now. The only exception + * is secondary command buffers inside a render pass, because in * that case we want to defer this until we finish recording the primary * job into which we execute the secondary. */ - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY || - !cmd_buffer->state.pass) { - cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer); + if (!suspending) { + if (cmd_buffer_has_pending_jobs(cmd_buffer) && + (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY || + !cmd_buffer->state.pass)) { + cmd_buffer_add_pending_jobs(cmd_buffer); + } } } -static bool -job_type_is_gpu(struct v3dv_job *job) +bool +v3dv_job_type_is_gpu(struct v3dv_job *job) { switch (job->type) { case V3DV_JOB_TYPE_GPU_CL: - case V3DV_JOB_TYPE_GPU_CL_SECONDARY: + case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE: case V3DV_JOB_TYPE_GPU_TFU: case V3DV_JOB_TYPE_GPU_CSD: return true; @@ -739,24 +750,40 @@ cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer, { assert(cmd_buffer && job); - if (!cmd_buffer->state.has_barrier) - return; - /* Serialization only affects GPU jobs, CPU jobs are always automatically * serialized. */ - if (!job_type_is_gpu(job)) + if (!v3dv_job_type_is_gpu(job)) return; - job->serialize = true; - if (cmd_buffer->state.has_bcl_barrier && - (job->type == V3DV_JOB_TYPE_GPU_CL || - job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) { - job->needs_bcl_sync = true; + uint8_t barrier_mask = cmd_buffer->state.barrier.dst_mask; + if (barrier_mask == 0) + return; + + uint8_t bit = 0; + uint8_t *src_mask; + if (job->type == V3DV_JOB_TYPE_GPU_CSD) { + assert(!job->is_transfer); + bit = V3DV_BARRIER_COMPUTE_BIT; + src_mask = &cmd_buffer->state.barrier.src_mask_compute; + } else if (job->is_transfer) { + assert(job->type == V3DV_JOB_TYPE_GPU_CL || + job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE || + job->type == V3DV_JOB_TYPE_GPU_TFU); + bit = V3DV_BARRIER_TRANSFER_BIT; + src_mask = &cmd_buffer->state.barrier.src_mask_transfer; + } else { + assert(job->type == V3DV_JOB_TYPE_GPU_CL || + job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE); + bit = V3DV_BARRIER_GRAPHICS_BIT; + src_mask = &cmd_buffer->state.barrier.src_mask_graphics; } - cmd_buffer->state.has_barrier = false; - cmd_buffer->state.has_bcl_barrier = false; + if (barrier_mask & bit) { + job->serialize = *src_mask; + *src_mask = 0; + cmd_buffer->state.barrier.dst_mask &= ~bit; + } } void @@ -779,7 +806,7 @@ v3dv_job_init(struct v3dv_job *job, list_inithead(&job->list_link); if (type == V3DV_JOB_TYPE_GPU_CL || - type == V3DV_JOB_TYPE_GPU_CL_SECONDARY || + type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE || type == V3DV_JOB_TYPE_GPU_CSD) { job->bos = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); @@ -787,12 +814,12 @@ v3dv_job_init(struct v3dv_job *job, v3dv_cl_init(job, &job->indirect); - if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH) + if (V3D_DBG(ALWAYS_FLUSH)) job->always_flush = true; } if (type == V3DV_JOB_TYPE_GPU_CL || - type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) { + type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) { v3dv_cl_init(job, &job->bcl); v3dv_cl_init(job, &job->rcl); } @@ -806,9 +833,10 @@ v3dv_job_init(struct v3dv_job *job, */ cmd_buffer->state.dirty = ~0; cmd_buffer->state.dirty_descriptor_stages = ~0; + vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state); - /* Honor inheritance of occlussion queries in secondaries if requested */ - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && + /* Honor inheritance of occlusion queries in secondaries if requested */ + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && cmd_buffer->state.inheritance.occlusion_query_enable) { cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY; } @@ -820,7 +848,11 @@ v3dv_job_init(struct v3dv_job *job, if (cmd_buffer->state.pass) job->first_subpass = subpass_idx; + job->is_transfer = cmd_buffer->state.is_transfer; + cmd_buffer_serialize_job_if_needed(cmd_buffer, job); + + job->perf = cmd_buffer->state.query.active_query.perf; } } @@ -860,19 +892,16 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer, return job; } -static VkResult -cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer, +static void +cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer, VkCommandBufferResetFlags flags) { + struct v3dv_cmd_buffer *cmd_buffer = + container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk); + + vk_command_buffer_reset(&cmd_buffer->vk); if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) { struct v3dv_device *device = cmd_buffer->device; - struct v3dv_cmd_pool *pool = cmd_buffer->pool; - VkCommandBufferLevel level = cmd_buffer->level; - - /* cmd_buffer_init below will re-add the command buffer to the pool - * so remove it here so we don't end up adding it again. - */ - list_del(&cmd_buffer->pool_link); /* FIXME: For now we always free all resources as if * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set. @@ -880,87 +909,61 @@ cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer, if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW) cmd_buffer_free_resources(cmd_buffer); - cmd_buffer_init(cmd_buffer, device, pool, level); + cmd_buffer_init(cmd_buffer, device); } assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED); - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_AllocateCommandBuffers(VkDevice _device, - const VkCommandBufferAllocateInfo *pAllocateInfo, - VkCommandBuffer *pCommandBuffers) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool); - - VkResult result = VK_SUCCESS; - uint32_t i; - - for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { - result = cmd_buffer_create(device, pool, pAllocateInfo->level, - &pCommandBuffers[i]); - if (result != VK_SUCCESS) - break; - } - - if (result != VK_SUCCESS) { - v3dv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, - i, pCommandBuffers); - for (i = 0; i < pAllocateInfo->commandBufferCount; i++) - pCommandBuffers[i] = VK_NULL_HANDLE; - } - - return result; } -VKAPI_ATTR void VKAPI_CALL -v3dv_FreeCommandBuffers(VkDevice device, - VkCommandPool commandPool, - uint32_t commandBufferCount, - const VkCommandBuffer *pCommandBuffers) -{ - for (uint32_t i = 0; i < commandBufferCount; i++) { - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); - - if (!cmd_buffer) - continue; - - cmd_buffer_destroy(cmd_buffer); - } -} -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroyCommandPool(VkDevice _device, - VkCommandPool commandPool, - const VkAllocationCallbacks *pAllocator) +static void +cmd_buffer_emit_resolve(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t dst_attachment_idx, + uint32_t src_attachment_idx, + VkImageAspectFlagBits aspect) { - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool); - - if (!pool) - return; - - list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer, - &pool->cmd_buffers, pool_link) { - cmd_buffer_destroy(cmd_buffer); - } + struct v3dv_image_view *src_iview = + cmd_buffer->state.attachments[src_attachment_idx].image_view; + struct v3dv_image_view *dst_iview = + cmd_buffer->state.attachments[dst_attachment_idx].image_view; + + const VkRect2D *ra = &cmd_buffer->state.render_area; + + VkImageResolve2 region = { + .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2, + .srcSubresource = { + aspect, + src_iview->vk.base_mip_level, + src_iview->vk.base_array_layer, + src_iview->vk.layer_count, + }, + .srcOffset = { ra->offset.x, ra->offset.y, 0 }, + .dstSubresource = { + aspect, + dst_iview->vk.base_mip_level, + dst_iview->vk.base_array_layer, + dst_iview->vk.layer_count, + }, + .dstOffset = { ra->offset.x, ra->offset.y, 0 }, + .extent = { ra->extent.width, ra->extent.height, 1 }, + }; - vk_object_free(&device->vk, pAllocator, pool); -} + struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image; + struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image; + VkResolveImageInfo2 resolve_info = { + .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2, + .srcImage = v3dv_image_to_handle(src_image), + .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL, + .dstImage = v3dv_image_to_handle(dst_image), + .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL, + .regionCount = 1, + .pRegions = ®ion, + }; -VKAPI_ATTR void VKAPI_CALL -v3dv_TrimCommandPool(VkDevice device, - VkCommandPool commandPool, - VkCommandPoolTrimFlags flags) -{ - /* We don't need to do anything here, our command pools never hold on to - * any resources from command buffers that are freed or reset. - */ + VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer); + v3dv_CmdResolveImage2(cmd_buffer_handle, &resolve_info); } - static void cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer) { @@ -972,8 +975,6 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer) if (!subpass->resolve_attachments) return; - struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer; - /* At this point we have already ended the current subpass and now we are * about to emit vkCmdResolveImage calls to get the resolves we can't handle * handle in the subpass RCL. @@ -993,55 +994,42 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer) cmd_buffer->state.pass = NULL; cmd_buffer->state.subpass_idx = -1; - VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer); for (uint32_t i = 0; i < subpass->color_count; i++) { const uint32_t src_attachment_idx = subpass->color_attachments[i].attachment; if (src_attachment_idx == VK_ATTACHMENT_UNUSED) continue; - if (pass->attachments[src_attachment_idx].use_tlb_resolve) + /* Skip if this attachment doesn't have a resolve or if it was already + * implemented as a TLB resolve. + */ + if (!cmd_buffer->state.attachments[src_attachment_idx].has_resolve || + cmd_buffer->state.attachments[src_attachment_idx].use_tlb_resolve) { continue; + } const uint32_t dst_attachment_idx = subpass->resolve_attachments[i].attachment; - if (dst_attachment_idx == VK_ATTACHMENT_UNUSED) - continue; + assert(dst_attachment_idx != VK_ATTACHMENT_UNUSED); - struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx]; - struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx]; - - VkImageResolve2KHR region = { - .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR, - .srcSubresource = { - VK_IMAGE_ASPECT_COLOR_BIT, - src_iview->vk.base_mip_level, - src_iview->vk.base_array_layer, - src_iview->vk.layer_count, - }, - .srcOffset = { 0, 0, 0 }, - .dstSubresource = { - VK_IMAGE_ASPECT_COLOR_BIT, - dst_iview->vk.base_mip_level, - dst_iview->vk.base_array_layer, - dst_iview->vk.layer_count, - }, - .dstOffset = { 0, 0, 0 }, - .extent = src_iview->vk.image->extent, - }; + cmd_buffer_emit_resolve(cmd_buffer, dst_attachment_idx, src_attachment_idx, + VK_IMAGE_ASPECT_COLOR_BIT); + } - struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image; - struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image; - VkResolveImageInfo2KHR resolve_info = { - .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR, - .srcImage = v3dv_image_to_handle(src_image), - .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL, - .dstImage = v3dv_image_to_handle(dst_image), - .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL, - .regionCount = 1, - .pRegions = ®ion, - }; - v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info); + const uint32_t ds_src_attachment_idx = + subpass->ds_attachment.attachment; + if (ds_src_attachment_idx != VK_ATTACHMENT_UNUSED && + cmd_buffer->state.attachments[ds_src_attachment_idx].has_resolve && + !cmd_buffer->state.attachments[ds_src_attachment_idx].use_tlb_resolve) { + assert(subpass->resolve_depth || subpass->resolve_stencil); + const VkImageAspectFlags ds_aspects = + (subpass->resolve_depth ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) | + (subpass->resolve_stencil ? VK_IMAGE_ASPECT_STENCIL_BIT : 0); + const uint32_t ds_dst_attachment_idx = + subpass->ds_resolve_attachment.attachment; + assert(ds_dst_attachment_idx != VK_ATTACHMENT_UNUSED); + cmd_buffer_emit_resolve(cmd_buffer, ds_dst_attachment_idx, + ds_src_attachment_idx, ds_aspects); } cmd_buffer->state.framebuffer = restore_fb; @@ -1054,19 +1042,30 @@ cmd_buffer_begin_render_pass_secondary( struct v3dv_cmd_buffer *cmd_buffer, const VkCommandBufferInheritanceInfo *inheritance_info) { - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT); assert(inheritance_info); - cmd_buffer->state.pass = - v3dv_render_pass_from_handle(inheritance_info->renderPass); - assert(cmd_buffer->state.pass); + const VkCommandBufferInheritanceRenderingInfo *rendering_info = NULL; + if (inheritance_info->renderPass == VK_NULL_HANDLE) { + rendering_info = vk_find_struct_const(inheritance_info, + COMMAND_BUFFER_INHERITANCE_RENDERING_INFO); + assert(rendering_info); + v3dv_setup_dynamic_render_pass_inheritance(cmd_buffer, rendering_info); + cmd_buffer->state.pass = &cmd_buffer->state.dynamic_pass; + cmd_buffer->state.subpass_idx = 0; + cmd_buffer->state.framebuffer = NULL; + } else { + cmd_buffer->state.pass = + v3dv_render_pass_from_handle(inheritance_info->renderPass); - cmd_buffer->state.framebuffer = - v3dv_framebuffer_from_handle(inheritance_info->framebuffer); + assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count); + cmd_buffer->state.subpass_idx = inheritance_info->subpass; - assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count); - cmd_buffer->state.subpass_idx = inheritance_info->subpass; + cmd_buffer->state.framebuffer = + v3dv_framebuffer_from_handle(inheritance_info->framebuffer); + } + assert(cmd_buffer->state.pass); cmd_buffer->state.inheritance.occlusion_query_enable = inheritance_info->occlusionQueryEnable; @@ -1075,8 +1074,8 @@ cmd_buffer_begin_render_pass_secondary( * so we want to create a job for them here. */ struct v3dv_job *job = - v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass, - V3DV_JOB_TYPE_GPU_CL_SECONDARY); + v3dv_cmd_buffer_start_job(cmd_buffer, cmd_buffer->state.subpass_idx, + V3DV_JOB_TYPE_GPU_CL_INCOMPLETE); if (!job) { v3dv_flag_oom(cmd_buffer, NULL); return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1089,21 +1088,31 @@ cmd_buffer_begin_render_pass_secondary( * * "The application must ensure (using scissor if necessary) that all * rendering is contained within the render area." - * - * FIXME: setup constants for the max framebuffer dimensions and use them - * here and when filling in VkPhysicalDeviceLimits. */ const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; cmd_buffer->state.render_area.offset.x = 0; cmd_buffer->state.render_area.offset.y = 0; cmd_buffer->state.render_area.extent.width = - framebuffer ? framebuffer->width : 4096; + framebuffer ? framebuffer->width : V3D_MAX_IMAGE_DIMENSION; cmd_buffer->state.render_area.extent.height = - framebuffer ? framebuffer->height : 4096; + framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION; + + /* We only really execute double-buffer mode in primary jobs, so allow this + * mode in render pass secondaries to keep track of the double-buffer mode + * score in them and update the primaries accordingly when they are executed + * into them. + */ + job->can_use_double_buffer = true; return VK_SUCCESS; } +const struct vk_command_buffer_ops v3dv_cmd_buffer_ops = { + .create = cmd_buffer_create, + .reset = cmd_buffer_reset, + .destroy = cmd_buffer_destroy, +}; + VKAPI_ATTR VkResult VKAPI_CALL v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) @@ -1114,17 +1123,15 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer, * command buffer's state. Otherwise, we must reset its state. In both * cases we reset it. */ - VkResult result = cmd_buffer_reset(cmd_buffer, 0); - if (result != VK_SUCCESS) - return result; + cmd_buffer_reset(&cmd_buffer->vk, 0); assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED); cmd_buffer->usage_flags = pBeginInfo->flags; - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { - result = + VkResult result = cmd_buffer_begin_render_pass_secondary(cmd_buffer, pBeginInfo->pInheritanceInfo); if (result != VK_SUCCESS) @@ -1137,32 +1144,6 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer, return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer, - VkCommandBufferResetFlags flags) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - return cmd_buffer_reset(cmd_buffer, flags); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ResetCommandPool(VkDevice device, - VkCommandPool commandPool, - VkCommandPoolResetFlags flags) -{ - V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool); - - VkCommandBufferResetFlags reset_flags = 0; - if (flags & VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) - reset_flags = VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT; - list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer, - &pool->cmd_buffers, pool_link) { - cmd_buffer_reset(cmd_buffer, reset_flags); - } - - return VK_SUCCESS; -} - static void cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer) { @@ -1191,21 +1172,64 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer) } static void +cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer *cmd_buffer) +{ + /* NOTE: This should be called after cmd_buffer_update_tile_alignment() + * since it relies on up-to-date information about subpass tile alignment. + */ + const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + const struct v3dv_render_pass *pass = state->pass; + const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; + + for (uint32_t i = 0; i < subpass->color_count; i++) { + const uint32_t attachment_idx = subpass->color_attachments[i].attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) + continue; + + state->attachments[attachment_idx].has_resolve = + subpass->resolve_attachments && + subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED; + + state->attachments[attachment_idx].use_tlb_resolve = + state->attachments[attachment_idx].has_resolve && + state->tile_aligned_render_area && + pass->attachments[attachment_idx].try_tlb_resolve; + } + + uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; + if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { + uint32_t ds_resolve_attachment_idx = + subpass->ds_resolve_attachment.attachment; + state->attachments[ds_attachment_idx].has_resolve = + ds_resolve_attachment_idx != VK_ATTACHMENT_UNUSED; + + assert(!state->attachments[ds_attachment_idx].has_resolve || + (subpass->resolve_depth || subpass->resolve_stencil)); + + state->attachments[ds_attachment_idx].use_tlb_resolve = + state->attachments[ds_attachment_idx].has_resolve && + state->tile_aligned_render_area && + pass->attachments[ds_attachment_idx].try_tlb_resolve; + } +} + +static void cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer, uint32_t attachment_idx, const VkClearColorValue *color) { assert(attachment_idx < cmd_buffer->state.pass->attachment_count); - const struct v3dv_render_pass_attachment *attachment = &cmd_buffer->state.pass->attachments[attachment_idx]; uint32_t internal_type, internal_bpp; const struct v3dv_format *format = v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format); + /* We don't allow multi-planar formats for render pass attachments */ + assert(format->plane_count == 1); v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format) - (format->rt_type, &internal_type, &internal_bpp); + (format->planes[0].rt_type, &internal_type, &internal_bpp); uint32_t internal_size = 4 << internal_bpp; @@ -1273,12 +1297,39 @@ cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer, } static void +cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer *cmd_buffer, + const VkRenderPassBeginInfo *pRenderPassBegin) +{ + V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass); + V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); + + const VkRenderPassAttachmentBeginInfo *attach_begin = + vk_find_struct_const(pRenderPassBegin, RENDER_PASS_ATTACHMENT_BEGIN_INFO); + + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + + for (uint32_t i = 0; i < pass->attachment_count; i++) { + if (attach_begin && attach_begin->attachmentCount != 0) { + state->attachments[i].image_view = + v3dv_image_view_from_handle(attach_begin->pAttachments[i]); + } else if (framebuffer) { + state->attachments[i].image_view = framebuffer->attachments[i]; + } else { + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + state->attachments[i].image_view = NULL; + } + } +} + +static void cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer, const VkRenderPassBeginInfo *pRenderPassBegin) { cmd_buffer_state_set_clear_values(cmd_buffer, pRenderPassBegin->clearValueCount, pRenderPassBegin->pClearValues); + + cmd_buffer_state_set_attachments(cmd_buffer, pRenderPassBegin); } static void @@ -1307,10 +1358,33 @@ cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffe assert(state->attachment_alloc_count >= pass->attachment_count); } +/* If our render area is smaller than the current clip window we will have + * to emit a new clip window to constraint it to the render area. + */ +static void +constraint_clip_window_to_render_area(struct v3dv_cmd_buffer *cmd_buffer) +{ + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + + uint32_t min_render_x = state->render_area.offset.x; + uint32_t min_render_y = state->render_area.offset.y; + uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1; + uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1; + uint32_t min_clip_x = state->clip_window.offset.x; + uint32_t min_clip_y = state->clip_window.offset.y; + uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1; + uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1; + if (min_render_x > min_clip_x || min_render_y > min_clip_y || + max_render_x < max_clip_x || max_render_y < max_clip_y) { + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS); + } +} + VKAPI_ATTR void VKAPI_CALL -v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer, - const VkRenderPassBeginInfo *pRenderPassBegin, - VkSubpassContents contents) +v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, + const VkRenderPassBeginInfo *pRenderPassBegin, + const VkSubpassBeginInfo *pSubpassBeginInfo) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass); @@ -1326,29 +1400,16 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer, cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin); state->render_area = pRenderPassBegin->renderArea; - - /* If our render area is smaller than the current clip window we will have - * to emit a new clip window to constraint it to the render area. - */ - uint32_t min_render_x = state->render_area.offset.x; - uint32_t min_render_y = state->render_area.offset.y; - uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1; - uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1; - uint32_t min_clip_x = state->clip_window.offset.x; - uint32_t min_clip_y = state->clip_window.offset.y; - uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1; - uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1; - if (min_render_x > min_clip_x || min_render_y > min_clip_y || - max_render_x < max_clip_x || max_render_y < max_clip_y) { - state->dirty |= V3DV_CMD_DIRTY_SCISSOR; - } + constraint_clip_window_to_render_area(cmd_buffer); /* Setup for first subpass */ v3dv_cmd_buffer_subpass_start(cmd_buffer, 0); } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) +v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer, + const VkSubpassBeginInfo *pSubpassBeginInfo, + const VkSubpassEndInfo *pSubpassEndInfo) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); @@ -1366,10 +1427,9 @@ v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) static void cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) { - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); - assert(cmd_buffer->state.pass); assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count); + assert(!cmd_buffer->state.resuming); const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; const struct v3dv_render_pass *pass = state->pass; const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; @@ -1384,7 +1444,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) } uint32_t att_count = 0; - VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */ + VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */ /* We only need to emit subpass clears as draw calls for color attachments * if the render area is not aligned to tile boundaries. @@ -1444,7 +1504,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); } else if (subpass->do_depth_clear_with_draw || subpass->do_stencil_clear_with_draw) { - perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), " + perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), " "falling back to vkCmdClearAttachments for " "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); } @@ -1458,23 +1518,212 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) * So the clear is only constrained by the render area and not by pipeline * state such as scissor or viewport, these are the semantics of * vkCmdClearAttachments as well. + * + * Also: + * + * "If the render pass instance this is recorded in uses multiview, then + * baseArrayLayer must be zero and layerCount must be one." */ + assert(state->framebuffer); + uint32_t layer_count = cmd_buffer->state.pass->multiview_enabled ? + 1 : state->framebuffer->layers; VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); VkClearRect rect = { .rect = state->render_area, .baseArrayLayer = 0, - .layerCount = 1, + .layerCount = layer_count, }; v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect); } +bool +v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state, + VkImageAspectFlags aspect, + uint32_t first_subpass_idx, + VkAttachmentLoadOp load_op, + uint32_t last_subpass_idx, + VkAttachmentStoreOp store_op) +{ + /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are + * testing does not exist in the image. + */ + if (!aspect) + return false; + + /* Attachment (or view) load operations apply on the first subpass that + * uses the attachment (or view), otherwise we always need to load. + */ + if (state->job->first_subpass > first_subpass_idx) + return true; + + /* If the job is continuing a subpass started in another job, we always + * need to load. + */ + if (state->job->is_subpass_continue) + return true; + + /* If the area is not aligned to tile boundaries and we are going to store, + * then we need to load to preserve contents outside the render area. + */ + if (!state->tile_aligned_render_area && + v3dv_cmd_buffer_check_needs_store(state, aspect, last_subpass_idx, + store_op)) { + return true; + } + + /* The attachment load operations must be LOAD */ + return load_op == VK_ATTACHMENT_LOAD_OP_LOAD; +} + +bool +v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state, + VkImageAspectFlags aspect, + uint32_t last_subpass_idx, + VkAttachmentStoreOp store_op) +{ + /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are + * testing does not exist in the image. + */ + if (!aspect) + return false; + + /* Attachment (or view) store operations only apply on the last subpass + * where the attachment (or view) is used, in other subpasses we always + * need to store. + */ + if (state->subpass_idx < last_subpass_idx) + return true; + + /* Attachment store operations only apply on the last job we emit on the the + * last subpass where the attachment is used, otherwise we always need to + * store. + */ + if (!state->job->is_subpass_finish) + return true; + + /* The attachment store operation must be STORE */ + return store_op == VK_ATTACHMENT_STORE_OP_STORE; +} + +static void +cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer *cmd_buffer, + bool msaa) +{ + const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); + + job->can_use_double_buffer = false; + + /* Double-buffer can only be used if requested via V3D_DEBUG */ + if (!V3D_DBG(DOUBLE_BUFFER)) + return; + + /* Double-buffer cannot be enabled for MSAA jobs */ + if (msaa) + return; + + const struct v3dv_render_pass *pass = state->pass; + const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; + + /* FIXME: For now we discard multiview jobs (which have an implicit geometry + * shader) for this optimization. If we want to enable this with multiview + * we would need to check if any view (layer) in any attachment used by the + * job has loads and/or stores as we do below for regular attachments. Also, + * we would want to have a heuristic that doesn't automatically disable + * double-buffer in the presence of geometry shaders. + */ + if (state->pass->multiview_enabled) + return; + + /* Tile loads are serialized against stores, in which case we don't get + * any benefits from enabling double-buffer and would just pay the price + * of a smaller tile size instead. Similarly, we only benefit from + * double-buffer if we have tile stores, as the point of this mode is + * to execute rendering of a new tile while we store the previous one to + * hide latency on the tile store operation. + */ + bool has_stores = false; + for (uint32_t i = 0; i < subpass->color_count; i++) { + uint32_t attachment_idx = subpass->color_attachments[i].attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) + continue; + + const struct v3dv_render_pass_attachment *attachment = + &state->pass->attachments[attachment_idx]; + + /* FIXME: This will check 'tile_aligned_render_area' but that was + * computed with a tile size without double-buffer. That is okay + * because if the larger tile size is aligned then we know the smaller + * tile size for double-buffer will be as well. However, we might + * still benefit from doing this check with the smaller tile size + * because it can happen that the smaller size is aligned and the + * larger size is not. + */ + if (v3dv_cmd_buffer_check_needs_load(state, + VK_IMAGE_ASPECT_COLOR_BIT, + attachment->first_subpass, + attachment->desc.loadOp, + attachment->last_subpass, + attachment->desc.storeOp)) { + return; + } + + if (v3dv_cmd_buffer_check_needs_store(state, + VK_IMAGE_ASPECT_COLOR_BIT, + attachment->last_subpass, + attachment->desc.storeOp)) { + has_stores = true; + } + } + + if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { + uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; + const struct v3dv_render_pass_attachment *ds_attachment = + &state->pass->attachments[ds_attachment_idx]; + + const VkImageAspectFlags ds_aspects = + vk_format_aspects(ds_attachment->desc.format); + + if (v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.loadOp, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp)) { + return; + } + + if (v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.stencilLoadOp, + ds_attachment->last_subpass, + ds_attachment->desc.stencilStoreOp)) { + return; + } + + has_stores |= v3dv_cmd_buffer_check_needs_store(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp); + has_stores |= v3dv_cmd_buffer_check_needs_store(state, + ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, + ds_attachment->last_subpass, + ds_attachment->desc.stencilStoreOp); + } + + job->can_use_double_buffer = has_stores; +} + static struct v3dv_job * cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t subpass_idx, - enum v3dv_job_type type) + enum v3dv_job_type type, + bool is_subpass_start) { assert(type == V3DV_JOB_TYPE_GPU_CL || - type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); + type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE); struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; assert(subpass_idx < state->pass->subpass_count); @@ -1488,24 +1737,33 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, if (!job) return NULL; + if (is_subpass_start && cmd_buffer->state.resuming) { + assert(subpass_idx == 0); + job->resuming = true; + } + state->subpass_idx = subpass_idx; /* If we are starting a new job we need to setup binning. We only do this - * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY + * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_INCOMPLETE * jobs are not submitted to the GPU directly, and are instead meant to be - * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. + * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. With dynamic rendering, + * all resuming jobs work similarly to secondary command buffers, so we + * apply the same. */ if (type == V3DV_JOB_TYPE_GPU_CL && - job->first_subpass == state->subpass_idx) { + job->first_subpass == state->subpass_idx && + !job->resuming) { const struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx]; const struct v3dv_framebuffer *framebuffer = state->framebuffer; - uint8_t internal_bpp; + uint8_t max_internal_bpp, total_color_bpp; bool msaa; v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa) - (framebuffer, subpass, &internal_bpp, &msaa); + (framebuffer, state->attachments, subpass, + &max_internal_bpp, &total_color_bpp, &msaa); /* From the Vulkan spec: * @@ -1527,9 +1785,10 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, framebuffer->width, framebuffer->height, layers, - true, + true, false, subpass->color_count, - internal_bpp, + max_internal_bpp, + total_color_bpp, msaa); } @@ -1545,28 +1804,29 @@ v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_job *job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, - V3DV_JOB_TYPE_GPU_CL); + V3DV_JOB_TYPE_GPU_CL, true); if (!job) return NULL; + /* FIXME: do we need all this below for resuming jobs? */ + /* Check if our render area is aligned to tile boundaries. We have to do * this in each subpass because the subset of attachments used can change * and with that the tile size selected by the hardware can change too. */ cmd_buffer_update_tile_alignment(cmd_buffer); + /* Decide if we can use double-buffer for this subpass job */ + cmd_buffer_subpass_check_double_buffer_mode(cmd_buffer, job->frame_tiling.msaa); + + cmd_buffer_update_attachment_resolve_state(cmd_buffer); + /* If we can't use TLB clears then we need to emit draw clears for any * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit - * Depth/Stencil clears if we hit GFXH-1461. - * - * Secondary command buffers don't start subpasses (and may not even have - * framebuffer state), so we only care about this in primaries. The only - * exception could be a secondary runnning inside a subpass that needs to - * record a meta operation (with its own render pass) that relies on - * attachment load clears, but we don't have any instances of that right - * now. - */ - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) + * Depth/Stencil clears if we hit GFXH-1461. With dynamic render passes this + * should only be called when starting the render pass, not when resuming. + */ + if (!cmd_buffer->state.resuming) cmd_buffer_emit_subpass_clears(cmd_buffer); return job; @@ -1580,13 +1840,13 @@ v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer, assert(subpass_idx < cmd_buffer->state.pass->subpass_count); struct v3dv_job *job; - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, - V3DV_JOB_TYPE_GPU_CL); + V3DV_JOB_TYPE_GPU_CL, false); } else { - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, - V3DV_JOB_TYPE_GPU_CL_SECONDARY); + V3DV_JOB_TYPE_GPU_CL_INCOMPLETE, false); } if (!job) @@ -1611,7 +1871,8 @@ v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer) } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer) +v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, + const VkSubpassEndInfo *pSubpassEndInfo) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); @@ -1645,7 +1906,7 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer) * inside a render pass. */ if (cmd_buffer->state.job) { - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && cmd_buffer->state.pass); v3dv_cmd_buffer_finish_job(cmd_buffer); } @@ -1655,26 +1916,73 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer) return VK_SUCCESS; } -static void -clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer, +static bool +clone_bo_list(struct v3dv_device *device, struct list_head *dst, struct list_head *src) { - assert(cmd_buffer); + assert(device); list_inithead(dst); list_for_each_entry(struct v3dv_bo, bo, src, list_link) { struct v3dv_bo *clone_bo = - vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8, + vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!clone_bo) { - v3dv_flag_oom(cmd_buffer, NULL); - return; - } + if (!clone_bo) + return false; *clone_bo = *bo; list_addtail(&clone_bo->list_link, dst); } + + return true; +} + +struct v3dv_job * +v3dv_job_clone(struct v3dv_job *job, bool skip_bcl) +{ + struct v3dv_job *clone = vk_alloc(&job->device->vk.alloc, + sizeof(struct v3dv_job), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!clone) + return NULL; + + /* Cloned jobs don't duplicate resources, they share their CLs with the + * oringinal job, since they are typically read-only. The exception to this + * is dynamic rendering suspension paired with + * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, since in that case we need + * to patch the BCL with the resume address and for that we need to create a + * copy of the job so we avoid rewriting the resume address for another copy + * of the same job that may be running in the GPU. When we create a job for + * this use case skip_bcl is set to True and the caller will be responsible + * for creating the BCL. + */ + *clone = *job; + clone->is_clone = true; + clone->cmd_buffer = NULL; + + /* We need to regen the BO lists so that they point to the BO list in the + * cloned job. Otherwise functions like list_length() will loop forever. + */ + if (job->type == V3DV_JOB_TYPE_GPU_CL) { + assert(job->cmd_buffer); + struct v3dv_device *device = job->cmd_buffer->device; + + clone->bcl.job = clone; + clone->rcl.job = clone; + clone->indirect.job = clone; + + if (!skip_bcl && + !clone_bo_list(device, &clone->bcl.bo_list, &job->bcl.bo_list)) { + return NULL; + } + if (!clone_bo_list(device, &clone->rcl.bo_list, &job->rcl.bo_list)) + return NULL; + if (!clone_bo_list(device, &clone->indirect.bo_list, &job->indirect.bo_list)) + return NULL; + } + + return clone; } /* Clones a job for inclusion in the given command buffer. Note that this @@ -1687,31 +1995,29 @@ struct v3dv_job * v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job, struct v3dv_cmd_buffer *cmd_buffer) { - struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc, - sizeof(struct v3dv_job), 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!clone_job) { + struct v3dv_job *clone = v3dv_job_clone(job, false); + if (!clone) { v3dv_flag_oom(cmd_buffer, NULL); return NULL; } - /* Cloned jobs don't duplicate resources! */ - *clone_job = *job; - clone_job->is_clone = true; - clone_job->cmd_buffer = cmd_buffer; - list_addtail(&clone_job->list_link, &cmd_buffer->jobs); + clone->cmd_buffer = cmd_buffer; + list_addtail(&clone->list_link, &cmd_buffer->jobs); + return clone; +} - /* We need to regen the BO lists so that they point to the BO list in the - * cloned job. Otherwise functions like list_length() will loop forever. - */ - if (job->type == V3DV_JOB_TYPE_GPU_CL) { - clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list); - clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list); - clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list, - &job->indirect.bo_list); - } +void +v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst, + struct v3dv_barrier_state *src) +{ + dst->dst_mask |= src->dst_mask; - return clone_job; + dst->src_mask_graphics |= src->src_mask_graphics; + dst->src_mask_compute |= src->src_mask_compute; + dst->src_mask_transfer |= src->src_mask_transfer; + + dst->bcl_buffer_access |= src->bcl_buffer_access; + dst->bcl_image_access |= src->bcl_image_access; } static void @@ -1719,8 +2025,7 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary, uint32_t cmd_buffer_count, const VkCommandBuffer *cmd_buffers) { - bool pending_barrier = false; - bool pending_bcl_barrier = false; + struct v3dv_barrier_state pending_barrier = { 0 }; for (uint32_t i = 0; i < cmd_buffer_count; i++) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]); @@ -1743,17 +2048,23 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary, list_for_each_entry(struct v3dv_job, secondary_job, &secondary->jobs, list_link) { /* These can only happen inside a render pass */ - assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY); + assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_INCOMPLETE); struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary); if (!job) return; - if (pending_barrier) { - job->serialize = true; - if (pending_bcl_barrier) + if (pending_barrier.dst_mask) { + /* FIXME: do the same we do for primaries and only choose the + * relevant src masks. + */ + job->serialize = pending_barrier.src_mask_graphics | + pending_barrier.src_mask_transfer | + pending_barrier.src_mask_compute; + if (pending_barrier.bcl_buffer_access || + pending_barrier.bcl_image_access) { job->needs_bcl_sync = true; - pending_barrier = false; - pending_bcl_barrier = false; + } + memset(&pending_barrier, 0, sizeof(pending_barrier)); } } @@ -1761,14 +2072,15 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary, * barrier state consumed with whatever comes after it (first job in * the next secondary or the primary, if this was the last secondary). */ - assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier); - pending_barrier = secondary->state.has_barrier; - pending_bcl_barrier = secondary->state.has_bcl_barrier; + assert(secondary->state.barrier.dst_mask || + (!secondary->state.barrier.bcl_buffer_access && + !secondary->state.barrier.bcl_image_access)); + pending_barrier = secondary->state.barrier; } - if (pending_barrier) { - primary->state.has_barrier = true; - primary->state.has_bcl_barrier |= pending_bcl_barrier; + if (pending_barrier.dst_mask) { + v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier, + &pending_barrier); } } @@ -1788,100 +2100,36 @@ v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer, } } -/* This goes though the list of possible dynamic states in the pipeline and, - * for those that are not configured as dynamic, copies relevant state into - * the command buffer. - */ static void -cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, - const struct v3dv_dynamic_state *src) -{ - struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic; - uint32_t dynamic_mask = src->mask; - uint32_t dirty = 0; - - if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) { - dest->viewport.count = src->viewport.count; - if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, - src->viewport.count * sizeof(VkViewport))) { - typed_memcpy(dest->viewport.viewports, - src->viewport.viewports, - src->viewport.count); - typed_memcpy(dest->viewport.scale, src->viewport.scale, - src->viewport.count); - typed_memcpy(dest->viewport.translate, src->viewport.translate, - src->viewport.count); - dirty |= V3DV_CMD_DIRTY_VIEWPORT; - } - } - - if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) { - dest->scissor.count = src->scissor.count; - if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, - src->scissor.count * sizeof(VkRect2D))) { - typed_memcpy(dest->scissor.scissors, - src->scissor.scissors, src->scissor.count); - dirty |= V3DV_CMD_DIRTY_SCISSOR; - } - } - - if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) { - if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask, - sizeof(src->stencil_compare_mask))) { - dest->stencil_compare_mask = src->stencil_compare_mask; - dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK; - } - } - - if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) { - if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, - sizeof(src->stencil_write_mask))) { - dest->stencil_write_mask = src->stencil_write_mask; - dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK; - } - } - - if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) { - if (memcmp(&dest->stencil_reference, &src->stencil_reference, - sizeof(src->stencil_reference))) { - dest->stencil_reference = src->stencil_reference; - dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE; - } - } - - if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) { - if (memcmp(dest->blend_constants, src->blend_constants, - sizeof(src->blend_constants))) { - memcpy(dest->blend_constants, src->blend_constants, - sizeof(src->blend_constants)); - dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS; - } - } - - if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) { - if (memcmp(&dest->depth_bias, &src->depth_bias, - sizeof(src->depth_bias))) { - memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias)); - dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS; - } - } - - if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) { - if (dest->line_width != src->line_width) { - dest->line_width = src->line_width; - dirty |= V3DV_CMD_DIRTY_LINE_WIDTH; - } +cmd_buffer_copy_private_dynamic_state(struct v3dv_dynamic_state *dst, + struct v3dv_dynamic_state *src, + struct vk_dynamic_graphics_state *src_dyn) +{ + if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS)) { + typed_memcpy(dst->viewport.scale, src->viewport.scale, + MAX_VIEWPORTS); + typed_memcpy(dst->viewport.translate, src->viewport.translate, + MAX_VIEWPORTS); } + if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) + dst->color_write_enable = src->color_write_enable; +} - if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) { - if (dest->color_write_enable != src->color_write_enable) { - dest->color_write_enable = src->color_write_enable; - dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; - } - } +/* This function copies relevant static state from the pipeline to the command + * buffer state. + * + * Notice the Vulkan runtime uses the term 'dynamic' to refer to all state + * that *could* be dynamic, even if it is not dynamic for a particular + * pipeline, so the terminology used in the runtime may be a bit misleading. + */ +static void +cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline *pipeline) +{ + vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, &pipeline->dynamic_graphics_state); + cmd_buffer_copy_private_dynamic_state(&cmd_buffer->state.dynamic, &pipeline->dynamic, + &pipeline->dynamic_graphics_state); - cmd_buffer->state.dynamic.mask = dynamic_mask; - cmd_buffer->state.dirty |= dirty; } static void @@ -1889,13 +2137,17 @@ bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_pipeline *pipeline) { assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT)); + + /* We need to unconditionally bind the pipeline static state, as the state + * could have changed (through calls to vkCmdSetXXX) between bindings of + * the same pipeline. + */ + cmd_buffer_bind_pipeline_static_state(cmd_buffer, pipeline); + if (cmd_buffer->state.gfx.pipeline == pipeline) return; cmd_buffer->state.gfx.pipeline = pipeline; - - cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state); - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE; } @@ -1935,39 +2187,66 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer, } } -/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */ +/* Considers the pipeline's negative_one_to_one state and applies it to the + * current viewport transform if needed to produce the resulting Z translate + * and scale parameters. + */ void -v3dv_viewport_compute_xform(const VkViewport *viewport, - float scale[3], - float translate[3]) -{ - float x = viewport->x; - float y = viewport->y; - float half_width = 0.5f * viewport->width; - float half_height = 0.5f * viewport->height; - double n = viewport->minDepth; - double f = viewport->maxDepth; - - scale[0] = half_width; - translate[0] = half_width + x; - scale[1] = half_height; - translate[1] = half_height + y; - - scale[2] = (f - n); - translate[2] = n; - - /* It seems that if the scale is small enough the hardware won't clip - * correctly so we work around this my choosing the smallest scale that - * seems to work. - * - * This case is exercised by CTS: - * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero +v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t vp_idx, + float *translate_z, float *scale_z) +{ + const struct v3dv_viewport_state *vp_state = &cmd_buffer->state.dynamic.viewport; + const struct vk_viewport_state *vk_vp_state = &cmd_buffer->vk.dynamic_graphics_state.vp; + + float t = vp_state->translate[vp_idx][2]; + float s = vp_state->scale[vp_idx][2]; + + assert(cmd_buffer->state.gfx.pipeline); + if (cmd_buffer->state.gfx.pipeline->negative_one_to_one) { + t = (t + vk_vp_state->viewports[vp_idx].maxDepth) * 0.5f; + s *= 0.5f; + } + + if (translate_z) + *translate_z = t; + + if (scale_z) + *scale_z = s; +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, + uint32_t attachmentCount, + const VkBool32 *pColorWriteEnables) +{ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic; + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + uint32_t color_write_enable = 0; + + /* Vulkan runtime computes color_write_enable as an 8-bit bitset, setting a + * bit per attachment. But when emitting, it is combined with the + * color_write_mask, that is stored as a 32-bit mask (one bit per channel, + * per attachment). So we store the color_write_enable as a 32-bit mask + * ourselves. */ - const float min_abs_scale = 0.000009f; - if (fabs(scale[2]) < min_abs_scale) - scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f); + for (uint32_t i = 0; i < attachmentCount; i++) + color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; + + if (v3dv_dyn->color_write_enable == color_write_enable) + return; + + v3dv_dyn->color_write_enable = color_write_enable; + BITSET_SET(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES); } +/* We keep a custom CmdSetViewport because we want to cache the outcome of + * viewport_compute_xform, and because we need to set the viewport count. This + * is specially relevant to our case because we are pushing/popping the + * dynamic state as part of the meta operations. + */ VKAPI_ATTR void VKAPI_CALL v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, @@ -1975,63 +2254,55 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, const VkViewport *pViewports) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - const uint32_t total_count = firstViewport + viewportCount; + struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic; + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + const uint32_t total_count = firstViewport + viewportCount; assert(firstViewport < MAX_VIEWPORTS); assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); - if (state->dynamic.viewport.count < total_count) - state->dynamic.viewport.count = total_count; - - if (!memcmp(state->dynamic.viewport.viewports + firstViewport, - pViewports, viewportCount * sizeof(*pViewports))) { - return; - } - - memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, - viewportCount * sizeof(*pViewports)); + vk_common_CmdSetViewportWithCount(commandBuffer, + total_count, + pViewports); for (uint32_t i = firstViewport; i < total_count; i++) { - v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i], - state->dynamic.viewport.scale[i], - state->dynamic.viewport.translate[i]); + v3dv_X(cmd_buffer->device, viewport_compute_xform) + (&dyn->vp.viewports[i], v3dv_dyn->viewport.scale[i], + v3dv_dyn->viewport.translate[i]); } +} - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT; +VKAPI_ATTR void VKAPI_CALL +v3dv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, + uint32_t viewportCount, + const VkViewport *pViewports) +{ + v3dv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports); } +/* We keep a custom CmdSetScissor because we need to set the scissor + * count. This is specially relevant to our case because we are + * pushing/popping the dynamic state as part of the meta operations. + */ VKAPI_ATTR void VKAPI_CALL v3dv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, const VkRect2D *pScissors) { - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - assert(firstScissor < MAX_SCISSORS); assert(firstScissor + scissorCount >= 1 && firstScissor + scissorCount <= MAX_SCISSORS); - if (state->dynamic.scissor.count < firstScissor + scissorCount) - state->dynamic.scissor.count = firstScissor + scissorCount; - - if (!memcmp(state->dynamic.scissor.scissors + firstScissor, - pScissors, scissorCount * sizeof(*pScissors))) { - return; - } - - memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, - scissorCount * sizeof(*pScissors)); - - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR; + vk_common_CmdSetScissorWithCount(commandBuffer, + firstScissor + scissorCount, + pScissors); } static void emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) { - if (cmd_buffer->state.dynamic.viewport.count == 0) + if (cmd_buffer->vk.dynamic_graphics_state.vp.viewport_count == 0) return; struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; @@ -2041,11 +2312,14 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) */ float *vptranslate = dynamic->viewport.translate[0]; float *vpscale = dynamic->viewport.scale[0]; + assert(vpscale[0] >= 0); - float vp_minx = -fabsf(vpscale[0]) + vptranslate[0]; - float vp_maxx = fabsf(vpscale[0]) + vptranslate[0]; - float vp_miny = -fabsf(vpscale[1]) + vptranslate[1]; - float vp_maxy = fabsf(vpscale[1]) + vptranslate[1]; + float vp_minx = vptranslate[0] - vpscale[0]; + float vp_maxx = vptranslate[0] + vpscale[0]; + + /* With KHR_maintenance1 viewport may have negative Y */ + float vp_miny = vptranslate[1] - fabsf(vpscale[1]); + float vp_maxy = vptranslate[1] + fabsf(vpscale[1]); /* Quoting from v3dx_emit: * "Clip to the scissor if it's enabled, but still clip to the @@ -2074,18 +2348,15 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y + cmd_buffer->state.render_area.extent.height); - minx = vp_minx; - miny = vp_miny; - maxx = vp_maxx; - maxy = vp_maxy; - /* Clip against user provided scissor if needed. * * FIXME: right now we only allow one scissor. Below would need to be * updated if we support more */ - if (dynamic->scissor.count > 0) { - VkRect2D *scissor = &dynamic->scissor.scissors[0]; + struct vk_dynamic_graphics_state *vk_dyn = + &cmd_buffer->vk.dynamic_graphics_state; + if (vk_dyn->vp.scissor_count > 0) { + VkRect2D *scissor = &vk_dyn->vp.scissors[0]; minx = MAX2(minx, scissor->offset.x); miny = MAX2(miny, scissor->offset.y); maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width); @@ -2108,12 +2379,11 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) v3dv_X(cmd_buffer->device, job_emit_clip_window) (cmd_buffer->state.job, &cmd_buffer->state.clip_window); - cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR; + BITSET_CLEAR(vk_dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS); } -static void -update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer, - uint32_t dirty_uniform_state) +static bool +update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer) { /* We need to update uniform streams if any piece of state that is passed * to the shader as a uniform may have changed. @@ -2121,15 +2391,29 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer, * If only descriptor sets are dirty then we can safely ignore updates * for shader stages that don't access descriptors. */ - struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; assert(pipeline); + uint32_t dirty = cmd_buffer->state.dirty; + struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + + const bool dirty_uniform_state = + (dirty & (V3DV_CMD_DIRTY_PIPELINE | + V3DV_CMD_DIRTY_PUSH_CONSTANTS | + V3DV_CMD_DIRTY_DESCRIPTOR_SETS | + V3DV_CMD_DIRTY_VIEW_INDEX | + V3DV_CMD_DIRTY_DRAW_ID)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS); - const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE; - const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT; - const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS; - const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS; - const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX; + if (!dirty_uniform_state) + return false; + + const bool has_new_pipeline = dirty & V3DV_CMD_DIRTY_PIPELINE; + const bool has_new_viewport = BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS); + const bool has_new_push_constants = dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS; + const bool has_new_descriptors = dirty & V3DV_CMD_DIRTY_DESCRIPTOR_SETS; + const bool has_new_view_index = dirty & V3DV_CMD_DIRTY_VIEW_INDEX; + const bool has_new_draw_id = dirty & V3DV_CMD_DIRTY_DRAW_ID; /* VK_SHADER_STAGE_FRAGMENT_BIT */ const bool has_new_descriptors_fs = @@ -2143,8 +2427,7 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer, const bool needs_fs_update = has_new_pipeline || has_new_view_index || has_new_push_constants_fs || - has_new_descriptors_fs || - has_new_view_index; + has_new_descriptors_fs; if (needs_fs_update) { struct v3dv_shader_variant *fs_variant = @@ -2198,6 +2481,7 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer, const bool needs_vs_update = has_new_viewport || has_new_view_index || + has_new_draw_id || has_new_pipeline || has_new_push_constants_vs || has_new_descriptors_vs; @@ -2217,6 +2501,9 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer, } cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX; + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DRAW_ID; + + return true; } /* This stores command buffer state that we might be about to stomp for @@ -2228,32 +2515,43 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer, { struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + /* Attachment state. + * + * We store this state even if we are not currently in a subpass + * (subpass_idx != -1) because we may get here to implement subpass + * resolves via vkCmdResolveImage from + * cmd_buffer_subpass_handle_pending_resolves. In that scenario we pretend + * we are no longer in a subpass because Vulkan disallows image resolves + * via vkCmdResolveImage during subpasses, but we still need to preserve + * attachment state because we may have more subpasses to go through + * after processing resolves in the current subass. + */ + const uint32_t attachment_state_item_size = + sizeof(struct v3dv_cmd_buffer_attachment_state); + const uint32_t attachment_state_total_size = + attachment_state_item_size * state->attachment_alloc_count; + if (state->meta.attachment_alloc_count < state->attachment_alloc_count) { + if (state->meta.attachment_alloc_count > 0) + vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments); + + state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, + attachment_state_total_size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!state->meta.attachments) { + v3dv_flag_oom(cmd_buffer, NULL); + return; + } + state->meta.attachment_alloc_count = state->attachment_alloc_count; + } + state->meta.attachment_count = state->attachment_alloc_count; + memcpy(state->meta.attachments, state->attachments, + attachment_state_total_size); + if (state->subpass_idx != -1) { state->meta.subpass_idx = state->subpass_idx; state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer); state->meta.pass = v3dv_render_pass_to_handle(state->pass); - const uint32_t attachment_state_item_size = - sizeof(struct v3dv_cmd_buffer_attachment_state); - const uint32_t attachment_state_total_size = - attachment_state_item_size * state->attachment_alloc_count; - if (state->meta.attachment_alloc_count < state->attachment_alloc_count) { - if (state->meta.attachment_alloc_count > 0) - vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments); - - state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, - attachment_state_total_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!state->meta.attachments) { - v3dv_flag_oom(cmd_buffer, NULL); - return; - } - state->meta.attachment_alloc_count = state->attachment_alloc_count; - } - state->meta.attachment_count = state->attachment_alloc_count; - memcpy(state->meta.attachments, state->attachments, - attachment_state_total_size); - state->meta.tile_aligned_render_area = state->tile_aligned_render_area; memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D)); } @@ -2262,6 +2560,8 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer, * account the graphics pipeline, and the graphics state */ state->meta.gfx.pipeline = state->gfx.pipeline; + vk_dynamic_graphics_state_copy(&state->meta.dynamic_graphics_state, + &cmd_buffer->vk.dynamic_graphics_state); memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic)); struct v3dv_descriptor_state *gfx_descriptor_state = @@ -2277,35 +2577,35 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer, state->meta.has_descriptor_state = false; } - /* FIXME: if we keep track of wether we have bound any push constant state - * at all we could restruct this only to cases where it is actually - * necessary. - */ - memcpy(state->meta.push_constants, cmd_buffer->push_constants_data, - sizeof(state->meta.push_constants)); + if (cmd_buffer->state.push_constants_size > 0) { + state->meta.push_constants_size = cmd_buffer->state.push_constants_size; + memcpy(state->meta.push_constants, cmd_buffer->state.push_constants_data, + cmd_buffer->state.push_constants_size); + cmd_buffer->state.push_constants_size = 0; + } } /* This restores command buffer state after a meta operation */ void v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer, - uint32_t dirty_dynamic_state, bool needs_subpass_resume) { struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + /* Attachment state */ + assert(state->meta.attachment_count <= state->attachment_alloc_count); + const uint32_t attachment_state_item_size = + sizeof(struct v3dv_cmd_buffer_attachment_state); + const uint32_t attachment_state_total_size = + attachment_state_item_size * state->meta.attachment_count; + memcpy(state->attachments, state->meta.attachments, + attachment_state_total_size); + if (state->meta.subpass_idx != -1) { state->pass = v3dv_render_pass_from_handle(state->meta.pass); state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer); - assert(state->meta.attachment_count <= state->attachment_alloc_count); - const uint32_t attachment_state_item_size = - sizeof(struct v3dv_cmd_buffer_attachment_state); - const uint32_t attachment_state_total_size = - attachment_state_item_size * state->meta.attachment_count; - memcpy(state->attachments, state->meta.attachments, - attachment_state_total_size); - state->tile_aligned_render_area = state->meta.tile_aligned_render_area; memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D)); @@ -2331,10 +2631,11 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer, state->gfx.pipeline = NULL; } - if (dirty_dynamic_state) { - memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic)); - state->dirty |= dirty_dynamic_state; - } + /* Restore dynamic state */ + vk_dynamic_graphics_state_copy(&cmd_buffer->vk.dynamic_graphics_state, + &state->meta.dynamic_graphics_state); + memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic)); + state->dirty = ~0; if (state->meta.has_descriptor_state) { if (state->meta.gfx.descriptor_state.valid != 0) { @@ -2345,14 +2646,23 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer, } } - memcpy(cmd_buffer->push_constants_data, state->meta.push_constants, - sizeof(state->meta.push_constants)); + /* We only need to restore push constant data if we had any data in the + * original command buffer and the meta operation wrote new push constant + * data. + */ + if (state->meta.push_constants_size > 0 && + cmd_buffer->state.push_constants_size > 0) { + memcpy(cmd_buffer->state.push_constants_data, state->meta.push_constants, + state->meta.push_constants_size); + } + cmd_buffer->state.push_constants_size = state->meta.push_constants_size; state->meta.gfx.pipeline = NULL; state->meta.framebuffer = VK_NULL_HANDLE; state->meta.pass = VK_NULL_HANDLE; state->meta.subpass_idx = -1; state->meta.has_descriptor_state = false; + state->meta.push_constants_size = 0; } static struct v3dv_job * @@ -2399,7 +2709,7 @@ cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer) * in rasterization." * * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we - * emit when we start a new frame at the begining of a subpass. At that point, + * emit when we start a new frame at the beginning of a subpass. At that point, * if the framebuffer doesn't have any attachments we won't enable MSAA and * the job won't be valid in the scenario described by the spec. * @@ -2434,7 +2744,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) * draw calls in them, and then using that info to decide if we need to * restart the primary job into which they are being recorded. */ - if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) + if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) return; /* Drop the current job and restart it with MSAA enabled */ @@ -2457,16 +2767,185 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) old_job->frame_tiling.width, old_job->frame_tiling.height, old_job->frame_tiling.layers, - true, + true, false, old_job->frame_tiling.render_target_count, old_job->frame_tiling.internal_bpp, + old_job->frame_tiling.total_color_bpp, true /* msaa */); v3dv_job_destroy(old_job); } +static bool +cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline *pipeline, + bool indexed, bool indirect) +{ + const struct v3dv_descriptor_maps *vs_bin_maps = + pipeline->shared_data->maps[BROADCOM_SHADER_VERTEX_BIN]; + + const struct v3dv_descriptor_maps *gs_bin_maps = + pipeline->shared_data->maps[BROADCOM_SHADER_GEOMETRY_BIN]; + + VkAccessFlags buffer_access = + cmd_buffer->state.barrier.bcl_buffer_access; + if (buffer_access) { + /* Index buffer read */ + if (indexed && (buffer_access & (VK_ACCESS_2_INDEX_READ_BIT | + VK_ACCESS_2_MEMORY_READ_BIT))) { + return true; + } + + /* Indirect buffer read */ + if (indirect && (buffer_access & (VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT | + VK_ACCESS_2_MEMORY_READ_BIT))) { + return true; + } + + /* Attribute read */ + if (buffer_access & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_2_MEMORY_READ_BIT)) { + const struct v3d_vs_prog_data *prog_data = + pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs; + + for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { + if (prog_data->vattr_sizes[i] > 0) + return true; + } + } + + /* UBO / SSBO read */ + if (buffer_access & (VK_ACCESS_2_UNIFORM_READ_BIT | + VK_ACCESS_2_SHADER_READ_BIT | + VK_ACCESS_2_MEMORY_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT)) { + + if (vs_bin_maps->ubo_map.num_desc > 0 || + vs_bin_maps->ssbo_map.num_desc > 0) { + return true; + } + + if (gs_bin_maps && (gs_bin_maps->ubo_map.num_desc > 0 || + gs_bin_maps->ssbo_map.num_desc > 0)) { + return true; + } + } + + /* SSBO write */ + if (buffer_access & (VK_ACCESS_2_SHADER_WRITE_BIT | + VK_ACCESS_2_MEMORY_WRITE_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT)) { + if (vs_bin_maps->ssbo_map.num_desc > 0) + return true; + + if (gs_bin_maps && gs_bin_maps->ssbo_map.num_desc > 0) + return true; + } + + /* Texel Buffer read */ + if (buffer_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_MEMORY_READ_BIT)) { + if (vs_bin_maps->texture_map.num_desc > 0) + return true; + + if (gs_bin_maps && gs_bin_maps->texture_map.num_desc > 0) + return true; + } + } + + VkAccessFlags image_access = + cmd_buffer->state.barrier.bcl_image_access; + if (image_access) { + /* Image load / store */ + if (image_access & (VK_ACCESS_2_SHADER_READ_BIT | + VK_ACCESS_2_SHADER_WRITE_BIT | + VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | + VK_ACCESS_2_MEMORY_READ_BIT | + VK_ACCESS_2_MEMORY_WRITE_BIT)) { + if (vs_bin_maps->texture_map.num_desc > 0 || + vs_bin_maps->sampler_map.num_desc > 0) { + return true; + } + + if (gs_bin_maps && (gs_bin_maps->texture_map.num_desc > 0 || + gs_bin_maps->sampler_map.num_desc > 0)) { + return true; + } + } + } + + return false; +} + void -v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer) +v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_job *job) +{ + job->needs_bcl_sync = true; + cmd_buffer->state.barrier.bcl_buffer_access = 0; + cmd_buffer->state.barrier.bcl_image_access = 0; +} + +static inline uint32_t +compute_prog_score(struct v3dv_shader_variant *vs) +{ + const uint32_t inst_count = vs->qpu_insts_size / sizeof(uint64_t); + const uint32_t tmu_count = vs->prog_data.base->tmu_count + + vs->prog_data.base->tmu_spills + + vs->prog_data.base->tmu_fills; + return inst_count + 4 * tmu_count; +} + +static void +job_update_double_buffer_score(struct v3dv_job *job, + struct v3dv_pipeline *pipeline, + uint32_t vertex_count, + VkExtent2D *render_area) +{ + /* FIXME: assume anything with GS workloads is too expensive */ + struct v3dv_shader_variant *gs_bin = + pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]; + if (gs_bin) { + job->can_use_double_buffer = false; + return; + } + + /* Keep track of vertex processing: too much geometry processing would not + * be good for double-buffer. + */ + struct v3dv_shader_variant *vs_bin = + pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]; + assert(vs_bin); + uint32_t geom_score = vertex_count * compute_prog_score(vs_bin); + + struct v3dv_shader_variant *vs = + pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]; + assert(vs); + uint32_t vs_score = vertex_count * compute_prog_score(vs); + geom_score += vs_score; + + job->double_buffer_score.geom += geom_score; + + /* Compute pixel rendering cost. + * + * We estimate that on average a draw would render 0.2% of the pixels in + * the render area. That would be a 64x64 region in a 1920x1080 area. + */ + struct v3dv_shader_variant *fs = + pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]; + assert(fs); + uint32_t pixel_count = 0.002f * render_area->width * render_area->height; + uint32_t render_score = vs_score + pixel_count * compute_prog_score(fs); + + job->double_buffer_score.render += render_score; +} + +void +v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, + bool indexed, bool indirect, + uint32_t vertex_count) { assert(cmd_buffer->state.gfx.pipeline); assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT)); @@ -2489,6 +2968,23 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer); job->draw_count++; + /* Track VK_KHR_buffer_device_address usage in the job */ + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + job->uses_buffer_device_address |= pipeline->uses_buffer_device_address; + + /* If this job is serialized (has consumed a barrier) then check if we need + * to sync at the binning stage by testing if the binning shaders involved + * with the draw call require access to external resources. + */ + if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access || + cmd_buffer->state.barrier.bcl_image_access)) { + assert(!job->needs_bcl_sync); + if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline, + indexed, indirect)) { + v3dv_cmd_buffer_consume_bcl_sync(cmd_buffer, job); + } + } + /* GL shader state binds shaders, uniform and vertex attribute state. The * compiler injects uniforms to handle some descriptor types (such as * textures), so we need to regen that when descriptor state changes. @@ -2497,62 +2993,84 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer) * that will require that we new uniform state for QUNIFORM_VIEWPORT_*. */ uint32_t *dirty = &cmd_buffer->state.dirty; + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; - const uint32_t dirty_uniform_state = - *dirty & (V3DV_CMD_DIRTY_PIPELINE | - V3DV_CMD_DIRTY_PUSH_CONSTANTS | - V3DV_CMD_DIRTY_DESCRIPTOR_SETS | - V3DV_CMD_DIRTY_VIEWPORT | - V3DV_CMD_DIRTY_VIEW_INDEX); - - if (dirty_uniform_state) - update_gfx_uniform_state(cmd_buffer, dirty_uniform_state); + const bool dirty_uniform_state = + update_gfx_uniform_state(cmd_buffer); struct v3dv_device *device = cmd_buffer->device; if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER)) v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer); - if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) { + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) { v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer); + } + + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) { v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer); } - if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) { + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) { emit_scissor(cmd_buffer); } - if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) { + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer); - } if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER) v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer); - const uint32_t dynamic_stencil_dirty_flags = - V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK | - V3DV_CMD_DIRTY_STENCIL_WRITE_MASK | - V3DV_CMD_DIRTY_STENCIL_REFERENCE; - if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags)) + bool any_dynamic_stencil_dirty = + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP); + + if (*dirty & V3DV_CMD_DIRTY_PIPELINE || any_dynamic_stencil_dirty) v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer); - if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS)) + if (*dirty & V3DV_CMD_DIRTY_PIPELINE || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) { v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer); + } - if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS)) + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) + v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer); + + if (*dirty & V3DV_CMD_DIRTY_PIPELINE || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) { v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer); + } if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY) v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer); - if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH) + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer); if (*dirty & V3DV_CMD_DIRTY_PIPELINE) v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer); - if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE)) + if (*dirty & V3DV_CMD_DIRTY_PIPELINE || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) { v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer); + } + + /* We disable double-buffer mode if indirect draws are used because in that + * case we don't know the vertex count. + */ + if (indirect) { + job->can_use_double_buffer = false; + } else if (job->can_use_double_buffer) { + job_update_double_buffer_score(job, pipeline, vertex_count, + &cmd_buffer->state.render_area.extent); + } cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE; } @@ -2561,18 +3079,23 @@ static inline void cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer, uint32_t view_index) { - cmd_buffer->state.view_index = view_index; - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX; + if (view_index != cmd_buffer->state.view_index) { + cmd_buffer->state.view_index = view_index; + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX; + } } static void cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_draw_info *info) { + uint32_t vertex_count = + info->vertex_count * info->instance_count; struct v3dv_render_pass *pass = cmd_buffer->state.pass; if (likely(!pass->multiview_enabled)) { - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + cmd_buffer_set_view_index(cmd_buffer, 0); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count); v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info); return; } @@ -2580,7 +3103,7 @@ cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer, uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; while (view_mask) { cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count); v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info); } } @@ -2606,6 +3129,35 @@ v3dv_CmdDraw(VkCommandBuffer commandBuffer, } VKAPI_ATTR void VKAPI_CALL +v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, + uint32_t drawCount, + const VkMultiDrawInfoEXT *pVertexInfo, + uint32_t instanceCount, + uint32_t firstInstance, + uint32_t stride) + +{ + if (drawCount == 0 || instanceCount == 0) + return; + + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + + uint32_t i = 0; + vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) { + cmd_buffer->state.draw_id = i; + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID; + + struct v3dv_draw_info info = {}; + info.vertex_count = draw->vertexCount; + info.instance_count = instanceCount; + info.first_instance = firstInstance; + info.first_vertex = draw->firstVertex; + + cmd_buffer_draw(cmd_buffer, &info); + } +} + +VKAPI_ATTR void VKAPI_CALL v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, @@ -2618,9 +3170,12 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer, V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + uint32_t vertex_count = indexCount * instanceCount; + struct v3dv_render_pass *pass = cmd_buffer->state.pass; if (likely(!pass->multiview_enabled)) { - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + cmd_buffer_set_view_index(cmd_buffer, 0); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count); v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) (cmd_buffer, indexCount, instanceCount, firstIndex, vertexOffset, firstInstance); @@ -2630,7 +3185,7 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; while (view_mask) { cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count); v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) (cmd_buffer, indexCount, instanceCount, firstIndex, vertexOffset, firstInstance); @@ -2638,6 +3193,48 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer, } VKAPI_ATTR void VKAPI_CALL +v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, + uint32_t drawCount, + const VkMultiDrawIndexedInfoEXT *pIndexInfo, + uint32_t instanceCount, + uint32_t firstInstance, + uint32_t stride, + const int32_t *pVertexOffset) +{ + if (drawCount == 0 || instanceCount == 0) + return; + + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + + uint32_t i = 0; + vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { + uint32_t vertex_count = draw->indexCount * instanceCount; + int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset; + + cmd_buffer->state.draw_id = i; + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID; + + struct v3dv_render_pass *pass = cmd_buffer->state.pass; + if (likely(!pass->multiview_enabled)) { + cmd_buffer_set_view_index(cmd_buffer, 0); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count); + v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) + (cmd_buffer, draw->indexCount, instanceCount, + draw->firstIndex, vertexOffset, firstInstance); + continue; + } + uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; + while (view_mask) { + cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count); + v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) + (cmd_buffer, draw->indexCount, instanceCount, + draw->firstIndex, vertexOffset, firstInstance); + } + } +} + +VKAPI_ATTR void VKAPI_CALL v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, @@ -2653,7 +3250,8 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer, struct v3dv_render_pass *pass = cmd_buffer->state.pass; if (likely(!pass->multiview_enabled)) { - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + cmd_buffer_set_view_index(cmd_buffer, 0); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0); v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect) (cmd_buffer, buffer, offset, drawCount, stride); return; @@ -2662,7 +3260,7 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer, uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; while (view_mask) { cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0); v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect) (cmd_buffer, buffer, offset, drawCount, stride); } @@ -2684,7 +3282,8 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, struct v3dv_render_pass *pass = cmd_buffer->state.pass; if (likely(!pass->multiview_enabled)) { - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + cmd_buffer_set_view_index(cmd_buffer, 0); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0); v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect) (cmd_buffer, buffer, offset, drawCount, stride); return; @@ -2693,64 +3292,173 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; while (view_mask) { cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); - v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); + v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0); v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect) (cmd_buffer, buffer, offset, drawCount, stride); } } -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, - VkPipelineStageFlags srcStageMask, - VkPipelineStageFlags dstStageMask, - VkDependencyFlags dependencyFlags, - uint32_t memoryBarrierCount, - const VkMemoryBarrier *pMemoryBarriers, - uint32_t bufferBarrierCount, - const VkBufferMemoryBarrier *pBufferBarriers, - uint32_t imageBarrierCount, - const VkImageMemoryBarrier *pImageBarriers) +static void +handle_barrier(VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask, + VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask, + bool is_image_barrier, bool is_buffer_barrier, + struct v3dv_barrier_state *state) { - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - /* We only care about barriers between GPU jobs */ - if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT || - dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) { + if (srcStageMask == VK_PIPELINE_STAGE_2_HOST_BIT || + dstStageMask == VK_PIPELINE_STAGE_2_HOST_BIT) { return; } + /* Track source of the barrier */ + uint8_t src_mask = 0; + + const VkPipelineStageFlags2 compute_mask = + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + if (srcStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) + src_mask |= V3DV_BARRIER_COMPUTE_BIT; + + const VkPipelineStageFlags2 transfer_mask = + VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | + VK_PIPELINE_STAGE_2_COPY_BIT | + VK_PIPELINE_STAGE_2_BLIT_BIT | + VK_PIPELINE_STAGE_2_CLEAR_BIT; + if (srcStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) + src_mask |= V3DV_BARRIER_TRANSFER_BIT; + + const VkPipelineStageFlags2 graphics_mask = ~(compute_mask | transfer_mask); + if (srcStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) + src_mask |= V3DV_BARRIER_GRAPHICS_BIT; + + /* Track consumer of the barrier */ + if (dstStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { + state->dst_mask |= V3DV_BARRIER_COMPUTE_BIT; + state->src_mask_compute |= src_mask; + } + + if (dstStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { + state->dst_mask |= V3DV_BARRIER_TRANSFER_BIT; + state->src_mask_transfer |= src_mask; + } + + if (dstStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { + state->dst_mask |= V3DV_BARRIER_GRAPHICS_BIT; + state->src_mask_graphics |= src_mask; + + if (dstStageMask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | + VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | + VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | + VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | + VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | + VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT | + VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { + if (is_image_barrier) + state->bcl_image_access |= dstAccessMask; + + if (is_buffer_barrier) + state->bcl_buffer_access |= dstAccessMask; + } + } +} + +void +v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer, + const VkDependencyInfo *info) +{ + uint32_t imageBarrierCount = info->imageMemoryBarrierCount; + const VkImageMemoryBarrier2 *pImageBarriers = info->pImageMemoryBarriers; + + uint32_t bufferBarrierCount = info->bufferMemoryBarrierCount; + const VkBufferMemoryBarrier2 *pBufferBarriers = info->pBufferMemoryBarriers; + + uint32_t memoryBarrierCount = info->memoryBarrierCount; + const VkMemoryBarrier2 *pMemoryBarriers = info->pMemoryBarriers; + + struct v3dv_barrier_state state = { 0 }; + for (uint32_t i = 0; i < imageBarrierCount; i++) { + /* We can safely skip barriers for image layout transitions from UNDEFINED + * layout. + * + * Notice that KHR_synchronization2 allows to specify barriers that don't + * involve a layout transition by making oldLayout and newLayout the same, + * including UNDEFINED. + */ + if (pImageBarriers[i].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && + pImageBarriers[i].oldLayout != pImageBarriers[i].newLayout) { + continue; + } + + handle_barrier(pImageBarriers[i].srcStageMask, + pImageBarriers[i].srcAccessMask, + pImageBarriers[i].dstStageMask, + pImageBarriers[i].dstAccessMask, + true, false, &state); + } + + for (uint32_t i = 0; i < bufferBarrierCount; i++) { + handle_barrier(pBufferBarriers[i].srcStageMask, + pBufferBarriers[i].srcAccessMask, + pBufferBarriers[i].dstStageMask, + pBufferBarriers[i].dstAccessMask, + false, true, &state); + } + + for (uint32_t i = 0; i < memoryBarrierCount; i++) { + handle_barrier(pMemoryBarriers[i].srcStageMask, + pMemoryBarriers[i].srcAccessMask, + pMemoryBarriers[i].dstStageMask, + pMemoryBarriers[i].dstAccessMask, + true, true, &state); + } + + /* Bail if we don't relevant barriers */ + if (!state.dst_mask) + return; + /* If we have a recording job, finish it here */ - struct v3dv_job *job = cmd_buffer->state.job; - if (job) + if (cmd_buffer->state.job) v3dv_cmd_buffer_finish_job(cmd_buffer); - cmd_buffer->state.has_barrier = true; - if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | - VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | - VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | - VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | - VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | - VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) { - cmd_buffer->state.has_bcl_barrier = true; - } + /* Update barrier state in the command buffer */ + v3dv_cmd_buffer_merge_barrier_state(&cmd_buffer->state.barrier, &state); } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, - uint32_t firstBinding, - uint32_t bindingCount, - const VkBuffer *pBuffers, - const VkDeviceSize *pOffsets) +v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, + const VkDependencyInfo *pDependencyInfo) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings; + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, pDependencyInfo); +} - /* We have to defer setting up vertex buffer since we need the buffer - * stride from the pipeline. - */ +VKAPI_ATTR void VKAPI_CALL +v3dv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer *pBuffers, + const VkDeviceSize *pOffsets, + const VkDeviceSize *pSizes, + const VkDeviceSize *pStrides) +{ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings; assert(firstBinding + bindingCount <= MAX_VBS); bool vb_state_changed = false; + if (pStrides) { + vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk, + firstBinding, bindingCount, + pStrides); + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) + vb_state_changed = true; + } + /* FIXME: at this moment we don't do any thing with pSizes. */ for (uint32_t i = 0; i < bindingCount; i++) { if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) { vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]); @@ -2766,24 +3474,6 @@ v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER; } -static uint32_t -get_index_size(VkIndexType index_type) -{ - switch (index_type) { - case VK_INDEX_TYPE_UINT8_EXT: - return 1; - break; - case VK_INDEX_TYPE_UINT16: - return 2; - break; - case VK_INDEX_TYPE_UINT32: - return 4; - break; - default: - unreachable("Unsupported index type"); - } -} - VKAPI_ATTR void VKAPI_CALL v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, @@ -2792,7 +3482,7 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - const uint32_t index_size = get_index_size(indexType); + const uint32_t index_size = vk_index_type_to_bytes(indexType); if (buffer == cmd_buffer->state.index_buffer.buffer && offset == cmd_buffer->state.index_buffer.offset && index_size == cmd_buffer->state.index_buffer.index_size) { @@ -2806,82 +3496,309 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, - VkStencilFaceFlags faceMask, - uint32_t compareMask) +v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, + uint32_t lineStippleFactor, + uint16_t lineStipplePattern) { - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - - if (faceMask & VK_STENCIL_FACE_FRONT_BIT) - cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff; - if (faceMask & VK_STENCIL_FACE_BACK_BIT) - cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff; - - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK; + /* We do not support stippled line rasterization so we just ignore this. */ } -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, - VkStencilFaceFlags faceMask, - uint32_t writeMask) +/** + * This checks a descriptor set to see if are binding any descriptors that would + * involve sampling from a linear image (the hardware only supports this for + * 1D images), and if so, attempts to create a tiled copy of the linear image + * and rewrite the descriptor set to use that instead. + * + * This was added to support a scenario with Android where some part of the UI + * wanted to show previews of linear swapchain images. For more details: + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712 + * + * Currently this only supports a linear sampling from a simple 2D image, but + * it could be extended to support more cases if necessary. + */ +static void +handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_descriptor_set *set, + bool is_compute) { - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + for (int32_t i = 0; i < set->layout->binding_count; i++) { + const struct v3dv_descriptor_set_binding_layout *blayout = + &set->layout->binding[i]; + if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE && + blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + continue; - if (faceMask & VK_STENCIL_FACE_FRONT_BIT) - cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff; - if (faceMask & VK_STENCIL_FACE_BACK_BIT) - cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff; + struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index]; + if (!desc->image_view) + continue; - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK; -} + struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image; + struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view; + if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D || + view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) { + continue; + } -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer, - VkStencilFaceFlags faceMask, - uint32_t reference) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + /* FIXME: we can probably handle most of these restrictions too with + * a bit of extra effort. + */ + if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D || + view->vk.level_count != 1 || view->vk.layer_count != 1 || + blayout->array_size != 1) { + fprintf(stderr, "Sampling from linear image is not supported. " + "Expect corruption.\n"); + continue; + } - if (faceMask & VK_STENCIL_FACE_FRONT_BIT) - cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff; - if (faceMask & VK_STENCIL_FACE_BACK_BIT) - cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff; + /* We are sampling from a linear image. V3D doesn't support this + * so we create a tiled copy of the image and rewrite the descriptor + * to read from it instead. + */ + perf_debug("Sampling from linear image is not supported natively and " + "requires a copy.\n"); - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE; -} + struct v3dv_device *device = cmd_buffer->device; + VkDevice vk_device = v3dv_device_to_handle(device); -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer, - float depthBiasConstantFactor, - float depthBiasClamp, - float depthBiasSlopeFactor) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + /* Allocate shadow tiled image if needed, we only do this once for + * each image, on the first sampling attempt. We need to take a lock + * since we may be trying to do the same in another command buffer in + * a separate thread. + */ + mtx_lock(&device->meta.mtx); + VkResult result; + VkImage tiled_image; + if (image->shadow) { + tiled_image = v3dv_image_to_handle(image->shadow); + } else { + VkImageCreateInfo image_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .flags = image->vk.create_flags, + .imageType = image->vk.image_type, + .format = image->vk.format, + .extent = { + image->vk.extent.width, + image->vk.extent.height, + image->vk.extent.depth, + }, + .mipLevels = image->vk.mip_levels, + .arrayLayers = image->vk.array_layers, + .samples = image->vk.samples, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = image->vk.usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + result = v3dv_CreateImage(vk_device, &image_info, + &device->vk.alloc, &tiled_image); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + mtx_unlock(&device->meta.mtx); + continue; + } - cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor; - cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp; - cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor; - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS; -} + bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT; + VkImageMemoryRequirementsInfo2 reqs_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2, + .image = tiled_image, + }; + + assert(image->plane_count <= V3DV_MAX_PLANE_COUNT); + for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) { + VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p; + VkImagePlaneMemoryRequirementsInfo plane_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO, + .planeAspect = plane_aspect, + }; + if (disjoint) + reqs_info.pNext = &plane_info; + + VkMemoryRequirements2 reqs = { + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, + }; + v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs); + + VkDeviceMemory mem; + VkMemoryAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = reqs.memoryRequirements.size, + .memoryTypeIndex = 0, + }; + result = v3dv_AllocateMemory(vk_device, &alloc_info, + &device->vk.alloc, &mem); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc); + mtx_unlock(&device->meta.mtx); + continue; + } + + VkBindImageMemoryInfo bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO, + .image = tiled_image, + .memory = mem, + .memoryOffset = 0, + }; + VkBindImagePlaneMemoryInfo plane_bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO, + .planeAspect = plane_aspect, + }; + if (disjoint) + bind_info.pNext = &plane_bind_info; + result = v3dv_BindImageMemory2(vk_device, 1, &bind_info); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc); + v3dv_FreeMemory(vk_device, mem, &device->vk.alloc); + mtx_unlock(&device->meta.mtx); + continue; + } + } -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, - float minDepthBounds, - float maxDepthBounds) -{ - /* We do not support depth bounds testing so we just ingore this. We are - * already asserting that pipelines don't enable the feature anyway. - */ -} + image->shadow = v3dv_image_from_handle(tiled_image); + } -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer, - float lineWidth) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + /* Create a shadow view that refers to the tiled image if needed */ + VkImageView tiled_view; + if (view->shadow) { + tiled_view = v3dv_image_view_to_handle(view->shadow); + } else { + VkImageViewCreateInfo view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .flags = view->vk.create_flags, + .image = tiled_image, + .viewType = view->vk.view_type, + .format = view->vk.format, + .components = view->vk.swizzle, + .subresourceRange = { + .aspectMask = view->vk.aspects, + .baseMipLevel = view->vk.base_mip_level, + .levelCount = view->vk.level_count, + .baseArrayLayer = view->vk.base_array_layer, + .layerCount = view->vk.layer_count, + }, + }; + result = v3dv_create_image_view(device, &view_info, &tiled_view); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + mtx_unlock(&device->meta.mtx); + continue; + } + } + + view->shadow = v3dv_image_view_from_handle(tiled_view); + + mtx_unlock(&device->meta.mtx); + + /* Rewrite the descriptor to use the shadow view */ + VkDescriptorImageInfo desc_image_info = { + .sampler = v3dv_sampler_to_handle(desc->sampler), + .imageView = tiled_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + VkWriteDescriptorSet write = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = v3dv_descriptor_set_to_handle(set), + .dstBinding = i, + .dstArrayElement = 0, /* Assumes array_size is 1 */ + .descriptorCount = 1, + .descriptorType = desc->type, + .pImageInfo = &desc_image_info, + }; + v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL); + + /* Now we need to actually copy the pixel data from the linear image + * into the tiled image storage to ensure it is up-to-date. + * + * FIXME: ideally we would track if the linear image is dirty and skip + * this step otherwise, but that would be a bit of a pain. + * + * Note that we need to place the copy job *before* the current job in + * the command buffer state so we have the tiled image ready to process + * an upcoming draw call in the current job that samples from it. + * + * Also, we need to use the TFU path for this copy, as any other path + * will use the tile buffer and would require a new framebuffer setup, + * thus requiring extra work to stop and resume any in-flight render + * pass. Since we are converting a full 2D texture here the TFU should + * be able to handle this. + */ + for (int p = 0; p < image->plane_count; p++) { + VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p; + struct VkImageCopy2 copy_region = { + .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2, + .srcSubresource = { + .aspectMask = image->plane_count == 1 ? + view->vk.aspects : (view->vk.aspects & plane_aspect), + .mipLevel = view->vk.base_mip_level, + .baseArrayLayer = view->vk.base_array_layer, + .layerCount = view->vk.layer_count, + }, + .srcOffset = {0, 0, 0 }, + .dstSubresource = { + .aspectMask = image->plane_count == 1 ? + view->vk.aspects : (view->vk.aspects & plane_aspect), + .mipLevel = view->vk.base_mip_level, + .baseArrayLayer = view->vk.base_array_layer, + .layerCount = view->vk.layer_count, + }, + .dstOffset = { 0, 0, 0}, + .extent = { + image->planes[p].width, + image->planes[p].height, + 1, + }, + }; + struct v3dv_image *copy_src = image; + struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image); + bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src, + ©_region); + if (ok) { + /* This will emit the TFU job right before the current in-flight + * job (if any), since in-fight jobs are only added to the list + * when finished. + */ + struct v3dv_job *tfu_job = + list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link); + assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU); + /* Serialize the copy since we don't know who is producing the linear + * image and we need the image to be ready by the time the copy + * executes. + */ + tfu_job->serialize = V3DV_BARRIER_ALL; - cmd_buffer->state.dynamic.line_width = lineWidth; - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH; + /* Also, we need to ensure the TFU copy job completes before anyhing + * else coming after that may be using the tiled shadow copy. + */ + if (cmd_buffer->state.job) { + /* If we already had an in-flight job (i.e. we are in a render + * pass) make sure the job waits for the TFU copy. + */ + cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT; + } else { + /* Otherwise, make the the follow-up job syncs with the TFU + * job we just added when it is created by adding the + * corresponding barrier state. + */ + if (!is_compute) { + cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT; + cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT; + } else { + cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT; + cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT; + } + } + } else { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "TFU doesn't support copy. Expect corruption.\n"); + } + } + } } VKAPI_ATTR void VKAPI_CALL @@ -2917,6 +3834,15 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, descriptor_state->descriptor_sets[index] = set; dirty_stages |= set->layout->shader_stages; descriptor_state_changed = true; + + /* Check if we are sampling from a linear 2D image. This is not + * supported in hardware, but may be required for some applications + * so we will transparently convert to tiled at the expense of + * performance. + */ + handle_sample_from_linear_image(cmd_buffer, set, + pipelineBindPoint == + VK_PIPELINE_BIND_POINT_COMPUTE); } for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) { @@ -2951,79 +3877,19 @@ v3dv_CmdPushConstants(VkCommandBuffer commandBuffer, { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size)) - return; - - memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size); - - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS; - cmd_buffer->state.dirty_push_constants_stages |= stageFlags; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, - const float blendConstants[4]) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - - if (!memcmp(state->dynamic.blend_constants, blendConstants, - sizeof(state->dynamic.blend_constants))) { + if (!memcmp((uint8_t *) cmd_buffer->state.push_constants_data + offset, + pValues, size)) { return; } - memcpy(state->dynamic.blend_constants, blendConstants, - sizeof(state->dynamic.blend_constants)); - - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS; -} + memcpy((uint8_t *) cmd_buffer->state.push_constants_data + offset, + pValues, size); + cmd_buffer->state.push_constants_size = + MAX2(offset + size, cmd_buffer->state.push_constants_size); -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, - uint32_t attachmentCount, - const VkBool32 *pColorWriteEnables) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - uint32_t color_write_enable = 0; - - for (uint32_t i = 0; i < attachmentCount; i++) - color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; - - if (state->dynamic.color_write_enable == color_write_enable) - return; - - state->dynamic.color_write_enable = color_write_enable; - - state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; -} - -void -v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_query_pool *pool, - uint32_t first, - uint32_t count) -{ - /* Resets can only happen outside a render pass instance so we should not - * be in the middle of job recording. - */ - assert(cmd_buffer->state.pass == NULL); - assert(cmd_buffer->state.job == NULL); - - assert(first < pool->query_count); - assert(first + count <= pool->query_count); - - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_RESET_QUERIES, - cmd_buffer, -1); - v3dv_return_if_oom(cmd_buffer, NULL); - - job->cpu.query_reset.pool = pool; - job->cpu.query_reset.first = first; - job->cpu.query_reset.count = count; - - list_addtail(&job->list_link, &cmd_buffer->jobs); + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS | + V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO; + cmd_buffer->state.dirty_push_constants_stages |= stageFlags; } void @@ -3059,37 +3925,87 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer, uint32_t query, VkQueryControlFlags flags) { - /* FIXME: we only support one active query for now */ - assert(cmd_buffer->state.query.active_query.bo == NULL); assert(query < pool->query_count); + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: + /* FIXME: we only support one active occlusion query for now */ + assert(cmd_buffer->state.query.active_query.bo == NULL); + + cmd_buffer->state.query.active_query.bo = pool->occlusion.bo; + cmd_buffer->state.query.active_query.offset = + pool->queries[query].occlusion.offset; + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; + break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + assert(cmd_buffer->state.query.active_query.perf == NULL); + if (cmd_buffer->state.pass) + v3dv_cmd_buffer_subpass_finish(cmd_buffer); - cmd_buffer->state.query.active_query.bo = pool->queries[query].bo; - cmd_buffer->state.query.active_query.offset = pool->queries[query].offset; - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; + cmd_buffer->state.query.active_query.perf = + &pool->queries[query].perf; + + if (cmd_buffer->state.pass) { + v3dv_cmd_buffer_subpass_resume(cmd_buffer, + cmd_buffer->state.subpass_idx); + } + break; + } + default: + unreachable("Unsupported query type"); + } +} + +void +v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer) +{ + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + struct v3dv_bo *occlusion_query_bo = state->query.active_query.bo; + if (occlusion_query_bo) { + assert(!state->query.active_query.paused_bo); + state->query.active_query.paused_bo = occlusion_query_bo; + state->query.active_query.bo = NULL; + state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; + } } void -v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_query_pool *pool, - uint32_t query) +v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer) +{ + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + struct v3dv_bo *occlusion_query_bo = state->query.active_query.paused_bo; + if (occlusion_query_bo) { + assert(!state->query.active_query.bo); + state->query.active_query.bo = occlusion_query_bo; + state->query.active_query.paused_bo = NULL; + state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; + } +} + +static void +v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) { assert(query < pool->query_count); - assert(cmd_buffer->state.query.active_query.bo != NULL); + assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION || + pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); - if (cmd_buffer->state.pass) { - /* Queue the EndQuery in the command buffer state, we will create a CPU - * job to flag all of these queries as possibly available right after the - * render pass job in which they have been recorded. - */ + /* For occlusion queries in the middle of a render pass we don't want to + * split the current job at the EndQuery just to emit query availability, + * instead we queue this state in the command buffer and we emit it when + * we finish the current job. + */ + if (cmd_buffer->state.pass && + pool->query_type == VK_QUERY_TYPE_OCCLUSION) { struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; v3dv_cmd_buffer_ensure_array_state(cmd_buffer, - sizeof(struct v3dv_end_query_cpu_job_info), + sizeof(struct v3dv_end_query_info), state->query.end.used_count, &state->query.end.alloc_count, (void **) &state->query.end.states); v3dv_return_if_oom(cmd_buffer, NULL); - struct v3dv_end_query_cpu_job_info *info = + struct v3dv_end_query_info *info = &state->query.end.states[state->query.end.used_count++]; info->pool = pool; @@ -3106,7 +4022,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, * * In our case, only the first query is used but this means we still need * to flag the other queries as available so we don't emit errors when - * the applications attempt to retrive values from them. + * the applications attempt to retrieve values from them. */ struct v3dv_render_pass *pass = cmd_buffer->state.pass; if (!pass->multiview_enabled) { @@ -3116,60 +4032,65 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, info->count = util_bitcount(subpass->view_mask); } } else { - /* Otherwise, schedule the CPU job immediately */ - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_END_QUERY, - cmd_buffer, -1); - v3dv_return_if_oom(cmd_buffer, NULL); - - job->cpu.query_end.pool = pool; - job->cpu.query_end.query = query; + /* Otherwise, schedule the end query job immediately. + * + * Multiview queries cannot cross subpass boundaries, so query count is + * always 1. + */ + if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) + v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, pool, query, 1, 1); + else + cmd_buffer_emit_end_query_cpu(cmd_buffer, pool, query, 1); + } +} - /* Multiview queries cannot cross subpass boundaries */ - job->cpu.query_end.count = 1; +static void +v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) +{ + assert(query < pool->query_count); + assert(cmd_buffer->state.query.active_query.bo != NULL); - list_addtail(&job->list_link, &cmd_buffer->jobs); - } + v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query); cmd_buffer->state.query.active_query.bo = NULL; cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; } -void -v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_query_pool *pool, - uint32_t first, - uint32_t count, - struct v3dv_buffer *dst, - uint32_t offset, - uint32_t stride, - VkQueryResultFlags flags) +static void +v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) { - /* Copies can only happen outside a render pass instance so we should not - * be in the middle of job recording. - */ - assert(cmd_buffer->state.pass == NULL); - assert(cmd_buffer->state.job == NULL); + assert(query < pool->query_count); + assert(cmd_buffer->state.query.active_query.perf != NULL); - assert(first < pool->query_count); - assert(first + count <= pool->query_count); + if (cmd_buffer->state.pass) + v3dv_cmd_buffer_subpass_finish(cmd_buffer); - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS, - cmd_buffer, -1); - v3dv_return_if_oom(cmd_buffer, NULL); + v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query); - job->cpu.query_copy_results.pool = pool; - job->cpu.query_copy_results.first = first; - job->cpu.query_copy_results.count = count; - job->cpu.query_copy_results.dst = dst; - job->cpu.query_copy_results.offset = offset; - job->cpu.query_copy_results.stride = stride; - job->cpu.query_copy_results.flags = flags; + cmd_buffer->state.query.active_query.perf = NULL; - list_addtail(&job->list_link, &cmd_buffer->jobs); + if (cmd_buffer->state.pass) + v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx); +} + +void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) +{ + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: + v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query); + break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query); + break; + default: + unreachable("Unsupported query type"); + } } void @@ -3191,115 +4112,10 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetEvent(VkCommandBuffer commandBuffer, - VkEvent _event, - VkPipelineStageFlags stageMask) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - V3DV_FROM_HANDLE(v3dv_event, event, _event); - - /* Event (re)sets can only happen outside a render pass instance so we - * should not be in the middle of job recording. - */ - assert(cmd_buffer->state.pass == NULL); - assert(cmd_buffer->state.job == NULL); - - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_SET_EVENT, - cmd_buffer, -1); - v3dv_return_if_oom(cmd_buffer, NULL); - - job->cpu.event_set.event = event; - job->cpu.event_set.state = 1; - - list_addtail(&job->list_link, &cmd_buffer->jobs); -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdResetEvent(VkCommandBuffer commandBuffer, - VkEvent _event, - VkPipelineStageFlags stageMask) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - V3DV_FROM_HANDLE(v3dv_event, event, _event); - - /* Event (re)sets can only happen outside a render pass instance so we - * should not be in the middle of job recording. - */ - assert(cmd_buffer->state.pass == NULL); - assert(cmd_buffer->state.job == NULL); - - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_SET_EVENT, - cmd_buffer, -1); - v3dv_return_if_oom(cmd_buffer, NULL); - - job->cpu.event_set.event = event; - job->cpu.event_set.state = 0; - - list_addtail(&job->list_link, &cmd_buffer->jobs); -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer, - uint32_t eventCount, - const VkEvent *pEvents, - VkPipelineStageFlags srcStageMask, - VkPipelineStageFlags dstStageMask, - uint32_t memoryBarrierCount, - const VkMemoryBarrier *pMemoryBarriers, - uint32_t bufferMemoryBarrierCount, - const VkBufferMemoryBarrier *pBufferMemoryBarriers, - uint32_t imageMemoryBarrierCount, - const VkImageMemoryBarrier *pImageMemoryBarriers) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - - assert(eventCount > 0); - - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_WAIT_EVENTS, - cmd_buffer, -1); - v3dv_return_if_oom(cmd_buffer, NULL); - - const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount; - - job->cpu.event_wait.events = - vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!job->cpu.event_wait.events) { - v3dv_flag_oom(cmd_buffer, NULL); - return; - } - job->cpu.event_wait.event_count = eventCount; - - for (uint32_t i = 0; i < eventCount; i++) - job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]); - - /* vkCmdWaitEvents can be recorded inside a render pass, so we might have - * an active job. - * - * If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen - * inside a render pass, it is safe to move the wait job so it happens right - * before the current job we are currently recording for the subpass, if any - * (it would actually be safe to move it all the way back to right before - * the start of the render pass). - * - * If we are outside a render pass then we should not have any on-going job - * and we are free to just add the wait job without restrictions. - */ - assert(cmd_buffer->state.pass || !cmd_buffer->state.job); - list_addtail(&job->list_link, &cmd_buffer->jobs); -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer, - VkPipelineStageFlagBits pipelineStage, - VkQueryPool queryPool, - uint32_t query) +v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, + VkPipelineStageFlags2 stage, + VkQueryPool queryPool, + uint32_t query) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool); @@ -3349,24 +4165,9 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT; } -#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 -#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0 -/* Allow this dispatch to start while the last one is still running. */ -#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26) -/* Maximum supergroup ID. 6 bits. */ -#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20 -/* Batches per supergroup minus 1. 8 bits. */ -#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12 -/* Workgroups per supergroup, 0 means 16 */ -#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8 -#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0 - -#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2) -#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) -#define V3D_CSD_CFG5_THREADING (1 << 0) - void v3dv_cmd_buffer_rewrite_indirect_csd_job( + struct v3dv_device *device, struct v3dv_csd_indirect_cpu_job_info *info, const uint32_t *wg_counts) { @@ -3386,15 +4187,22 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; - submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * - (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]); + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (device->devinfo.ver < 71 || + (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { + submit->cfg[4] = num_batches - 1; + } else { + submit->cfg[4] = num_batches; + } assert(submit->cfg[4] != ~0); if (info->needs_wg_uniform_rewrite) { /* Make sure the GPU is not currently accessing the indirect CL for this * job, since we are about to overwrite some of the uniform data. */ - v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE); + v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE); for (uint32_t i = 0; i < 3; i++) { if (info->wg_uniform_offsets[i]) { @@ -3420,6 +4228,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t **wg_uniform_offsets_out, uint32_t *wg_size_out) { + struct v3dv_device *device = cmd_buffer->device; struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline; assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); struct v3dv_shader_variant *cs_variant = @@ -3478,23 +4287,31 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, if (wg_size_out) *wg_size_out = wg_size; - submit->cfg[4] = num_batches - 1; + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (device->devinfo.ver < 71 || + (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { + submit->cfg[4] = num_batches - 1; + } else { + submit->cfg[4] = num_batches; + } assert(submit->cfg[4] != ~0); assert(pipeline->shared_data->assembly_bo); struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo; submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset; - submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; if (cs_variant->prog_data.base->single_seg) submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; if (cs_variant->prog_data.base->threads == 4) submit->cfg[5] |= V3D_CSD_CFG5_THREADING; + /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */ + if (device->devinfo.ver < 71) + submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; if (cs_variant->prog_data.cs->shared_size > 0) { job->csd.shared_memory = v3dv_bo_alloc(cmd_buffer->device, - cs_variant->prog_data.cs->shared_size * wgs_per_sg, + cs_variant->prog_data.cs->shared_size * num_wgs, "shared_vars", true); if (!job->csd.shared_memory) { v3dv_flag_oom(cmd_buffer, NULL); @@ -3509,6 +4326,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, wg_uniform_offsets_out); submit->cfg[6] = uniforms.bo->offset + uniforms.offset; + + /* Track VK_KHR_buffer_device_address usage in the job */ + job->uses_buffer_device_address |= pipeline->uses_buffer_device_address; + v3dv_job_add_bo(job, uniforms.bo); return job; @@ -3541,19 +4362,6 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer, } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdDispatch(VkCommandBuffer commandBuffer, - uint32_t groupCountX, - uint32_t groupCountY, - uint32_t groupCountZ) -{ - V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - - cmd_buffer_emit_pre_dispatch(cmd_buffer); - cmd_buffer_dispatch(cmd_buffer, 0, 0, 0, - groupCountX, groupCountY, groupCountZ); -} - -VKAPI_ATTR void VKAPI_CALL v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t baseGroupX, uint32_t baseGroupY, @@ -3615,6 +4423,16 @@ cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer, job->cpu.csd_indirect.wg_uniform_offsets[2]; list_addtail(&job->list_link, &cmd_buffer->jobs); + + /* If we have a CPU queue we submit the CPU job directly to the + * queue and the CSD job will be dispatched from within the kernel + * queue, otherwise we will have to dispatch the CSD job manually + * right after the CPU job by adding it to the list of jobs in the + * command buffer. + */ + if (!cmd_buffer->device->pdevice->caps.cpu_queue) + list_addtail(&csd_job->list_link, &cmd_buffer->jobs); + cmd_buffer->state.job = NULL; } @@ -3633,8 +4451,144 @@ v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) +v3dv_CmdBeginRenderingKHR(VkCommandBuffer commandBuffer, + const VkRenderingInfoKHR *info) { - /* Nothing to do here since we only support a single device */ - assert(deviceMask == 0x1); + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->state.suspending = info->flags & VK_RENDERING_SUSPENDING_BIT; + cmd_buffer->state.resuming = info->flags & VK_RENDERING_RESUMING_BIT; + + /* FIXME: for resuming passes we might not need all this setup below since + * we are only mostly recording draw calls like in secondaries. + */ + + v3dv_setup_dynamic_render_pass(cmd_buffer, info); + v3dv_return_if_oom(cmd_buffer, NULL); + + v3dv_setup_dynamic_framebuffer(cmd_buffer, info); + v3dv_return_if_oom(cmd_buffer, NULL); + + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + state->pass = &state->dynamic_pass; + state->framebuffer = state->dynamic_framebuffer; + + VkRenderPassBeginInfo begin_info = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = NULL, + .renderPass = v3dv_render_pass_to_handle(state->pass), + .framebuffer = v3dv_framebuffer_to_handle(state->framebuffer), + .renderArea = info->renderArea, + }; + + VkClearValue *clear_values = NULL; + if (state->pass->attachment_count > 0) { + clear_values = + vk_alloc(&cmd_buffer->device->vk.alloc, + state->pass->attachment_count * sizeof(VkClearValue), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!clear_values) { + v3dv_flag_oom(cmd_buffer, NULL); + return; + } + } + + for (int i = 0; i < info->colorAttachmentCount; i++) { + if (!info->pColorAttachments[i].imageView) + continue; + + uint32_t a = cmd_buffer->state.dynamic_subpass.color_attachments[i].attachment; + assert(a < state->pass->attachment_count); + clear_values[a] = info->pColorAttachments[i].clearValue; + } + + if (info->pDepthAttachment && + info->pDepthAttachment->imageView != VK_NULL_HANDLE) { + uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment; + assert(a < state->pass->attachment_count); + clear_values[a].depthStencil.depth = + info->pDepthAttachment->clearValue.depthStencil.depth; + } + + if (info->pStencilAttachment && + info->pStencilAttachment->imageView != VK_NULL_HANDLE) { + uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment; + assert(a < state->pass->attachment_count); + clear_values[a].depthStencil.stencil = + info->pStencilAttachment->clearValue.depthStencil.stencil; + } + + begin_info.clearValueCount = state->pass->attachment_count; + begin_info.pClearValues = clear_values; + + cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer); + v3dv_return_if_oom(cmd_buffer, NULL); + cmd_buffer_init_render_pass_attachment_state(cmd_buffer, &begin_info); + + if (clear_values) + vk_free(&cmd_buffer->vk.pool->alloc, clear_values); + + state->render_area = info->renderArea; + constraint_clip_window_to_render_area(cmd_buffer); + v3dv_cmd_buffer_subpass_start(cmd_buffer, 0); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_CmdEndRenderingKHR(VkCommandBuffer commandBuffer) +{ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + + v3dv_return_if_oom(cmd_buffer, NULL); + + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + assert(state->subpass_idx == state->pass->subpass_count - 1); + + /* If we have any pending jobs that were waiting for the current job + * to finish and we are suspending the pass here, we need to finish the + * job completely and ensure we emit the pending jobs immediately. + * + * FIXME: this is not optimal but since the resuming command buffer won't + * have the pending state we can't do it after the resuming chain completes + * without some extra work: we would have to generate the pending jobs + * now but not add them to this command buffer's job list, instead, they + * should be added to a separate list of "pending jobs" and at submit time + * we would accumulate these jobs during the suspend/resume chain and emit + * them all after the last job in the chain. + */ + if (state->suspending && cmd_buffer_has_pending_jobs(cmd_buffer)) + v3dv_cmd_buffer_finish_job(cmd_buffer); + + /* If we don't have a job and we are suspending we will need to create one + * so we can link to a follow-up resume job. Because would be starting a new + * job, we should ensure the command buffer state is not flagged as resuming + * from a previous suspend. The new job will consume any pending barrier + * state if necessary. + */ + struct v3dv_job *job = cmd_buffer->state.job; + if (!job && state->suspending) { + state->resuming = false; + job = v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->subpass_idx); + if (!job) + return; + } + + /* If this job is suspending it means it will continue execution in another + * job (with the same RCL spec). We implement this by branching the BCL and + * we will patch the branch address when we know the resuming job. + */ + if (state->suspending) + v3dv_X(cmd_buffer->device, cmd_buffer_suspend)(cmd_buffer); + + v3dv_cmd_buffer_subpass_finish(cmd_buffer); + v3dv_cmd_buffer_finish_job(cmd_buffer); + + /* This must be done after the resume/suspend chain completed. */ + if (!state->suspending) + cmd_buffer_subpass_handle_pending_resolves(cmd_buffer); + + state->framebuffer = NULL; + state->pass = NULL; + state->subpass_idx = -1; + state->suspending = false; + state->resuming = false; } diff --git a/src/broadcom/vulkan/v3dv_debug.c b/src/broadcom/vulkan/v3dv_debug.c index 055300d05c9..065e8f66026 100644 --- a/src/broadcom/vulkan/v3dv_debug.c +++ b/src/broadcom/vulkan/v3dv_debug.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * based in part on radv_debug.h which is: * Copyright © 2017 Google. diff --git a/src/broadcom/vulkan/v3dv_debug.h b/src/broadcom/vulkan/v3dv_debug.h index 75f253700ed..bab21eef2b8 100644 --- a/src/broadcom/vulkan/v3dv_debug.h +++ b/src/broadcom/vulkan/v3dv_debug.h @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * based in part on radv_debug.h which is: * Copyright © 2017 Google. diff --git a/src/broadcom/vulkan/v3dv_descriptor_set.c b/src/broadcom/vulkan/v3dv_descriptor_set.c index fd9ec935611..1d777ba08d4 100644 --- a/src/broadcom/vulkan/v3dv_descriptor_set.c +++ b/src/broadcom/vulkan/v3dv_descriptor_set.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -28,19 +28,26 @@ /* * For a given descriptor defined by the descriptor_set it belongs, its - * binding layout, and array_index, it returns the map region assigned to it - * from the descriptor pool bo. + * binding layout, array_index, and plane, it returns the map region assigned + * to it from the descriptor pool bo. */ -static void* +static void * descriptor_bo_map(struct v3dv_device *device, struct v3dv_descriptor_set *set, const struct v3dv_descriptor_set_binding_layout *binding_layout, uint32_t array_index) { - assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0); + /* Inline uniform blocks use BO memory to store UBO contents, not + * descriptor data, so their descriptor BO size is 0 even though they + * do use BO memory. + */ + uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type); + assert(bo_size > 0 || + binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK); + return set->pool->bo->map + set->base_offset + binding_layout->descriptor_offset + - array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type); + array_index * binding_layout->plane_stride * bo_size; } static bool @@ -102,7 +109,7 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat * It also returns the descriptor type, so the caller could do extra * validation or adding extra offsets if the bo contains more that one field. */ -static struct v3dv_cl_reloc +struct v3dv_cl_reloc v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device, struct v3dv_descriptor_state *descriptor_state, struct v3dv_descriptor_map *map, @@ -125,8 +132,13 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device, const struct v3dv_descriptor_set_binding_layout *binding_layout = &set->layout->binding[binding_number]; - assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0); - *out_type = binding_layout->type; + + uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type); + + assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK || + bo_size > 0); + if (out_type) + *out_type = binding_layout->type; uint32_t array_index = map->array_index[index]; assert(array_index < binding_layout->array_size); @@ -134,7 +146,7 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device, struct v3dv_cl_reloc reloc = { .bo = set->pool->bo, .offset = set->base_offset + binding_layout->descriptor_offset + - array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type), + array_index * binding_layout->plane_stride * bo_size, }; return reloc; @@ -213,40 +225,11 @@ v3dv_descriptor_map_get_sampler_state(struct v3dv_device *device, type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) - reloc.offset += v3dv_X(device, combined_image_sampler_sampler_state_offset)(); + reloc.offset += v3dv_X(device, combined_image_sampler_sampler_state_offset)(map->plane[index]); return reloc; } -const struct v3dv_format* -v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_state, - struct v3dv_descriptor_map *map, - struct v3dv_pipeline_layout *pipeline_layout, - uint32_t index, - VkFormat *out_vk_format) -{ - struct v3dv_descriptor *descriptor = - v3dv_descriptor_map_get_descriptor(descriptor_state, map, - pipeline_layout, index, NULL); - - switch (descriptor->type) { - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - assert(descriptor->buffer_view); - *out_vk_format = descriptor->buffer_view->vk_format; - return descriptor->buffer_view->format; - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - assert(descriptor->image_view); - *out_vk_format = descriptor->image_view->vk.format; - return descriptor->image_view->format; - default: - unreachable("descriptor type doesn't has a texture format"); - } -} - struct v3dv_bo* v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_state, struct v3dv_descriptor_map *map, @@ -270,7 +253,8 @@ v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_stat assert(descriptor->image_view); struct v3dv_image *image = (struct v3dv_image *) descriptor->image_view->vk.image; - return image->mem->bo; + assert(map->plane[index] < image->plane_count); + return image->planes[map->plane[index]].mem->bo; } default: unreachable("descriptor type doesn't has a texture bo"); @@ -299,11 +283,66 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device, type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER); if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) - reloc.offset += v3dv_X(device, combined_image_sampler_texture_state_offset)(); + reloc.offset += v3dv_X(device, combined_image_sampler_texture_state_offset)(map->plane[index]); return reloc; } +#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x)); + +static void +sha1_update_ycbcr_conversion(struct mesa_sha1 *ctx, + const struct vk_ycbcr_conversion_state *conversion) +{ + SHA1_UPDATE_VALUE(ctx, conversion->format); + SHA1_UPDATE_VALUE(ctx, conversion->ycbcr_model); + SHA1_UPDATE_VALUE(ctx, conversion->ycbcr_range); + SHA1_UPDATE_VALUE(ctx, conversion->mapping); + SHA1_UPDATE_VALUE(ctx, conversion->chroma_offsets); + SHA1_UPDATE_VALUE(ctx, conversion->chroma_reconstruction); +} + +static void +sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx, + const struct v3dv_descriptor_set_binding_layout *layout, + const struct v3dv_descriptor_set_layout *set_layout) +{ + SHA1_UPDATE_VALUE(ctx, layout->type); + SHA1_UPDATE_VALUE(ctx, layout->array_size); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_index); + SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count); + SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset); + SHA1_UPDATE_VALUE(ctx, layout->immutable_samplers_offset); + SHA1_UPDATE_VALUE(ctx, layout->plane_stride); + + if (layout->immutable_samplers_offset) { + const struct v3dv_sampler *immutable_samplers = + v3dv_immutable_samplers(set_layout, layout); + + for (unsigned i = 0; i < layout->array_size; i++) { + const struct v3dv_sampler *sampler = &immutable_samplers[i]; + if (sampler->conversion) + sha1_update_ycbcr_conversion(ctx, &sampler->conversion->state); + } + } +} + +static void +sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx, + const struct v3dv_descriptor_set_layout *layout) +{ + SHA1_UPDATE_VALUE(ctx, layout->flags); + SHA1_UPDATE_VALUE(ctx, layout->binding_count); + SHA1_UPDATE_VALUE(ctx, layout->shader_stages); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_count); + SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count); + + for (uint16_t i = 0; i < layout->binding_count; i++) + sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i], layout); +} + + /* * As anv and tu already points: * @@ -326,16 +365,17 @@ v3dv_CreatePipelineLayout(VkDevice _device, layout = vk_object_zalloc(&device->vk, pAllocator, sizeof(*layout), VK_OBJECT_TYPE_PIPELINE_LAYOUT); if (layout == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); layout->num_sets = pCreateInfo->setLayoutCount; + layout->ref_cnt = 1; uint32_t dynamic_offset_count = 0; for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) { V3DV_FROM_HANDLE(v3dv_descriptor_set_layout, set_layout, pCreateInfo->pSetLayouts[set]); + v3dv_descriptor_set_layout_ref(set_layout); layout->set[set].layout = set_layout; - layout->set[set].dynamic_offset_start = dynamic_offset_count; for (uint32_t b = 0; b < set_layout->binding_count; b++) { dynamic_offset_count += set_layout->binding[b].array_size * @@ -356,11 +396,34 @@ v3dv_CreatePipelineLayout(VkDevice _device, layout->dynamic_offset_count = dynamic_offset_count; + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + for (unsigned s = 0; s < layout->num_sets; s++) { + sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout); + _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start, + sizeof(layout->set[s].dynamic_offset_start)); + } + _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets)); + _mesa_sha1_final(&ctx, layout->sha1); + *pPipelineLayout = v3dv_pipeline_layout_to_handle(layout); return VK_SUCCESS; } +void +v3dv_pipeline_layout_destroy(struct v3dv_device *device, + struct v3dv_pipeline_layout *layout, + const VkAllocationCallbacks *alloc) +{ + assert(layout); + + for (uint32_t i = 0; i < layout->num_sets; i++) + v3dv_descriptor_set_layout_unref(device, layout->set[i].layout); + + vk_object_free(&device->vk, alloc, layout); +} + VKAPI_ATTR void VKAPI_CALL v3dv_DestroyPipelineLayout(VkDevice _device, VkPipelineLayout _pipelineLayout, @@ -371,7 +434,8 @@ v3dv_DestroyPipelineLayout(VkDevice _device, if (!pipeline_layout) return; - vk_object_free(&device->vk, pAllocator, pipeline_layout); + + v3dv_pipeline_layout_unref(device, pipeline_layout, pAllocator); } VKAPI_ATTR VkResult VKAPI_CALL @@ -393,7 +457,10 @@ v3dv_CreateDescriptorPool(VkDevice _device, uint32_t bo_size = 0; uint32_t descriptor_count = 0; - assert(pCreateInfo->poolSizeCount > 0); + const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO); + for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { /* Verify supported descriptor type */ switch(pCreateInfo->pPoolSizes[i].type) { @@ -408,6 +475,7 @@ v3dv_CreateDescriptorPool(VkDevice _device, case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: break; default: unreachable("Unimplemented descriptor type"); @@ -415,9 +483,28 @@ v3dv_CreateDescriptorPool(VkDevice _device, } assert(pCreateInfo->pPoolSizes[i].descriptorCount > 0); - descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; - bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) * - pCreateInfo->pPoolSizes[i].descriptorCount; + if (pCreateInfo->pPoolSizes[i].type == + VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* Inline uniform blocks are specified to use the descriptor array + * size as the size in bytes of the block. + */ + assert(inline_info); + descriptor_count += inline_info->maxInlineUniformBlockBindings; + bo_size += pCreateInfo->pPoolSizes[i].descriptorCount; + } else { + descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; + bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) * + pCreateInfo->pPoolSizes[i].descriptorCount; + } + } + + /* We align all our buffers to V3D_NON_COHERENT_ATOM_SIZE, make sure we + * allocate enough memory to honor that requirement for all our inline + * buffers too. + */ + if (inline_info) { + bo_size += V3D_NON_COHERENT_ATOM_SIZE * + inline_info->maxInlineUniformBlockBindings; } if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { @@ -433,7 +520,7 @@ v3dv_CreateDescriptorPool(VkDevice _device, VK_OBJECT_TYPE_DESCRIPTOR_POOL); if (!pool) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { pool->host_memory_base = (uint8_t*)pool + sizeof(struct v3dv_descriptor_pool); @@ -457,13 +544,15 @@ v3dv_CreateDescriptorPool(VkDevice _device, pool->bo = NULL; } + list_inithead(&pool->set_list); + *pDescriptorPool = v3dv_descriptor_pool_to_handle(pool); return VK_SUCCESS; out_of_device_memory: vk_object_free(&device->vk, pAllocator, pool); - return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); } static void @@ -498,6 +587,11 @@ v3dv_DestroyDescriptorPool(VkDevice _device, if (!pool) return; + list_for_each_entry_safe(struct v3dv_descriptor_set, set, + &pool->set_list, pool_link) { + v3dv_descriptor_set_layout_unref(device, set->layout); + } + if (!pool->host_memory_base) { for(int i = 0; i < pool->entry_count; ++i) { descriptor_set_destroy(device, pool, pool->entries[i].set, false); @@ -520,6 +614,12 @@ v3dv_ResetDescriptorPool(VkDevice _device, V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_descriptor_pool, pool, descriptorPool); + list_for_each_entry_safe(struct v3dv_descriptor_set, set, + &pool->set_list, pool_link) { + v3dv_descriptor_set_layout_unref(device, set->layout); + } + list_inithead(&pool->set_list); + if (!pool->host_memory_base) { for(int i = 0; i < pool->entry_count; ++i) { descriptor_set_destroy(device, pool, pool->entries[i].set, false); @@ -539,6 +639,15 @@ v3dv_ResetDescriptorPool(VkDevice _device, return VK_SUCCESS; } +void +v3dv_descriptor_set_layout_destroy(struct v3dv_device *device, + struct v3dv_descriptor_set_layout *set_layout) +{ + assert(set_layout->ref_cnt == 0); + vk_object_base_finish(&set_layout->base); + vk_free2(&device->vk.alloc, NULL, set_layout); +} + VKAPI_ATTR VkResult VKAPI_CALL v3dv_CreateDescriptorSetLayout(VkDevice _device, const VkDescriptorSetLayoutCreateInfo *pCreateInfo, @@ -552,6 +661,13 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, uint32_t num_bindings = 0; uint32_t immutable_sampler_count = 0; + + /* for immutable descriptors, the plane stride is the largest plane + * count of all combined image samplers. For mutable descriptors + * this is always 1 since multiplanar images are restricted to + * immutable combined image samplers. + */ + uint8_t plane_stride = 1; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { num_bindings = MAX2(num_bindings, pCreateInfo->pBindings[j].binding + 1); @@ -570,22 +686,40 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, if ((desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || desc_type == VK_DESCRIPTOR_TYPE_SAMPLER) && pCreateInfo->pBindings[j].pImmutableSamplers) { - immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount; + uint32_t descriptor_count = pCreateInfo->pBindings[j].descriptorCount; + immutable_sampler_count += descriptor_count; + + for (uint32_t i = 0; i < descriptor_count; i++) { + const VkSampler vk_sampler = + pCreateInfo->pBindings[j].pImmutableSamplers[i]; + VK_FROM_HANDLE(v3dv_sampler, sampler, vk_sampler); + plane_stride = MAX2(plane_stride, sampler->plane_count); + } } } - uint32_t samplers_offset = sizeof(struct v3dv_descriptor_set_layout) + - num_bindings * sizeof(set_layout->binding[0]); + /* We place immutable samplers after the binding data. We want to use + * offsetof instead of any sizeof(struct v3dv_descriptor_set_layout) + * because the latter may include padding at the end of the struct. + */ + uint32_t samplers_offset = + offsetof(struct v3dv_descriptor_set_layout, binding[num_bindings]); + uint32_t size = samplers_offset + immutable_sampler_count * sizeof(struct v3dv_sampler); - set_layout = vk_object_zalloc(&device->vk, pAllocator, size, - VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT); - + /* Descriptor set layouts are reference counted and therefore can survive + * vkDestroyPipelineSetLayout, so they need to be allocated with a device + * scope. + */ + set_layout = + vk_zalloc(&device->vk.alloc, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!set_layout) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + vk_object_base_init(&device->vk, &set_layout->base, + VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT); - /* We just allocate all the immutable samplers at the end of the struct */ struct v3dv_sampler *samplers = (void*) &set_layout->binding[num_bindings]; assert(pCreateInfo->bindingCount == 0 || num_bindings > 0); @@ -594,17 +728,15 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, VkResult result = vk_create_sorted_bindings(pCreateInfo->pBindings, pCreateInfo->bindingCount, &bindings); if (result != VK_SUCCESS) { - vk_object_free(&device->vk, pAllocator, set_layout); - return vk_error(device->instance, result); + v3dv_descriptor_set_layout_destroy(device, set_layout); + return vk_error(device, result); } - memset(set_layout->binding, 0, - size - sizeof(struct v3dv_descriptor_set_layout)); - set_layout->binding_count = num_bindings; set_layout->flags = pCreateInfo->flags; set_layout->shader_stages = 0; set_layout->bo_size = 0; + set_layout->ref_cnt = 1; uint32_t descriptor_count = 0; uint32_t dynamic_offset_count = 0; @@ -628,6 +760,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: /* Nothing here, just to keep the descriptor type filtering below */ break; default: @@ -639,6 +772,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, set_layout->binding[binding_number].array_size = binding->descriptorCount; set_layout->binding[binding_number].descriptor_index = descriptor_count; set_layout->binding[binding_number].dynamic_offset_index = dynamic_offset_count; + set_layout->binding[binding_number].plane_stride = plane_stride; if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && @@ -651,18 +785,40 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, samplers += binding->descriptorCount; samplers_offset += sizeof(struct v3dv_sampler) * binding->descriptorCount; - } - descriptor_count += binding->descriptorCount; - dynamic_offset_count += binding->descriptorCount * - set_layout->binding[binding_number].dynamic_offset_count; + set_layout->binding[binding_number].plane_stride = plane_stride; + } set_layout->shader_stages |= binding->stageFlags; - set_layout->binding[binding_number].descriptor_offset = set_layout->bo_size; - set_layout->bo_size += - v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) * - binding->descriptorCount; + if (binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + dynamic_offset_count += binding->descriptorCount * + set_layout->binding[binding_number].dynamic_offset_count; + + descriptor_count += binding->descriptorCount; + + set_layout->binding[binding_number].descriptor_offset = + set_layout->bo_size; + set_layout->bo_size += + v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) * + binding->descriptorCount * set_layout->binding[binding_number].plane_stride; + } else { + /* We align all our buffers, inline buffers too. We made sure to take + * this account when calculating total BO size requirements at pool + * creation time. + */ + set_layout->bo_size = align(set_layout->bo_size, + V3D_NON_COHERENT_ATOM_SIZE); + + set_layout->binding[binding_number].descriptor_offset = + set_layout->bo_size; + + /* Inline uniform blocks are not arrayed, instead descriptorCount + * specifies the size of the buffer in bytes. + */ + set_layout->bo_size += binding->descriptorCount; + descriptor_count++; + } } free(bindings); @@ -686,7 +842,7 @@ v3dv_DestroyDescriptorSetLayout(VkDevice _device, if (!set_layout) return; - vk_object_free(&device->vk, pAllocator, set_layout); + v3dv_descriptor_set_layout_unref(device, set_layout); } static inline VkResult @@ -697,7 +853,7 @@ out_of_pool_memory(const struct v3dv_device *device, * by allocating a new pool, so they don't point to real issues. */ if (!pool->is_driver_internal) - return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY) + return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY); else return VK_ERROR_OUT_OF_POOL_MEMORY; } @@ -705,7 +861,7 @@ out_of_pool_memory(const struct v3dv_device *device, static VkResult descriptor_set_create(struct v3dv_device *device, struct v3dv_descriptor_pool *pool, - const struct v3dv_descriptor_set_layout *layout, + struct v3dv_descriptor_set_layout *layout, struct v3dv_descriptor_set **out_set) { struct v3dv_descriptor_set *set; @@ -726,7 +882,7 @@ descriptor_set_create(struct v3dv_device *device, VK_OBJECT_TYPE_DESCRIPTOR_SET); if (!set) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } set->pool = pool; @@ -797,19 +953,24 @@ descriptor_set_create(struct v3dv_device *device, layout->binding[b].immutable_samplers_offset); for (uint32_t i = 0; i < layout->binding[b].array_size; i++) { - uint32_t combined_offset = - layout->binding[b].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ? - v3dv_X(device, combined_image_sampler_sampler_state_offset)() : 0; - - void *desc_map = descriptor_bo_map(device, set, &layout->binding[b], i); - desc_map += combined_offset; - - memcpy(desc_map, - samplers[i].sampler_state, - sizeof(samplers[i].sampler_state)); + assert(samplers[i].plane_count <= V3DV_MAX_PLANE_COUNT); + for (uint8_t plane = 0; plane < samplers[i].plane_count; plane++) { + uint32_t combined_offset = + layout->binding[b].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ? + v3dv_X(device, combined_image_sampler_sampler_state_offset)(plane) : 0; + void *desc_map = + descriptor_bo_map(device, set, &layout->binding[b], i); + desc_map += combined_offset; + + memcpy(desc_map, samplers[i].sampler_state, + sizeof(samplers[i].sampler_state)); + } } } + v3dv_descriptor_set_layout_ref(layout); + list_addtail(&set->pool_link, &pool->set_list); + *out_set = set; return VK_SUCCESS; @@ -860,8 +1021,13 @@ v3dv_FreeDescriptorSets(VkDevice _device, for (uint32_t i = 0; i < count; i++) { V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]); - if (set && !pool->host_memory_base) - descriptor_set_destroy(device, pool, set, true); + + if (set) { + v3dv_descriptor_set_layout_unref(device, set->layout); + list_del(&set->pool_link); + if (!pool->host_memory_base) + descriptor_set_destroy(device, pool, set, true); + } } return VK_SUCCESS; @@ -877,11 +1043,16 @@ descriptor_bo_copy(struct v3dv_device *device, uint32_t src_array_index) { assert(dst_binding_layout->type == src_binding_layout->type); + assert(src_binding_layout->plane_stride == dst_binding_layout->plane_stride); - void *dst_map = descriptor_bo_map(device, dst_set, dst_binding_layout, dst_array_index); - void *src_map = descriptor_bo_map(device, src_set, src_binding_layout, src_array_index); + void *dst_map = descriptor_bo_map(device, dst_set, dst_binding_layout, + dst_array_index); + void *src_map = descriptor_bo_map(device, src_set, src_binding_layout, + src_array_index); - memcpy(dst_map, src_map, v3dv_X(device, descriptor_bo_size)(src_binding_layout->type)); + memcpy(dst_map, src_map, + v3dv_X(device, descriptor_bo_size)(src_binding_layout->type) * + src_binding_layout->plane_stride); } static void @@ -916,26 +1087,39 @@ write_image_descriptor(struct v3dv_device *device, descriptor->sampler = sampler; descriptor->image_view = iview; + assert(iview || sampler); + uint8_t plane_count = iview ? iview->plane_count : sampler->plane_count; + void *desc_map = descriptor_bo_map(device, set, binding_layout, array_index); - if (iview) { - const uint32_t tex_state_index = - iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY || - desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1; - memcpy(desc_map, - iview->texture_shader_state[tex_state_index], - sizeof(iview->texture_shader_state[0])); - desc_map += v3dv_X(device, combined_image_sampler_sampler_state_offset)(); - } + for (uint8_t plane = 0; plane < plane_count; plane++) { + if (iview) { + uint32_t offset = desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ? + v3dv_X(device, combined_image_sampler_texture_state_offset)(plane) : 0; - if (sampler && !binding_layout->immutable_samplers_offset) { - /* For immutable samplers this was already done as part of the - * descriptor set create, as that info can't change later - */ - memcpy(desc_map, - sampler->sampler_state, - sizeof(sampler->sampler_state)); + void *plane_desc_map = desc_map + offset; + + const uint32_t tex_state_index = + iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY || + desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1; + memcpy(plane_desc_map, + iview->planes[plane].texture_shader_state[tex_state_index], + sizeof(iview->planes[plane].texture_shader_state[0])); + } + + if (sampler && !binding_layout->immutable_samplers_offset) { + uint32_t offset = desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ? + v3dv_X(device, combined_image_sampler_sampler_state_offset)(plane) : 0; + + void *plane_desc_map = desc_map + offset; + /* For immutable samplers this was already done as part of the + * descriptor set create, as that info can't change later + */ + memcpy(plane_desc_map, + sampler->sampler_state, + sizeof(sampler->sampler_state)); + } } } @@ -960,6 +1144,31 @@ write_buffer_view_descriptor(struct v3dv_device *device, sizeof(bview->texture_shader_state)); } +static void +write_inline_uniform_descriptor(struct v3dv_device *device, + struct v3dv_descriptor *descriptor, + struct v3dv_descriptor_set *set, + const struct v3dv_descriptor_set_binding_layout *binding_layout, + const void *data, + size_t offset, + size_t size) +{ + assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK); + descriptor->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK; + descriptor->buffer = NULL; + + void *desc_map = descriptor_bo_map(device, set, binding_layout, 0); + memcpy(desc_map + offset, data, size); + + /* Inline uniform buffers allocate BO space in the pool for all inline + * buffers it may allocate and then this space is assigned to individual + * descriptors when they are written, so we define the range of an inline + * buffer as the largest range of data that the client has written to it. + */ + descriptor->offset = 0; + descriptor->range = MAX2(descriptor->range, offset + size); +} + VKAPI_ATTR void VKAPI_CALL v3dv_UpdateDescriptorSets(VkDevice _device, uint32_t descriptorWriteCount, @@ -978,9 +1187,20 @@ v3dv_UpdateDescriptorSets(VkDevice _device, struct v3dv_descriptor *descriptor = set->descriptors; descriptor += binding_layout->descriptor_index; - descriptor += writeset->dstArrayElement; - for (uint32_t j = 0; j < writeset->descriptorCount; ++j) { + /* Inline uniform blocks are not arrayed, instead they use dstArrayElement + * to specify the byte offset of the uniform update and descriptorCount + * to specify the size (in bytes) of the update. + */ + uint32_t descriptor_count; + if (writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + descriptor += writeset->dstArrayElement; + descriptor_count = writeset->descriptorCount; + } else { + descriptor_count = 1; + } + + for (uint32_t j = 0; j < descriptor_count; ++j) { switch(writeset->descriptorType) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: @@ -993,12 +1213,11 @@ v3dv_UpdateDescriptorSets(VkDevice _device, break; } case VK_DESCRIPTOR_TYPE_SAMPLER: { - /* If we are here we shouldn't be modifying a immutable sampler, - * so we don't ensure that would work or not crash. But let the - * validation layers check that - */ + /* If we are here we shouldn't be modifying an immutable sampler */ + assert(!binding_layout->immutable_samplers_offset); const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j; V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler); + write_image_descriptor(device, descriptor, writeset->descriptorType, set, binding_layout, NULL, sampler, writeset->dstArrayElement + j); @@ -1010,6 +1229,7 @@ v3dv_UpdateDescriptorSets(VkDevice _device, case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j; V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView); + write_image_descriptor(device, descriptor, writeset->descriptorType, set, binding_layout, iview, NULL, writeset->dstArrayElement + j); @@ -1019,7 +1239,17 @@ v3dv_UpdateDescriptorSets(VkDevice _device, case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j; V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView); - V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler); + struct v3dv_sampler *sampler = NULL; + if (!binding_layout->immutable_samplers_offset) { + /* In general we ignore the sampler when updating a combined + * image sampler, but for YCbCr we kwnow that we must use + * immutable combined image samplers + */ + assert(iview->plane_count == 1); + V3DV_FROM_HANDLE(v3dv_sampler, _sampler, image_info->sampler); + sampler = _sampler; + } + write_image_descriptor(device, descriptor, writeset->descriptorType, set, binding_layout, iview, sampler, writeset->dstArrayElement + j); @@ -1035,6 +1265,18 @@ v3dv_UpdateDescriptorSets(VkDevice _device, writeset->dstArrayElement + j); break; } + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { + const VkWriteDescriptorSetInlineUniformBlock *inline_write = + vk_find_struct_const(writeset->pNext, + WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK); + assert(inline_write->dataSize == writeset->descriptorCount); + write_inline_uniform_descriptor(device, descriptor, set, + binding_layout, + inline_write->pData, + writeset->dstArrayElement, /* offset */ + inline_write->dataSize); + break; + } default: unreachable("unimplemented descriptor type"); break; @@ -1061,9 +1303,25 @@ v3dv_UpdateDescriptorSets(VkDevice _device, struct v3dv_descriptor *dst_descriptor = dst_set->descriptors; src_descriptor += src_binding_layout->descriptor_index; - src_descriptor += copyset->srcArrayElement; - dst_descriptor += dst_binding_layout->descriptor_index; + + if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* {src,dst}ArrayElement specifies src/dst start offset and + * descriptorCount specifies size (in bytes) to copy. + */ + const void *src_data = src_set->pool->bo->map + + src_set->base_offset + + src_binding_layout->descriptor_offset + + copyset->srcArrayElement; + write_inline_uniform_descriptor(device, dst_descriptor, dst_set, + dst_binding_layout, + src_data, + copyset->dstArrayElement, + copyset->descriptorCount); + continue; + } + + src_descriptor += copyset->srcArrayElement; dst_descriptor += copyset->dstArrayElement; for (uint32_t j = 0; j < copyset->descriptorCount; j++) { @@ -1127,66 +1385,6 @@ v3dv_GetDescriptorSetLayoutSupport( pSupport->supported = supported; } -VkResult -v3dv_CreateDescriptorUpdateTemplate( - VkDevice _device, - const VkDescriptorUpdateTemplateCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkDescriptorUpdateTemplate *pDescriptorUpdateTemplate) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - struct v3dv_descriptor_update_template *template; - - size_t size = sizeof(*template) + - pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]); - template = vk_object_alloc(&device->vk, pAllocator, size, - VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE); - if (template == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - template->bind_point = pCreateInfo->pipelineBindPoint; - - assert(pCreateInfo->templateType == - VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET); - template->set = pCreateInfo->set; - - template->entry_count = pCreateInfo->descriptorUpdateEntryCount; - for (uint32_t i = 0; i < template->entry_count; i++) { - const VkDescriptorUpdateTemplateEntry *pEntry = - &pCreateInfo->pDescriptorUpdateEntries[i]; - - template->entries[i] = (struct v3dv_descriptor_template_entry) { - .type = pEntry->descriptorType, - .binding = pEntry->dstBinding, - .array_element = pEntry->dstArrayElement, - .array_count = pEntry->descriptorCount, - .offset = pEntry->offset, - .stride = pEntry->stride, - }; - } - - *pDescriptorUpdateTemplate = - v3dv_descriptor_update_template_to_handle(template); - - return VK_SUCCESS; -} - -void -v3dv_DestroyDescriptorUpdateTemplate( - VkDevice _device, - VkDescriptorUpdateTemplate descriptorUpdateTemplate, - const VkAllocationCallbacks *pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template, - descriptorUpdateTemplate); - - if (!template) - return; - - vk_object_free(&device->vk, pAllocator, template); -} - void v3dv_UpdateDescriptorSetWithTemplate( VkDevice _device, @@ -1196,11 +1394,11 @@ v3dv_UpdateDescriptorSetWithTemplate( { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_descriptor_set, set, descriptorSet); - V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template, + V3DV_FROM_HANDLE(vk_descriptor_update_template, template, descriptorUpdateTemplate); for (int i = 0; i < template->entry_count; i++) { - const struct v3dv_descriptor_template_entry *entry = + const struct vk_descriptor_template_entry *entry = &template->entries[i]; const struct v3dv_descriptor_set_binding_layout *binding_layout = @@ -1208,8 +1406,7 @@ v3dv_UpdateDescriptorSetWithTemplate( struct v3dv_descriptor *descriptor = set->descriptors + - binding_layout->descriptor_index + - entry->array_element; + binding_layout->descriptor_index; switch (entry->type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: @@ -1219,7 +1416,8 @@ v3dv_UpdateDescriptorSetWithTemplate( for (uint32_t j = 0; j < entry->array_count; j++) { const VkDescriptorBufferInfo *info = pData + entry->offset + j * entry->stride; - write_buffer_descriptor(descriptor + j, entry->type, info); + write_buffer_descriptor(descriptor + entry->array_element + j, + entry->type, info); } break; @@ -1233,9 +1431,9 @@ v3dv_UpdateDescriptorSetWithTemplate( pData + entry->offset + j * entry->stride; V3DV_FROM_HANDLE(v3dv_image_view, iview, info->imageView); V3DV_FROM_HANDLE(v3dv_sampler, sampler, info->sampler); - write_image_descriptor(device, descriptor + j, entry->type, - set, binding_layout, iview, sampler, - entry->array_element + j); + write_image_descriptor(device, descriptor + entry->array_element + j, + entry->type, set, binding_layout, iview, + sampler, entry->array_element + j); } break; @@ -1245,34 +1443,24 @@ v3dv_UpdateDescriptorSetWithTemplate( const VkBufferView *_bview = pData + entry->offset + j * entry->stride; V3DV_FROM_HANDLE(v3dv_buffer_view, bview, *_bview); - write_buffer_view_descriptor(device, descriptor + j, entry->type, - set, binding_layout, bview, + write_buffer_view_descriptor(device, + descriptor + entry->array_element + j, + entry->type, set, binding_layout, bview, entry->array_element + j); } break; + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { + write_inline_uniform_descriptor(device, descriptor, set, + binding_layout, + pData + entry->offset, + entry->array_element, /* offset */ + entry->array_count); /* size */ + break; + } + default: unreachable("Unsupported descriptor type"); } } } - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateSamplerYcbcrConversion( - VkDevice _device, - const VkSamplerYcbcrConversionCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkSamplerYcbcrConversion *pYcbcrConversion) -{ - unreachable("Ycbcr sampler conversion is not supported"); - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroySamplerYcbcrConversion( - VkDevice _device, - VkSamplerYcbcrConversion YcbcrConversion, - const VkAllocationCallbacks *pAllocator) -{ - unreachable("Ycbcr sampler conversion is not supported"); -} diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index fec53ec38c5..7992cab59ff 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -44,12 +44,18 @@ #include "compiler/v3d_compiler.h" #include "drm-uapi/v3d_drm.h" -#include "format/u_format.h" +#include "vk_drm_syncobj.h" #include "vk_util.h" +#include "git_sha1.h" #include "util/build_id.h" -#include "util/debug.h" -#include "util/u_cpu_detect.h" +#include "util/os_file.h" +#include "util/u_debug.h" +#include "util/format/u_format.h" + +#if DETECT_OS_ANDROID +#include "vk_android.h" +#endif #ifdef VK_USE_PLATFORM_XCB_KHR #include <xcb/xcb.h> @@ -62,11 +68,15 @@ #include "wayland-drm-client-protocol.h" #endif -#ifdef USE_V3D_SIMULATOR -#include "drm-uapi/i915_drm.h" -#endif +#define V3DV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION) -#define V3DV_API_VERSION VK_MAKE_VERSION(1, 0, VK_HEADER_VERSION) +#ifdef ANDROID_STRICT +#if ANDROID_API_LEVEL <= 32 +/* Android 12.1 and lower support only Vulkan API v1.1 */ +#undef V3DV_API_VERSION +#define V3DV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION) +#endif +#endif VKAPI_ATTR VkResult VKAPI_CALL v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion) @@ -75,25 +85,32 @@ v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion) return VK_SUCCESS; } -#define V3DV_HAS_SURFACE (VK_USE_PLATFORM_WIN32_KHR || \ - VK_USE_PLATFORM_WAYLAND_KHR || \ - VK_USE_PLATFORM_XCB_KHR || \ - VK_USE_PLATFORM_XLIB_KHR || \ - VK_USE_PLATFORM_DISPLAY_KHR) +#if defined(VK_USE_PLATFORM_WIN32_KHR) || \ + defined(VK_USE_PLATFORM_WAYLAND_KHR) || \ + defined(VK_USE_PLATFORM_XCB_KHR) || \ + defined(VK_USE_PLATFORM_XLIB_KHR) || \ + defined(VK_USE_PLATFORM_DISPLAY_KHR) +#define V3DV_USE_WSI_PLATFORM +#endif static const struct vk_instance_extension_table instance_extensions = { .KHR_device_group_creation = true, #ifdef VK_USE_PLATFORM_DISPLAY_KHR .KHR_display = true, + .KHR_get_display_properties2 = true, + .EXT_direct_mode_display = true, + .EXT_acquire_drm_display = true, #endif .KHR_external_fence_capabilities = true, .KHR_external_memory_capabilities = true, .KHR_external_semaphore_capabilities = true, - .KHR_get_display_properties2 = true, .KHR_get_physical_device_properties2 = true, -#ifdef V3DV_HAS_SURFACE +#ifdef V3DV_USE_WSI_PLATFORM .KHR_get_surface_capabilities2 = true, .KHR_surface = true, + .KHR_surface_protected_capabilities = true, + .EXT_surface_maintenance1 = true, + .EXT_swapchain_colorspace = true, #endif #ifdef VK_USE_PLATFORM_WAYLAND_KHR .KHR_wayland_surface = true, @@ -104,7 +121,14 @@ static const struct vk_instance_extension_table instance_extensions = { #ifdef VK_USE_PLATFORM_XLIB_KHR .KHR_xlib_surface = true, #endif +#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT + .EXT_acquire_xlib_display = true, +#endif +#ifndef VK_USE_PLATFORM_WIN32_KHR + .EXT_headless_surface = true, +#endif .EXT_debug_report = true, + .EXT_debug_utils = true, }; static void @@ -112,43 +136,354 @@ get_device_extensions(const struct v3dv_physical_device *device, struct vk_device_extension_table *ext) { *ext = (struct vk_device_extension_table) { - .KHR_bind_memory2 = true, - .KHR_copy_commands2 = true, - .KHR_dedicated_allocation = true, - .KHR_device_group = true, - .KHR_descriptor_update_template = true, - .KHR_external_fence = true, - .KHR_external_fence_fd = true, - .KHR_external_memory = true, - .KHR_external_memory_fd = true, - .KHR_external_semaphore = true, - .KHR_external_semaphore_fd = true, - .KHR_get_memory_requirements2 = true, - .KHR_image_format_list = true, - .KHR_relaxed_block_layout = true, - .KHR_maintenance1 = true, - .KHR_maintenance2 = true, - .KHR_maintenance3 = true, - .KHR_multiview = true, - .KHR_shader_non_semantic_info = true, - .KHR_sampler_mirror_clamp_to_edge = true, - .KHR_storage_buffer_storage_class = true, - .KHR_uniform_buffer_standard_layout = true, -#ifdef V3DV_HAS_SURFACE - .KHR_swapchain = true, - .KHR_incremental_present = true, + .KHR_8bit_storage = true, + .KHR_16bit_storage = true, + .KHR_bind_memory2 = true, + .KHR_buffer_device_address = true, + .KHR_copy_commands2 = true, + .KHR_create_renderpass2 = true, + .KHR_dedicated_allocation = true, + .KHR_device_group = true, + .KHR_driver_properties = true, + .KHR_descriptor_update_template = true, + .KHR_depth_stencil_resolve = true, + .KHR_dynamic_rendering = true, + .KHR_external_fence = true, + .KHR_external_fence_fd = true, + .KHR_external_memory = true, + .KHR_external_memory_fd = true, + .KHR_external_semaphore = true, + .KHR_external_semaphore_fd = true, + .KHR_format_feature_flags2 = true, + .KHR_get_memory_requirements2 = true, + .KHR_image_format_list = true, + .KHR_imageless_framebuffer = true, + .KHR_index_type_uint8 = true, + .KHR_line_rasterization = true, + .KHR_load_store_op_none = true, + .KHR_performance_query = device->caps.perfmon, + .KHR_relaxed_block_layout = true, + .KHR_maintenance1 = true, + .KHR_maintenance2 = true, + .KHR_maintenance3 = true, + .KHR_maintenance4 = true, + .KHR_multiview = true, + .KHR_pipeline_executable_properties = true, + .KHR_separate_depth_stencil_layouts = true, + .KHR_shader_expect_assume = true, + .KHR_shader_float_controls = true, + .KHR_shader_non_semantic_info = true, + .KHR_sampler_mirror_clamp_to_edge = true, + .KHR_sampler_ycbcr_conversion = true, + .KHR_spirv_1_4 = true, + .KHR_storage_buffer_storage_class = true, + .KHR_timeline_semaphore = true, + .KHR_uniform_buffer_standard_layout = true, + .KHR_shader_integer_dot_product = true, + .KHR_shader_terminate_invocation = true, + .KHR_synchronization2 = true, + .KHR_workgroup_memory_explicit_layout = true, +#ifdef V3DV_USE_WSI_PLATFORM + .KHR_swapchain = true, + .KHR_swapchain_mutable_format = true, + .KHR_incremental_present = true, +#endif + .KHR_variable_pointers = true, + .KHR_vertex_attribute_divisor = true, + .KHR_vulkan_memory_model = true, + .KHR_zero_initialize_workgroup_memory = true, + .EXT_4444_formats = true, + .EXT_attachment_feedback_loop_layout = true, + .EXT_border_color_swizzle = true, + .EXT_color_write_enable = true, + .EXT_custom_border_color = true, + .EXT_depth_clip_control = true, + .EXT_depth_clip_enable = device->devinfo.ver >= 71, + .EXT_load_store_op_none = true, + .EXT_inline_uniform_block = true, + .EXT_extended_dynamic_state = true, + .EXT_external_memory_dma_buf = true, + .EXT_host_query_reset = true, + .EXT_image_drm_format_modifier = true, + .EXT_image_robustness = true, + .EXT_index_type_uint8 = true, + .EXT_line_rasterization = true, + .EXT_memory_budget = true, + .EXT_multi_draw = true, + .EXT_physical_device_drm = true, + .EXT_pipeline_creation_cache_control = true, + .EXT_pipeline_creation_feedback = true, + .EXT_pipeline_robustness = true, + .EXT_primitive_topology_list_restart = true, + .EXT_private_data = true, + .EXT_provoking_vertex = true, + .EXT_separate_stencil_usage = true, + .EXT_shader_demote_to_helper_invocation = true, + .EXT_shader_module_identifier = true, + .EXT_subgroup_size_control = true, +#ifdef V3DV_USE_WSI_PLATFORM + .EXT_swapchain_maintenance1 = true, +#endif + .EXT_texel_buffer_alignment = true, + .EXT_tooling_info = true, + .EXT_vertex_attribute_divisor = true, +#if DETECT_OS_ANDROID + .ANDROID_external_memory_android_hardware_buffer = true, + .ANDROID_native_buffer = true, + .EXT_queue_family_foreign = true, +#endif + }; +} + +static void +get_features(const struct v3dv_physical_device *physical_device, + struct vk_features *features) +{ + *features = (struct vk_features) { + /* Vulkan 1.0 */ + .robustBufferAccess = true, /* This feature is mandatory */ + .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, + .tessellationShader = false, + .sampleRateShading = true, + .dualSrcBlend = false, + .logicOp = true, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = true, + .depthClamp = physical_device->devinfo.ver >= 71, + .depthBiasClamp = true, + .fillModeNonSolid = true, + .depthBounds = physical_device->devinfo.ver >= 71, + .wideLines = true, + .largePoints = true, + .alphaToOne = true, + .multiViewport = false, + .samplerAnisotropy = true, + .textureCompressionETC2 = true, + .textureCompressionASTC_LDR = true, + /* Note that textureCompressionBC requires that the driver support all + * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim + * that we support it. + */ + .textureCompressionBC = false, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = false, + .vertexPipelineStoresAndAtomics = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = true, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = true, + .shaderStorageImageMultisample = false, + .shaderStorageImageReadWithoutFormat = true, + .shaderStorageImageWriteWithoutFormat = false, + .shaderUniformBufferArrayDynamicIndexing = false, + .shaderSampledImageArrayDynamicIndexing = false, + .shaderStorageBufferArrayDynamicIndexing = false, + .shaderStorageImageArrayDynamicIndexing = false, + .shaderClipDistance = true, + .shaderCullDistance = false, + .shaderFloat64 = false, + .shaderInt64 = false, + .shaderInt16 = false, + .shaderResourceResidency = false, + .shaderResourceMinLod = false, + .sparseBinding = false, + .sparseResidencyBuffer = false, + .sparseResidencyImage2D = false, + .sparseResidencyImage3D = false, + .sparseResidency2Samples = false, + .sparseResidency4Samples = false, + .sparseResidency8Samples = false, + .sparseResidency16Samples = false, + .sparseResidencyAliased = false, + .variableMultisampleRate = false, + .inheritedQueries = true, + + /* Vulkan 1.1 */ + .storageBuffer16BitAccess = true, + .uniformAndStorageBuffer16BitAccess = true, + .storagePushConstant16 = true, + .storageInputOutput16 = false, + .multiview = true, + .multiviewGeometryShader = false, + .multiviewTessellationShader = false, + .variablePointersStorageBuffer = true, + /* FIXME: this needs support for non-constant index on UBO/SSBO */ + .variablePointers = false, + .protectedMemory = false, + .samplerYcbcrConversion = true, + .shaderDrawParameters = false, + + /* Vulkan 1.2 */ + .hostQueryReset = true, + .uniformAndStorageBuffer8BitAccess = true, + .uniformBufferStandardLayout = true, + /* V3D 4.2 wraps TMU vector accesses to 16-byte boundaries, so loads and + * stores of vectors that cross these boundaries would not work correctly + * with scalarBlockLayout and would need to be split into smaller vectors + * (and/or scalars) that don't cross these boundaries. For load/stores + * with dynamic offsets where we can't identify if the offset is + * problematic, we would always have to scalarize. Overall, this would + * not lead to best performance so let's just not support it. + */ + .scalarBlockLayout = physical_device->devinfo.ver >= 71, + /* This tells applications 2 things: + * + * 1. If they can select just one aspect for barriers. For us barriers + * decide if we need to split a job and we don't care if it is only + * for one of the aspects of the image or both, so we don't really + * benefit from seeing barriers that select just one aspect. + * + * 2. If they can program different layouts for each aspect. We + * generally don't care about layouts, so again, we don't get any + * benefits from this to limit the scope of image layout transitions. + * + * Still, Vulkan 1.2 requires this feature to be supported so we + * advertise it even though we don't really take advantage of it. + */ + .separateDepthStencilLayouts = true, + .storageBuffer8BitAccess = true, + .storagePushConstant8 = true, + .imagelessFramebuffer = true, + .timelineSemaphore = true, + + .samplerMirrorClampToEdge = true, + + /* Extended subgroup types is mandatory by Vulkan 1.2, however, it is + * only in effect if the implementation supports non 32-bit types, which + * we don't, so in practice setting it to true doesn't have any + * implications for us. + */ + .shaderSubgroupExtendedTypes = true, + .subgroupBroadcastDynamicId = true, + + .vulkanMemoryModel = true, + .vulkanMemoryModelDeviceScope = true, + .vulkanMemoryModelAvailabilityVisibilityChains = true, + + .bufferDeviceAddress = true, + .bufferDeviceAddressCaptureReplay = false, + .bufferDeviceAddressMultiDevice = false, + + /* Vulkan 1.3 */ + .inlineUniformBlock = true, + /* Inline buffers work like push constants, so after their are bound + * some of their contents may be copied into the uniform stream as soon + * as the next draw/dispatch is recorded in the command buffer. This means + * that if the client updates the buffer contents after binding it to + * a command buffer, the next queue submit of that command buffer may + * not use the latest update to the buffer contents, but the data that + * was present in the buffer at the time it was bound to the command + * buffer. + */ + .descriptorBindingInlineUniformBlockUpdateAfterBind = false, + .pipelineCreationCacheControl = true, + .privateData = true, + .maintenance4 = true, + .shaderZeroInitializeWorkgroupMemory = true, + .synchronization2 = true, + .robustImageAccess = true, + .shaderIntegerDotProduct = true, + + /* VK_EXT_4444_formats */ + .formatA4R4G4B4 = true, + .formatA4B4G4R4 = true, + + /* VK_EXT_custom_border_color */ + .customBorderColors = true, + .customBorderColorWithoutFormat = false, + + /* VK_EXT_index_type_uint8 */ + .indexTypeUint8 = true, + + /* VK_EXT_line_rasterization */ + .rectangularLines = true, + .bresenhamLines = true, + .smoothLines = true, + .stippledRectangularLines = false, + .stippledBresenhamLines = false, + .stippledSmoothLines = false, + + /* VK_EXT_color_write_enable */ + .colorWriteEnable = true, + + /* VK_EXT_extended_dynamic_state */ + .extendedDynamicState = true, + + /* VK_KHR_pipeline_executable_properties */ + .pipelineExecutableInfo = true, + + /* VK_EXT_provoking_vertex */ + .provokingVertexLast = true, + /* FIXME: update when supporting EXT_transform_feedback */ + .transformFeedbackPreservesProvokingVertex = false, + + /* VK_EXT_vertex_attribute_divisor */ + .vertexAttributeInstanceRateDivisor = true, + .vertexAttributeInstanceRateZeroDivisor = false, + + /* VK_KHR_performance_query */ + .performanceCounterQueryPools = physical_device->caps.perfmon, + .performanceCounterMultipleQueryPools = false, + + /* VK_EXT_texel_buffer_alignment */ + .texelBufferAlignment = true, + + /* VK_KHR_workgroup_memory_explicit_layout */ + .workgroupMemoryExplicitLayout = true, + .workgroupMemoryExplicitLayoutScalarBlockLayout = false, + .workgroupMemoryExplicitLayout8BitAccess = true, + .workgroupMemoryExplicitLayout16BitAccess = true, + + /* VK_EXT_border_color_swizzle */ + .borderColorSwizzle = true, + .borderColorSwizzleFromImage = true, + + /* VK_EXT_shader_module_identifier */ + .shaderModuleIdentifier = true, + + /* VK_EXT_depth_clip_control */ + .depthClipControl = true, + + /* VK_EXT_depth_clip_enable */ + .depthClipEnable = physical_device->devinfo.ver >= 71, + + /* VK_EXT_attachment_feedback_loop_layout */ + .attachmentFeedbackLoopLayout = true, + + /* VK_EXT_primitive_topology_list_restart */ + .primitiveTopologyListRestart = true, + /* FIXME: we don't support tessellation shaders yet */ + .primitiveTopologyPatchListRestart = false, + + /* VK_EXT_pipeline_robustness */ + .pipelineRobustness = true, + + /* VK_EXT_multi_draw */ + .multiDraw = true, + + /* VK_KHR_shader_terminate_invocation */ + .shaderTerminateInvocation = true, + + /* VK_EXT_shader_demote_to_helper_invocation */ + .shaderDemoteToHelperInvocation = true, + + /* VK_EXT_subgroup_size_control */ + .subgroupSizeControl = true, + .computeFullSubgroups = true, + + /* VK_KHR_shader_expect_assume */ + .shaderExpectAssume = true, + + /* VK_KHR_dynamic_rendering */ + .dynamicRendering = true, + +#ifdef V3DV_USE_WSI_PLATFORM + /* VK_EXT_swapchain_maintenance1 */ + .swapchainMaintenance1 = true, #endif - .KHR_variable_pointers = true, - .EXT_color_write_enable = true, - .EXT_custom_border_color = true, - .EXT_external_memory_dma_buf = true, - .EXT_index_type_uint8 = true, - .EXT_physical_device_drm = true, - .EXT_pipeline_creation_cache_control = true, - .EXT_pipeline_creation_feedback = true, - .EXT_private_data = true, - .EXT_provoking_vertex = true, - .EXT_vertex_attribute_divisor = true, }; } @@ -165,6 +500,10 @@ v3dv_EnumerateInstanceExtensionProperties(const char *pLayerName, &instance_extensions, pPropertyCount, pProperties); } +static VkResult enumerate_devices(struct vk_instance *vk_instance); + +static void destroy_physical_device(struct vk_physical_device *device); + VKAPI_ATTR VkResult VKAPI_CALL v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -186,6 +525,8 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, struct vk_instance_dispatch_table dispatch_table; vk_instance_dispatch_table_from_entrypoints( &dispatch_table, &v3dv_instance_entrypoints, true); + vk_instance_dispatch_table_from_entrypoints( + &dispatch_table, &wsi_instance_entrypoints, false); result = vk_instance_init(&instance->vk, &instance_extensions, @@ -194,12 +535,13 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, if (result != VK_SUCCESS) { vk_free(pAllocator, instance); - return vk_error(instance, result); + return vk_error(NULL, result); } v3d_process_debug_variable(); - instance->physicalDeviceCount = -1; + instance->vk.physical_devices.enumerate = enumerate_devices; + instance->vk.physical_devices.destroy = destroy_physical_device; /* We start with the default values for the pipeline_cache envvars */ instance->pipeline_cache_enabled = true; @@ -229,8 +571,6 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, } } - util_cpu_detect(); - VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); *pInstance = v3dv_instance_to_handle(instance); @@ -256,11 +596,11 @@ physical_device_finish(struct v3dv_physical_device *device) v3dv_physical_device_free_disk_cache(device); v3d_compiler_free(device->compiler); + util_sparse_array_finish(&device->bo_map); + close(device->render_fd); if (device->display_fd >= 0) close(device->display_fd); - if (device->master_fd >= 0) - close(device->master_fd); free(device->name); @@ -272,6 +612,13 @@ physical_device_finish(struct v3dv_physical_device *device) mtx_destroy(&device->mutex); } +static void +destroy_physical_device(struct vk_physical_device *device) +{ + physical_device_finish((struct v3dv_physical_device *)device); + vk_free(&device->instance->alloc, device); +} + VKAPI_ATTR void VKAPI_CALL v3dv_DestroyInstance(VkInstance _instance, const VkAllocationCallbacks *pAllocator) @@ -281,12 +628,6 @@ v3dv_DestroyInstance(VkInstance _instance, if (!instance) return; - if (instance->physicalDeviceCount > 0) { - /* We support at most one physical device. */ - assert(instance->physicalDeviceCount == 1); - physical_device_finish(&instance->physicalDevice); - } - VG(VALGRIND_DESTROY_MEMPOOL(instance)); vk_instance_finish(&instance->vk); @@ -306,286 +647,39 @@ compute_heap_size() uint64_t total_ram = (uint64_t) v3d_simulator_get_mem_size(); #endif - /* We don't want to burn too much ram with the GPU. If the user has 4GiB - * or less, we use at most half. If they have more than 4GiB, we use 3/4. + /* We don't want to burn too much ram with the GPU. If the user has 4GB + * or less, we use at most half. If they have more than 4GB we limit it + * to 3/4 with a max. of 4GB since the GPU cannot address more than that. */ - uint64_t available_ram; - if (total_ram <= 4ull * 1024ull * 1024ull * 1024ull) - available_ram = total_ram / 2; + const uint64_t MAX_HEAP_SIZE = 4ull * 1024ull * 1024ull * 1024ull; + uint64_t available; + if (total_ram <= MAX_HEAP_SIZE) + available = total_ram / 2; else - available_ram = total_ram * 3 / 4; - - return available_ram; -} - -#if !using_v3d_simulator -#ifdef VK_USE_PLATFORM_XCB_KHR -static int -create_display_fd_xcb(VkIcdSurfaceBase *surface) -{ - int fd = -1; - - xcb_connection_t *conn; - xcb_dri3_open_reply_t *reply = NULL; - if (surface) { - if (surface->platform == VK_ICD_WSI_PLATFORM_XLIB) - conn = XGetXCBConnection(((VkIcdSurfaceXlib *)surface)->dpy); - else - conn = ((VkIcdSurfaceXcb *)surface)->connection; - } else { - conn = xcb_connect(NULL, NULL); - } - - if (xcb_connection_has_error(conn)) - goto finish; - - const xcb_setup_t *setup = xcb_get_setup(conn); - xcb_screen_iterator_t iter = xcb_setup_roots_iterator(setup); - xcb_screen_t *screen = iter.data; - - xcb_dri3_open_cookie_t cookie; - cookie = xcb_dri3_open(conn, screen->root, None); - reply = xcb_dri3_open_reply(conn, cookie, NULL); - if (!reply) - goto finish; - - if (reply->nfd != 1) - goto finish; - - fd = xcb_dri3_open_reply_fds(conn, reply)[0]; - fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); - -finish: - if (!surface) - xcb_disconnect(conn); - if (reply) - free(reply); - - return fd; -} -#endif - -#ifdef VK_USE_PLATFORM_WAYLAND_KHR -struct v3dv_wayland_info { - struct wl_drm *wl_drm; - int fd; - bool is_set; - bool authenticated; -}; - -static void -v3dv_drm_handle_device(void *data, struct wl_drm *drm, const char *device) -{ - struct v3dv_wayland_info *info = data; - info->fd = open(device, O_RDWR | O_CLOEXEC); - info->is_set = info->fd != -1; - if (!info->is_set) { - fprintf(stderr, "v3dv_drm_handle_device: could not open %s (%s)\n", - device, strerror(errno)); - return; - } - - drm_magic_t magic; - if (drmGetMagic(info->fd, &magic)) { - fprintf(stderr, "v3dv_drm_handle_device: drmGetMagic failed\n"); - close(info->fd); - info->fd = -1; - info->is_set = false; - return; - } - wl_drm_authenticate(info->wl_drm, magic); -} - -static void -v3dv_drm_handle_format(void *data, struct wl_drm *drm, uint32_t format) -{ -} - -static void -v3dv_drm_handle_authenticated(void *data, struct wl_drm *drm) -{ - struct v3dv_wayland_info *info = data; - info->authenticated = true; -} - -static void -v3dv_drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t value) -{ -} - -struct wl_drm_listener v3dv_drm_listener = { - .device = v3dv_drm_handle_device, - .format = v3dv_drm_handle_format, - .authenticated = v3dv_drm_handle_authenticated, - .capabilities = v3dv_drm_handle_capabilities -}; - -static void -v3dv_registry_global(void *data, - struct wl_registry *registry, - uint32_t name, - const char *interface, - uint32_t version) -{ - struct v3dv_wayland_info *info = data; - if (strcmp(interface, "wl_drm") == 0) { - info->wl_drm = wl_registry_bind(registry, name, &wl_drm_interface, - MIN2(version, 2)); - wl_drm_add_listener(info->wl_drm, &v3dv_drm_listener, data); - }; -} - -static void -v3dv_registry_global_remove_cb(void *data, - struct wl_registry *registry, - uint32_t name) -{ -} - -static int -create_display_fd_wayland(VkIcdSurfaceBase *surface) -{ - struct wl_display *display; - struct wl_registry *registry = NULL; - - struct v3dv_wayland_info info = { - .wl_drm = NULL, - .fd = -1, - .is_set = false, - .authenticated = false - }; - - if (surface) - display = ((VkIcdSurfaceWayland *) surface)->display; - else - display = wl_display_connect(NULL); - - if (!display) - return -1; - - registry = wl_display_get_registry(display); - if (!registry) { - if (!surface) - wl_display_disconnect(display); - return -1; - } - - static const struct wl_registry_listener registry_listener = { - v3dv_registry_global, - v3dv_registry_global_remove_cb - }; - wl_registry_add_listener(registry, ®istry_listener, &info); - - wl_display_roundtrip(display); /* For the registry advertisement */ - wl_display_roundtrip(display); /* For the DRM device event */ - wl_display_roundtrip(display); /* For the authentication event */ - - wl_drm_destroy(info.wl_drm); - wl_registry_destroy(registry); - - if (!surface) - wl_display_disconnect(display); - - if (!info.is_set) - return -1; - - if (!info.authenticated) - return -1; - - return info.fd; -} -#endif - -/* Acquire an authenticated display fd without a surface reference. This is the - * case where the application is making WSI allocations outside the Vulkan - * swapchain context (only Zink, for now). Since we lack information about the - * underlying surface we just try our best to figure out the correct display - * and platform to use. It should work in most cases. - */ -static void -acquire_display_device_no_surface(struct v3dv_instance *instance, - struct v3dv_physical_device *pdevice) -{ -#ifdef VK_USE_PLATFORM_WAYLAND_KHR - pdevice->display_fd = create_display_fd_wayland(NULL); -#endif - -#ifdef VK_USE_PLATFORM_XCB_KHR - if (pdevice->display_fd == -1) - pdevice->display_fd = create_display_fd_xcb(NULL); -#endif - -#ifdef VK_USE_PLATFORM_DISPLAY_KHR - if (pdevice->display_fd == - 1 && pdevice->master_fd >= 0) - pdevice->display_fd = dup(pdevice->master_fd); -#endif -} + available = MIN2(MAX_HEAP_SIZE, total_ram * 3 / 4); -/* Acquire an authenticated display fd from the surface. This is the regular - * case where the application is using swapchains to create WSI allocations. - * In this case we use the surface information to figure out the correct - * display and platform combination. - */ -static void -acquire_display_device_surface(struct v3dv_instance *instance, - struct v3dv_physical_device *pdevice, - VkIcdSurfaceBase *surface) -{ - /* Mesa will set both of VK_USE_PLATFORM_{XCB,XLIB} when building with - * platform X11, so only check for XCB and rely on XCB to get an - * authenticated device also for Xlib. - */ -#ifdef VK_USE_PLATFORM_XCB_KHR - if (surface->platform == VK_ICD_WSI_PLATFORM_XCB || - surface->platform == VK_ICD_WSI_PLATFORM_XLIB) { - pdevice->display_fd = create_display_fd_xcb(surface); - } -#endif - -#ifdef VK_USE_PLATFORM_WAYLAND_KHR - if (surface->platform == VK_ICD_WSI_PLATFORM_WAYLAND) - pdevice->display_fd = create_display_fd_wayland(surface); -#endif - -#ifdef VK_USE_PLATFORM_DISPLAY_KHR - if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY && - pdevice->master_fd >= 0) { - pdevice->display_fd = dup(pdevice->master_fd); - } -#endif + return available; } -#endif /* !using_v3d_simulator */ -/* Attempts to get an authenticated display fd from the display server that - * we can use to allocate BOs for presentable images. - */ -VkResult -v3dv_physical_device_acquire_display(struct v3dv_instance *instance, - struct v3dv_physical_device *pdevice, - VkIcdSurfaceBase *surface) +static uint64_t +compute_memory_budget(struct v3dv_physical_device *device) { - VkResult result = VK_SUCCESS; - mtx_lock(&pdevice->mutex); - - if (pdevice->display_fd != -1) - goto done; - - /* When running on the simulator we do everything on a single render node so - * we don't need to get an authenticated display fd from the display server. - */ + uint64_t heap_size = device->memory.memoryHeaps[0].size; + uint64_t heap_used = device->heap_used; + uint64_t sys_available; #if !using_v3d_simulator - if (surface) - acquire_display_device_surface(instance, pdevice, surface); - else - acquire_display_device_no_surface(instance, pdevice); - - if (pdevice->display_fd == -1) - result = VK_ERROR_INITIALIZATION_FAILED; + ASSERTED bool has_available_memory = + os_get_available_system_memory(&sys_available); + assert(has_available_memory); +#else + sys_available = (uint64_t) v3d_simulator_get_mem_free(); #endif -done: - mtx_unlock(&pdevice->mutex); - return result; + /* Let's not incite the app to starve the system: report at most 90% of + * available system memory. + */ + uint64_t heap_available = sys_available * 9 / 10; + return MIN2(heap_size, heap_used + heap_available); } static bool @@ -604,7 +698,8 @@ device_has_expected_features(struct v3dv_physical_device *device) { return v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_TFU) && v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CSD) && - v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH); + v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH) && + device->caps.multisync; } @@ -614,14 +709,14 @@ init_uuids(struct v3dv_physical_device *device) const struct build_id_note *note = build_id_find_nhdr_for_addr(init_uuids); if (!note) { - return vk_errorf((struct v3dv_instance*) device->vk.instance, + return vk_errorf(device->vk.instance, VK_ERROR_INITIALIZATION_FAILED, "Failed to find build-id"); } unsigned build_id_len = build_id_length(note); if (build_id_len < 20) { - return vk_errorf((struct v3dv_instance*) device->vk.instance, + return vk_errorf(device->vk.instance, VK_ERROR_INITIALIZATION_FAILED, "build-id too short. It needs to be a SHA"); } @@ -672,38 +767,46 @@ v3dv_physical_device_init_disk_cache(struct v3dv_physical_device *device) _mesa_sha1_format(timestamp, device->driver_build_sha1); assert(device->name); - device->disk_cache = disk_cache_create(device->name, timestamp, 0); + device->disk_cache = disk_cache_create(device->name, timestamp, v3d_mesa_debug); #else device->disk_cache = NULL; #endif } static VkResult -physical_device_init(struct v3dv_physical_device *device, - struct v3dv_instance *instance, - drmDevicePtr drm_render_device, - drmDevicePtr drm_primary_device) +create_physical_device(struct v3dv_instance *instance, + drmDevicePtr gpu_device, + drmDevicePtr display_device) { VkResult result = VK_SUCCESS; - int32_t master_fd = -1; + int32_t display_fd = -1; int32_t render_fd = -1; + struct v3dv_physical_device *device = + vk_zalloc(&instance->vk.alloc, sizeof(*device), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (!device) + return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + struct vk_physical_device_dispatch_table dispatch_table; vk_physical_device_dispatch_table_from_entrypoints (&dispatch_table, &v3dv_physical_device_entrypoints, true); + vk_physical_device_dispatch_table_from_entrypoints( + &dispatch_table, &wsi_physical_device_entrypoints, false); - result = vk_physical_device_init(&device->vk, &instance->vk, NULL, - &dispatch_table); + result = vk_physical_device_init(&device->vk, &instance->vk, NULL, NULL, + NULL, &dispatch_table); if (result != VK_SUCCESS) goto fail; - assert(drm_render_device); - const char *path = drm_render_device->nodes[DRM_NODE_RENDER]; + assert(gpu_device); + const char *path = gpu_device->nodes[DRM_NODE_RENDER]; render_fd = open(path, O_RDWR | O_CLOEXEC); if (render_fd < 0) { fprintf(stderr, "Opening %s failed: %s\n", path, strerror(errno)); - result = VK_ERROR_INCOMPATIBLE_DRIVER; + result = VK_ERROR_INITIALIZATION_FAILED; goto fail; } @@ -714,12 +817,12 @@ physical_device_init(struct v3dv_physical_device *device, const char *primary_path; #if !using_v3d_simulator - if (drm_primary_device) - primary_path = drm_primary_device->nodes[DRM_NODE_PRIMARY]; + if (display_device) + primary_path = display_device->nodes[DRM_NODE_PRIMARY]; else primary_path = NULL; #else - primary_path = drm_render_device->nodes[DRM_NODE_PRIMARY]; + primary_path = gpu_device->nodes[DRM_NODE_PRIMARY]; #endif struct stat primary_stat = {0}, render_stat = {0}; @@ -727,8 +830,7 @@ physical_device_init(struct v3dv_physical_device *device, device->has_primary = primary_path; if (device->has_primary) { if (stat(primary_path, &primary_stat) != 0) { - result = vk_errorf(instance, - VK_ERROR_INITIALIZATION_FAILED, + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, "failed to stat DRM primary node %s", primary_path); goto fail; @@ -738,8 +840,7 @@ physical_device_init(struct v3dv_physical_device *device, } if (fstat(render_fd, &render_stat) != 0) { - result = vk_errorf(instance, - VK_ERROR_INITIALIZATION_FAILED, + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, "failed to stat DRM render node %s", path); goto fail; @@ -747,16 +848,24 @@ physical_device_init(struct v3dv_physical_device *device, device->has_render = true; device->render_devid = render_stat.st_rdev; - if (instance->vk.enabled_extensions.KHR_display) { +#if using_v3d_simulator + device->device_id = gpu_device->deviceinfo.pci->device_id; +#endif + + if (instance->vk.enabled_extensions.KHR_display || + instance->vk.enabled_extensions.KHR_xcb_surface || + instance->vk.enabled_extensions.KHR_xlib_surface || + instance->vk.enabled_extensions.KHR_wayland_surface || + instance->vk.enabled_extensions.EXT_acquire_drm_display) { #if !using_v3d_simulator /* Open the primary node on the vc4 display device */ - assert(drm_primary_device); - master_fd = open(primary_path, O_RDWR | O_CLOEXEC); + assert(display_device); + display_fd = open(primary_path, O_RDWR | O_CLOEXEC); #else /* There is only one device with primary and render nodes. * Open its primary node. */ - master_fd = open(primary_path, O_RDWR | O_CLOEXEC); + display_fd = open(primary_path, O_RDWR | O_CLOEXEC); #endif } @@ -765,21 +874,32 @@ physical_device_init(struct v3dv_physical_device *device, #endif device->render_fd = render_fd; /* The v3d render node */ - device->display_fd = -1; /* Authenticated vc4 primary node */ - device->master_fd = master_fd; /* Master vc4 primary node */ + device->display_fd = display_fd; /* Master vc4 primary node */ if (!v3d_get_device_info(device->render_fd, &device->devinfo, &v3dv_ioctl)) { - result = VK_ERROR_INCOMPATIBLE_DRIVER; + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "Failed to get info from device."); goto fail; } if (device->devinfo.ver < 42) { - result = VK_ERROR_INCOMPATIBLE_DRIVER; + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "Device version < 42."); goto fail; } + device->caps.cpu_queue = + v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_CPU_QUEUE); + + device->caps.multisync = + v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT); + + device->caps.perfmon = + v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_PERFMON); + if (!device_has_expected_features(device)) { - result = VK_ERROR_INCOMPATIBLE_DRIVER; + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "Kernel driver doesn't have required features."); goto fail; } @@ -787,12 +907,15 @@ physical_device_init(struct v3dv_physical_device *device, if (result != VK_SUCCESS) goto fail; - device->compiler = v3d_compiler_init(&device->devinfo); + device->compiler = v3d_compiler_init(&device->devinfo, + MAX_INLINE_UNIFORM_BUFFERS); device->next_program_id = 0; ASSERTED int len = - asprintf(&device->name, "V3D %d.%d", - device->devinfo.ver / 10, device->devinfo.ver % 10); + asprintf(&device->name, "V3D %d.%d.%d", + device->devinfo.ver / 10, + device->devinfo.ver % 10, + device->devinfo.rev); assert(len != -1); v3dv_physical_device_init_disk_cache(device); @@ -811,7 +934,31 @@ physical_device_init(struct v3dv_physical_device *device, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; mem->memoryTypes[0].heapIndex = 0; - device->options.merge_jobs = getenv("V3DV_NO_MERGE_JOBS") == NULL; + /* Initialize sparse array for refcounting imported BOs */ + util_sparse_array_init(&device->bo_map, sizeof(struct v3dv_bo), 512); + + device->options.merge_jobs = !V3D_DBG(NO_MERGE_JOBS); + + device->drm_syncobj_type = vk_drm_syncobj_get_type(device->render_fd); + + /* We don't support timelines in the uAPI yet and we don't want it getting + * suddenly turned on by vk_drm_syncobj_get_type() without us adding v3dv + * code for it first. + */ + device->drm_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE; + + /* Multiwait is required for emulated timeline semaphores and is supported + * by the v3d kernel interface. + */ + device->drm_syncobj_type.features |= VK_SYNC_FEATURE_GPU_MULTI_WAIT; + + device->sync_timeline_type = + vk_sync_timeline_get_type(&device->drm_syncobj_type); + + device->sync_types[0] = &device->drm_syncobj_type; + device->sync_types[1] = &device->sync_timeline_type.sync; + device->sync_types[2] = NULL; + device->vk.supported_sync_types = device->sync_types; result = v3dv_wsi_init(device); if (result != VK_SUCCESS) { @@ -820,35 +967,46 @@ physical_device_init(struct v3dv_physical_device *device, } get_device_extensions(device, &device->vk.supported_extensions); + get_features(device, &device->vk.supported_features); + + mtx_init(&device->mutex, mtx_plain); - pthread_mutex_init(&device->mutex, NULL); + list_addtail(&device->vk.link, &instance->vk.physical_devices.list); return VK_SUCCESS; fail: vk_physical_device_finish(&device->vk); + vk_free(&instance->vk.alloc, device); if (render_fd >= 0) close(render_fd); - if (master_fd >= 0) - close(master_fd); + if (display_fd >= 0) + close(display_fd); return result; } +/* This driver hook is expected to return VK_SUCCESS (unless a memory + * allocation error happened) if no compatible device is found. If a + * compatible device is found, it may return an error code if device + * inialization failed. + */ static VkResult -enumerate_devices(struct v3dv_instance *instance) +enumerate_devices(struct vk_instance *vk_instance) { - /* TODO: Check for more devices? */ + struct v3dv_instance *instance = + container_of(vk_instance, struct v3dv_instance, vk); + + /* FIXME: Check for more devices? */ drmDevicePtr devices[8]; - VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER; int max_devices; - instance->physicalDeviceCount = 0; - max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices)); if (max_devices < 1) - return VK_ERROR_INCOMPATIBLE_DRIVER; + return VK_SUCCESS; + + VkResult result = VK_SUCCESS; #if !using_v3d_simulator int32_t v3d_idx = -1; @@ -856,25 +1014,24 @@ enumerate_devices(struct v3dv_instance *instance) #endif for (unsigned i = 0; i < (unsigned)max_devices; i++) { #if using_v3d_simulator - /* In the simulator, we look for an Intel render node */ + /* In the simulator, we look for an Intel/AMD render node */ const int required_nodes = (1 << DRM_NODE_RENDER) | (1 << DRM_NODE_PRIMARY); if ((devices[i]->available_nodes & required_nodes) == required_nodes && devices[i]->bustype == DRM_BUS_PCI && - devices[i]->deviceinfo.pci->vendor_id == 0x8086) { - result = physical_device_init(&instance->physicalDevice, instance, - devices[i], NULL); - if (result != VK_ERROR_INCOMPATIBLE_DRIVER) + (devices[i]->deviceinfo.pci->vendor_id == 0x8086 || + devices[i]->deviceinfo.pci->vendor_id == 0x1002)) { + result = create_physical_device(instance, devices[i], NULL); + if (result == VK_SUCCESS) break; } #else - /* On actual hardware, we should have a render node (v3d) - * and a primary node (vc4). We will need to use the primary - * to allocate WSI buffers and share them with the render node - * via prime, but that is a privileged operation so we need the - * primary node to be authenticated, and for that we need the - * display server to provide the device fd (with DRI3), so we - * here we only check that the device is present but we don't - * try to open it. + /* On actual hardware, we should have a gpu device (v3d) and a display + * device (vc4). We will need to use the display device to allocate WSI + * buffers and share them with the render node via prime, but that is a + * privileged operation so we need t have an authenticated display fd + * and for that we need the display server to provide the it (with DRI3), + * so here we only check that the device is present but we don't try to + * open it. */ if (devices[i]->bustype != DRM_BUS_PLATFORM) continue; @@ -882,7 +1039,8 @@ enumerate_devices(struct v3dv_instance *instance) if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) { char **compat = devices[i]->deviceinfo.platform->compatible; while (*compat) { - if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) { + if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 || + strncmp(*compat, "brcm,2712-v3d", 13) == 0) { v3d_idx = i; break; } @@ -891,8 +1049,9 @@ enumerate_devices(struct v3dv_instance *instance) } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) { char **compat = devices[i]->deviceinfo.platform->compatible; while (*compat) { - if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || - strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) { + if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 || + strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || + strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) { vc4_idx = i; break; } @@ -903,345 +1062,35 @@ enumerate_devices(struct v3dv_instance *instance) } #if !using_v3d_simulator - if (v3d_idx == -1 || vc4_idx == -1) - result = VK_ERROR_INCOMPATIBLE_DRIVER; - else - result = physical_device_init(&instance->physicalDevice, instance, - devices[v3d_idx], devices[vc4_idx]); + if (v3d_idx != -1) { + drmDevicePtr v3d_device = devices[v3d_idx]; + drmDevicePtr vc4_device = vc4_idx != -1 ? devices[vc4_idx] : NULL; + result = create_physical_device(instance, v3d_device, vc4_device); + } #endif drmFreeDevices(devices, max_devices); - if (result == VK_SUCCESS) - instance->physicalDeviceCount = 1; - return result; } -static VkResult -instance_ensure_physical_device(struct v3dv_instance *instance) -{ - if (instance->physicalDeviceCount < 0) { - VkResult result = enumerate_devices(instance); - if (result != VK_SUCCESS && - result != VK_ERROR_INCOMPATIBLE_DRIVER) - return result; - } - - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_EnumeratePhysicalDevices(VkInstance _instance, - uint32_t *pPhysicalDeviceCount, - VkPhysicalDevice *pPhysicalDevices) -{ - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); - VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount); - - VkResult result = instance_ensure_physical_device(instance); - if (result != VK_SUCCESS) - return result; - - if (instance->physicalDeviceCount == 0) - return VK_SUCCESS; - - assert(instance->physicalDeviceCount == 1); - vk_outarray_append(&out, i) { - *i = v3dv_physical_device_to_handle(&instance->physicalDevice); - } - - return vk_outarray_status(&out); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_EnumeratePhysicalDeviceGroups( - VkInstance _instance, - uint32_t *pPhysicalDeviceGroupCount, - VkPhysicalDeviceGroupProperties *pPhysicalDeviceGroupProperties) -{ - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); - VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties, - pPhysicalDeviceGroupCount); - - VkResult result = instance_ensure_physical_device(instance); - if (result != VK_SUCCESS) - return result; - - assert(instance->physicalDeviceCount == 1); - - vk_outarray_append(&out, p) { - p->physicalDeviceCount = 1; - memset(p->physicalDevices, 0, sizeof(p->physicalDevices)); - p->physicalDevices[0] = - v3dv_physical_device_to_handle(&instance->physicalDevice); - p->subsetAllocation = false; - - vk_foreach_struct(ext, p->pNext) - v3dv_debug_ignored_stype(ext->sType); - } - - return vk_outarray_status(&out); -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice, - VkPhysicalDeviceFeatures *pFeatures) -{ - memset(pFeatures, 0, sizeof(*pFeatures)); - - *pFeatures = (VkPhysicalDeviceFeatures) { - .robustBufferAccess = true, /* This feature is mandatory */ - .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */ - .imageCubeArray = true, - .independentBlend = true, - .geometryShader = true, - .tessellationShader = false, - .sampleRateShading = true, - .dualSrcBlend = false, - .logicOp = true, - .multiDrawIndirect = false, - .drawIndirectFirstInstance = true, - .depthClamp = false, - .depthBiasClamp = true, - .fillModeNonSolid = true, - .depthBounds = false, /* Only available since V3D 4.3.16.2 */ - .wideLines = true, - .largePoints = true, - .alphaToOne = true, - .multiViewport = false, - .samplerAnisotropy = true, - .textureCompressionETC2 = true, - .textureCompressionASTC_LDR = true, - /* Note that textureCompressionBC requires that the driver support all - * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim - * that we support it. - */ - .textureCompressionBC = false, - .occlusionQueryPrecise = true, - .pipelineStatisticsQuery = false, - .vertexPipelineStoresAndAtomics = true, - .fragmentStoresAndAtomics = true, - .shaderTessellationAndGeometryPointSize = true, - .shaderImageGatherExtended = false, - .shaderStorageImageExtendedFormats = true, - .shaderStorageImageMultisample = false, - .shaderStorageImageReadWithoutFormat = false, - .shaderStorageImageWriteWithoutFormat = false, - .shaderUniformBufferArrayDynamicIndexing = false, - .shaderSampledImageArrayDynamicIndexing = false, - .shaderStorageBufferArrayDynamicIndexing = false, - .shaderStorageImageArrayDynamicIndexing = false, - .shaderClipDistance = true, - .shaderCullDistance = false, - .shaderFloat64 = false, - .shaderInt64 = false, - .shaderInt16 = false, - .shaderResourceResidency = false, - .shaderResourceMinLod = false, - .sparseBinding = false, - .sparseResidencyBuffer = false, - .sparseResidencyImage2D = false, - .sparseResidencyImage3D = false, - .sparseResidency2Samples = false, - .sparseResidency4Samples = false, - .sparseResidency8Samples = false, - .sparseResidency16Samples = false, - .sparseResidencyAliased = false, - .variableMultisampleRate = false, - .inheritedQueries = true, - }; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, - VkPhysicalDeviceFeatures2 *pFeatures) -{ - v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); - - VkPhysicalDeviceVulkan11Features vk11 = { - .storageBuffer16BitAccess = false, - .uniformAndStorageBuffer16BitAccess = false, - .storagePushConstant16 = false, - .storageInputOutput16 = false, - .multiview = true, - .multiviewGeometryShader = false, - .multiviewTessellationShader = false, - .variablePointersStorageBuffer = true, - /* FIXME: this needs support for non-constant index on UBO/SSBO */ - .variablePointers = false, - .protectedMemory = false, - .samplerYcbcrConversion = false, - .shaderDrawParameters = false, - }; - - vk_foreach_struct(ext, pFeatures->pNext) { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: { - VkPhysicalDeviceCustomBorderColorFeaturesEXT *features = - (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext; - features->customBorderColors = true; - features->customBorderColorWithoutFormat = false; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: { - VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features = - (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext; - features->uniformBufferStandardLayout = true; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT: { - VkPhysicalDevicePrivateDataFeaturesEXT *features = - (VkPhysicalDevicePrivateDataFeaturesEXT *)ext; - features->privateData = true; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: { - VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features = - (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext; - features->indexTypeUint8 = true; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: { - VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = (void *) ext; - features->colorWriteEnable = true; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_CREATION_CACHE_CONTROL_FEATURES_EXT: { - VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *features = (void *) ext; - features->pipelineCreationCacheControl = true; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: { - VkPhysicalDeviceProvokingVertexFeaturesEXT *features = (void *) ext; - features->provokingVertexLast = true; - /* FIXME: update when supporting EXT_transform_feedback */ - features->transformFeedbackPreservesProvokingVertex = false; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: { - VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features = - (void *) ext; - features->vertexAttributeInstanceRateDivisor = true; - features->vertexAttributeInstanceRateZeroDivisor = false; - break; - } - - /* Vulkan 1.1 */ - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: { - VkPhysicalDeviceVulkan11Features *features = - (VkPhysicalDeviceVulkan11Features *)ext; - memcpy(features, &vk11, sizeof(VkPhysicalDeviceVulkan11Features)); - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: { - VkPhysicalDevice16BitStorageFeatures *features = (void *) ext; - features->storageBuffer16BitAccess = vk11.storageBuffer16BitAccess; - features->uniformAndStorageBuffer16BitAccess = - vk11.uniformAndStorageBuffer16BitAccess; - features->storagePushConstant16 = vk11.storagePushConstant16; - features->storageInputOutput16 = vk11.storageInputOutput16; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: { - VkPhysicalDeviceMultiviewFeatures *features = (void *) ext; - features->multiview = vk11.multiview; - features->multiviewGeometryShader = vk11.multiviewGeometryShader; - features->multiviewTessellationShader = vk11.multiviewTessellationShader; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: { - VkPhysicalDeviceProtectedMemoryFeatures *features = (void *) ext; - features->protectedMemory = vk11.protectedMemory; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { - VkPhysicalDeviceSamplerYcbcrConversionFeatures *features = (void *) ext; - features->samplerYcbcrConversion = vk11.samplerYcbcrConversion; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: { - VkPhysicalDeviceShaderDrawParametersFeatures *features = (void *) ext; - features->shaderDrawParameters = vk11.shaderDrawParameters; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: { - VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext; - features->variablePointersStorageBuffer = - vk11.variablePointersStorageBuffer; - features->variablePointers = vk11.variablePointers; - break; - } - - default: - v3dv_debug_ignored_stype(ext->sType); - break; - } - } -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_GetDeviceGroupPeerMemoryFeatures(VkDevice device, - uint32_t heapIndex, - uint32_t localDeviceIndex, - uint32_t remoteDeviceIndex, - VkPeerMemoryFeatureFlags *pPeerMemoryFeatures) -{ - assert(localDeviceIndex == 0 && remoteDeviceIndex == 0); - *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT | - VK_PEER_MEMORY_FEATURE_COPY_DST_BIT | - VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT | - VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT; -} - uint32_t v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev) { return 0x14E4; /* Broadcom */ } - -#if using_v3d_simulator -static bool -get_i915_param(int fd, uint32_t param, int *value) -{ - int tmp; - - struct drm_i915_getparam gp = { - .param = param, - .value = &tmp, - }; - - int ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp); - if (ret != 0) - return false; - - *value = tmp; - return true; -} -#endif - uint32_t v3dv_physical_device_device_id(struct v3dv_physical_device *dev) { #if using_v3d_simulator - int devid = 0; - - if (!get_i915_param(dev->render_fd, I915_PARAM_CHIPSET_ID, &devid)) - fprintf(stderr, "Error getting device_id\n"); - - return devid; + return dev->device_id; #else switch (dev->devinfo.ver) { case 42: return 0xBE485FD3; /* Broadcom deviceID for 2711 */ + case 71: + return 0x55701C33; /* Broadcom deviceID for 2712 */ default: unreachable("Unsupported V3D version"); } @@ -1260,18 +1109,18 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, STATIC_ASSERT(MAX_STORAGE_BUFFERS >= MAX_DYNAMIC_STORAGE_BUFFERS); const uint32_t page_size = 4096; - const uint32_t mem_size = compute_heap_size(); + const uint64_t mem_size = compute_heap_size(); const uint32_t max_varying_components = 16 * 4; - const uint32_t v3d_coord_shift = 6; - - const float v3d_point_line_granularity = 2.0f / (1 << v3d_coord_shift); - const uint32_t max_fb_size = 4096; + const float v3d_point_line_granularity = 2.0f / (1 << V3D_COORD_SHIFT); + const uint32_t max_fb_size = V3D_MAX_IMAGE_DIMENSION; const VkSampleCountFlags supported_sample_counts = VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT; + const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver); + struct timespec clock_res; clock_getres(CLOCK_MONOTONIC, &clock_res); const float timestamp_period = @@ -1279,18 +1128,18 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, /* FIXME: this will probably require an in-depth review */ VkPhysicalDeviceLimits limits = { - .maxImageDimension1D = 4096, - .maxImageDimension2D = 4096, - .maxImageDimension3D = 4096, - .maxImageDimensionCube = 4096, - .maxImageArrayLayers = 2048, + .maxImageDimension1D = V3D_MAX_IMAGE_DIMENSION, + .maxImageDimension2D = V3D_MAX_IMAGE_DIMENSION, + .maxImageDimension3D = V3D_MAX_IMAGE_DIMENSION, + .maxImageDimensionCube = V3D_MAX_IMAGE_DIMENSION, + .maxImageArrayLayers = V3D_MAX_ARRAY_LAYERS, .maxTexelBufferElements = (1ul << 28), .maxUniformBufferRange = V3D_MAX_BUFFER_RANGE, .maxStorageBufferRange = V3D_MAX_BUFFER_RANGE, .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE, .maxMemoryAllocationCount = mem_size / page_size, .maxSamplerAllocationCount = 64 * 1024, - .bufferImageGranularity = 256, /* A cache line */ + .bufferImageGranularity = V3D_NON_COHERENT_ATOM_SIZE, .sparseAddressSpaceSize = 0, .maxBoundDescriptorSets = MAX_SETS, .maxPerStageDescriptorSamplers = V3D_MAX_TEXTURE_SAMPLERS, @@ -1342,7 +1191,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxFragmentInputComponents = max_varying_components, .maxFragmentOutputAttachments = 4, .maxFragmentDualSrcAttachments = 0, - .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS + + .maxFragmentCombinedOutputResources = max_rts + MAX_STORAGE_BUFFERS + MAX_STORAGE_IMAGES, @@ -1352,10 +1201,11 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxComputeWorkGroupInvocations = 256, .maxComputeWorkGroupSize = { 256, 256, 256 }, - .subPixelPrecisionBits = v3d_coord_shift, + .subPixelPrecisionBits = V3D_COORD_SHIFT, .subTexelPrecisionBits = 8, .mipmapPrecisionBits = 8, - .maxDrawIndexedIndexValue = 0x00ffffff, + .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ? + 0xffffffff : 0x00ffffff, .maxDrawIndirectCount = 0x7fffffff, .maxSamplerLodBias = 14.0f, .maxSamplerAnisotropy = 16.0f, @@ -1365,7 +1215,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, 2.0 * max_fb_size - 1 }, .viewportSubPixelBits = 0, .minMemoryMapAlignment = page_size, - .minTexelBufferOffsetAlignment = V3D_UIFBLOCK_SIZE, + .minTexelBufferOffsetAlignment = V3D_TMU_TEXEL_ALIGN, .minUniformBufferOffsetAlignment = 32, .minStorageBufferOffsetAlignment = 32, .minTexelOffset = -8, @@ -1374,7 +1224,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxTexelGatherOffset = 7, .minInterpolationOffset = -0.5, .maxInterpolationOffset = 0.5, - .subPixelInterpolationOffsetBits = v3d_coord_shift, + .subPixelInterpolationOffsetBits = V3D_COORD_SHIFT, .maxFramebufferWidth = max_fb_size, .maxFramebufferHeight = max_fb_size, .maxFramebufferLayers = 256, @@ -1382,7 +1232,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .framebufferDepthSampleCounts = supported_sample_counts, .framebufferStencilSampleCounts = supported_sample_counts, .framebufferNoAttachmentsSampleCounts = supported_sample_counts, - .maxColorAttachments = MAX_RENDER_TARGETS, + .maxColorAttachments = max_rts, .sampledImageColorSampleCounts = supported_sample_counts, .sampledImageIntegerSampleCounts = supported_sample_counts, .sampledImageDepthSampleCounts = supported_sample_counts, @@ -1404,7 +1254,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .standardSampleLocations = false, .optimalBufferCopyOffsetAlignment = 32, .optimalBufferCopyRowPitchAlignment = 32, - .nonCoherentAtomSize = 256, + .nonCoherentAtomSize = V3D_NON_COHERENT_ATOM_SIZE, }; *pProperties = (VkPhysicalDeviceProperties) { @@ -1431,7 +1281,166 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, v3dv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); + /* We don't really have special restrictions for the maximum + * descriptors per set, other than maybe not exceeding the limits + * of addressable memory in a single allocation on either the host + * or the GPU. This will be a much larger limit than any of the + * per-stage limits already available in Vulkan though, so in practice, + * it is not expected to limit anything beyond what is already + * constrained through per-stage limits. + */ + const uint32_t max_host_descriptors = + (UINT32_MAX - sizeof(struct v3dv_descriptor_set)) / + sizeof(struct v3dv_descriptor); + const uint32_t max_gpu_descriptors = + (UINT32_MAX / v3dv_X(pdevice, max_descriptor_bo_size)()); + + VkPhysicalDeviceVulkan13Properties vk13 = { + .maxInlineUniformBlockSize = 4096, + .maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UNIFORM_BUFFERS, + .maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UNIFORM_BUFFERS, + .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = + MAX_INLINE_UNIFORM_BUFFERS, + .maxDescriptorSetUpdateAfterBindInlineUniformBlocks = + MAX_INLINE_UNIFORM_BUFFERS, + .maxBufferSize = V3D_MAX_BUFFER_RANGE, + .storageTexelBufferOffsetAlignmentBytes = V3D_TMU_TEXEL_ALIGN, + .storageTexelBufferOffsetSingleTexelAlignment = false, + .uniformTexelBufferOffsetAlignmentBytes = V3D_TMU_TEXEL_ALIGN, + .uniformTexelBufferOffsetSingleTexelAlignment = false, + /* No native acceleration for integer dot product. We use NIR lowering. */ + .integerDotProduct8BitUnsignedAccelerated = false, + .integerDotProduct8BitMixedSignednessAccelerated = false, + .integerDotProduct4x8BitPackedUnsignedAccelerated = false, + .integerDotProduct4x8BitPackedSignedAccelerated = false, + .integerDotProduct4x8BitPackedMixedSignednessAccelerated = false, + .integerDotProduct16BitUnsignedAccelerated = false, + .integerDotProduct16BitSignedAccelerated = false, + .integerDotProduct16BitMixedSignednessAccelerated = false, + .integerDotProduct32BitUnsignedAccelerated = false, + .integerDotProduct32BitSignedAccelerated = false, + .integerDotProduct32BitMixedSignednessAccelerated = false, + .integerDotProduct64BitUnsignedAccelerated = false, + .integerDotProduct64BitSignedAccelerated = false, + .integerDotProduct64BitMixedSignednessAccelerated = false, + .integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false, + .integerDotProductAccumulatingSaturating8BitSignedAccelerated = false, + .integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false, + .integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = false, + .integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false, + .integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false, + .integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false, + .integerDotProductAccumulatingSaturating16BitSignedAccelerated = false, + .integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false, + .integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false, + .integerDotProductAccumulatingSaturating32BitSignedAccelerated = false, + .integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false, + .integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false, + .integerDotProductAccumulatingSaturating64BitSignedAccelerated = false, + .integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false, + /* VK_EXT_subgroup_size_control */ + .minSubgroupSize = V3D_CHANNELS, + .maxSubgroupSize = V3D_CHANNELS, + .maxComputeWorkgroupSubgroups = 16, /* 256 / 16 */ + .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT, + }; + + VkPhysicalDeviceVulkan12Properties vk12 = { + .driverID = VK_DRIVER_ID_MESA_V3DV, + .conformanceVersion = { + .major = 1, + .minor = 3, + .subminor = 6, + .patch = 1, + }, + .supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT, + .supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT, + /* FIXME: if we want to support independentResolveNone then we would + * need to honor attachment load operations on resolve attachments, + * which we currently ignore because the resolve makes them irrelevant, + * as it unconditionally writes all pixels in the render area. However, + * with independentResolveNone, it is possible to have one aspect of a + * D/S resolve attachment stay unresolved, in which case the attachment + * load operation is relevant. + * + * NOTE: implementing attachment load for resolve attachments isn't + * immediately trivial because these attachments are not part of the + * framebuffer and therefore we can't use the same mechanism we use + * for framebuffer attachments. Instead, we should probably have to + * emit a meta operation for that right at the start of the render + * pass (or subpass). + */ + .independentResolveNone = false, + .independentResolve = false, + .maxTimelineSemaphoreValueDifference = UINT64_MAX, + + .denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, + .roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, + .shaderSignedZeroInfNanPreserveFloat16 = true, + .shaderSignedZeroInfNanPreserveFloat32 = true, + .shaderSignedZeroInfNanPreserveFloat64 = false, + .shaderDenormPreserveFloat16 = true, + .shaderDenormPreserveFloat32 = true, + .shaderDenormPreserveFloat64 = false, + .shaderDenormFlushToZeroFloat16 = false, + .shaderDenormFlushToZeroFloat32 = false, + .shaderDenormFlushToZeroFloat64 = false, + .shaderRoundingModeRTEFloat16 = true, + .shaderRoundingModeRTEFloat32 = true, + .shaderRoundingModeRTEFloat64 = false, + .shaderRoundingModeRTZFloat16 = false, + .shaderRoundingModeRTZFloat32 = false, + .shaderRoundingModeRTZFloat64 = false, + + /* V3D doesn't support min/max filtering */ + .filterMinmaxSingleComponentFormats = false, + .filterMinmaxImageComponentMapping = false, + + .framebufferIntegerColorSampleCounts = + VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT, + }; + memset(vk12.driverName, 0, VK_MAX_DRIVER_NAME_SIZE); + snprintf(vk12.driverName, VK_MAX_DRIVER_NAME_SIZE, "V3DV Mesa"); + memset(vk12.driverInfo, 0, VK_MAX_DRIVER_INFO_SIZE); + snprintf(vk12.driverInfo, VK_MAX_DRIVER_INFO_SIZE, + "Mesa " PACKAGE_VERSION MESA_GIT_SHA1); + + VkSubgroupFeatureFlags subgroup_ops = VK_SUBGROUP_FEATURE_BASIC_BIT; + if (pdevice->devinfo.ver >= 71) { + subgroup_ops |= VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_QUAD_BIT; + } + + VkPhysicalDeviceVulkan11Properties vk11 = { + .deviceLUIDValid = false, + .subgroupSize = V3D_CHANNELS, + .subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT | + VK_SHADER_STAGE_FRAGMENT_BIT, + .subgroupSupportedOperations = subgroup_ops, + .subgroupQuadOperationsInAllStages = false, + .pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES, + .maxMultiviewViewCount = MAX_MULTIVIEW_VIEW_COUNT, + .maxMultiviewInstanceIndex = UINT32_MAX - 1, + .protectedNoFault = false, + .maxPerSetDescriptors = MIN2(max_host_descriptors, max_gpu_descriptors), + /* Minimum required by the spec */ + .maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE, + }; + memcpy(vk11.deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); + memcpy(vk11.driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); + + vk_foreach_struct(ext, pProperties->pNext) { + if (vk_get_physical_device_core_1_1_property_ext(ext, &vk11)) + continue; + if (vk_get_physical_device_core_1_2_property_ext(ext, &vk12)) + continue; + if (vk_get_physical_device_core_1_3_property_ext(ext, &vk13)) + continue; + switch (ext->sType) { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: { VkPhysicalDeviceCustomBorderColorPropertiesEXT *props = @@ -1453,15 +1462,31 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, props->maxVertexAttribDivisor = 0xffff; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: { - VkPhysicalDeviceIDProperties *id_props = - (VkPhysicalDeviceIDProperties *)ext; - memcpy(id_props->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); - memcpy(id_props->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); - /* The LUID is for Windows. */ - id_props->deviceLUIDValid = false; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR : { + VkPhysicalDevicePerformanceQueryPropertiesKHR *props = + (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext; + + props->allowCommandBufferQueryCopies = true; + break; + } +#if DETECT_OS_ANDROID +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wswitch" + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: { + VkPhysicalDevicePresentationPropertiesANDROID *props = + (VkPhysicalDevicePresentationPropertiesANDROID *)ext; + uint64_t front_rendering_usage = 0; + struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO); + if (gralloc != NULL) { + u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage); + u_gralloc_destroy(&gralloc); + } + props->sharedImage = front_rendering_usage ? VK_TRUE + : VK_FALSE; break; } +#pragma GCC diagnostic pop +#endif case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: { VkPhysicalDeviceDrmPropertiesEXT *props = (VkPhysicalDeviceDrmPropertiesEXT *)ext; @@ -1477,34 +1502,10 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, } break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: { - VkPhysicalDeviceMaintenance3Properties *props = - (VkPhysicalDeviceMaintenance3Properties *)ext; - /* We don't really have special restrictions for the maximum - * descriptors per set, other than maybe not exceeding the limits - * of addressable memory in a single allocation on either the host - * or the GPU. This will be a much larger limit than any of the - * per-stage limits already available in Vulkan though, so in practice, - * it is not expected to limit anything beyond what is already - * constrained through per-stage limits. - */ - uint32_t max_host_descriptors = - (UINT32_MAX - sizeof(struct v3dv_descriptor_set)) / - sizeof(struct v3dv_descriptor); - uint32_t max_gpu_descriptors = - (UINT32_MAX / v3dv_X(pdevice, max_descriptor_bo_size)()); - props->maxPerSetDescriptors = - MIN2(max_host_descriptors, max_gpu_descriptors); - - /* Minimum required by the spec */ - props->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: { - VkPhysicalDeviceMultiviewProperties *props = - (VkPhysicalDeviceMultiviewProperties *)ext; - props->maxMultiviewViewCount = MAX_MULTIVIEW_VIEW_COUNT; - props->maxMultiviewInstanceIndex = UINT32_MAX - 1; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: { + VkPhysicalDeviceLineRasterizationPropertiesEXT *props = + (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext; + props->lineSubPixelPrecisionBits = V3D_COORD_SHIFT; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT: @@ -1512,26 +1513,33 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, * never provide this extension. */ break; - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: { - VkPhysicalDevicePointClippingProperties *props = - (VkPhysicalDevicePointClippingProperties *)ext; - props->pointClippingBehavior = - VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_PROPERTIES_EXT: { + VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *props = + (VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *)ext; + STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) == + sizeof(props->shaderModuleIdentifierAlgorithmUUID)); + memcpy(props->shaderModuleIdentifierAlgorithmUUID, + vk_shaderModuleIdentifierAlgorithmUUID, + sizeof(props->shaderModuleIdentifierAlgorithmUUID)); break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: { - VkPhysicalDeviceProtectedMemoryProperties *props = - (VkPhysicalDeviceProtectedMemoryProperties *)ext; - props->protectedNoFault = false; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_PROPERTIES_EXT: { + VkPhysicalDevicePipelineRobustnessPropertiesEXT *props = + (VkPhysicalDevicePipelineRobustnessPropertiesEXT *)ext; + props->defaultRobustnessStorageBuffers = + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT; + props->defaultRobustnessUniformBuffers = + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT; + props->defaultRobustnessVertexInputs = + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT; + props->defaultRobustnessImages = + VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DEVICE_DEFAULT_EXT; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: { - VkPhysicalDeviceSubgroupProperties *props = - (VkPhysicalDeviceSubgroupProperties *)ext; - props->subgroupSize = V3D_CHANNELS; - props->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT; - props->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT; - props->quadOperationsInAllStages = false; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: { + VkPhysicalDeviceMultiDrawPropertiesEXT *properties = + (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext; + properties->maxMultiDrawCount = 2048; break; } default: @@ -1553,25 +1561,14 @@ v3dv_queue_family_properties = { }; VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceQueueFamilyProperties(VkPhysicalDevice physicalDevice, - uint32_t *pCount, - VkQueueFamilyProperties *pQueueFamilyProperties) -{ - VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pCount); - - vk_outarray_append(&out, p) { - *p = v3dv_queue_family_properties; - } -} - -VKAPI_ATTR void VKAPI_CALL v3dv_GetPhysicalDeviceQueueFamilyProperties2(VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount, VkQueueFamilyProperties2 *pQueueFamilyProperties) { - VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount); + VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out, + pQueueFamilyProperties, pQueueFamilyPropertyCount); - vk_outarray_append(&out, p) { + vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) { p->queueFamilyProperties = v3dv_queue_family_properties; vk_foreach_struct(s, p->pNext) { @@ -1592,11 +1589,28 @@ VKAPI_ATTR void VKAPI_CALL v3dv_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties2 *pMemoryProperties) { + V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); + v3dv_GetPhysicalDeviceMemoryProperties(physicalDevice, &pMemoryProperties->memoryProperties); vk_foreach_struct(ext, pMemoryProperties->pNext) { switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: { + VkPhysicalDeviceMemoryBudgetPropertiesEXT *p = + (VkPhysicalDeviceMemoryBudgetPropertiesEXT *) ext; + p->heapUsage[0] = device->heap_used; + p->heapBudget[0] = compute_memory_budget(device); + + /* The heapBudget and heapUsage values must be zero for array elements + * greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount + */ + for (unsigned i = 1; i < VK_MAX_MEMORY_HEAPS; i++) { + p->heapBudget[i] = 0u; + p->heapUsage[i] = 0u; + } + break; + } default: v3dv_debug_ignored_stype(ext->sType); break; @@ -1618,11 +1632,6 @@ v3dv_GetInstanceProcAddr(VkInstance _instance, * vk_icdGetInstanceProcAddr to work around certain LD_PRELOAD issues seen in apps. */ PUBLIC -VKAPI_ATTR PFN_vkVoidFunction -VKAPI_CALL vk_icdGetInstanceProcAddr(VkInstance instance, - const char *pName); - -PUBLIC VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(VkInstance instance, const char* pName) @@ -1630,23 +1639,6 @@ vk_icdGetInstanceProcAddr(VkInstance instance, return v3dv_GetInstanceProcAddr(instance, pName); } -/* With version 4+ of the loader interface the ICD should expose - * vk_icdGetPhysicalDeviceProcAddr() - */ -PUBLIC -VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL -vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance, - const char* pName); - -PFN_vkVoidFunction -vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance, - const char* pName) -{ - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); - - return vk_instance_get_physical_device_proc_addr(&instance->vk, pName); -} - VKAPI_ATTR VkResult VKAPI_CALL v3dv_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount, VkLayerProperties *pProperties) @@ -1671,30 +1663,66 @@ v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice, return VK_SUCCESS; } - return vk_error((struct v3dv_instance*) physical_device->vk.instance, - VK_ERROR_LAYER_NOT_PRESENT); + return vk_error(physical_device, VK_ERROR_LAYER_NOT_PRESENT); +} + +static void +destroy_queue_syncs(struct v3dv_queue *queue) +{ + for (int i = 0; i < V3DV_QUEUE_COUNT; i++) { + if (queue->last_job_syncs.syncs[i]) { + drmSyncobjDestroy(queue->device->pdevice->render_fd, + queue->last_job_syncs.syncs[i]); + } + } } static VkResult -queue_init(struct v3dv_device *device, struct v3dv_queue *queue) +queue_init(struct v3dv_device *device, struct v3dv_queue *queue, + const VkDeviceQueueCreateInfo *create_info, + uint32_t index_in_family) { - vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE); + VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info, + index_in_family); + if (result != VK_SUCCESS) + return result; + + result = vk_queue_enable_submit_thread(&queue->vk); + if (result != VK_SUCCESS) + goto fail_submit_thread; + queue->device = device; - queue->flags = 0; + queue->vk.driver_submit = v3dv_queue_driver_submit; + + for (int i = 0; i < V3DV_QUEUE_COUNT; i++) { + queue->last_job_syncs.first[i] = true; + int ret = drmSyncobjCreate(device->pdevice->render_fd, + DRM_SYNCOBJ_CREATE_SIGNALED, + &queue->last_job_syncs.syncs[i]); + if (ret) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "syncobj create failed: %m"); + goto fail_last_job_syncs; + } + } + queue->noop_job = NULL; - list_inithead(&queue->submit_wait_list); - pthread_mutex_init(&queue->mutex, NULL); return VK_SUCCESS; + +fail_last_job_syncs: + destroy_queue_syncs(queue); +fail_submit_thread: + vk_queue_finish(&queue->vk); + return result; } static void queue_finish(struct v3dv_queue *queue) { - vk_object_base_finish(&queue->base); - assert(list_is_empty(&queue->submit_wait_list)); if (queue->noop_job) v3dv_job_destroy(queue->noop_job); - pthread_mutex_destroy(&queue->mutex); + destroy_queue_syncs(queue); + vk_queue_finish(&queue->vk); } static void @@ -1728,19 +1756,6 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO); - /* Check enabled features */ - if (pCreateInfo->pEnabledFeatures) { - VkPhysicalDeviceFeatures supported_features; - v3dv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features); - VkBool32 *supported_feature = (VkBool32 *)&supported_features; - VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures; - unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32); - for (uint32_t i = 0; i < num_features; i++) { - if (enabled_feature[i] && !supported_feature[i]) - return vk_error(instance, VK_ERROR_FEATURE_NOT_PRESENT); - } - } - /* Check requested queues (we only expose one queue ) */ assert(pCreateInfo->queueCreateInfoCount == 1); for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { @@ -1759,56 +1774,46 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, struct vk_device_dispatch_table dispatch_table; vk_device_dispatch_table_from_entrypoints(&dispatch_table, &v3dv_device_entrypoints, true); + vk_device_dispatch_table_from_entrypoints(&dispatch_table, + &wsi_device_entrypoints, false); result = vk_device_init(&device->vk, &physical_device->vk, &dispatch_table, pCreateInfo, pAllocator); if (result != VK_SUCCESS) { vk_free(&device->vk.alloc, device); - return vk_error(instance, result); + return vk_error(NULL, result); } +#if DETECT_OS_ANDROID + device->gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO); + assert(device->gralloc); +#endif + device->instance = instance; device->pdevice = physical_device; - if (pAllocator) - device->vk.alloc = *pAllocator; - else - device->vk.alloc = physical_device->vk.instance->alloc; + mtx_init(&device->query_mutex, mtx_plain); + cnd_init(&device->query_ended); + + device->vk.command_buffer_ops = &v3dv_cmd_buffer_ops; - pthread_mutex_init(&device->mutex, NULL); + vk_device_set_drm_fd(&device->vk, physical_device->render_fd); + vk_device_enable_threaded_submit(&device->vk); - result = queue_init(device, &device->queue); + result = queue_init(device, &device->queue, + pCreateInfo->pQueueCreateInfos, 0); if (result != VK_SUCCESS) goto fail; device->devinfo = physical_device->devinfo; - /* Vulkan 1.1 and VK_KHR_get_physical_device_properties2 added - * VkPhysicalDeviceFeatures2 which can be used in the pNext chain of - * vkDeviceCreateInfo, in which case it should be used instead of - * pEnabledFeatures. - */ - const VkPhysicalDeviceFeatures2 *features2 = - vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_FEATURES_2); - if (features2) { - memcpy(&device->features, &features2->features, - sizeof(device->features)); - } else if (pCreateInfo->pEnabledFeatures) { - memcpy(&device->features, pCreateInfo->pEnabledFeatures, - sizeof(device->features)); - } - - if (device->features.robustBufferAccess) + if (device->vk.enabled_features.robustBufferAccess) perf_debug("Device created with Robust Buffer Access enabled.\n"); - int ret = drmSyncobjCreate(physical_device->render_fd, - DRM_SYNCOBJ_CREATE_SIGNALED, - &device->last_job_sync); - if (ret) { - result = VK_ERROR_INITIALIZATION_FAILED; - goto fail; - } + if (device->vk.enabled_features.robustImageAccess) + perf_debug("Device created with Robust Image Access enabled.\n"); -#ifdef DEBUG + +#if MESA_DEBUG v3dv_X(device, device_check_prepacked_sizes)(); #endif init_device_meta(device); @@ -1816,14 +1821,42 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0, device->instance->default_pipeline_cache_enabled); device->default_attribute_float = - v3dv_pipeline_create_default_attribute_values(device, NULL); + v3dv_X(device, create_default_attribute_values)(device, NULL); + + device->device_address_mem_ctx = ralloc_context(NULL); + util_dynarray_init(&device->device_address_bo_list, + device->device_address_mem_ctx); + + mtx_init(&device->events.lock, mtx_plain); + result = v3dv_event_allocate_resources(device); + if (result != VK_SUCCESS) + goto fail; + + if (list_is_empty(&device->events.free_list)) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto fail; + } + + result = v3dv_query_allocate_resources(device); + if (result != VK_SUCCESS) + goto fail; *pDevice = v3dv_device_to_handle(device); return VK_SUCCESS; fail: + cnd_destroy(&device->query_ended); + mtx_destroy(&device->query_mutex); + queue_finish(&device->queue); + destroy_device_meta(device); + v3dv_pipeline_cache_finish(&device->default_pipeline_cache); + v3dv_event_free_resources(device); + v3dv_query_free_resources(device); vk_device_finish(&device->vk); +#if DETECT_OS_ANDROID + u_gralloc_destroy(&device->gralloc); +#endif vk_free(&device->vk.alloc, device); return result; @@ -1835,10 +1868,14 @@ v3dv_DestroyDevice(VkDevice _device, { V3DV_FROM_HANDLE(v3dv_device, device, _device); - v3dv_DeviceWaitIdle(_device); + device->vk.dispatch_table.DeviceWaitIdle(_device); queue_finish(&device->queue); - pthread_mutex_destroy(&device->mutex); - drmSyncobjDestroy(device->pdevice->render_fd, device->last_job_sync); + + v3dv_event_free_resources(device); + mtx_destroy(&device->events.lock); + + v3dv_query_free_resources(device); + destroy_device_meta(device); v3dv_pipeline_cache_finish(&device->default_pipeline_cache); @@ -1847,36 +1884,23 @@ v3dv_DestroyDevice(VkDevice _device, device->default_attribute_float = NULL; } + ralloc_free(device->device_address_mem_ctx); + /* Bo cache should be removed the last, as any other object could be * freeing their private bos */ v3dv_bo_cache_destroy(device); + cnd_destroy(&device->query_ended); + mtx_destroy(&device->query_mutex); + vk_device_finish(&device->vk); +#if DETECT_OS_ANDROID + u_gralloc_destroy(&device->gralloc); +#endif vk_free2(&device->vk.alloc, pAllocator, device); } -VKAPI_ATTR void VKAPI_CALL -v3dv_GetDeviceQueue(VkDevice _device, - uint32_t queueFamilyIndex, - uint32_t queueIndex, - VkQueue *pQueue) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - assert(queueIndex == 0); - assert(queueFamilyIndex == 0); - - *pQueue = v3dv_queue_to_handle(&device->queue); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_DeviceWaitIdle(VkDevice _device) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - return v3dv_QueueWaitIdle(v3dv_queue_to_handle(&device->queue)); -} - static VkResult device_alloc(struct v3dv_device *device, struct v3dv_device_memory *mem, @@ -1914,15 +1938,12 @@ device_free(struct v3dv_device *device, struct v3dv_device_memory *mem) * display device to free the allocated dumb BO. */ if (mem->is_for_wsi) { - assert(mem->has_bo_ownership); - device_free_wsi_dumb(device->instance->physicalDevice.display_fd, - mem->bo->dumb_handle); + device_free_wsi_dumb(device->pdevice->display_fd, mem->bo->dumb_handle); } - if (mem->has_bo_ownership) - v3dv_bo_free(device, mem->bo); - else if (mem->bo) - vk_free(&device->vk.alloc, mem->bo); + p_atomic_add(&device->pdevice->heap_used, -((int64_t)mem->bo->size)); + + v3dv_bo_free(device, mem->bo); } static void @@ -1967,21 +1988,12 @@ device_import_bo(struct v3dv_device *device, int fd, uint64_t size, struct v3dv_bo **bo) { - VkResult result; - - *bo = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(struct v3dv_bo), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (*bo == NULL) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } + *bo = NULL; off_t real_size = lseek(fd, 0, SEEK_END); lseek(fd, 0, SEEK_SET); - if (real_size < 0 || (uint64_t) real_size < size) { - result = VK_ERROR_INVALID_EXTERNAL_HANDLE; - goto fail; - } + if (real_size < 0 || (uint64_t) real_size < size) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; int render_fd = device->pdevice->render_fd; assert(render_fd >= 0); @@ -1989,31 +2001,26 @@ device_import_bo(struct v3dv_device *device, int ret; uint32_t handle; ret = drmPrimeFDToHandle(render_fd, fd, &handle); - if (ret) { - result = VK_ERROR_INVALID_EXTERNAL_HANDLE; - goto fail; - } + if (ret) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; struct drm_v3d_get_bo_offset get_offset = { .handle = handle, }; ret = v3dv_ioctl(render_fd, DRM_IOCTL_V3D_GET_BO_OFFSET, &get_offset); - if (ret) { - result = VK_ERROR_INVALID_EXTERNAL_HANDLE; - goto fail; - } + if (ret) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; assert(get_offset.offset != 0); - v3dv_bo_init(*bo, handle, size, get_offset.offset, "import", false); + *bo = v3dv_device_lookup_bo(device->pdevice, handle); + assert(*bo); - return VK_SUCCESS; + if ((*bo)->refcnt == 0) + v3dv_bo_init_import(*bo, handle, size, get_offset.offset, false); + else + p_atomic_inc(&(*bo)->refcnt); -fail: - if (*bo) { - vk_free2(&device->vk.alloc, pAllocator, *bo); - *bo = NULL; - } - return result; + return VK_SUCCESS; } static VkResult @@ -2030,19 +2037,8 @@ device_alloc_for_wsi(struct v3dv_device *device, #if using_v3d_simulator return device_alloc(device, mem, size); #else - /* If we are allocating for WSI we should have a swapchain and thus, - * we should've initialized the display device. However, Zink doesn't - * use swapchains, so in that case we can get here without acquiring the - * display device and we need to do it now. - */ VkResult result; - struct v3dv_instance *instance = device->instance; - struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; - if (unlikely(pdevice->display_fd < 0)) { - result = v3dv_physical_device_acquire_display(instance, pdevice, NULL); - if (result != VK_SUCCESS) - return result; - } + struct v3dv_physical_device *pdevice = device->pdevice; assert(pdevice->display_fd != -1); mem->is_for_wsi = true; @@ -2082,6 +2078,53 @@ fail_create: #endif } +static void +device_add_device_address_bo(struct v3dv_device *device, + struct v3dv_bo *bo) +{ + util_dynarray_append(&device->device_address_bo_list, + struct v3dv_bo *, + bo); +} + +static void +device_remove_device_address_bo(struct v3dv_device *device, + struct v3dv_bo *bo) +{ + util_dynarray_delete_unordered(&device->device_address_bo_list, + struct v3dv_bo *, + bo); +} + +static void +free_memory(struct v3dv_device *device, + struct v3dv_device_memory *mem, + const VkAllocationCallbacks *pAllocator) +{ + if (mem == NULL) + return; + + if (mem->bo->map) + device_unmap(device, mem); + + if (mem->is_for_device_address) + device_remove_device_address_bo(device, mem->bo); + + device_free(device, mem); + + vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_FreeMemory(VkDevice _device, + VkDeviceMemory _mem, + const VkAllocationCallbacks *pAllocator) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + V3DV_FROM_HANDLE(v3dv_device_memory, mem, _mem); + free_memory(device, mem, pAllocator); +} + VKAPI_ATTR VkResult VKAPI_CALL v3dv_AllocateMemory(VkDevice _device, const VkMemoryAllocateInfo *pAllocateInfo, @@ -2090,25 +2133,34 @@ v3dv_AllocateMemory(VkDevice _device, { V3DV_FROM_HANDLE(v3dv_device, device, _device); struct v3dv_device_memory *mem; - struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; + struct v3dv_physical_device *pdevice = device->pdevice; assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); - /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */ - assert(pAllocateInfo->allocationSize > 0); + /* We always allocate device memory in multiples of a page, so round up + * requested size to that. + */ + const VkDeviceSize alloc_size = align64(pAllocateInfo->allocationSize, 4096); + + if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE)) + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + uint64_t heap_used = p_atomic_read(&pdevice->heap_used); + if (unlikely(heap_used + alloc_size > pdevice->memory.memoryHeaps[0].size)) + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); - mem = vk_object_zalloc(&device->vk, pAllocator, sizeof(*mem), - VK_OBJECT_TYPE_DEVICE_MEMORY); + mem = vk_device_memory_create(&device->vk, pAllocateInfo, + pAllocator, sizeof(*mem)); if (mem == NULL) return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.memoryTypeCount); mem->type = &pdevice->memory.memoryTypes[pAllocateInfo->memoryTypeIndex]; - mem->has_bo_ownership = true; mem->is_for_wsi = false; const struct wsi_memory_allocate_info *wsi_info = NULL; const VkImportMemoryFdInfoKHR *fd_info = NULL; + const VkMemoryAllocateFlagsInfo *flags_info = NULL; vk_foreach_struct_const(ext, pAllocateInfo->pNext) { switch ((unsigned)ext->sType) { case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA: @@ -2118,16 +2170,14 @@ v3dv_AllocateMemory(VkDevice _device, fd_info = (void *)ext; break; case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: - /* We don't support VK_KHR_buffer_device_address or multiple - * devices per device group, so we can ignore this. - */ + flags_info = (void *)ext; break; - case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR: + case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO: /* We don't have particular optimizations associated with memory * allocations that won't be suballocated to multiple resources. */ break; - case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR: + case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO: /* The mask of handle types specified here must be supported * according to VkExternalImageFormatProperties, so it must be * fd or dmabuf, which don't have special requirements for us. @@ -2139,57 +2189,58 @@ v3dv_AllocateMemory(VkDevice _device, } } - VkResult result = VK_SUCCESS; - - /* We always allocate device memory in multiples of a page, so round up - * requested size to that. - */ - VkDeviceSize alloc_size = ALIGN(pAllocateInfo->allocationSize, 4096); + VkResult result; - if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE)) { - result = VK_ERROR_OUT_OF_DEVICE_MEMORY; + if (wsi_info) { + result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size); + } else if (fd_info && fd_info->handleType) { + assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || + fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); + result = device_import_bo(device, pAllocator, + fd_info->fd, alloc_size, &mem->bo); + if (result == VK_SUCCESS) + close(fd_info->fd); + } else if (mem->vk.ahardware_buffer) { +#if DETECT_OS_ANDROID + const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer); + assert(handle->numFds > 0); + size_t size = lseek(handle->data[0], 0, SEEK_END); + result = device_import_bo(device, pAllocator, + handle->data[0], size, &mem->bo); +#else + result = VK_ERROR_FEATURE_NOT_PRESENT; +#endif } else { - if (wsi_info) { - result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size); - } else if (fd_info && fd_info->handleType) { - assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || - fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); - result = device_import_bo(device, pAllocator, - fd_info->fd, alloc_size, &mem->bo); - mem->has_bo_ownership = false; - if (result == VK_SUCCESS) - close(fd_info->fd); - } else { - result = device_alloc(device, mem, alloc_size); - } + result = device_alloc(device, mem, alloc_size); } if (result != VK_SUCCESS) { - vk_object_free(&device->vk, pAllocator, mem); - return vk_error(device->instance, result); + vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk); + return vk_error(device, result); } - *pMem = v3dv_device_memory_to_handle(mem); - return result; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_FreeMemory(VkDevice _device, - VkDeviceMemory _mem, - const VkAllocationCallbacks *pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_device_memory, mem, _mem); - - if (mem == NULL) - return; - - if (mem->bo->map) - v3dv_UnmapMemory(_device, _mem); + heap_used = p_atomic_add_return(&pdevice->heap_used, mem->bo->size); + if (heap_used > pdevice->memory.memoryHeaps[0].size) { + free_memory(device, mem, pAllocator); + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } - device_free(device, mem); + /* If this memory can be used via VK_KHR_buffer_device_address then we + * will need to manually add the BO to any job submit that makes use of + * VK_KHR_buffer_device_address, since such jobs may produce buffer + * load/store operations that may access any buffer memory allocated with + * this flag and we don't have any means to tell which buffers will be + * accessed through this mechanism since they don't even have to be bound + * through descriptor state. + */ + if (flags_info && + (flags_info->flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT)) { + mem->is_for_device_address = true; + device_add_device_address_bo(device, mem->bo); + } - vk_object_free(&device->vk, pAllocator, mem); + *pMem = v3dv_device_memory_to_handle(mem); + return result; } VKAPI_ATTR VkResult VKAPI_CALL @@ -2217,7 +2268,7 @@ v3dv_MapMemory(VkDevice _device, */ VkResult result = device_map(device, mem); if (result != VK_SUCCESS) - return vk_error(device->instance, result); + return vk_error(device, result); *ppData = ((uint8_t *) mem->bo->map) + offset; return VK_SUCCESS; @@ -2252,19 +2303,30 @@ v3dv_InvalidateMappedMemoryRanges(VkDevice _device, return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL -v3dv_GetImageMemoryRequirements2(VkDevice device, - const VkImageMemoryRequirementsInfo2 *pInfo, - VkMemoryRequirements2 *pMemoryRequirements) +static void +get_image_memory_requirements(struct v3dv_image *image, + VkImageAspectFlagBits planeAspect, + VkMemoryRequirements2 *pMemoryRequirements) { - V3DV_FROM_HANDLE(v3dv_image, image, pInfo->image); - pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { .memoryTypeBits = 0x1, - .alignment = image->alignment, - .size = image->size + .alignment = image->planes[0].alignment, + .size = image->non_disjoint_size }; + if (planeAspect != VK_IMAGE_ASPECT_NONE) { + assert(image->format->plane_count > 1); + /* Disjoint images should have a 0 non_disjoint_size */ + assert(!pMemoryRequirements->memoryRequirements.size); + + uint8_t plane = v3dv_image_aspect_to_plane(image, planeAspect); + + VkMemoryRequirements *mem_reqs = + &pMemoryRequirements->memoryRequirements; + mem_reqs->alignment = image->planes[plane].alignment; + mem_reqs->size = image->planes[plane].size; + } + vk_foreach_struct(ext, pMemoryRequirements->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { @@ -2281,6 +2343,65 @@ v3dv_GetImageMemoryRequirements2(VkDevice device, } } +VKAPI_ATTR void VKAPI_CALL +v3dv_GetImageMemoryRequirements2(VkDevice device, + const VkImageMemoryRequirementsInfo2 *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + V3DV_FROM_HANDLE(v3dv_image, image, pInfo->image); + + VkImageAspectFlagBits planeAspect = VK_IMAGE_ASPECT_NONE; + vk_foreach_struct_const(ext, pInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: { + VkImagePlaneMemoryRequirementsInfo *req = + (VkImagePlaneMemoryRequirementsInfo *) ext; + planeAspect = req->planeAspect; + break; + } + default: + v3dv_debug_ignored_stype(ext->sType); + break; + } + } + + get_image_memory_requirements(image, planeAspect, pMemoryRequirements); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_GetDeviceImageMemoryRequirements( + VkDevice _device, + const VkDeviceImageMemoryRequirements *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + + struct v3dv_image image = { 0 }; + vk_image_init(&device->vk, &image.vk, pInfo->pCreateInfo); + + ASSERTED VkResult result = + v3dv_image_init(device, pInfo->pCreateInfo, NULL, &image); + assert(result == VK_SUCCESS); + + /* From VkDeviceImageMemoryRequirements spec: + * + * " planeAspect is a VkImageAspectFlagBits value specifying the aspect + * corresponding to the image plane to query. This parameter is ignored + * unless pCreateInfo::tiling is + * VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, or pCreateInfo::flags has + * VK_IMAGE_CREATE_DISJOINT_BIT set" + * + * We need to explicitly ignore that flag, or following asserts could be + * triggered. + */ + VkImageAspectFlagBits planeAspect = + pInfo->pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT || + pInfo->pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT ? + pInfo->planeAspect : 0; + + get_image_memory_requirements(&image, planeAspect, pMemoryRequirements); +} + static void bind_image_memory(const VkBindImageMemoryInfo *info) { @@ -2293,11 +2414,43 @@ bind_image_memory(const VkBindImageMemoryInfo *info) * the VkMemoryRequirements structure returned from a call to * vkGetImageMemoryRequirements with image" */ - assert(info->memoryOffset % image->alignment == 0); assert(info->memoryOffset < mem->bo->size); - image->mem = mem; - image->mem_offset = info->memoryOffset; + uint64_t offset = info->memoryOffset; + if (image->non_disjoint_size) { + /* We only check for plane 0 as it is the only one that actually starts + * at that offset + */ + assert(offset % image->planes[0].alignment == 0); + for (uint8_t plane = 0; plane < image->plane_count; plane++) { + image->planes[plane].mem = mem; + image->planes[plane].mem_offset = offset; + } + } else { + const VkBindImagePlaneMemoryInfo *plane_mem_info = + vk_find_struct_const(info->pNext, BIND_IMAGE_PLANE_MEMORY_INFO); + assert(plane_mem_info); + + /* + * From VkBindImagePlaneMemoryInfo spec: + * + * "If the image’s tiling is VK_IMAGE_TILING_LINEAR or + * VK_IMAGE_TILING_OPTIMAL, then planeAspect must be a single valid + * format plane for the image" + * + * <skip> + * + * "If the image’s tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, + * then planeAspect must be a single valid memory plane for the + * image" + * + * So planeAspect should only refer to one plane. + */ + uint8_t plane = v3dv_plane_from_aspect(plane_mem_info->planeAspect); + assert(offset % image->planes[plane].alignment == 0); + image->planes[plane].mem = mem; + image->planes[plane].mem_offset = offset; + } } VKAPI_ATTR VkResult VKAPI_CALL @@ -2306,21 +2459,59 @@ v3dv_BindImageMemory2(VkDevice _device, const VkBindImageMemoryInfo *pBindInfos) { for (uint32_t i = 0; i < bindInfoCount; i++) { +#if DETECT_OS_ANDROID + V3DV_FROM_HANDLE(v3dv_device_memory, mem, pBindInfos[i].memory); + V3DV_FROM_HANDLE(v3dv_device, device, _device); + if (mem != NULL && mem->vk.ahardware_buffer) { + AHardwareBuffer_Desc description; + const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer); + + V3DV_FROM_HANDLE(v3dv_image, image, pBindInfos[i].image); + AHardwareBuffer_describe(mem->vk.ahardware_buffer, &description); + + struct u_gralloc_buffer_handle gr_handle = { + .handle = handle, + .pixel_stride = description.stride, + .hal_format = description.format, + }; + + VkResult result = v3dv_gralloc_to_drm_explicit_layout( + device->gralloc, + &gr_handle, + image->android_explicit_layout, + image->android_plane_layouts, + V3DV_MAX_PLANE_COUNT); + if (result != VK_SUCCESS) + return result; + + result = v3dv_update_image_layout( + device, image, image->android_explicit_layout->drmFormatModifier, + /* disjoint = */ false, image->android_explicit_layout); + if (result != VK_SUCCESS) + return result; + } +#endif + const VkBindImageMemorySwapchainInfoKHR *swapchain_info = vk_find_struct_const(pBindInfos->pNext, BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR); if (swapchain_info && swapchain_info->swapchain) { +#if !DETECT_OS_ANDROID struct v3dv_image *swapchain_image = v3dv_wsi_get_image_from_swapchain(swapchain_info->swapchain, swapchain_info->imageIndex); + /* Making the assumption that swapchain images are a single plane */ + assert(swapchain_image->plane_count == 1); VkBindImageMemoryInfo swapchain_bind = { .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO, .image = pBindInfos[i].image, - .memory = v3dv_device_memory_to_handle(swapchain_image->mem), - .memoryOffset = swapchain_image->mem_offset, + .memory = v3dv_device_memory_to_handle(swapchain_image->planes[0].mem), + .memoryOffset = swapchain_image->planes[0].mem_offset, }; bind_image_memory(&swapchain_bind); - } else { +#endif + } else + { bind_image_memory(&pBindInfos[i]); } } @@ -2328,19 +2519,39 @@ v3dv_BindImageMemory2(VkDevice _device, return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL -v3dv_GetBufferMemoryRequirements2(VkDevice device, - const VkBufferMemoryRequirementsInfo2 *pInfo, - VkMemoryRequirements2 *pMemoryRequirements) +void +v3dv_buffer_init(struct v3dv_device *device, + const VkBufferCreateInfo *pCreateInfo, + struct v3dv_buffer *buffer, + uint32_t alignment) { - V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer); + buffer->size = pCreateInfo->size; + buffer->usage = pCreateInfo->usage; + buffer->alignment = alignment; +} +static void +get_buffer_memory_requirements(struct v3dv_buffer *buffer, + VkMemoryRequirements2 *pMemoryRequirements) +{ pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { .memoryTypeBits = 0x1, .alignment = buffer->alignment, .size = align64(buffer->size, buffer->alignment), }; + /* UBO and SSBO may be read using ldunifa, which prefetches the next + * 4 bytes after a read. If the buffer's size is exactly a multiple + * of a page size and the shader reads the last 4 bytes with ldunifa + * the prefetching would read out of bounds and cause an MMU error, + * so we allocate extra space to avoid kernel error spamming. + */ + bool can_ldunifa = buffer->usage & + (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + if (can_ldunifa && (buffer->size % 4096 == 0)) + pMemoryRequirements->memoryRequirements.size += buffer->alignment; + vk_foreach_struct(ext, pMemoryRequirements->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { @@ -2357,8 +2568,30 @@ v3dv_GetBufferMemoryRequirements2(VkDevice device, } } -static void -bind_buffer_memory(const VkBindBufferMemoryInfo *info) +VKAPI_ATTR void VKAPI_CALL +v3dv_GetBufferMemoryRequirements2(VkDevice device, + const VkBufferMemoryRequirementsInfo2 *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer); + get_buffer_memory_requirements(buffer, pMemoryRequirements); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_GetDeviceBufferMemoryRequirements( + VkDevice _device, + const VkDeviceBufferMemoryRequirements *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + + struct v3dv_buffer buffer = { 0 }; + v3dv_buffer_init(device, pInfo->pCreateInfo, &buffer, V3D_NON_COHERENT_ATOM_SIZE); + get_buffer_memory_requirements(&buffer, pMemoryRequirements); +} + +void +v3dv_buffer_bind_memory(const VkBindBufferMemoryInfo *info) { V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->buffer); V3DV_FROM_HANDLE(v3dv_device_memory, mem, info->memory); @@ -2383,7 +2616,7 @@ v3dv_BindBufferMemory2(VkDevice device, const VkBindBufferMemoryInfo *pBindInfos) { for (uint32_t i = 0; i < bindInfoCount; i++) - bind_buffer_memory(&pBindInfos[i]); + v3dv_buffer_bind_memory(&pBindInfos[i]); return VK_SUCCESS; } @@ -2406,16 +2639,16 @@ v3dv_CreateBuffer(VkDevice _device, buffer = vk_object_zalloc(&device->vk, pAllocator, sizeof(*buffer), VK_OBJECT_TYPE_BUFFER); if (buffer == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - buffer->size = pCreateInfo->size; - buffer->usage = pCreateInfo->usage; - buffer->alignment = 256; /* nonCoherentAtomSize */ + v3dv_buffer_init(device, pCreateInfo, buffer, V3D_NON_COHERENT_ATOM_SIZE); /* Limit allocations to 32-bit */ const VkDeviceSize aligned_size = align64(buffer->size, buffer->alignment); - if (aligned_size > UINT32_MAX || aligned_size < buffer->size) + if (aligned_size > UINT32_MAX || aligned_size < buffer->size) { + vk_free(&device->vk.alloc, buffer); return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } *pBuffer = v3dv_buffer_to_handle(buffer); @@ -2452,20 +2685,32 @@ v3dv_CreateFramebuffer(VkDevice _device, framebuffer = vk_object_zalloc(&device->vk, pAllocator, size, VK_OBJECT_TYPE_FRAMEBUFFER); if (framebuffer == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); framebuffer->width = pCreateInfo->width; framebuffer->height = pCreateInfo->height; framebuffer->layers = pCreateInfo->layers; framebuffer->has_edge_padding = true; + const VkFramebufferAttachmentsCreateInfo *imageless = + vk_find_struct_const(pCreateInfo->pNext, + FRAMEBUFFER_ATTACHMENTS_CREATE_INFO); + framebuffer->attachment_count = pCreateInfo->attachmentCount; framebuffer->color_attachment_count = 0; - for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { - framebuffer->attachments[i] = - v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]); - if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) - framebuffer->color_attachment_count++; + for (uint32_t i = 0; i < framebuffer->attachment_count; i++) { + if (!imageless) { + framebuffer->attachments[i] = + v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]); + if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) + framebuffer->color_attachment_count++; + } else { + assert(i < imageless->attachmentImageInfoCount); + if (imageless->pAttachmentImageInfos[i].usage & + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + framebuffer->color_attachment_count++; + } + } } *pFramebuffer = v3dv_framebuffer_to_handle(framebuffer); @@ -2487,6 +2732,105 @@ v3dv_DestroyFramebuffer(VkDevice _device, vk_object_free(&device->vk, pAllocator, fb); } +void +v3dv_setup_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer, + const VkRenderingInfoKHR *info) +{ + struct v3dv_device *device = cmd_buffer->device; + + /* Max framebuffer attachments is max_color_RTs + D/S multiplied by two for + * MSAA resolves. + */ + const uint32_t max_attachments = + 2 * (V3D_MAX_RENDER_TARGETS(device->devinfo.ver) + 1); + const uint32_t attachments_alloc_size = + sizeof(struct v3dv_image_view *) * max_attachments; + + /* Only allocate the dynamic framebuffer once and will stay valid + * for the duration of the command buffer. + */ + struct v3dv_framebuffer *fb = cmd_buffer->state.dynamic_framebuffer; + if (!fb) { + uint32_t alloc_size = sizeof(struct v3dv_framebuffer) + + attachments_alloc_size; + fb = vk_object_zalloc(&cmd_buffer->device->vk, NULL, alloc_size, + VK_OBJECT_TYPE_FRAMEBUFFER); + if (fb == NULL) { + v3dv_flag_oom(cmd_buffer, NULL); + return; + } + cmd_buffer->state.dynamic_framebuffer = fb; + } else { + memset(fb->attachments, 0, attachments_alloc_size); + } + + fb->width = info->renderArea.offset.x + info->renderArea.extent.width; + fb->height = info->renderArea.offset.y + info->renderArea.extent.height; + + /* From the Vulkan spec for VkFramebufferCreateInfo: + * + * "If the render pass uses multiview, then layers must be one (...)" + */ + fb->layers = info->viewMask == 0 ? info->layerCount : 1; + + struct v3dv_render_pass *pass = &cmd_buffer->state.dynamic_pass; + assert(pass->subpass_count == 1 && pass->subpasses); + assert(pass->subpasses[0].color_count == info->colorAttachmentCount); + fb->color_attachment_count = info->colorAttachmentCount; + + uint32_t a = 0; + for (int i = 0; i < info->colorAttachmentCount; i++) { + if (info->pColorAttachments[i].imageView == VK_NULL_HANDLE) + continue; + fb->attachments[a++] = + v3dv_image_view_from_handle(info->pColorAttachments[i].imageView); + if (info->pColorAttachments[i].resolveMode != VK_RESOLVE_MODE_NONE) { + fb->attachments[a++] = + v3dv_image_view_from_handle(info->pColorAttachments[i].resolveImageView); + } + } + + if ((info->pDepthAttachment && info->pDepthAttachment->imageView) || + (info->pStencilAttachment && info->pStencilAttachment->imageView)) { + const struct VkRenderingAttachmentInfo *common_ds_info = + (info->pDepthAttachment && + info->pDepthAttachment->imageView != VK_NULL_HANDLE) ? + info->pDepthAttachment : + info->pStencilAttachment; + + fb->attachments[a++] = + v3dv_image_view_from_handle(common_ds_info->imageView); + + if (common_ds_info->resolveMode != VK_RESOLVE_MODE_NONE) { + fb->attachments[a++] = + v3dv_image_view_from_handle(common_ds_info->resolveImageView); + } + } + + assert(a == pass->attachment_count); + fb->attachment_count = a; + + /* Dynamic rendering doesn't provide the size of the underlying framebuffer + * so we estimate its size from the render area. This means it is possible + * the underlying attachments are larger and thus we cannot assume we have + * edge padding. + */ + fb->has_edge_padding = false; +} + +void +v3dv_destroy_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer) +{ + if (!cmd_buffer->state.dynamic_framebuffer) + return; + + VkDevice vk_device = v3dv_device_to_handle(cmd_buffer->device); + VkFramebuffer vk_dynamic_fb = + v3dv_framebuffer_to_handle(cmd_buffer->state.dynamic_framebuffer); + v3dv_DestroyFramebuffer(vk_device, vk_dynamic_fb, NULL); + cmd_buffer->state.dynamic_framebuffer = NULL; +} + VKAPI_ATTR VkResult VKAPI_CALL v3dv_GetMemoryFdPropertiesKHR(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType, @@ -2494,7 +2838,7 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device, VkMemoryFdPropertiesKHR *pMemoryFdProperties) { V3DV_FROM_HANDLE(v3dv_device, device, _device); - struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; + struct v3dv_physical_device *pdevice = device->pdevice; switch (handleType) { case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: @@ -2502,7 +2846,7 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device, (1 << pdevice->memory.memoryTypeCount) - 1; return VK_SUCCESS; default: - return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); + return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); } } @@ -2523,7 +2867,7 @@ v3dv_GetMemoryFdKHR(VkDevice _device, mem->bo->handle, DRM_CLOEXEC, &fd); if (ret) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); *pFd = fd; @@ -2531,63 +2875,6 @@ v3dv_GetMemoryFdKHR(VkDevice _device, } VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateEvent(VkDevice _device, - const VkEventCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkEvent *pEvent) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - struct v3dv_event *event = - vk_object_zalloc(&device->vk, pAllocator, sizeof(*event), - VK_OBJECT_TYPE_EVENT); - if (!event) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - /* Events are created in the unsignaled state */ - event->state = false; - *pEvent = v3dv_event_to_handle(event); - - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroyEvent(VkDevice _device, - VkEvent _event, - const VkAllocationCallbacks *pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_event, event, _event); - - if (!event) - return; - - vk_object_free(&device->vk, pAllocator, event); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetEventStatus(VkDevice _device, VkEvent _event) -{ - V3DV_FROM_HANDLE(v3dv_event, event, _event); - return p_atomic_read(&event->state) ? VK_EVENT_SET : VK_EVENT_RESET; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_SetEvent(VkDevice _device, VkEvent _event) -{ - V3DV_FROM_HANDLE(v3dv_event, event, _event); - p_atomic_set(&event->state, 1); - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ResetEvent(VkDevice _device, VkEvent _event) -{ - V3DV_FROM_HANDLE(v3dv_event, event, _event); - p_atomic_set(&event->state, 0); - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL v3dv_CreateSampler(VkDevice _device, const VkSamplerCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -2601,7 +2888,9 @@ v3dv_CreateSampler(VkDevice _device, sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler), VK_OBJECT_TYPE_SAMPLER); if (!sampler) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + sampler->plane_count = 1; sampler->compare_enable = pCreateInfo->compareEnable; sampler->unnormalized_coordinates = pCreateInfo->unnormalizedCoordinates; @@ -2610,7 +2899,21 @@ v3dv_CreateSampler(VkDevice _device, vk_find_struct_const(pCreateInfo->pNext, SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT); - v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info); + const VkSamplerYcbcrConversionInfo *ycbcr_conv_info = + vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO); + + const struct vk_format_ycbcr_info *ycbcr_info = NULL; + + if (ycbcr_conv_info) { + VK_FROM_HANDLE(vk_ycbcr_conversion, conversion, ycbcr_conv_info->conversion); + ycbcr_info = vk_format_get_ycbcr_info(conversion->state.format); + if (ycbcr_info) { + sampler->plane_count = ycbcr_info->n_planes; + sampler->conversion = conversion; + } + } + + v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info); *pSampler = v3dv_sampler_to_handle(sampler); @@ -2659,49 +2962,65 @@ v3dv_GetImageSparseMemoryRequirements2( *pSparseMemoryRequirementCount = 0; } -/* vk_icd.h does not declare this function, so we declare it here to - * suppress Wmissing-prototypes. - */ -PUBLIC VKAPI_ATTR VkResult VKAPI_CALL -vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion); +VKAPI_ATTR void VKAPI_CALL +v3dv_GetDeviceImageSparseMemoryRequirements( + VkDevice device, + const VkDeviceImageMemoryRequirements *pInfo, + uint32_t *pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements) +{ + *pSparseMemoryRequirementCount = 0; +} -PUBLIC VKAPI_ATTR VkResult VKAPI_CALL -vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion) +VkDeviceAddress +v3dv_GetBufferDeviceAddress(VkDevice device, + const VkBufferDeviceAddressInfo *pInfo) { - /* For the full details on loader interface versioning, see - * <https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers/blob/master/loader/LoaderAndLayerInterface.md>. - * What follows is a condensed summary, to help you navigate the large and - * confusing official doc. - * - * - Loader interface v0 is incompatible with later versions. We don't - * support it. - * - * - In loader interface v1: - * - The first ICD entrypoint called by the loader is - * vk_icdGetInstanceProcAddr(). The ICD must statically expose this - * entrypoint. - * - The ICD must statically expose no other Vulkan symbol unless it is - * linked with -Bsymbolic. - * - Each dispatchable Vulkan handle created by the ICD must be - * a pointer to a struct whose first member is VK_LOADER_DATA. The - * ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC. - * - The loader implements vkCreate{PLATFORM}SurfaceKHR() and - * vkDestroySurfaceKHR(). The ICD must be capable of working with - * such loader-managed surfaces. - * - * - Loader interface v2 differs from v1 in: - * - The first ICD entrypoint called by the loader is - * vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must - * statically expose this entrypoint. - * - * - Loader interface v3 differs from v2 in: - * - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(), - * vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR, - * because the loader no longer does so. - * - * - Loader interface v4 differs from v3 in: - * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr(). - */ - *pSupportedVersion = MIN2(*pSupportedVersion, 3u); - return VK_SUCCESS; + V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer); + return buffer->mem_offset + buffer->mem->bo->offset; +} + +uint64_t +v3dv_GetBufferOpaqueCaptureAddress(VkDevice device, + const VkBufferDeviceAddressInfo *pInfo) +{ + /* Not implemented */ + return 0; +} + +uint64_t +v3dv_GetDeviceMemoryOpaqueCaptureAddress( + VkDevice device, + const VkDeviceMemoryOpaqueCaptureAddressInfo *pInfo) +{ + /* Not implemented */ + return 0; +} + +VkResult +v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device, + nir_shader *nir, + VkPipelineLayout pipeline_layout, + VkPipeline *pipeline) +{ + struct vk_shader_module cs_m = vk_shader_module_from_nir(nir); + + VkPipelineShaderStageCreateInfo set_event_cs_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = vk_shader_module_to_handle(&cs_m), + .pName = "main", + }; + + VkComputePipelineCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = set_event_cs_stage, + .layout = pipeline_layout, + }; + + VkResult result = + v3dv_CreateComputePipelines(v3dv_device_to_handle(device), VK_NULL_HANDLE, + 1, &info, &device->vk.alloc, pipeline); + + return result; } diff --git a/src/broadcom/vulkan/v3dv_event.c b/src/broadcom/vulkan/v3dv_event.c new file mode 100644 index 00000000000..a3aad37d9c7 --- /dev/null +++ b/src/broadcom/vulkan/v3dv_event.c @@ -0,0 +1,712 @@ +/* + * Copyright © 2022 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3dv_private.h" +#include "compiler/nir/nir_builder.h" + +#include "vk_common_entrypoints.h" + +static nir_shader * +get_set_event_cs() +{ + const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, + "set event cs"); + + nir_def *buf = + nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), + .desc_set = 0, + .binding = 0, + .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + nir_def *offset = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); + + nir_def *value = + nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 4, .range = 4); + + nir_store_ssbo(&b, value, buf, offset, + .access = 0, .write_mask = 0x1, .align_mul = 4); + + return b.shader; +} + +static nir_shader * +get_wait_event_cs() +{ + const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, + "wait event cs"); + + nir_def *buf = + nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), + .desc_set = 0, + .binding = 0, + .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + nir_def *offset = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); + + nir_loop *loop = nir_push_loop(&b); + nir_def *load = + nir_load_ssbo(&b, 1, 8, buf, offset, .access = 0, .align_mul = 4); + nir_def *value = nir_i2i32(&b, load); + + nir_if *if_stmt = nir_push_if(&b, nir_ieq_imm(&b, value, 1)); + nir_jump(&b, nir_jump_break); + nir_pop_if(&b, if_stmt); + nir_pop_loop(&b, loop); + + return b.shader; +} + +static bool +create_event_pipelines(struct v3dv_device *device) +{ + VkResult result; + + if (!device->events.descriptor_set_layout) { + /* Pipeline layout: + * - 1 storage buffer for the BO with the events state. + * - 2 push constants: + * 0B: offset of the event in the buffer (4 bytes). + * 4B: value for the event (1 byte), only used with the set_event_pipeline. + */ + VkDescriptorSetLayoutBinding descriptor_set_layout_binding = { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }; + + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 1, + .pBindings = &descriptor_set_layout_binding, + }; + + result = + v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device), + &descriptor_set_layout_info, + &device->vk.alloc, + &device->events.descriptor_set_layout); + + if (result != VK_SUCCESS) + return false; + } + + if (!device->events.pipeline_layout) { + VkPipelineLayoutCreateInfo pipeline_layout_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->events.descriptor_set_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = + &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 5 }, + }; + + result = + v3dv_CreatePipelineLayout(v3dv_device_to_handle(device), + &pipeline_layout_info, + &device->vk.alloc, + &device->events.pipeline_layout); + + if (result != VK_SUCCESS) + return false; + } + + VkPipeline pipeline; + + if (!device->events.set_event_pipeline) { + nir_shader *set_event_cs_nir = get_set_event_cs(); + result = v3dv_create_compute_pipeline_from_nir(device, + set_event_cs_nir, + device->events.pipeline_layout, + &pipeline); + ralloc_free(set_event_cs_nir); + if (result != VK_SUCCESS) + return false; + + device->events.set_event_pipeline = pipeline; + } + + if (!device->events.wait_event_pipeline) { + nir_shader *wait_event_cs_nir = get_wait_event_cs(); + result = v3dv_create_compute_pipeline_from_nir(device, + wait_event_cs_nir, + device->events.pipeline_layout, + &pipeline); + ralloc_free(wait_event_cs_nir); + if (result != VK_SUCCESS) + return false; + + device->events.wait_event_pipeline = pipeline; + } + + return true; +} + +static void +destroy_event_pipelines(struct v3dv_device *device) +{ + VkDevice _device = v3dv_device_to_handle(device); + + v3dv_DestroyPipeline(_device, device->events.set_event_pipeline, + &device->vk.alloc); + device->events.set_event_pipeline = VK_NULL_HANDLE; + + v3dv_DestroyPipeline(_device, device->events.wait_event_pipeline, + &device->vk.alloc); + device->events.wait_event_pipeline = VK_NULL_HANDLE; + + v3dv_DestroyPipelineLayout(_device, device->events.pipeline_layout, + &device->vk.alloc); + device->events.pipeline_layout = VK_NULL_HANDLE; + + v3dv_DestroyDescriptorSetLayout(_device, + device->events.descriptor_set_layout, + &device->vk.alloc); + device->events.descriptor_set_layout = VK_NULL_HANDLE; +} + +static void +init_event(struct v3dv_device *device, struct v3dv_event *event, uint32_t index) +{ + vk_object_base_init(&device->vk, &event->base, VK_OBJECT_TYPE_EVENT); + event->index = index; + list_addtail(&event->link, &device->events.free_list); +} + +VkResult +v3dv_event_allocate_resources(struct v3dv_device *device) +{ + VkResult result = VK_SUCCESS; + VkDevice _device = v3dv_device_to_handle(device); + + /* BO with event states. Make sure we always align to a page size (4096) + * to ensure we use all the memory the kernel will allocate for the BO. + * + * CTS has tests that require over 8192 active events (yes, really) so + * let's make sure we allow for that. + */ + const uint32_t bo_size = 3 * 4096; + struct v3dv_bo *bo = v3dv_bo_alloc(device, bo_size, "events", true); + if (!bo) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto fail; + } + + device->events.bo = bo; + + if (!v3dv_bo_map(device, bo, bo_size)) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + /* Pre-allocate our events, each event requires 1 byte of BO storage */ + device->events.event_count = bo_size; + device->events.events = + vk_zalloc2(&device->vk.alloc, NULL, + device->events.event_count * sizeof(struct v3dv_event), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device->events.events) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + list_inithead(&device->events.free_list); + for (int i = 0; i < device->events.event_count; i++) + init_event(device, &device->events.events[i], i); + + /* Vulkan buffer for the event state BO */ + VkBufferCreateInfo buf_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = bo->size, + .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + }; + result = v3dv_CreateBuffer(_device, &buf_info, NULL, + &device->events.buffer); + if (result != VK_SUCCESS) + goto fail; + + struct v3dv_device_memory *mem = + vk_object_zalloc(&device->vk, NULL, sizeof(*mem), + VK_OBJECT_TYPE_DEVICE_MEMORY); + if (!mem) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + mem->bo = bo; + mem->type = &device->pdevice->memory.memoryTypes[0]; + + device->events.mem = v3dv_device_memory_to_handle(mem); + VkBindBufferMemoryInfo bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, + .buffer = device->events.buffer, + .memory = device->events.mem, + .memoryOffset = 0, + }; + v3dv_BindBufferMemory2(_device, 1, &bind_info); + + /* Pipelines */ + if (!create_event_pipelines(device)) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + /* Descriptor pool & set to access the buffer */ + VkDescriptorPoolSize pool_size = { + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + }; + VkDescriptorPoolCreateInfo pool_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .maxSets = 1, + .poolSizeCount = 1, + .pPoolSizes = &pool_size, + }; + result = + v3dv_CreateDescriptorPool(_device, &pool_info, NULL, + &device->events.descriptor_pool); + + if (result != VK_SUCCESS) + goto fail; + + VkDescriptorSetAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = device->events.descriptor_pool, + .descriptorSetCount = 1, + .pSetLayouts = &device->events.descriptor_set_layout, + }; + result = v3dv_AllocateDescriptorSets(_device, &alloc_info, + &device->events.descriptor_set); + if (result != VK_SUCCESS) + goto fail; + + VkDescriptorBufferInfo desc_buf_info = { + .buffer = device->events.buffer, + .offset = 0, + .range = VK_WHOLE_SIZE, + }; + + VkWriteDescriptorSet write = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = device->events.descriptor_set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .pBufferInfo = &desc_buf_info, + }; + v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL); + + return VK_SUCCESS; + +fail: + v3dv_event_free_resources(device); + return result; +} + +void +v3dv_event_free_resources(struct v3dv_device *device) +{ + if (device->events.bo) { + v3dv_bo_free(device, device->events.bo); + device->events.bo = NULL; + } + + if (device->events.events) { + vk_free2(&device->vk.alloc, NULL, device->events.events); + device->events.events = NULL; + } + + if (device->events.mem) { + vk_object_free(&device->vk, NULL, + v3dv_device_memory_from_handle(device->events.mem)); + device->events.mem = VK_NULL_HANDLE; + } + + v3dv_DestroyBuffer(v3dv_device_to_handle(device), + device->events.buffer, NULL); + device->events.buffer = VK_NULL_HANDLE; + + v3dv_FreeDescriptorSets(v3dv_device_to_handle(device), + device->events.descriptor_pool, + 1, &device->events.descriptor_set); + device->events.descriptor_set = VK_NULL_HANDLE; + + v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device), + device->events.descriptor_pool, + NULL); + device->events.descriptor_pool = VK_NULL_HANDLE; + + destroy_event_pipelines(device); +} + +static struct v3dv_event * +allocate_event(struct v3dv_device *device) +{ + mtx_lock(&device->events.lock); + if (list_is_empty(&device->events.free_list)) { + mtx_unlock(&device->events.lock); + return NULL; + } + + struct v3dv_event *event = + list_first_entry(&device->events.free_list, struct v3dv_event, link); + list_del(&event->link); + mtx_unlock(&device->events.lock); + + return event; +} + +static void +free_event(struct v3dv_device *device, uint32_t index) +{ + assert(index < device->events.event_count); + mtx_lock(&device->events.lock); + list_addtail(&device->events.events[index].link, &device->events.free_list); + mtx_unlock(&device->events.lock); +} + +static void +event_set_value(struct v3dv_device *device, + struct v3dv_event *event, + uint8_t value) +{ + assert(value == 0 || value == 1); + uint8_t *data = (uint8_t *) device->events.bo->map; + data[event->index] = value; +} + +static uint8_t +event_get_value(struct v3dv_device *device, struct v3dv_event *event) +{ + uint8_t *data = (uint8_t *) device->events.bo->map; + return data[event->index]; +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_CreateEvent(VkDevice _device, + const VkEventCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkEvent *pEvent) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + VkResult result = VK_SUCCESS; + + struct v3dv_event *event = allocate_event(device); + if (!event) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto fail; + } + + event_set_value(device, event, 0); + *pEvent = v3dv_event_to_handle(event); + return VK_SUCCESS; + +fail: + return result; +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_DestroyEvent(VkDevice _device, + VkEvent _event, + const VkAllocationCallbacks *pAllocator) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + V3DV_FROM_HANDLE(v3dv_event, event, _event); + + if (!event) + return; + + free_event(device, event->index); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_GetEventStatus(VkDevice _device, VkEvent _event) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + V3DV_FROM_HANDLE(v3dv_event, event, _event); + return event_get_value(device, event) ? VK_EVENT_SET : VK_EVENT_RESET; +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_SetEvent(VkDevice _device, VkEvent _event) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + V3DV_FROM_HANDLE(v3dv_event, event, _event); + event_set_value(device, event, 1); + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_ResetEvent(VkDevice _device, VkEvent _event) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + V3DV_FROM_HANDLE(v3dv_event, event, _event); + event_set_value(device, event, 0); + return VK_SUCCESS; +} + +static void +cmd_buffer_emit_set_event(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_event *event, + uint8_t value) +{ + assert(value == 0 || value == 1); + + struct v3dv_device *device = cmd_buffer->device; + VkCommandBuffer commandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer); + + v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); + + v3dv_CmdBindPipeline(commandBuffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->events.set_event_pipeline); + + v3dv_CmdBindDescriptorSets(commandBuffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->events.pipeline_layout, + 0, 1, &device->events.descriptor_set, 0, NULL); + + assert(event->index < device->events.event_count); + uint32_t offset = event->index; + v3dv_CmdPushConstants(commandBuffer, + device->events.pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, 4, &offset); + + v3dv_CmdPushConstants(commandBuffer, + device->events.pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, + 4, 1, &value); + + vk_common_CmdDispatch(commandBuffer, 1, 1, 1); + + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); +} + +static void +cmd_buffer_emit_wait_event(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_event *event) +{ + struct v3dv_device *device = cmd_buffer->device; + VkCommandBuffer commandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer); + + v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); + + v3dv_CmdBindPipeline(commandBuffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->events.wait_event_pipeline); + + v3dv_CmdBindDescriptorSets(commandBuffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->events.pipeline_layout, + 0, 1, &device->events.descriptor_set, 0, NULL); + + assert(event->index < device->events.event_count); + uint32_t offset = event->index; + v3dv_CmdPushConstants(commandBuffer, + device->events.pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, 4, &offset); + + vk_common_CmdDispatch(commandBuffer, 1, 1, 1); + + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_CmdSetEvent2(VkCommandBuffer commandBuffer, + VkEvent _event, + const VkDependencyInfo *pDependencyInfo) +{ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + V3DV_FROM_HANDLE(v3dv_event, event, _event); + + /* Event (re)sets can only happen outside a render pass instance so we + * should not be in the middle of job recording. + */ + assert(cmd_buffer->state.pass == NULL); + assert(cmd_buffer->state.job == NULL); + + /* We need to add the compute stage to the dstStageMask of all dependencies, + * so let's go ahead and patch the dependency info we receive. + */ + struct v3dv_device *device = cmd_buffer->device; + + uint32_t memory_barrier_count = pDependencyInfo->memoryBarrierCount; + VkMemoryBarrier2 *memory_barriers = memory_barrier_count ? + vk_alloc2(&device->vk.alloc, NULL, + memory_barrier_count * sizeof(memory_barriers[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL; + for (int i = 0; i < memory_barrier_count; i++) { + memory_barriers[i] = pDependencyInfo->pMemoryBarriers[i]; + memory_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + } + + uint32_t buffer_barrier_count = pDependencyInfo->bufferMemoryBarrierCount; + VkBufferMemoryBarrier2 *buffer_barriers = buffer_barrier_count ? + vk_alloc2(&device->vk.alloc, NULL, + buffer_barrier_count * sizeof(buffer_barriers[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL; + for (int i = 0; i < buffer_barrier_count; i++) { + buffer_barriers[i] = pDependencyInfo->pBufferMemoryBarriers[i]; + buffer_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + } + + uint32_t image_barrier_count = pDependencyInfo->imageMemoryBarrierCount; + VkImageMemoryBarrier2 *image_barriers = image_barrier_count ? + vk_alloc2(&device->vk.alloc, NULL, + image_barrier_count * sizeof(image_barriers[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL; + for (int i = 0; i < image_barrier_count; i++) { + image_barriers[i] = pDependencyInfo->pImageMemoryBarriers[i]; + image_barriers[i].dstStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + } + + VkDependencyInfo info = { + .sType = pDependencyInfo->sType, + .dependencyFlags = pDependencyInfo->dependencyFlags, + .memoryBarrierCount = memory_barrier_count, + .pMemoryBarriers = memory_barriers, + .bufferMemoryBarrierCount = buffer_barrier_count, + .pBufferMemoryBarriers = buffer_barriers, + .imageMemoryBarrierCount = image_barrier_count, + .pImageMemoryBarriers = image_barriers, + }; + + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &info); + + cmd_buffer_emit_set_event(cmd_buffer, event, 1); + + if (memory_barriers) + vk_free2(&device->vk.alloc, NULL, memory_barriers); + if (buffer_barriers) + vk_free2(&device->vk.alloc, NULL, buffer_barriers); + if (image_barriers) + vk_free2(&device->vk.alloc, NULL, image_barriers); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_CmdResetEvent2(VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags2 stageMask) +{ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + V3DV_FROM_HANDLE(v3dv_event, event, _event); + + /* Event (re)sets can only happen outside a render pass instance so we + * should not be in the middle of job recording. + */ + assert(cmd_buffer->state.pass == NULL); + assert(cmd_buffer->state.job == NULL); + + VkMemoryBarrier2 barrier = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, + .srcStageMask = stageMask, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + }; + VkDependencyInfo info = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier, + }; + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &info); + + cmd_buffer_emit_set_event(cmd_buffer, event, 0); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_CmdWaitEvents2(VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent *pEvents, + const VkDependencyInfo *pDependencyInfo) +{ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + for (uint32_t i = 0; i < eventCount; i++) { + struct v3dv_event *event = v3dv_event_from_handle(pEvents[i]);; + cmd_buffer_emit_wait_event(cmd_buffer, event); + } + + /* We need to add the compute stage to the srcStageMask of all dependencies, + * so let's go ahead and patch the dependency info we receive. + */ + struct v3dv_device *device = cmd_buffer->device; + for (int e = 0; e < eventCount; e++) { + const VkDependencyInfo *info = &pDependencyInfo[e]; + + uint32_t memory_barrier_count = info->memoryBarrierCount; + VkMemoryBarrier2 *memory_barriers = memory_barrier_count ? + vk_alloc2(&device->vk.alloc, NULL, + memory_barrier_count * sizeof(memory_barriers[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL; + for (int i = 0; i < memory_barrier_count; i++) { + memory_barriers[i] = info->pMemoryBarriers[i]; + memory_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + } + + uint32_t buffer_barrier_count = info->bufferMemoryBarrierCount; + VkBufferMemoryBarrier2 *buffer_barriers = buffer_barrier_count ? + vk_alloc2(&device->vk.alloc, NULL, + buffer_barrier_count * sizeof(buffer_barriers[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL; + for (int i = 0; i < buffer_barrier_count; i++) { + buffer_barriers[i] = info->pBufferMemoryBarriers[i]; + buffer_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + } + + uint32_t image_barrier_count = info->imageMemoryBarrierCount; + VkImageMemoryBarrier2 *image_barriers = image_barrier_count ? + vk_alloc2(&device->vk.alloc, NULL, + image_barrier_count * sizeof(image_barriers[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND) : NULL; + for (int i = 0; i < image_barrier_count; i++) { + image_barriers[i] = info->pImageMemoryBarriers[i]; + image_barriers[i].srcStageMask |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + } + + VkDependencyInfo new_info = { + .sType = info->sType, + .dependencyFlags = info->dependencyFlags, + .memoryBarrierCount = memory_barrier_count, + .pMemoryBarriers = memory_barriers, + .bufferMemoryBarrierCount = buffer_barrier_count, + .pBufferMemoryBarriers = buffer_barriers, + .imageMemoryBarrierCount = image_barrier_count, + .pImageMemoryBarriers = image_barriers, + }; + + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &new_info); + + if (memory_barriers) + vk_free2(&device->vk.alloc, NULL, memory_barriers); + if (buffer_barriers) + vk_free2(&device->vk.alloc, NULL, buffer_barriers); + if (image_barriers) + vk_free2(&device->vk.alloc, NULL, image_barriers); + } +} diff --git a/src/broadcom/vulkan/v3dv_formats.c b/src/broadcom/vulkan/v3dv_formats.c index 6e32d341a25..4d8f648d26a 100644 --- a/src/broadcom/vulkan/v3dv_formats.c +++ b/src/broadcom/vulkan/v3dv_formats.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -22,15 +22,20 @@ */ #include "v3dv_private.h" +#if DETECT_OS_ANDROID +#include "vk_android.h" +#endif +#include "vk_enum_defines.h" #include "vk_util.h" -#include "vk_format_info.h" #include "drm-uapi/drm_fourcc.h" #include "util/format/u_format.h" #include "vulkan/wsi/wsi_common.h" +#include <vulkan/vulkan_android.h> + const uint8_t * -v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f) +v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f, uint8_t plane) { const struct v3dv_format *vf = v3dv_X(device, get_format)(f); static const uint8_t fallback[] = {0, 1, 2, 3}; @@ -38,23 +43,43 @@ v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f) if (!vf) return fallback; - return vf->swizzle; + return vf->planes[plane].swizzle; } -uint8_t -v3dv_get_tex_return_size(const struct v3dv_format *vf, - bool compare_enable) +bool +v3dv_format_swizzle_needs_rb_swap(const uint8_t *swizzle) { - if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT)) - return 16; + /* Normal case */ + if (swizzle[0] == PIPE_SWIZZLE_Z) + return swizzle[2] == PIPE_SWIZZLE_X; + + /* Format uses reverse flag */ + if (swizzle[0] == PIPE_SWIZZLE_Y) + return swizzle[2] == PIPE_SWIZZLE_W; - if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT)) - return 32; + return false; +} + +bool +v3dv_format_swizzle_needs_reverse(const uint8_t *swizzle) +{ + /* Normal case */ + if (swizzle[0] == PIPE_SWIZZLE_W && + swizzle[1] == PIPE_SWIZZLE_Z && + swizzle[2] == PIPE_SWIZZLE_Y && + swizzle[3] == PIPE_SWIZZLE_X) { + return true; + } - if (compare_enable) - return 16; + /* Format uses RB swap flag */ + if (swizzle[0] == PIPE_SWIZZLE_Y && + swizzle[1] == PIPE_SWIZZLE_Z && + swizzle[2] == PIPE_SWIZZLE_W && + swizzle[3] == PIPE_SWIZZLE_X) { + return true; + } - return vf->return_size; + return false; } /* Some cases of transfer operations are raw data copies that don't depend @@ -62,6 +87,9 @@ v3dv_get_tex_return_size(const struct v3dv_format *vf, * involved). In these cases, it is safe to choose any format supported by * the TFU so long as it has the same texel size, which allows us to use the * TFU paths with formats that are not TFU supported otherwise. + * + * Even when copying multi-plane images, we are copying per-plane, so the + * compatible TFU format will be single-plane. */ const struct v3dv_format * v3dv_get_compatible_tfu_format(struct v3dv_device *device, @@ -82,20 +110,18 @@ v3dv_get_compatible_tfu_format(struct v3dv_device *device, *out_vk_format = vk_format; const struct v3dv_format *format = v3dv_X(device, get_format)(vk_format); - assert(v3dv_X(device, tfu_supports_tex_format)(format->tex_type)); + assert(format->plane_count == 1); + assert(v3dv_X(device, tfu_supports_tex_format)(format->planes[0].tex_type)); return format; } -static VkFormatFeatureFlags -image_format_features(struct v3dv_physical_device *pdevice, - VkFormat vk_format, - const struct v3dv_format *v3dv_format, - VkImageTiling tiling) +static VkFormatFeatureFlags2 +image_format_plane_features(struct v3dv_physical_device *pdevice, + VkFormat vk_format, + const struct v3dv_format_plane *v3dv_format, + VkImageTiling tiling) { - if (!v3dv_format || !v3dv_format->supported) - return 0; - const VkImageAspectFlags aspects = vk_format_aspects(vk_format); const VkImageAspectFlags zs_aspects = VK_IMAGE_ASPECT_DEPTH_BIT | @@ -114,7 +140,7 @@ image_format_features(struct v3dv_physical_device *pdevice, return 0; } - VkFormatFeatureFlags flags = 0; + VkFormatFeatureFlags2 flags = 0; /* Raster format is only supported for 1D textures, so let's just * always require optimal tiling for anything that requires sampling. @@ -123,55 +149,127 @@ image_format_features(struct v3dv_physical_device *pdevice, */ if (v3dv_format->tex_type != TEXTURE_DATA_FORMAT_NO && tiling == VK_IMAGE_TILING_OPTIMAL) { - flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | - VK_FORMAT_FEATURE_BLIT_SRC_BIT; + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_2_BLIT_SRC_BIT; - if (v3dv_format->supports_filtering) - flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; } if (v3dv_format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) { if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) { - flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | - VK_FORMAT_FEATURE_BLIT_DST_BIT; - if (v3dv_X(pdevice, format_supports_blending)(v3dv_format)) - flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; + flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_BLIT_DST_BIT; } else if (aspects & zs_aspects) { - flags |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT | - VK_FORMAT_FEATURE_BLIT_DST_BIT; + flags |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_BLIT_DST_BIT; } } const struct util_format_description *desc = vk_format_description(vk_format); - assert(desc); - if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->is_array) { - flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT; - if (desc->nr_channels == 1 && vk_format_is_int(vk_format)) - flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT; - } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 || - vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 || - vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) { - /* To comply with shaderStorageImageExtendedFormats */ - flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT; + if (tiling != VK_IMAGE_TILING_LINEAR) { + if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->is_array) { + flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT | + VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT; + if (desc->nr_channels == 1 && vk_format_is_int(vk_format)) + flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT; + } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 || + vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32 || + vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 || + vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) { + /* To comply with shaderStorageImageExtendedFormats */ + flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT | + VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT; + } + } + + /* All our depth formats support shadow comparisons. */ + if (vk_format_has_depth(vk_format) && + (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) { + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT; } if (flags) { - flags |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | - VK_FORMAT_FEATURE_TRANSFER_DST_BIT; + flags |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; } return flags; } -static VkFormatFeatureFlags +static VkFormatFeatureFlags2 +image_format_features(struct v3dv_physical_device *pdevice, + VkFormat vk_format, + const struct v3dv_format *v3dv_format, + VkImageTiling tiling) +{ + if (!v3dv_format || !v3dv_format->plane_count) + return 0; + + VkFormatFeatureFlags2 flags = ~0ull; + for (uint8_t plane = 0; + flags && plane < v3dv_format->plane_count; + plane++) { + VkFormat plane_format = vk_format_get_plane_format(vk_format, plane); + + flags &= image_format_plane_features(pdevice, + plane_format, + &v3dv_format->planes[plane], + tiling); + } + + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(vk_format); + + if (ycbcr_info) { + assert(v3dv_format->plane_count == ycbcr_info->n_planes); + + flags |= VK_FORMAT_FEATURE_2_DISJOINT_BIT; + + if (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT) { + flags |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT; + for (unsigned p = 0; p < ycbcr_info->n_planes; p++) { + if (ycbcr_info->planes[p].denominator_scales[0] > 1 || + ycbcr_info->planes[p].denominator_scales[1] > 1) { + flags |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT; + break; + } + } + } + + /* FIXME: in the future we should be able to support BLIT_SRC via the + * blit_shader path + */ + const VkFormatFeatureFlags2 disallowed_ycbcr_image_features = + VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_2_BLIT_DST_BIT | + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT | + VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT; + + flags &= ~disallowed_ycbcr_image_features; + } + + if (flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT && + v3dv_format->supports_filtering) { + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + } + + if (flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT && + v3dv_X(pdevice, format_supports_blending)(v3dv_format)) { + flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT; + } + + return flags; +} + +static VkFormatFeatureFlags2 buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format) { - if (!v3dv_format || !v3dv_format->supported) + if (!v3dv_format) return 0; - if (!v3dv_format->supported) + if (v3dv_format->plane_count != 1) return 0; /* We probably only want to support buffer formats that have a @@ -182,32 +280,39 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format const struct util_format_description *desc = vk_format_description(vk_format); - assert(desc); - VkFormatFeatureFlags flags = 0; + VkFormatFeatureFlags2 flags = 0; if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && desc->is_array) { - flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT; - if (v3dv_format->tex_type != TEXTURE_DATA_FORMAT_NO) { - flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT | - VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT; + flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT; + if (v3dv_format->planes[0].tex_type != TEXTURE_DATA_FORMAT_NO) { + /* STORAGE_READ_WITHOUT_FORMAT can also be applied for buffers. From spec: + * "VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT specifies + * that image views or buffer views created with this format can + * be used as storage images for read operations without + * specifying a format." + */ + flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT | + VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT | + VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT; } - } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32) { - flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT | - VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT | - VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT; + } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 || + vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) { + flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT | + VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT | + VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT; } else if (vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 || vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) { - flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT | - VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT; + flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT | + VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT; } if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && desc->is_array && desc->nr_channels == 1 && vk_format_is_int(vk_format)) { - flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT; + flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT; } return flags; @@ -216,48 +321,44 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format bool v3dv_buffer_format_supports_features(struct v3dv_device *device, VkFormat vk_format, - VkFormatFeatureFlags features) + VkFormatFeatureFlags2 features) { const struct v3dv_format *v3dv_format = v3dv_X(device, get_format)(vk_format); - const VkFormatFeatureFlags supported = + const VkFormatFeatureFlags2 supported = buffer_format_features(vk_format, v3dv_format); return (supported & features) == features; } VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice, - VkFormat format, - VkFormatProperties* pFormatProperties) +v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice, + VkFormat format, + VkFormatProperties2 *pFormatProperties) { V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice); const struct v3dv_format *v3dv_format = v3dv_X(pdevice, get_format)(format); - *pFormatProperties = (VkFormatProperties) { - .linearTilingFeatures = - image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_LINEAR), - .optimalTilingFeatures = - image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_OPTIMAL), - .bufferFeatures = - buffer_format_features(format, v3dv_format), + VkFormatFeatureFlags2 linear2, optimal2, buffer2; + linear2 = image_format_features(pdevice, format, v3dv_format, + VK_IMAGE_TILING_LINEAR); + optimal2 = image_format_features(pdevice, format, v3dv_format, + VK_IMAGE_TILING_OPTIMAL); + buffer2 = buffer_format_features(format, v3dv_format); + pFormatProperties->formatProperties = (VkFormatProperties) { + .linearTilingFeatures = vk_format_features2_to_features(linear2), + .optimalTilingFeatures = vk_format_features2_to_features(optimal2), + .bufferFeatures = vk_format_features2_to_features(buffer2), }; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice, - VkFormat format, - VkFormatProperties2 *pFormatProperties) -{ - v3dv_GetPhysicalDeviceFormatProperties(physicalDevice, format, - &pFormatProperties->formatProperties); vk_foreach_struct(ext, pFormatProperties->pNext) { switch ((unsigned)ext->sType) { case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT: { struct VkDrmFormatModifierPropertiesListEXT *list = (void *)ext; - VK_OUTARRAY_MAKE(out, list->pDrmFormatModifierProperties, - &list->drmFormatModifierCount); + VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out, + list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); if (pFormatProperties->formatProperties.linearTilingFeatures) { - vk_outarray_append(&out, mod_props) { + vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, + &out, mod_props) { mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR; mod_props->drmFormatModifierPlaneCount = 1; mod_props->drmFormatModifierTilingFeatures = @@ -265,7 +366,8 @@ v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice, } } if (pFormatProperties->formatProperties.optimalTilingFeatures) { - vk_outarray_append(&out, mod_props) { + vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, + &out, mod_props) { mod_props->drmFormatModifier = DRM_FORMAT_MOD_BROADCOM_UIF; mod_props->drmFormatModifierPlaneCount = 1; mod_props->drmFormatModifierTilingFeatures = @@ -274,6 +376,36 @@ v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice, } break; } + case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT: { + struct VkDrmFormatModifierPropertiesList2EXT *list = (void *)ext; + VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out, + list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); + if (linear2) { + vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, + &out, mod_props) { + mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR; + mod_props->drmFormatModifierPlaneCount = 1; + mod_props->drmFormatModifierTilingFeatures = linear2; + } + } + if (optimal2) { + vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, + &out, mod_props) { + mod_props->drmFormatModifier = DRM_FORMAT_MOD_BROADCOM_UIF; + mod_props->drmFormatModifierPlaneCount = 1; + mod_props->drmFormatModifierTilingFeatures = optimal2; + } + } + break; + } + case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: { + VkFormatProperties3 *props = (VkFormatProperties3 *)ext; + props->linearTilingFeatures = linear2; + props->optimalTilingFeatures = optimal2; + props->bufferFeatures = buffer2; + break; + } default: v3dv_debug_ignored_stype(ext->sType); break; @@ -290,7 +422,7 @@ get_image_format_properties( VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties) { const struct v3dv_format *v3dv_format = v3dv_X(physical_device, get_format)(info->format); - VkFormatFeatureFlags format_feature_flags = + VkFormatFeatureFlags2 format_feature_flags = image_format_features(physical_device, info->format, v3dv_format, tiling); if (!format_feature_flags) goto unsupported; @@ -307,8 +439,24 @@ get_image_format_properties( if (info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) goto unsupported; - if (info->usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) { - if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_SRC_BIT)) { + const VkImageStencilUsageCreateInfo *stencil_usage_info = + vk_find_struct_const(info->pNext, IMAGE_STENCIL_USAGE_CREATE_INFO); + + VkImageUsageFlags image_usage = + info->usage | (stencil_usage_info ? stencil_usage_info->stencilUsage : 0); + + /* If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set it means the usage flags may + * not be be supported for the image format but are supported for at least + * one compatible format from which an image view can be created for the + * image. This means we should not report the format as unsupported based + * on the usage flags when usage refers to how an image view may be used + * (i.e. as a framebuffer attachment, for sampling, etc). + */ + VkImageUsageFlags view_usage = + info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT ? 0 : image_usage; + + if (image_usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT)) { goto unsupported; } @@ -323,16 +471,16 @@ get_image_format_properties( } } - if (info->usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) { - if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_DST_BIT)) { + if (image_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT)) { goto unsupported; } } - if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) { - if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) { + if (view_usage & (VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) goto unsupported; - } /* Sampling of raster depth/stencil images is not supported. Since 1D * images are always raster, even if the user requested optimal tiling, @@ -344,50 +492,47 @@ get_image_format_properties( } } - if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) { - if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) { + if (view_usage & VK_IMAGE_USAGE_STORAGE_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT)) { goto unsupported; } } - if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { - if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) { + if (view_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT)) { goto unsupported; } } - if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { + if (view_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { if (!(format_feature_flags & - VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) { + VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) { goto unsupported; } } - /* FIXME: these are taken from VkPhysicalDeviceLimits, we should just put - * these limits available in the physical device and read them from there - * wherever we need them. - */ switch (info->type) { case VK_IMAGE_TYPE_1D: - pImageFormatProperties->maxExtent.width = 4096; + pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION; pImageFormatProperties->maxExtent.height = 1; pImageFormatProperties->maxExtent.depth = 1; - pImageFormatProperties->maxArrayLayers = 2048; - pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */ + pImageFormatProperties->maxArrayLayers = V3D_MAX_ARRAY_LAYERS; + pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS; break; case VK_IMAGE_TYPE_2D: - pImageFormatProperties->maxExtent.width = 4096; - pImageFormatProperties->maxExtent.height = 4096; + pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION; + pImageFormatProperties->maxExtent.height = V3D_MAX_IMAGE_DIMENSION; pImageFormatProperties->maxExtent.depth = 1; - pImageFormatProperties->maxArrayLayers = 2048; - pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */ + pImageFormatProperties->maxArrayLayers = + v3dv_format->plane_count == 1 ? V3D_MAX_ARRAY_LAYERS : 1; + pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS; break; case VK_IMAGE_TYPE_3D: - pImageFormatProperties->maxExtent.width = 4096; - pImageFormatProperties->maxExtent.height = 4096; - pImageFormatProperties->maxExtent.depth = 4096; + pImageFormatProperties->maxExtent.width = V3D_MAX_IMAGE_DIMENSION; + pImageFormatProperties->maxExtent.height = V3D_MAX_IMAGE_DIMENSION; + pImageFormatProperties->maxExtent.depth = V3D_MAX_IMAGE_DIMENSION; pImageFormatProperties->maxArrayLayers = 1; - pImageFormatProperties->maxMipLevels = 13; /* log2(maxWidth) + 1 */ + pImageFormatProperties->maxMipLevels = V3D_MAX_MIP_LEVELS; break; default: unreachable("bad VkImageType"); @@ -416,16 +561,50 @@ get_image_format_properties( if (tiling != VK_IMAGE_TILING_LINEAR && info->type == VK_IMAGE_TYPE_2D && !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) && - (format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT || - format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) { + (format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT || + format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) { pImageFormatProperties->sampleCounts |= VK_SAMPLE_COUNT_4_BIT; } if (tiling == VK_IMAGE_TILING_LINEAR) pImageFormatProperties->maxMipLevels = 1; + /* From the Vulkan 1.2 spec, section 12.3. Images, VkImageCreateInfo structure: + * + * "Images created with one of the formats that require a sampler Y′CBCR + * conversion, have further restrictions on their limits and + * capabilities compared to images created with other formats. Creation + * of images with a format requiring Y′CBCR conversion may not be + * supported unless other parameters meet all of the constraints: + * + * * imageType is VK_IMAGE_TYPE_2D + * * mipLevels is 1 + * * arrayLayers is 1, unless the ycbcrImageArrays feature is enabled, or + * otherwise indicated by VkImageFormatProperties::maxArrayLayers, as + * returned by vkGetPhysicalDeviceImageFormatProperties + * * samples is VK_SAMPLE_COUNT_1_BIT + * + * Implementations may support additional limits and capabilities beyond + * those listed above." + * + * We don't provide such additional limits, so we set those limits, or just + * return unsupported. + */ + if (vk_format_get_plane_count(info->format) > 1) { + if (info->type != VK_IMAGE_TYPE_2D) + goto unsupported; + pImageFormatProperties->maxMipLevels = 1; + pImageFormatProperties->maxArrayLayers = 1; + pImageFormatProperties->sampleCounts = VK_SAMPLE_COUNT_1_BIT; + } + pImageFormatProperties->maxResourceSize = 0xffffffff; /* 32-bit allocation */ + if (pYcbcrImageFormatProperties) { + pYcbcrImageFormatProperties->combinedImageSamplerDescriptorCount = + vk_format_get_plane_count(info->format); + } + return VK_SUCCESS; unsupported: @@ -486,6 +665,8 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL; const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *drm_format_mod_info = NULL; VkExternalImageFormatProperties *external_props = NULL; + UNUSED VkAndroidHardwareBufferUsageANDROID *android_usage = NULL; + VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL; VkImageTiling tiling = base_info->tiling; /* Extract input structs */ @@ -494,6 +675,9 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO: external_info = (const void *) s; break; + case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO: + /* Do nothing, get_image_format_properties() below will handle it */; + break; case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT: drm_format_mod_info = (const void *) s; switch (drm_format_mod_info->drmFormatModifier) { @@ -522,6 +706,12 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: external_props = (void *) s; break; + case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID: + android_usage = (void *)s; + break; + case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: + ycbcr_props = (void *) s; + break; default: v3dv_debug_ignored_stype(s->sType); break; @@ -530,7 +720,8 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, VkResult result = get_image_format_properties(physical_device, base_info, tiling, - &base_props->imageFormatProperties, NULL); + &base_props->imageFormatProperties, + ycbcr_props); if (result != VK_SUCCESS) goto done; @@ -541,12 +732,28 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, if (external_props) external_props->externalMemoryProperties = prime_fd_props; break; +#if DETECT_OS_ANDROID + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID: + if (external_props) { + external_props->externalMemoryProperties.exportFromImportedHandleTypes = 0; + external_props->externalMemoryProperties.compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID; + external_props->externalMemoryProperties.externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT | VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT; + } + break; +#endif default: result = VK_ERROR_FORMAT_NOT_SUPPORTED; break; } } + if (android_usage) { +#if DETECT_OS_ANDROID + android_usage->androidHardwareBufferUsage = + vk_image_usage_to_ahb_usage(base_info->flags, base_info->usage); +#endif + } + done: return result; } diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c index c7ae05c4c22..358c03c555f 100644 --- a/src/broadcom/vulkan/v3dv_image.c +++ b/src/broadcom/vulkan/v3dv_image.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,9 +26,11 @@ #include "drm-uapi/drm_fourcc.h" #include "util/format/u_format.h" #include "util/u_math.h" -#include "vk_format_info.h" #include "vk_util.h" #include "vulkan/wsi/wsi_common.h" +#if DETECT_OS_ANDROID +#include "vk_android.h" +#endif /** * Computes the HW's UIFblock padding for a given height/cpp. @@ -71,32 +73,61 @@ v3d_get_ub_pad(uint32_t cpp, uint32_t height) return 0; } -static void -v3d_setup_slices(struct v3dv_image *image) +/** + * Computes the dimension with required padding for mip levels. + * + * This padding is required for width and height dimensions when the mip + * level is greater than 1, and for the depth dimension when the mip level + * is greater than 0. This function expects to be passed a mip level >= 1. + * + * Note: Hardware documentation seems to suggest that the third argument + * should be the utile dimensions, but through testing it was found that + * the block dimension should be used instead. + */ +static uint32_t +v3d_get_dimension_mpad(uint32_t dimension, uint32_t level, uint32_t block_dimension) { - assert(image->cpp > 0); + assert(level >= 1); + uint32_t pot_dim = u_minify(dimension, 1); + pot_dim = util_next_power_of_two(DIV_ROUND_UP(pot_dim, block_dimension)); + uint32_t padded_dim = block_dimension * pot_dim; + return u_minify(padded_dim, level - 1); +} - uint32_t width = image->vk.extent.width; - uint32_t height = image->vk.extent.height; - uint32_t depth = image->vk.extent.depth; +static bool +v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane, + uint32_t plane_offset, + const VkSubresourceLayout *plane_layouts) +{ + assert(image->planes[plane].cpp > 0); - /* Note that power-of-two padding is based on level 1. These are not - * equivalent to just util_next_power_of_two(dimension), because at a - * level 0 dimension of 9, the level 1 power-of-two padded value is 4, - * not 8. - */ - uint32_t pot_width = 2 * util_next_power_of_two(u_minify(width, 1)); - uint32_t pot_height = 2 * util_next_power_of_two(u_minify(height, 1)); - uint32_t pot_depth = 2 * util_next_power_of_two(u_minify(depth, 1)); + uint32_t width = image->planes[plane].width; + uint32_t height = image->planes[plane].height; + uint32_t depth = image->vk.extent.depth; - uint32_t utile_w = v3d_utile_width(image->cpp); - uint32_t utile_h = v3d_utile_height(image->cpp); + uint32_t utile_w = v3d_utile_width(image->planes[plane].cpp); + uint32_t utile_h = v3d_utile_height(image->planes[plane].cpp); uint32_t uif_block_w = utile_w * 2; uint32_t uif_block_h = utile_h * 2; uint32_t block_width = vk_format_get_blockwidth(image->vk.format); uint32_t block_height = vk_format_get_blockheight(image->vk.format); + /* Note that power-of-two padding is based on level 1. These are not + * equivalent to just util_next_power_of_two(dimension), because at a + * level 0 dimension of 9, the level 1 power-of-two padded value is 4, + * not 8. Additionally the pot padding is based on the block size. + */ + uint32_t pot_width = 2 * v3d_get_dimension_mpad(width, + 1, + block_width); + uint32_t pot_height = 2 * v3d_get_dimension_mpad(height, + 1, + block_height); + uint32_t pot_depth = 2 * v3d_get_dimension_mpad(depth, + 1, + 1); + assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT || image->vk.samples == VK_SAMPLE_COUNT_4_BIT); bool msaa = image->vk.samples != VK_SAMPLE_COUNT_1_BIT; @@ -107,14 +138,30 @@ v3d_setup_slices(struct v3dv_image *image) assert(depth > 0); assert(image->vk.mip_levels >= 1); - uint32_t offset = 0; + /* Texture Base Address needs to be 64-byte aligned. If we have an explicit + * plane layout we will return false to fail image creation with appropriate + * error code. + */ + uint32_t offset; + if (plane_layouts) { + offset = plane_layouts[plane].offset; + if (offset % 64 != 0) + return false; + } else { + offset = plane_offset; + } + assert(plane_offset % 64 == 0); + for (int32_t i = image->vk.mip_levels - 1; i >= 0; i--) { - struct v3d_resource_slice *slice = &image->slices[i]; + struct v3d_resource_slice *slice = &image->planes[plane].slices[i]; + + slice->width = u_minify(width, i); + slice->height = u_minify(height, i); uint32_t level_width, level_height, level_depth; if (i < 2) { - level_width = u_minify(width, i); - level_height = u_minify(height, i); + level_width = slice->width; + level_height = slice->height; } else { level_width = u_minify(pot_width, i); level_height = u_minify(pot_height, i); @@ -136,7 +183,7 @@ v3d_setup_slices(struct v3dv_image *image) if (!image->tiled) { slice->tiling = V3D_TILING_RASTER; if (image->vk.image_type == VK_IMAGE_TYPE_1D) - level_width = align(level_width, 64 / image->cpp); + level_width = align(level_width, 64 / image->planes[plane].cpp); } else { if ((i != 0 || !uif_top) && (level_width <= utile_w || level_height <= utile_h)) { @@ -158,7 +205,8 @@ v3d_setup_slices(struct v3dv_image *image) level_width = align(level_width, 4 * uif_block_w); level_height = align(level_height, uif_block_h); - slice->ub_pad = v3d_get_ub_pad(image->cpp, level_height); + slice->ub_pad = v3d_get_ub_pad(image->planes[plane].cpp, + level_height); level_height += slice->ub_pad * uif_block_h; /* If the padding set us to to be aligned to the page cache size, @@ -175,12 +223,25 @@ v3d_setup_slices(struct v3dv_image *image) } slice->offset = offset; - slice->stride = level_width * image->cpp; + slice->stride = level_width * image->planes[plane].cpp; + + /* We assume that rowPitch in the plane layout refers to level 0 */ + if (plane_layouts && i == 0) { + if (plane_layouts[plane].rowPitch < slice->stride) + return false; + if (plane_layouts[plane].rowPitch % image->planes[plane].cpp) + return false; + if (image->tiled && (plane_layouts[plane].rowPitch % (4 * uif_block_w))) + return false; + slice->stride = plane_layouts[plane].rowPitch; + } + slice->padded_height = level_height; if (slice->tiling == V3D_TILING_UIF_NO_XOR || slice->tiling == V3D_TILING_UIF_XOR) { slice->padded_height_of_output_image_in_uif_blocks = - slice->padded_height / (2 * v3d_utile_height(image->cpp)); + slice->padded_height / + (2 * v3d_utile_height(image->planes[plane].cpp)); } slice->size = level_height * slice->stride; @@ -188,7 +249,7 @@ v3d_setup_slices(struct v3dv_image *image) /* The HW aligns level 1's base to a page if any of level 1 or * below could be UIF XOR. The lower levels then inherit the - * alignment for as long as necesary, thanks to being power of + * alignment for as long as necessary, thanks to being power of * two aligned. */ if (i == 1 && @@ -200,7 +261,7 @@ v3d_setup_slices(struct v3dv_image *image) offset += slice_total_size; } - image->size = offset; + image->planes[plane].size = offset - plane_offset; /* UIF/UBLINEAR levels need to be aligned to UIF-blocks, and LT only * needs to be aligned to utile boundaries. Since tiles are laid out @@ -209,14 +270,27 @@ v3d_setup_slices(struct v3dv_image *image) * slices. * * We additionally align to 4k, which improves UIF XOR performance. + * + * Finally, because the Texture Base Address field must be 64-byte aligned, + * we also need to align linear images to 64 if the image is going to be + * used for transfer. */ - image->alignment = image->tiled ? 4096 : image->cpp; + if (image->tiled) { + image->planes[plane].alignment = 4096; + } else { + image->planes[plane].alignment = + (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) ? + 64 : image->planes[plane].cpp; + } + uint32_t align_offset = - align(image->slices[0].offset, image->alignment) - image->slices[0].offset; + align(image->planes[plane].slices[0].offset, + image->planes[plane].alignment) - + image->planes[plane].slices[0].offset; if (align_offset) { - image->size += align_offset; + image->planes[plane].size += align_offset; for (int i = 0; i < image->vk.mip_levels; i++) - image->slices[i].offset += align_offset; + image->planes[plane].slices[i].offset += align_offset; } /* Arrays and cube textures have a stride which is the distance from @@ -224,41 +298,112 @@ v3d_setup_slices(struct v3dv_image *image) * we need to program the stride between slices of miplevel 0. */ if (image->vk.image_type != VK_IMAGE_TYPE_3D) { - image->cube_map_stride = - align(image->slices[0].offset + image->slices[0].size, 64); - image->size += image->cube_map_stride * (image->vk.array_layers - 1); + image->planes[plane].cube_map_stride = + align(image->planes[plane].slices[0].offset + + image->planes[plane].slices[0].size, 64); + + if (plane_layouts && image->vk.array_layers > 1) { + if (plane_layouts[plane].arrayPitch % 64 != 0) + return false; + if (plane_layouts[plane].arrayPitch < + image->planes[plane].cube_map_stride) { + return false; + } + image->planes[plane].cube_map_stride = plane_layouts[plane].arrayPitch; + } + + image->planes[plane].size += image->planes[plane].cube_map_stride * + (image->vk.array_layers - 1); } else { - image->cube_map_stride = image->slices[0].size; + image->planes[plane].cube_map_stride = image->planes[plane].slices[0].size; + if (plane_layouts) { + /* We assume that depthPitch in the plane layout refers to level 0 */ + if (plane_layouts[plane].depthPitch != + image->planes[plane].slices[0].size) { + return false; + } + } + } + + return true; +} + +static VkResult +v3d_setup_slices(struct v3dv_image *image, bool disjoint, + const VkSubresourceLayout *plane_layouts) +{ + if (disjoint && image->plane_count == 1) + disjoint = false; + + uint64_t offset = 0; + for (uint8_t plane = 0; plane < image->plane_count; plane++) { + offset = disjoint ? 0 : offset; + if (!v3d_setup_plane_slices(image, plane, offset, plane_layouts)) { + assert(plane_layouts); + return VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT; + } + offset += align64(image->planes[plane].size, 64); } + + /* From the Vulkan spec: + * + * "If the size of the resultant image would exceed maxResourceSize, then + * vkCreateImage must fail and return VK_ERROR_OUT_OF_DEVICE_MEMORY. This + * failure may occur even when all image creation parameters satisfy their + * valid usage requirements." + */ + if (offset > 0xffffffff) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + image->non_disjoint_size = disjoint ? 0 : offset; + return VK_SUCCESS; } uint32_t -v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer) +v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer, + uint8_t plane) { - const struct v3d_resource_slice *slice = &image->slices[level]; + const struct v3d_resource_slice *slice = &image->planes[plane].slices[level]; if (image->vk.image_type == VK_IMAGE_TYPE_3D) - return image->mem_offset + slice->offset + layer * slice->size; + return image->planes[plane].mem_offset + slice->offset + layer * slice->size; else - return image->mem_offset + slice->offset + layer * image->cube_map_stride; + return image->planes[plane].mem_offset + slice->offset + + layer * image->planes[plane].cube_map_stride; } -static VkResult -create_image(struct v3dv_device *device, - const VkImageCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkImage *pImage) +VkResult +v3dv_update_image_layout(struct v3dv_device *device, + struct v3dv_image *image, + uint64_t modifier, + bool disjoint, + const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info) { - struct v3dv_image *image = NULL; + assert(!explicit_mod_info || + image->plane_count == explicit_mod_info->drmFormatModifierPlaneCount); - image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image)); - if (image == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + assert(!explicit_mod_info || + modifier == explicit_mod_info->drmFormatModifier); + + image->tiled = modifier != DRM_FORMAT_MOD_LINEAR; + + image->vk.drm_format_mod = modifier; + + return v3d_setup_slices(image, disjoint, + explicit_mod_info ? explicit_mod_info->pPlaneLayouts : + NULL); +} +VkResult +v3dv_image_init(struct v3dv_device *device, + const VkImageCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + struct v3dv_image *image) +{ /* When using the simulator the WSI common code will see that our * driver wsi device doesn't match the display device and because of that * it will not attempt to present directly from the swapchain images, - * instead it will use the prime blit path (use_prime_blit flag in + * instead it will use the prime blit path (use_buffer_blit flag in * struct wsi_swapchain), where it copies the contents of the swapchain * images to a linear buffer with appropriate row stride for presentation. * As a result, on that path, swapchain images do not have any special @@ -266,11 +411,20 @@ create_image(struct v3dv_device *device, */ VkImageTiling tiling = pCreateInfo->tiling; uint64_t modifier = DRM_FORMAT_MOD_INVALID; + const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = NULL; + const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info = NULL; +#if DETECT_OS_ANDROID + if (image->is_native_buffer_memory) { + assert(image->android_explicit_layout); + explicit_mod_info = image->android_explicit_layout; + modifier = explicit_mod_info->drmFormatModifier; + } +#endif if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { - const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = + mod_info = vk_find_struct_const(pCreateInfo->pNext, IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT); - const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info = + explicit_mod_info = vk_find_struct_const(pCreateInfo->pNext, IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT); assert(mod_info || explicit_mod_info); @@ -297,21 +451,42 @@ create_image(struct v3dv_device *device, tiling = VK_IMAGE_TILING_LINEAR; } + if (modifier == DRM_FORMAT_MOD_INVALID) + modifier = (tiling == VK_IMAGE_TILING_OPTIMAL) ? DRM_FORMAT_MOD_BROADCOM_UIF + : DRM_FORMAT_MOD_LINEAR; + const struct v3dv_format *format = - v3dv_X(device, get_format)(pCreateInfo->format); - v3dv_assert(format != NULL && format->supported); + v3dv_X(device, get_format)(image->vk.format); + v3dv_assert(format != NULL && format->plane_count); assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT || pCreateInfo->samples == VK_SAMPLE_COUNT_4_BIT); image->format = format; - image->cpp = vk_format_get_blocksize(image->vk.format); - image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL || - (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT && - modifier != DRM_FORMAT_MOD_LINEAR); - image->vk.tiling = tiling; - image->vk.drm_format_mod = modifier; + image->plane_count = vk_format_get_plane_count(image->vk.format); + + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(image->vk.format); + + for (uint8_t plane = 0; plane < image->plane_count; plane++) { + VkFormat plane_format = + vk_format_get_plane_format(image->vk.format, plane); + image->planes[plane].cpp = + vk_format_get_blocksize(plane_format); + image->planes[plane].vk_format = plane_format; + + image->planes[plane].width = image->vk.extent.width; + image->planes[plane].height = image->vk.extent.height; + + if (ycbcr_info) { + image->planes[plane].width /= + ycbcr_info->planes[plane].denominator_scales[0]; + + image->planes[plane].height /= + ycbcr_info->planes[plane].denominator_scales[1]; + } + } /* Our meta paths can create image views with compatible formats for any * image, so always set this flag to keep the common Vulkan image code @@ -319,11 +494,112 @@ create_image(struct v3dv_device *device, */ image->vk.create_flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; - v3d_setup_slices(image); +#if DETECT_OS_ANDROID + /* At this time, an AHB handle is not yet provided. + * Image layout will be filled up during vkBindImageMemory2 + */ + if (image->is_ahb) + return VK_SUCCESS; +#endif + + bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT; + + return v3dv_update_image_layout(device, image, modifier, disjoint, + explicit_mod_info); +} + +static VkResult +create_image(struct v3dv_device *device, + const VkImageCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkImage *pImage) +{ + VkResult result; + struct v3dv_image *image = NULL; + + image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image)); + if (image == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + +#if DETECT_OS_ANDROID + const VkExternalMemoryImageCreateInfo *external_info = + vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO); + + const VkNativeBufferANDROID *native_buffer = + vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID); + + if (native_buffer != NULL) + image->is_native_buffer_memory = true; + + image->is_ahb = external_info && (external_info->handleTypes & + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID); + + assert(!(image->is_ahb && image->is_native_buffer_memory)); + + if (image->is_ahb || image->is_native_buffer_memory) { + image->android_explicit_layout = vk_alloc2(&device->vk.alloc, pAllocator, + sizeof(VkImageDrmFormatModifierExplicitCreateInfoEXT), + 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image->android_explicit_layout) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + image->android_plane_layouts = vk_alloc2(&device->vk.alloc, pAllocator, + sizeof(VkSubresourceLayout) * V3DV_MAX_PLANE_COUNT, + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image->android_plane_layouts) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + } + + if (image->is_native_buffer_memory) { + struct u_gralloc_buffer_handle gr_handle = { + .handle = native_buffer->handle, + .hal_format = native_buffer->format, + .pixel_stride = native_buffer->stride, + }; + + result = v3dv_gralloc_to_drm_explicit_layout(device->gralloc, + &gr_handle, + image->android_explicit_layout, + image->android_plane_layouts, + V3DV_MAX_PLANE_COUNT); + if (result != VK_SUCCESS) + goto fail; + } +#endif + + result = v3dv_image_init(device, pCreateInfo, pAllocator, image); + if (result != VK_SUCCESS) + goto fail; + +#if DETECT_OS_ANDROID + if (image->is_native_buffer_memory) { + result = v3dv_import_native_buffer_fd(v3dv_device_to_handle(device), + native_buffer->handle->data[0], pAllocator, + v3dv_image_to_handle(image)); + if (result != VK_SUCCESS) + goto fail; + } +#endif *pImage = v3dv_image_to_handle(image); return VK_SUCCESS; + +fail: +#if DETECT_OS_ANDROID + if (image->android_explicit_layout) + vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout); + if (image->android_plane_layouts) + vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts); +#endif + + vk_image_destroy(&device->vk, pAllocator, &image->vk); + return result; } static VkResult @@ -381,8 +657,14 @@ v3dv_CreateImage(VkDevice _device, { V3DV_FROM_HANDLE(v3dv_device, device, _device); +#if DETECT_OS_ANDROID + /* VkImageSwapchainCreateInfoKHR is not useful at all */ + const VkImageSwapchainCreateInfoKHR *swapchain_info = NULL; +#else const VkImageSwapchainCreateInfoKHR *swapchain_info = vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR); +#endif + if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) return create_image_from_swapchain(device, pCreateInfo, swapchain_info, pAllocator, pImage); @@ -398,13 +680,30 @@ v3dv_GetImageSubresourceLayout(VkDevice device, { V3DV_FROM_HANDLE(v3dv_image, image, _image); + uint8_t plane = v3dv_plane_from_aspect(subresource->aspectMask); const struct v3d_resource_slice *slice = - &image->slices[subresource->mipLevel]; + &image->planes[plane].slices[subresource->mipLevel]; + + /* About why the offset below works for both disjoint and non-disjoint + * cases, from the Vulkan spec: + * + * "If the image is disjoint, then the offset is relative to the base + * address of the plane." + * + * "If the image is non-disjoint, then the offset is relative to the base + * address of the image." + * + * In our case, the per-plane mem_offset for non-disjoint images is the + * same for all planes and matches the base address of the image. + */ layout->offset = - v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer); + v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer, + plane) - image->planes[plane].mem_offset; layout->rowPitch = slice->stride; - layout->depthPitch = image->cube_map_stride; - layout->arrayPitch = image->cube_map_stride; + layout->depthPitch = image->vk.image_type == VK_IMAGE_TYPE_3D ? + image->planes[plane].cube_map_stride : 0; + layout->arrayPitch = image->vk.array_layers > 1 ? + image->planes[plane].cube_map_stride : 0; if (image->vk.image_type != VK_IMAGE_TYPE_3D) { layout->size = slice->size; @@ -419,7 +718,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device, layout->size = slice->size * image->vk.extent.depth; } else { const struct v3d_resource_slice *prev_slice = - &image->slices[subresource->mipLevel - 1]; + &image->planes[plane].slices[subresource->mipLevel - 1]; layout->size = prev_slice->offset - slice->offset; } } @@ -436,6 +735,35 @@ v3dv_DestroyImage(VkDevice _device, if (image == NULL) return; + /* If we have created a shadow tiled image for this image we must also free + * it (along with its memory allocation). + */ + if (image->shadow) { + bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT; + for (int i = 0; i < (disjoint ? image->plane_count : 1); i++) { + if (image->shadow->planes[i].mem) { + v3dv_FreeMemory(_device, + v3dv_device_memory_to_handle(image->shadow->planes[i].mem), + pAllocator); + } + } + v3dv_DestroyImage(_device, v3dv_image_to_handle(image->shadow), + pAllocator); + image->shadow = NULL; + } + +#if DETECT_OS_ANDROID + if (image->is_native_buffer_memory) + v3dv_FreeMemory(_device, + v3dv_device_memory_to_handle(image->planes[0].mem), + pAllocator); + + if (image->android_explicit_layout) + vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout); + if (image->android_plane_layouts) + vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts); +#endif + vk_image_destroy(&device->vk, pAllocator, &image->vk); } @@ -451,96 +779,102 @@ v3dv_image_type_to_view_type(VkImageType type) } } -static enum pipe_swizzle -vk_component_mapping_to_pipe_swizzle(VkComponentSwizzle swz) -{ - assert(swz != VK_COMPONENT_SWIZZLE_IDENTITY); - - switch (swz) { - case VK_COMPONENT_SWIZZLE_ZERO: - return PIPE_SWIZZLE_0; - case VK_COMPONENT_SWIZZLE_ONE: - return PIPE_SWIZZLE_1; - case VK_COMPONENT_SWIZZLE_R: - return PIPE_SWIZZLE_X; - case VK_COMPONENT_SWIZZLE_G: - return PIPE_SWIZZLE_Y; - case VK_COMPONENT_SWIZZLE_B: - return PIPE_SWIZZLE_Z; - case VK_COMPONENT_SWIZZLE_A: - return PIPE_SWIZZLE_W; - default: - unreachable("Unknown VkComponentSwizzle"); - }; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateImageView(VkDevice _device, - const VkImageViewCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkImageView *pView) +static VkResult +create_image_view(struct v3dv_device *device, + bool driver_internal, + const VkImageViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkImageView *pView) { - V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_image, image, pCreateInfo->image); struct v3dv_image_view *iview; - iview = vk_image_view_create(&device->vk, pCreateInfo, pAllocator, - sizeof(*iview)); + iview = vk_image_view_create(&device->vk, driver_internal, pCreateInfo, + pAllocator, sizeof(*iview)); if (iview == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + const VkImageAspectFlagBits any_plane_aspect = + VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; + + if (image->vk.aspects & any_plane_aspect) { + assert((image->vk.aspects & ~any_plane_aspect) == 0); + iview->plane_count = 0; + static const VkImageAspectFlagBits plane_aspects[]= { + VK_IMAGE_ASPECT_PLANE_0_BIT, + VK_IMAGE_ASPECT_PLANE_1_BIT, + VK_IMAGE_ASPECT_PLANE_2_BIT + }; + for (uint8_t plane = 0; plane < V3DV_MAX_PLANE_COUNT; plane++) { + if (iview->vk.aspects & plane_aspects[plane]) + iview->planes[iview->plane_count++].image_plane = plane; + } + } else { + iview->plane_count = 1; + iview->planes[0].image_plane = 0; + } + /* At this point we should have at least one plane */ + assert(iview->plane_count > 0); const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange; - iview->offset = v3dv_layer_offset(image, iview->vk.base_mip_level, - iview->vk.base_array_layer); - /* If we have D24S8 format but the view only selects the stencil aspect * we want to re-interpret the format as RGBA8_UINT, then map our stencil * data reads to the R component and ignore the GBA channels that contain * the depth aspect data. + * + * FIXME: thwe code belows calls vk_component_mapping_to_pipe_swizzle + * only so it can then call util_format_compose_swizzles later. Maybe it + * makes sense to implement swizzle composition using VkSwizzle directly. */ VkFormat format; - uint8_t image_view_swizzle[4]; - if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT && + if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT && range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { format = VK_FORMAT_R8G8B8A8_UINT; - image_view_swizzle[0] = PIPE_SWIZZLE_X; - image_view_swizzle[1] = PIPE_SWIZZLE_0; - image_view_swizzle[2] = PIPE_SWIZZLE_0; - image_view_swizzle[3] = PIPE_SWIZZLE_1; + uint8_t stencil_aspect_swizzle[4] = { + PIPE_SWIZZLE_X, PIPE_SWIZZLE_0, PIPE_SWIZZLE_0, PIPE_SWIZZLE_1, + }; + uint8_t view_swizzle[4]; + vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle); + + util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle, + iview->view_swizzle); } else { - format = pCreateInfo->format; - - /* FIXME: we are doing this vk to pipe swizzle mapping just to call - * util_format_compose_swizzles. Would be good to check if it would be - * better to reimplement the latter using vk component - */ - image_view_swizzle[0] = - vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.r); - image_view_swizzle[1] = - vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.g); - image_view_swizzle[2] = - vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.b); - image_view_swizzle[3] = - vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.a); + format = iview->vk.format; + vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, + iview->view_swizzle); } - iview->vk.format = format; + iview->vk.view_format = format; iview->format = v3dv_X(device, get_format)(format); - assert(iview->format && iview->format->supported); + assert(iview->format && iview->format->plane_count); - if (vk_format_is_depth_or_stencil(iview->vk.format)) { - iview->internal_type = - v3dv_X(device, get_internal_depth_type)(iview->vk.format); - } else { - v3dv_X(device, get_internal_type_bpp_for_output_format) - (iview->format->rt_type, &iview->internal_type, &iview->internal_bpp); - } + for (uint8_t plane = 0; plane < iview->plane_count; plane++) { + iview->planes[plane].offset = v3dv_layer_offset(image, + iview->vk.base_mip_level, + iview->vk.base_array_layer, + plane); + + if (vk_format_is_depth_or_stencil(iview->vk.view_format)) { + iview->planes[plane].internal_type = + v3dv_X(device, get_internal_depth_type)(iview->vk.view_format); + } else { + v3dv_X(device, get_internal_type_bpp_for_output_format) + (iview->format->planes[plane].rt_type, + &iview->planes[plane].internal_type, + &iview->planes[plane].internal_bpp); + } - const uint8_t *format_swizzle = v3dv_get_format_swizzle(device, format); - util_format_compose_swizzles(format_swizzle, image_view_swizzle, - iview->swizzle); - iview->swap_rb = iview->swizzle[0] == PIPE_SWIZZLE_Z; + const uint8_t *format_swizzle = + v3dv_get_format_swizzle(device, format, plane); + util_format_compose_swizzles(format_swizzle, iview->view_swizzle, + iview->planes[plane].swizzle); + + iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle); + iview->planes[plane].channel_reverse = v3dv_format_swizzle_needs_reverse(format_swizzle); + } v3dv_X(device, pack_texture_shader_state)(device, iview); @@ -549,6 +883,25 @@ v3dv_CreateImageView(VkDevice _device, return VK_SUCCESS; } +VkResult +v3dv_create_image_view(struct v3dv_device *device, + const VkImageViewCreateInfo *pCreateInfo, + VkImageView *pView) +{ + return create_image_view(device, true, pCreateInfo, NULL, pView); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_CreateImageView(VkDevice _device, + const VkImageViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkImageView *pView) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + + return create_image_view(device, false, pCreateInfo, pAllocator, pView); +} + VKAPI_ATTR void VKAPI_CALL v3dv_DestroyImageView(VkDevice _device, VkImageView imageView, @@ -560,6 +913,13 @@ v3dv_DestroyImageView(VkDevice _device, if (image_view == NULL) return; + if (image_view->shadow) { + v3dv_DestroyImageView(_device, + v3dv_image_view_to_handle(image_view->shadow), + pAllocator); + image_view->shadow = NULL; + } + vk_image_view_destroy(&device->vk, pAllocator, &image_view->vk); } @@ -578,7 +938,7 @@ v3dv_CreateBufferView(VkDevice _device, vk_object_zalloc(&device->vk, pAllocator, sizeof(*view), VK_OBJECT_TYPE_BUFFER_VIEW); if (!view) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); uint32_t range; if (pCreateInfo->range == VK_WHOLE_SIZE) @@ -596,8 +956,10 @@ v3dv_CreateBufferView(VkDevice _device, view->vk_format = pCreateInfo->format; view->format = v3dv_X(device, get_format)(view->vk_format); + /* We don't support multi-plane formats for buffer views */ + assert(view->format->plane_count == 1); v3dv_X(device, get_internal_type_bpp_for_output_format) - (view->format->rt_type, &view->internal_type, &view->internal_bpp); + (view->format->planes[0].rt_type, &view->internal_type, &view->internal_bpp); if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT || buffer->usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h index aaab1ce03ac..4df172e6bf3 100644 --- a/src/broadcom/vulkan/v3dv_limits.h +++ b/src/broadcom/vulkan/v3dv_limits.h @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Raspberry Pi + * Copyright © 2020 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,8 +23,6 @@ #ifndef V3DV_LIMITS_H #define V3DV_LIMITS_H -#define NSEC_PER_SEC 1000000000ull - /* From vulkan spec "If the multiple viewports feature is not enabled, * scissorCount must be 1", ditto for viewportCount. For now we don't support * that feature. @@ -43,7 +41,8 @@ #define MAX_STORAGE_IMAGES 4 #define MAX_INPUT_ATTACHMENTS 4 -#define MAX_UNIFORM_BUFFERS 12 +#define MAX_UNIFORM_BUFFERS 16 +#define MAX_INLINE_UNIFORM_BUFFERS 4 #define MAX_STORAGE_BUFFERS 8 #define MAX_DYNAMIC_UNIFORM_BUFFERS 8 @@ -51,8 +50,6 @@ #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \ MAX_DYNAMIC_STORAGE_BUFFERS) -#define MAX_RENDER_TARGETS 4 - #define MAX_MULTIVIEW_VIEW_COUNT 16 /* These are tunable parameters in the HW design, but all the V3D diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c index 5555c690bb3..d8868142329 100644 --- a/src/broadcom/vulkan/v3dv_meta_clear.c +++ b/src/broadcom/vulkan/v3dv_meta_clear.c @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Raspberry Pi + * Copyright © 2020 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,8 +25,8 @@ #include "v3dv_meta_common.h" #include "compiler/nir/nir_builder.h" -#include "vk_format_info.h" #include "util/u_pack_color.h" +#include "vk_common_entrypoints.h" static void get_hw_clear_color(struct v3dv_device *device, @@ -68,7 +68,13 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, { const VkOffset3D origin = { 0, 0, 0 }; VkFormat fb_format; - if (!v3dv_meta_can_use_tlb(image, &origin, &fb_format)) + + /* From vkCmdClearColorImage spec: + * "image must not use any of the formats that require a sampler YCBCR + * conversion" + */ + assert(image->plane_count == 1); + if (!v3dv_meta_can_use_tlb(image, 0, 0, &origin, NULL, &fb_format)) return false; uint32_t internal_type, internal_bpp; @@ -120,8 +126,9 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, if (!job) return true; - v3dv_job_start_frame(job, width, height, max_layer, false, - 1, internal_bpp, + v3dv_job_start_frame(job, width, height, max_layer, + false, true, 1, internal_bpp, + 4 * v3d_internal_bpp_words(internal_bpp), image->vk.samples > VK_SAMPLE_COUNT_1_BIT); struct v3dv_meta_framebuffer framebuffer; @@ -161,11 +168,15 @@ v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer, .color = *pColor, }; + cmd_buffer->state.is_transfer = true; + for (uint32_t i = 0; i < rangeCount; i++) { if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i])) continue; unreachable("Unsupported color clear."); } + + cmd_buffer->state.is_transfer = false; } VKAPI_ATTR void VKAPI_CALL @@ -183,11 +194,15 @@ v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, .depthStencil = *pDepthStencil, }; + cmd_buffer->state.is_transfer = true; + for (uint32_t i = 0; i < rangeCount; i++) { if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i])) continue; unreachable("Unsupported depth/stencil clear."); } + + cmd_buffer->state.is_transfer = false; } static void @@ -304,39 +319,6 @@ v3dv_meta_clear_finish(struct v3dv_device *device) } } -static nir_ssa_def * -gen_rect_vertices(nir_builder *b) -{ - nir_ssa_def *vertex_id = nir_load_vertex_id(b); - - /* vertex 0: -1.0, -1.0 - * vertex 1: -1.0, 1.0 - * vertex 2: 1.0, -1.0 - * vertex 3: 1.0, 1.0 - * - * so: - * - * channel 0 is vertex_id < 2 ? -1.0 : 1.0 - * channel 1 is vertex id & 1 ? 1.0 : -1.0 - */ - - nir_ssa_def *one = nir_imm_int(b, 1); - nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2)); - nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one); - - nir_ssa_def *comp[4]; - comp[0] = nir_bcsel(b, c0cmp, - nir_imm_float(b, -1.0f), - nir_imm_float(b, 1.0f)); - - comp[1] = nir_bcsel(b, c1cmp, - nir_imm_float(b, 1.0f), - nir_imm_float(b, -1.0f)); - comp[2] = nir_imm_float(b, 0.0f); - comp[3] = nir_imm_float(b, 1.0f); - return nir_vec(b, comp, 4); -} - static nir_shader * get_clear_rect_vs() { @@ -349,7 +331,7 @@ get_clear_rect_vs() nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position"); vs_out_pos->data.location = VARYING_SLOT_POS; - nir_ssa_def *pos = gen_rect_vertices(&b); + nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); nir_store_var(&b, vs_out_pos, pos, 0xf); return b.shader; @@ -372,8 +354,8 @@ get_clear_rect_gs(uint32_t push_constant_layer_base) nir->info.inputs_read = 1ull << VARYING_SLOT_POS; nir->info.outputs_written = (1ull << VARYING_SLOT_POS) | (1ull << VARYING_SLOT_LAYER); - nir->info.gs.input_primitive = GL_TRIANGLES; - nir->info.gs.output_primitive = GL_TRIANGLE_STRIP; + nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES; + nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP; nir->info.gs.vertices_in = 3; nir->info.gs.vertices_out = 3; nir->info.gs.invocations = 1; @@ -406,7 +388,7 @@ get_clear_rect_gs(uint32_t push_constant_layer_base) nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i); /* gl_Layer from push constants */ - nir_ssa_def *layer = + nir_def *layer = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = push_constant_layer_base, .range = 4); nir_store_var(&b, gs_out_layer, layer, 0x1); @@ -434,7 +416,7 @@ get_color_clear_rect_fs(uint32_t rt_idx, VkFormat format) nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color"); fs_out_color->data.location = FRAG_RESULT_DATA0 + rt_idx; - nir_ssa_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); + nir_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); nir_store_var(&b, fs_out_color, color_load, 0xf); return b.shader; @@ -452,7 +434,7 @@ get_depth_clear_rect_fs() "out_depth"); fs_out_depth->data.location = FRAG_RESULT_DEPTH; - nir_ssa_def *depth_load = + nir_def *depth_load = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); nir_store_var(&b, fs_out_depth, depth_load, 0x1); @@ -475,12 +457,11 @@ create_pipeline(struct v3dv_device *device, VkPipeline *pipeline) { VkPipelineShaderStageCreateInfo stages[3] = { 0 }; - struct vk_shader_module vs_m; + struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir); struct vk_shader_module gs_m; struct vk_shader_module fs_m; uint32_t stage_count = 0; - v3dv_shader_module_internal_init(device, &vs_m, vs_nir); stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; stages[stage_count].stage = VK_SHADER_STAGE_VERTEX_BIT; stages[stage_count].module = vk_shader_module_to_handle(&vs_m); @@ -488,7 +469,7 @@ create_pipeline(struct v3dv_device *device, stage_count++; if (gs_nir) { - v3dv_shader_module_internal_init(device, &gs_m, gs_nir); + gs_m = vk_shader_module_from_nir(gs_nir); stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; stages[stage_count].stage = VK_SHADER_STAGE_GEOMETRY_BIT; stages[stage_count].module = vk_shader_module_to_handle(&gs_m); @@ -497,7 +478,7 @@ create_pipeline(struct v3dv_device *device, } if (fs_nir) { - v3dv_shader_module_internal_init(device, &fs_m, fs_nir); + fs_m = vk_shader_module_from_nir(fs_nir); stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; stages[stage_count].stage = VK_SHADER_STAGE_FRAGMENT_BIT; stages[stage_count].module = vk_shader_module_to_handle(&fs_m); @@ -581,6 +562,7 @@ create_pipeline(struct v3dv_device *device, pipeline); ralloc_free(vs_nir); + ralloc_free(gs_nir); ralloc_free(fs_nir); return result; @@ -592,7 +574,7 @@ create_color_clear_pipeline(struct v3dv_device *device, uint32_t subpass_idx, uint32_t rt_idx, VkFormat format, - uint32_t samples, + VkSampleCountFlagBits samples, uint32_t components, bool is_layered, VkPipelineLayout pipeline_layout, @@ -709,10 +691,11 @@ static VkResult create_color_clear_render_pass(struct v3dv_device *device, uint32_t rt_idx, VkFormat format, - uint32_t samples, + VkSampleCountFlagBits samples, VkRenderPass *pass) { - VkAttachmentDescription att = { + VkAttachmentDescription2 att = { + .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2, .format = format, .samples = samples, .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, @@ -721,12 +704,14 @@ create_color_clear_render_pass(struct v3dv_device *device, .finalLayout = VK_IMAGE_LAYOUT_GENERAL, }; - VkAttachmentReference att_ref = { + VkAttachmentReference2 att_ref = { + .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, .attachment = rt_idx, .layout = VK_IMAGE_LAYOUT_GENERAL, }; - VkSubpassDescription subpass = { + VkSubpassDescription2 subpass = { + .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2, .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, .inputAttachmentCount = 0, .colorAttachmentCount = 1, @@ -737,8 +722,8 @@ create_color_clear_render_pass(struct v3dv_device *device, .pPreserveAttachments = NULL, }; - VkRenderPassCreateInfo info = { - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + VkRenderPassCreateInfo2 info = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2, .attachmentCount = 1, .pAttachments = &att, .subpassCount = 1, @@ -747,14 +732,14 @@ create_color_clear_render_pass(struct v3dv_device *device, .pDependencies = NULL, }; - return v3dv_CreateRenderPass(v3dv_device_to_handle(device), - &info, &device->vk.alloc, pass); + return v3dv_CreateRenderPass2(v3dv_device_to_handle(device), + &info, &device->vk.alloc, pass); } static inline uint64_t get_color_clear_pipeline_cache_key(uint32_t rt_idx, VkFormat format, - uint32_t samples, + VkSampleCountFlagBits samples, uint32_t components, bool is_layered) { @@ -764,7 +749,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx, uint32_t bit_offset = 0; key |= rt_idx; - bit_offset += 2; + bit_offset += 3; key |= ((uint64_t) format) << bit_offset; bit_offset += 32; @@ -819,7 +804,7 @@ get_color_clear_pipeline(struct v3dv_device *device, uint32_t rt_idx, uint32_t attachment_idx, VkFormat format, - uint32_t samples, + VkSampleCountFlagBits samples, uint32_t components, bool is_layered, struct v3dv_meta_color_clear_pipeline **pipeline) @@ -1012,7 +997,7 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer, assert(attachment_idx < cmd_buffer->state.pass->attachment_count); const VkFormat format = cmd_buffer->state.pass->attachments[attachment_idx].desc.format; - const VkFormat samples = + const VkSampleCountFlagBits samples = cmd_buffer->state.pass->attachments[attachment_idx].desc.samples; const uint32_t components = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | @@ -1049,8 +1034,6 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline); - uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR; - for (uint32_t i = 0; i < rect_count; i++) { const VkViewport viewport = { .x = rects[i].rect.offset.x, @@ -1087,7 +1070,7 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer, cmd_buffer, (uintptr_t)pipeline, (v3dv_cmd_buffer_private_obj_destroy_cb) destroy_color_clear_pipeline); - v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false); + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); } /* Emits a scissored quad, clearing the depth aspect by writing to gl_FragDepth @@ -1139,18 +1122,14 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline); - uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR; if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { - v3dv_CmdSetStencilReference(cmd_buffer_handle, - VK_STENCIL_FACE_FRONT_AND_BACK, - clear_ds->stencil); - v3dv_CmdSetStencilWriteMask(cmd_buffer_handle, - VK_STENCIL_FACE_FRONT_AND_BACK, 0xff); - v3dv_CmdSetStencilCompareMask(cmd_buffer_handle, - VK_STENCIL_FACE_FRONT_AND_BACK, 0xff); - dynamic_states |= VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK | - VK_DYNAMIC_STATE_STENCIL_WRITE_MASK | - VK_DYNAMIC_STATE_STENCIL_REFERENCE; + vk_common_CmdSetStencilReference(cmd_buffer_handle, + VK_STENCIL_FACE_FRONT_AND_BACK, + clear_ds->stencil); + vk_common_CmdSetStencilWriteMask(cmd_buffer_handle, + VK_STENCIL_FACE_FRONT_AND_BACK, 0xff); + vk_common_CmdSetStencilCompareMask(cmd_buffer_handle, + VK_STENCIL_FACE_FRONT_AND_BACK, 0xff); } for (uint32_t i = 0; i < rect_count; i++) { @@ -1179,7 +1158,7 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer, } } - v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false); + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); } static void @@ -1212,9 +1191,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer, { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - /* We can only clear attachments in the current subpass */ - assert(attachmentCount <= 5); /* 4 color + D/S */ + /* We can have at most max_color_RTs + 1 D/S attachments */ + assert(attachmentCount <= + V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1); + /* We can only clear attachments in the current subpass */ struct v3dv_render_pass *pass = cmd_buffer->state.pass; assert(cmd_buffer->state.subpass_idx < pass->subpass_count); @@ -1225,6 +1206,9 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer, * framebuffers, we use a geometry shader to redirect clears to the * appropriate layers. */ + + v3dv_cmd_buffer_pause_occlusion_query(cmd_buffer); + bool is_layered, all_rects_same_layers; gather_layering_info(rectCount, pRects, &is_layered, &all_rects_same_layers); for (uint32_t i = 0; i < attachmentCount; i++) { @@ -1242,4 +1226,6 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer, rectCount, pRects); } } + + v3dv_cmd_buffer_resume_occlusion_query(cmd_buffer); } diff --git a/src/broadcom/vulkan/v3dv_meta_common.h b/src/broadcom/vulkan/v3dv_meta_common.h index 555b55f90b7..3be51b56a1f 100644 --- a/src/broadcom/vulkan/v3dv_meta_common.h +++ b/src/broadcom/vulkan/v3dv_meta_common.h @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,30 +23,6 @@ #ifndef V3DV_META_COMMON_H #define V3DV_META_COMMON_H -/* Disable level 0 write, just write following mipmaps */ -#define V3D_TFU_IOA_DIMTW (1 << 0) -#define V3D_TFU_IOA_FORMAT_SHIFT 3 -#define V3D_TFU_IOA_FORMAT_LINEARTILE 3 -#define V3D_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4 -#define V3D_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5 -#define V3D_TFU_IOA_FORMAT_UIF_NO_XOR 6 -#define V3D_TFU_IOA_FORMAT_UIF_XOR 7 - -#define V3D_TFU_ICFG_NUMMM_SHIFT 5 -#define V3D_TFU_ICFG_TTYPE_SHIFT 9 - -#define V3D_TFU_ICFG_OPAD_SHIFT 22 - -#define V3D_TFU_ICFG_FORMAT_SHIFT 18 -#define V3D_TFU_ICFG_FORMAT_RASTER 0 -#define V3D_TFU_ICFG_FORMAT_SAND_128 1 -#define V3D_TFU_ICFG_FORMAT_SAND_256 2 -#define V3D_TFU_ICFG_FORMAT_LINEARTILE 11 -#define V3D_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12 -#define V3D_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13 -#define V3D_TFU_ICFG_FORMAT_UIF_NO_XOR 14 -#define V3D_TFU_ICFG_FORMAT_UIF_XOR 15 - /** * Copy/Clear operations implemented in v3dv_meta_*.c that use the TLB hardware * need to figure out TLB programming from the target image data instead of an diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c index 85cd8e06638..0713b1b4084 100644 --- a/src/broadcom/vulkan/v3dv_meta_copy.c +++ b/src/broadcom/vulkan/v3dv_meta_copy.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,9 +25,8 @@ #include "v3dv_meta_common.h" #include "compiler/nir/nir_builder.h" -#include "vk_format_info.h" #include "util/u_pack_color.h" -#include "vulkan/util/vk_common_entrypoints.h" +#include "vk_common_entrypoints.h" static uint32_t meta_blit_key_hash(const void *key) @@ -42,6 +41,19 @@ meta_blit_key_compare(const void *key1, const void *key2) } static bool +texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, + VkImageAspectFlags aspect, + struct v3dv_image *image, + VkFormat dst_format, + VkFormat src_format, + struct v3dv_buffer *buffer, + uint32_t buffer_bpp, + VkColorComponentFlags cmask, + VkComponentMapping *cswizzle, + uint32_t region_count, + const VkBufferImageCopy2 *regions); + +static bool create_blit_pipeline_layout(struct v3dv_device *device, VkDescriptorSetLayout *descriptor_set_layout, VkPipelineLayout *pipeline_layout) @@ -338,18 +350,41 @@ get_compatible_tlb_format(VkFormat format) /** * Checks if we can implement an image copy or clear operation using the TLB * hardware. + * + * The extent and miplevel are only used to validate tile stores (to match the + * region to store against the miplevel dimensions to avoid avoid cases where + * the region to store is not a aligned to tile boundaries). If extent is + * NULL no checks are done (which is fine if the image will only be used for a + * TLB load or when we know in advance that the store will be for the entire + * size of the image miplevel). + * + * For tlb copies we are doing a per-plane copy, so for multi-plane formats, + * the compatible format will be single-plane. */ bool v3dv_meta_can_use_tlb(struct v3dv_image *image, + uint8_t plane, + uint8_t miplevel, const VkOffset3D *offset, + const VkExtent3D *extent, VkFormat *compat_format) { if (offset->x != 0 || offset->y != 0) return false; - if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) { + /* FIXME: this is suboptimal, what we really want to check is that the + * extent of the region to copy is the full slice or a multiple of the + * tile size. + */ + if (extent) { + struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel]; + if (slice->width != extent->width || slice->height != extent->height) + return false; + } + + if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) { if (compat_format) - *compat_format = image->vk.format; + *compat_format = image->planes[plane].vk_format; return true; } @@ -357,9 +392,11 @@ v3dv_meta_can_use_tlb(struct v3dv_image *image, * a compatible format instead. */ if (compat_format) { - *compat_format = get_compatible_tlb_format(image->vk.format); - if (*compat_format != VK_FORMAT_UNDEFINED) + *compat_format = get_compatible_tlb_format(image->planes[plane].vk_format); + if (*compat_format != VK_FORMAT_UNDEFINED) { + assert(vk_format_get_plane_count(*compat_format) == 1); return true; + } } return false; @@ -379,11 +416,17 @@ static bool copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_buffer *buffer, struct v3dv_image *image, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { VkFormat fb_format; - if (!v3dv_meta_can_use_tlb(image, ®ion->imageOffset, &fb_format)) + uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask); + assert(plane < image->plane_count); + + if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel, + ®ion->imageOffset, ®ion->imageExtent, + &fb_format)) { return false; + } uint32_t internal_type, internal_bpp; v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects) @@ -403,13 +446,16 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer, return true; /* Handle copy from compressed format using a compatible format */ - const uint32_t block_w = vk_format_get_blockwidth(image->vk.format); - const uint32_t block_h = vk_format_get_blockheight(image->vk.format); + const uint32_t block_w = + vk_format_get_blockwidth(image->planes[plane].vk_format); + const uint32_t block_h = + vk_format_get_blockheight(image->planes[plane].vk_format); const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, false, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, @@ -432,29 +478,110 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, VkFormat src_format, VkColorComponentFlags cmask, VkComponentMapping *cswizzle, - const VkImageBlit2KHR *region, + const VkImageBlit2 *region, VkFilter filter, bool dst_is_padded_image); + /** - * Returns true if the implementation supports the requested operation (even if - * it failed to process it, for example, due to an out-of-memory error). + * A structure that contains all the information we may need in various + * processes involving image to buffer copies implemented with blit paths. + */ +struct image_to_buffer_info { + /* Source image info */ + VkFormat src_format; + uint8_t plane; + VkColorComponentFlags cmask; + VkComponentMapping cswizzle; + VkImageAspectFlags src_copy_aspect; + uint32_t block_width; + uint32_t block_height; + + /* Destination buffer info */ + VkFormat dst_format; + uint32_t buf_width; + uint32_t buf_height; + uint32_t buf_bpp; + VkImageAspectFlags dst_copy_aspect; +}; + +static VkImageBlit2 +blit_region_for_image_to_buffer(const VkOffset3D *offset, + const VkExtent3D *extent, + uint32_t mip_level, + uint32_t base_layer, + uint32_t layer_offset, + struct image_to_buffer_info *info) +{ + VkImageBlit2 output = { + .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2, + .srcSubresource = { + .aspectMask = info->src_copy_aspect, + .mipLevel = mip_level, + .baseArrayLayer = base_layer + layer_offset, + .layerCount = 1, + }, + .srcOffsets = { + { + DIV_ROUND_UP(offset->x, info->block_width), + DIV_ROUND_UP(offset->y, info->block_height), + offset->z + layer_offset, + }, + { + DIV_ROUND_UP(offset->x + extent->width, info->block_width), + DIV_ROUND_UP(offset->y + extent->height, info->block_height), + offset->z + layer_offset + 1, + }, + }, + .dstSubresource = { + .aspectMask = info->dst_copy_aspect, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .dstOffsets = { + { 0, 0, 0 }, + { + DIV_ROUND_UP(extent->width, info->block_width), + DIV_ROUND_UP(extent->height, info->block_height), + 1 + }, + }, + }; + + return output; +} + +/** + * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can + * use to implement buffer to image copies with blit paths. + * + * Returns false if the copy operation can't be implemented with a blit. */ static bool -copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_buffer *buffer, - struct v3dv_image *image, - const VkBufferImageCopy2KHR *region) +gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *image, + const VkBufferImageCopy2 *region, + struct image_to_buffer_info *out_info) { - bool handled = false; + bool supported = false; + + VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask; + /* For multi-planar images we copy one plane at a time using an image alias + * with a color aspect for each plane. + */ + if (image->plane_count > 1) + dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT; + + VkImageAspectFlags src_copy_aspect = region->imageSubresource.aspectMask; + uint8_t plane = v3dv_plane_from_aspect(src_copy_aspect); + assert(plane < image->plane_count); /* Generally, the bpp of the data in the buffer matches that of the * source image. The exception is the case where we are copying * stencil (8bpp) to a combined d24s8 image (32bpp). */ - uint32_t buffer_bpp = image->cpp; - - VkImageAspectFlags copy_aspect = region->imageSubresource.aspectMask; + uint32_t buffer_bpp = image->planes[plane].cpp; /* Because we are going to implement the copy as a blit, we need to create * a linear image from the destination buffer and we also want our blit @@ -477,22 +604,23 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, }; switch (buffer_bpp) { case 16: - assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT); + assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT); dst_format = VK_FORMAT_R32G32B32A32_UINT; src_format = dst_format; break; case 8: - assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT); + assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT); dst_format = VK_FORMAT_R16G16B16A16_UINT; src_format = dst_format; break; case 4: - switch (copy_aspect) { + switch (dst_copy_aspect) { case VK_IMAGE_ASPECT_COLOR_BIT: src_format = VK_FORMAT_R8G8B8A8_UINT; dst_format = VK_FORMAT_R8G8B8A8_UINT; break; case VK_IMAGE_ASPECT_DEPTH_BIT: + assert(image->plane_count == 1); assert(image->vk.format == VK_FORMAT_D32_SFLOAT || image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT || image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32); @@ -517,7 +645,8 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, } break; case VK_IMAGE_ASPECT_STENCIL_BIT: - assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT); + assert(image->plane_count == 1); + assert(dst_copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT); assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT); /* Copying from S8D24. We want to write 8-bit stencil values only, * so adjust the buffer bpp for that. Since the hardware stores stencil @@ -529,23 +658,23 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, break; default: unreachable("unsupported aspect"); - return handled; + return supported; }; break; case 2: - assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT || - copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT); + assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT || + dst_copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT); dst_format = VK_FORMAT_R16_UINT; src_format = dst_format; break; case 1: - assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT); + assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT); dst_format = VK_FORMAT_R8_UINT; src_format = dst_format; break; default: unreachable("unsupported bit-size"); - return handled; + return supported; }; /* The hardware doesn't support linear depth/stencil stores, so we @@ -554,10 +683,10 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, */ assert(vk_format_is_color(src_format)); assert(vk_format_is_color(dst_format)); - copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT; + dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT; /* We should be able to handle the blit if we got this far */ - handled = true; + supported = true; /* Obtain the 2D buffer region spec */ uint32_t buf_width, buf_height; @@ -572,99 +701,250 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, buf_height = region->bufferImageHeight; /* If the image is compressed, the bpp refers to blocks, not pixels */ - uint32_t block_width = vk_format_get_blockwidth(image->vk.format); - uint32_t block_height = vk_format_get_blockheight(image->vk.format); - buf_width = buf_width / block_width; - buf_height = buf_height / block_height; + uint32_t block_width = + vk_format_get_blockwidth(image->planes[plane].vk_format); + uint32_t block_height = + vk_format_get_blockheight(image->planes[plane].vk_format); + buf_width = DIV_ROUND_UP(buf_width, block_width); + buf_height = DIV_ROUND_UP(buf_height, block_height); + + out_info->src_format = src_format; + out_info->dst_format = dst_format; + out_info->src_copy_aspect = src_copy_aspect; + out_info->dst_copy_aspect = dst_copy_aspect; + out_info->buf_width = buf_width; + out_info->buf_height = buf_height; + out_info->buf_bpp = buffer_bpp; + out_info->block_width = block_width; + out_info->block_height = block_height; + out_info->cmask = cmask; + out_info->cswizzle = cswizzle; + out_info->plane = plane; + + return supported; +} - /* Compute layers to copy */ - uint32_t num_layers; - if (image->vk.image_type != VK_IMAGE_TYPE_3D) - num_layers = region->imageSubresource.layerCount; - else - num_layers = region->imageExtent.depth; - assert(num_layers > 0); +/* Creates a linear image to alias buffer memory. It also includes that image + * as a private object in the cmd_buffer. + * + * This is used for cases where we want to implement an image to buffer copy, + * but we need to rely on a mechanism that uses an image as destination, like + * blitting. + */ +static VkResult +create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_buffer *buffer, + const VkBufferImageCopy2 *region, + struct image_to_buffer_info *info, + uint32_t layer, + VkImage *out_image) +{ + VkImageCreateInfo image_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = VK_IMAGE_TYPE_2D, + .format = info->dst_format, + .extent = { info->buf_width, info->buf_height, 1 }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + }; - /* Our blit interface can see the real format of the images to detect - * copies between compressed and uncompressed images and adapt the - * blit region accordingly. Here we are just doing a raw copy of - * compressed data, but we are passing an uncompressed view of the - * buffer for the blit destination image (since compressed formats are - * not renderable), so we also want to provide an uncompressed view of - * the source image. - */ VkResult result; struct v3dv_device *device = cmd_buffer->device; VkDevice _device = v3dv_device_to_handle(device); - if (vk_format_is_compressed(image->vk.format)) { - VkImage uiview; - VkImageCreateInfo uiview_info = { - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - .imageType = VK_IMAGE_TYPE_3D, - .format = dst_format, - .extent = { buf_width, buf_height, image->vk.extent.depth }, - .mipLevels = image->vk.mip_levels, - .arrayLayers = image->vk.array_layers, - .samples = image->vk.samples, - .tiling = image->vk.tiling, - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .initialLayout = VK_IMAGE_LAYOUT_GENERAL, - }; - result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview); - if (result != VK_SUCCESS) - return handled; - v3dv_cmd_buffer_add_private_obj( - cmd_buffer, (uintptr_t)uiview, - (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + VkImage buffer_image; + result = + v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image); + if (result != VK_SUCCESS) + return result; - result = - vk_common_BindImageMemory(_device, uiview, - v3dv_device_memory_to_handle(image->mem), - image->mem_offset); - if (result != VK_SUCCESS) - return handled; + *out_image = buffer_image; + + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uintptr_t)buffer_image, + (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + + /* Bind the buffer memory to the image + */ + VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset + + layer * info->buf_width * info->buf_height * info->buf_bpp; + + result = + vk_common_BindImageMemory(_device, buffer_image, + v3dv_device_memory_to_handle(buffer->mem), + buffer_offset); + return result; +} - image = v3dv_image_from_handle(uiview); +/** + * Creates an image with a single mip level that aliases the memory of a + * mip level in another image, re-interpreting the memory with an uncompressed + * format. The image is added to the command buffer as a private object for + * disposal. + */ +static bool +create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *image, + VkFormat format, + uint32_t plane, + uint32_t mip_level, + uint32_t layer, + VkImage *alias) +{ + VkResult result; + assert(!vk_format_is_compressed(format)); + + struct v3dv_device *device = cmd_buffer->device; + VkDevice vk_device = v3dv_device_to_handle(device); + uint32_t mip_width = image->planes[plane].slices[mip_level].width; + uint32_t mip_height = image->planes[plane].slices[mip_level].height; + + uint32_t block_width = + vk_format_get_blockwidth(image->planes[plane].vk_format); + uint32_t block_height = + vk_format_get_blockheight(image->planes[plane].vk_format); + + VkImageCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = image->vk.image_type, + .format = format, + .extent = { DIV_ROUND_UP(mip_width, block_width), + DIV_ROUND_UP(mip_height, block_height), + 1 }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = image->vk.samples, + .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias); + if (result != VK_SUCCESS) + return false; + + /* The alias we have just created has just one mip, but we may be aliasing + * any mip in the original image. Because the slice setup changes based on + * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally) + * and this can influence the tiling layout selected for the slice, we want + * to make sure we copy the slice description from the actual mip level in + * the original image, and then rewrite any fields that we need for the + * alias. Particularly, we want to make the offset 0 because we are going to + * bind the underlying image memory exactly at the start of the selected mip. + * We also want to relax the image alignment requirements to the minimum + * (the one imposed by the Texture Base Address field) since we may not be + * aliasing a level 0 (for which we typically want a page alignment for + * optimal performance). + */ + V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias); + v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level]; + v3dv_alias->planes[plane].slices[0].width = info.extent.width; + v3dv_alias->planes[plane].slices[0].height = info.extent.height; + v3dv_alias->planes[plane].slices[0].offset = 0; + v3dv_alias->planes[plane].alignment = 64; + + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uintptr_t)*alias, + (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + + result = + vk_common_BindImageMemory(vk_device, *alias, + v3dv_device_memory_to_handle(image->planes[plane].mem), + v3dv_layer_offset(image, mip_level, layer, plane)); + return result == VK_SUCCESS; +} + +/** + * Returns true if the implementation supports the requested operation (even if + * it failed to process it, for example, due to an out-of-memory error). + */ +static bool +copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_buffer *buffer, + struct v3dv_image *image, + const VkBufferImageCopy2 *region) +{ + bool handled = false; + struct image_to_buffer_info info; + + /* This path uses a shader blit which doesn't support linear images. Return + * early to avoid all the heavy lifting in preparation for the + * blit_shader() call that is bound to fail in that scenario. + */ + if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) { + return handled; } + handled = gather_image_to_buffer_info(cmd_buffer, image, region, + &info); + + if (!handled) + return handled; + + /* We should be able to handle the blit if we got this far */ + handled = true; + + /* Compute layers to copy */ + uint32_t num_layers; + if (image->vk.image_type != VK_IMAGE_TYPE_3D) + num_layers = region->imageSubresource.layerCount; + else + num_layers = region->imageExtent.depth; + assert(num_layers > 0); + /* Copy requested layers */ + VkResult result; + VkImageBlit2 blit_region; + uint32_t mip_level = region->imageSubresource.mipLevel; + uint32_t base_layer = region->imageSubresource.baseArrayLayer; for (uint32_t i = 0; i < num_layers; i++) { - /* Create the destination blit image from the destination buffer */ - VkImageCreateInfo image_info = { - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - .imageType = VK_IMAGE_TYPE_2D, - .format = dst_format, - .extent = { buf_width, buf_height, 1 }, - .mipLevels = 1, - .arrayLayers = 1, - .samples = VK_SAMPLE_COUNT_1_BIT, - .tiling = VK_IMAGE_TILING_LINEAR, - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .initialLayout = VK_IMAGE_LAYOUT_GENERAL, - }; - - VkImage buffer_image; - result = - v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image); - if (result != VK_SUCCESS) - return handled; + uint32_t layer_offset = i; + + if (vk_format_is_compressed(image->vk.format)) { + /* Our blit interface can see the real format of the images to detect + * copies between compressed and uncompressed images and adapt the + * blit region accordingly. Here we are just doing a raw copy of + * compressed data, but we are passing an uncompressed view of the + * buffer for the blit destination image (since compressed formats are + * not renderable), so we also want to provide an uncompressed view of + * the source image. + * + * It is important that we create the alias over the selected mip + * level (instead of aliasing the entire image) because an uncompressed + * view of the image won't have the same number of mip levels as the + * original image and the implicit mip size calculations the hw will + * do to sample from a non-zero mip level may not match exactly between + * compressed and uncompressed views. + */ + VkImage alias; + if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format, + info.plane, mip_level, + base_layer + layer_offset, + &alias)) { + return handled; + } - v3dv_cmd_buffer_add_private_obj( - cmd_buffer, (uintptr_t)buffer_image, - (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + /* We are aliasing the selected mip level and layer with a + * single-mip and single-layer image. + */ + image = v3dv_image_from_handle(alias); + mip_level = 0; + base_layer = 0; + layer_offset = 0; + } - /* Bind the buffer memory to the image */ - VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset + - i * buf_width * buf_height * buffer_bpp; + /* Create the destination blit image from the destination buffer */ + VkImage buffer_image; result = - vk_common_BindImageMemory(_device, buffer_image, - v3dv_device_memory_to_handle(buffer->mem), - buffer_offset); + create_image_from_buffer(cmd_buffer, buffer, region, &info, + i, &buffer_image); if (result != VK_SUCCESS) return handled; @@ -676,48 +956,17 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, * image, but that we need to blit to a S8D24 destination (the only * stencil format we support). */ - const VkImageBlit2KHR blit_region = { - .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR, - .srcSubresource = { - .aspectMask = copy_aspect, - .mipLevel = region->imageSubresource.mipLevel, - .baseArrayLayer = region->imageSubresource.baseArrayLayer + i, - .layerCount = 1, - }, - .srcOffsets = { - { - DIV_ROUND_UP(region->imageOffset.x, block_width), - DIV_ROUND_UP(region->imageOffset.y, block_height), - region->imageOffset.z + i, - }, - { - DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width, - block_width), - DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height, - block_height), - region->imageOffset.z + i + 1, - }, - }, - .dstSubresource = { - .aspectMask = copy_aspect, - .mipLevel = 0, - .baseArrayLayer = 0, - .layerCount = 1, - }, - .dstOffsets = { - { 0, 0, 0 }, - { - DIV_ROUND_UP(region->imageExtent.width, block_width), - DIV_ROUND_UP(region->imageExtent.height, block_height), - 1 - }, - }, - }; + blit_region = + blit_region_for_image_to_buffer(®ion->imageOffset, + ®ion->imageExtent, + mip_level, base_layer, layer_offset, + &info); handled = blit_shader(cmd_buffer, - v3dv_image_from_handle(buffer_image), dst_format, - image, src_format, - cmask, &cswizzle, + v3dv_image_from_handle(buffer_image), + info.dst_format, + image, info.src_format, + info.cmask, &info.cswizzle, &blit_region, VK_FILTER_NEAREST, false); if (!handled) { /* This is unexpected, we should have a supported blit spec */ @@ -730,9 +979,110 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, return true; } +static bool +copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *dst, + struct v3dv_image *src, + const VkImageCopy2 *region); + +static VkImageCopy2 +image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region, + struct image_to_buffer_info *info, + uint32_t layer) +{ + VkImageCopy2 output = { + .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2, + .srcSubresource = { + .aspectMask = info->src_copy_aspect, + .mipLevel = region->imageSubresource.mipLevel, + .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer, + .layerCount = 1, + }, + .srcOffset = { + DIV_ROUND_UP(region->imageOffset.x, info->block_width), + DIV_ROUND_UP(region->imageOffset.y, info->block_height), + region->imageOffset.z, + }, + .dstSubresource = { + .aspectMask = info->dst_copy_aspect, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .dstOffset = { 0, 0, 0 }, + .extent = { + DIV_ROUND_UP(region->imageExtent.width, info->block_width), + DIV_ROUND_UP(region->imageExtent.height, info->block_height), + 1 + }, + }; + + return output; +} + +/** + * Returns true if the implementation supports the requested operation (even if + * it failed to process it, for example, due to an out-of-memory error). + */ +static bool +copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_buffer *dst_buffer, + struct v3dv_image *src_image, + const VkBufferImageCopy2 *region) +{ + bool handled = false; + VkImage dst_buffer_image; + struct image_to_buffer_info info; + + /* This is a requirement for copy_image_linear_texel_buffer below. We check + * it in advance in order to do an early return + */ + if (src_image->tiled) + return false; + + handled = + gather_image_to_buffer_info(cmd_buffer, src_image, region, + &info); + if (!handled) + return handled; + + /* At this point the implementation should support the copy, any possible + * error below are for different reasons, like out-of-memory error + */ + handled = true; + + uint32_t num_layers; + if (src_image->vk.image_type != VK_IMAGE_TYPE_3D) + num_layers = region->imageSubresource.layerCount; + else + num_layers = region->imageExtent.depth; + assert(num_layers > 0); + + VkResult result; + VkImageCopy2 image_region; + for (uint32_t layer = 0; layer < num_layers; layer++) { + /* Create the destination image from the destination buffer */ + result = + create_image_from_buffer(cmd_buffer, dst_buffer, region, &info, + layer, &dst_buffer_image); + if (result != VK_SUCCESS) + return handled; + + image_region = + image_copy_region_for_image_to_buffer(region, &info, layer); + + handled = + copy_image_linear_texel_buffer(cmd_buffer, + v3dv_image_from_handle(dst_buffer_image), + src_image, &image_region); + } + + return handled; +} + VKAPI_ATTR void VKAPI_CALL -v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, - const VkCopyImageToBufferInfo2KHR *info) +v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer, + const VkCopyImageToBufferInfo2 *info) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); @@ -741,13 +1091,23 @@ v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT); + cmd_buffer->state.is_transfer = true; + for (uint32_t i = 0; i < info->regionCount; i++) { - if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i])) + const VkBufferImageCopy2 *region = &info->pRegions[i]; + + if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, region)) continue; - if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &info->pRegions[i])) + + if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region)) continue; + + if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region)) + continue; + unreachable("Unsupported image to buffer copy."); } + cmd_buffer->state.is_transfer = false; } /** @@ -758,10 +1118,15 @@ static bool copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *dst, struct v3dv_image *src, - const VkImageCopy2KHR *region) + const VkImageCopy2 *region) { + if (V3D_DBG(DISABLE_TFU)) { + perf_debug("Copy images: TFU disabled, fallbacks could be slower.\n"); + return false; + } + /* Destination can't be raster format */ - if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR) + if (!dst->tiled) return false; /* We can only do full copies, so if the format is D24S8 both aspects need @@ -772,7 +1137,7 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; if (region->dstSubresource.aspectMask != ds_aspects) - return false; + return false; } /* Don't handle copies between uncompressed and compressed formats for now. @@ -797,9 +1162,14 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, if (region->dstOffset.x != 0 || region->dstOffset.y != 0) return false; + uint8_t src_plane = + v3dv_plane_from_aspect(region->srcSubresource.aspectMask); + uint8_t dst_plane = + v3dv_plane_from_aspect(region->dstSubresource.aspectMask); + const uint32_t dst_mip_level = region->dstSubresource.mipLevel; - uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level); - uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level); + uint32_t dst_width = u_minify(dst->planes[dst_plane].width, dst_mip_level); + uint32_t dst_height = u_minify(dst->planes[dst_plane].height, dst_mip_level); if (region->extent.width != dst_width || region->extent.height != dst_height) return false; @@ -809,8 +1179,10 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, * members represent the texel dimensions of the source image and not * the destination." */ - const uint32_t block_w = vk_format_get_blockwidth(src->vk.format); - const uint32_t block_h = vk_format_get_blockheight(src->vk.format); + const uint32_t block_w = + vk_format_get_blockwidth(src->planes[src_plane].vk_format); + const uint32_t block_h = + vk_format_get_blockheight(src->planes[src_plane].vk_format); uint32_t width = DIV_ROUND_UP(region->extent.width, block_w); uint32_t height = DIV_ROUND_UP(region->extent.height, block_h); @@ -834,10 +1206,10 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, * the underlying pixel data according to its format, we can always choose * to use compatible formats that are supported with the TFU unit. */ - assert(dst->cpp == src->cpp); + assert(dst->planes[dst_plane].cpp == src->planes[src_plane].cpp); const struct v3dv_format *format = v3dv_get_compatible_tfu_format(cmd_buffer->device, - dst->cpp, NULL); + dst->planes[dst_plane].cpp, NULL); /* Emit a TFU job for each layer to blit */ const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ? @@ -850,15 +1222,47 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ? region->dstSubresource.baseArrayLayer : region->dstOffset.z; for (uint32_t i = 0; i < layer_count; i++) { - v3dv_X(cmd_buffer->device, meta_emit_tfu_job) - (cmd_buffer, dst, dst_mip_level, base_dst_layer + i, - src, src_mip_level, base_src_layer + i, - width, height, format); + const uint32_t dst_offset = + dst->planes[dst_plane].mem->bo->offset + + v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i, dst_plane); + const uint32_t src_offset = + src->planes[src_plane].mem->bo->offset + + v3dv_layer_offset(src, src_mip_level, base_src_layer + i, src_plane); + + const struct v3d_resource_slice *dst_slice = + &dst->planes[dst_plane].slices[dst_mip_level]; + const struct v3d_resource_slice *src_slice = + &src->planes[src_plane].slices[src_mip_level]; + + v3dv_X(cmd_buffer->device, meta_emit_tfu_job)( + cmd_buffer, + dst->planes[dst_plane].mem->bo->handle, + dst_offset, + dst_slice->tiling, + dst_slice->padded_height, + dst->planes[dst_plane].cpp, + src->planes[src_plane].mem->bo->handle, + src_offset, + src_slice->tiling, + src_slice->tiling == V3D_TILING_RASTER ? + src_slice->stride : src_slice->padded_height, + src->planes[src_plane].cpp, + /* All compatible TFU formats are single-plane */ + width, height, &format->planes[0]); } return true; } +inline bool +v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *dst, + struct v3dv_image *src, + const VkImageCopy2 *region) +{ + return copy_image_tfu(cmd_buffer, dst, src, region); +} + /** * Returns true if the implementation supports the requested operation (even if * it failed to process it, for example, due to an out-of-memory error). @@ -867,11 +1271,20 @@ static bool copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *dst, struct v3dv_image *src, - const VkImageCopy2KHR *region) + const VkImageCopy2 *region) { + uint8_t src_plane = + v3dv_plane_from_aspect(region->srcSubresource.aspectMask); + assert(src_plane < src->plane_count); + uint8_t dst_plane = + v3dv_plane_from_aspect(region->dstSubresource.aspectMask); + assert(dst_plane < dst->plane_count); + VkFormat fb_format; - if (!v3dv_meta_can_use_tlb(src, ®ion->srcOffset, &fb_format) || - !v3dv_meta_can_use_tlb(dst, ®ion->dstOffset, &fb_format)) { + if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel, + ®ion->srcOffset, NULL, &fb_format) || + !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel, + ®ion->dstOffset, ®ion->extent, &fb_format)) { return false; } @@ -881,7 +1294,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, * dstImage has a multi-planar image format then the aspectMask member * of srcSubresource and dstSubresource must match." */ - assert(region->dstSubresource.aspectMask == + assert(src->plane_count != 1 || dst->plane_count != 1 || + region->dstSubresource.aspectMask == region->srcSubresource.aspectMask); uint32_t internal_type, internal_bpp; v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects) @@ -911,12 +1325,15 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, return true; /* Handle copy to compressed image using compatible format */ - const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format); - const uint32_t block_h = vk_format_get_blockheight(dst->vk.format); + const uint32_t block_w = + vk_format_get_blockwidth(dst->planes[dst_plane].vk_format); + const uint32_t block_h = + vk_format_get_blockheight(dst->planes[dst_plane].vk_format); const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp, + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), src->vk.samples > VK_SAMPLE_COUNT_1_BIT); struct v3dv_meta_framebuffer framebuffer; @@ -951,6 +1368,8 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer, VkFormat format) { assert(!vk_format_is_compressed(format)); + /* We don't support ycbcr compressed formats */ + assert(src->plane_count == 1); VkDevice _device = v3dv_device_to_handle(cmd_buffer->device); @@ -966,7 +1385,7 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer, .mipLevels = src->vk.mip_levels, .arrayLayers = src->vk.array_layers, .samples = src->vk.samples, - .tiling = src->vk.tiling, + .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR, .usage = src->vk.usage, }; @@ -979,8 +1398,8 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer, } struct v3dv_image *image = v3dv_image_from_handle(_image); - image->mem = src->mem; - image->mem_offset = src->mem_offset; + image->planes[0].mem = src->planes[0].mem; + image->planes[0].mem_offset = src->planes[0].mem_offset; return image; } @@ -992,12 +1411,26 @@ static bool copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *dst, struct v3dv_image *src, - const VkImageCopy2KHR *region) + const VkImageCopy2 *region) { - const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format); - const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format); - const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format); - const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format); + if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) + return false; + + uint8_t src_plane = + v3dv_plane_from_aspect(region->srcSubresource.aspectMask); + assert(src_plane < src->plane_count); + uint8_t dst_plane = + v3dv_plane_from_aspect(region->dstSubresource.aspectMask); + assert(dst_plane < dst->plane_count); + + const uint32_t src_block_w = + vk_format_get_blockwidth(src->planes[src_plane].vk_format); + const uint32_t src_block_h = + vk_format_get_blockheight(src->planes[src_plane].vk_format); + const uint32_t dst_block_w = + vk_format_get_blockwidth(dst->planes[dst_plane].vk_format); + const uint32_t dst_block_h = + vk_format_get_blockheight(dst->planes[dst_plane].vk_format); const float block_scale_w = (float)src_block_w / (float)dst_block_w; const float block_scale_h = (float)src_block_h / (float)dst_block_h; @@ -1033,10 +1466,10 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, * divisors for the width and height depending on the source image's * bpp. */ - assert(src->cpp == dst->cpp); + assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp); format = VK_FORMAT_R32G32_UINT; - switch (src->cpp) { + switch (src->planes[src_plane].cpp) { case 16: format = VK_FORMAT_R32G32B32A32_UINT; break; @@ -1061,13 +1494,15 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, dst = create_image_alias(cmd_buffer, dst, dst_scale_w, dst_scale_h, format); } else { - format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ? - src->vk.format : get_compatible_tlb_format(src->vk.format); + format = src->format->planes[src_plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ? + src->planes[src_plane].vk_format : + get_compatible_tlb_format(src->planes[src_plane].vk_format); if (format == VK_FORMAT_UNDEFINED) return false; const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format); - if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO) + assert(f->plane_count < 2); + if (!f->plane_count || f->planes[0].tex_type == TEXTURE_DATA_FORMAT_NO) return false; } @@ -1090,14 +1525,21 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, * (since the region dimensions are already specified in terms of the source * image). */ + uint32_t region_width = region->extent.width * src_scale_w; + uint32_t region_height = region->extent.height * src_scale_h; + if (src_block_w > 1) + region_width = util_next_power_of_two(region_width); + if (src_block_h > 1) + region_height = util_next_power_of_two(region_height); + const VkOffset3D src_start = { region->srcOffset.x * src_scale_w, region->srcOffset.y * src_scale_h, region->srcOffset.z, }; const VkOffset3D src_end = { - src_start.x + region->extent.width * src_scale_w, - src_start.y + region->extent.height * src_scale_h, + src_start.x + region_width, + src_start.y + region_height, src_start.z + region->extent.depth, }; @@ -1107,13 +1549,13 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, region->dstOffset.z, }; const VkOffset3D dst_end = { - dst_start.x + region->extent.width * src_scale_w, - dst_start.y + region->extent.height * src_scale_h, + dst_start.x + region_width, + dst_start.y + region_height, dst_start.z + region->extent.depth, }; - const VkImageBlit2KHR blit_region = { - .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR, + const VkImageBlit2 blit_region = { + .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2, .srcSubresource = region->srcSubresource, .srcOffsets = { src_start, src_end }, .dstSubresource = region->dstSubresource, @@ -1130,9 +1572,113 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, return handled; } +static bool +copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *dst, + struct v3dv_image *src, + const VkImageCopy2 *region) +{ + if (src->tiled) + return false; + + /* Implementations are allowed to restrict linear images like this */ + assert(region->srcOffset.z == 0); + assert(region->dstOffset.z == 0); + assert(region->srcSubresource.mipLevel == 0); + assert(region->srcSubresource.baseArrayLayer == 0); + assert(region->srcSubresource.layerCount == 1); + assert(region->dstSubresource.mipLevel == 0); + assert(region->dstSubresource.baseArrayLayer == 0); + assert(region->dstSubresource.layerCount == 1); + + uint8_t src_plane = + v3dv_plane_from_aspect(region->srcSubresource.aspectMask); + uint8_t dst_plane = + v3dv_plane_from_aspect(region->dstSubresource.aspectMask); + + assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp); + const uint32_t bpp = src->planes[src_plane].cpp; + + VkFormat format; + switch (bpp) { + case 16: + format = VK_FORMAT_R32G32B32A32_UINT; + break; + case 8: + format = VK_FORMAT_R16G16B16A16_UINT; + break; + case 4: + format = VK_FORMAT_R8G8B8A8_UINT; + break; + case 2: + format = VK_FORMAT_R16_UINT; + break; + case 1: + format = VK_FORMAT_R8_UINT; + break; + default: + unreachable("unsupported bit-size"); + return false; + } + + VkComponentMapping ident_swizzle = { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }; + + const uint32_t buf_stride = src->planes[src_plane].slices[0].stride; + const VkDeviceSize buf_offset = + region->srcOffset.y * buf_stride + region->srcOffset.x * bpp; + + struct v3dv_buffer src_buffer; + vk_object_base_init(&cmd_buffer->device->vk, &src_buffer.base, + VK_OBJECT_TYPE_BUFFER); + + const struct VkBufferCreateInfo buf_create_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = src->planes[src_plane].size, + .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + v3dv_buffer_init(cmd_buffer->device, &buf_create_info, &src_buffer, + src->planes[src_plane].alignment); + + const VkBindBufferMemoryInfo buf_bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, + .buffer = v3dv_buffer_to_handle(&src_buffer), + .memory = v3dv_device_memory_to_handle(src->planes[src_plane].mem), + .memoryOffset = src->planes[src_plane].mem_offset + + v3dv_layer_offset(src, 0, 0, src_plane), + }; + v3dv_buffer_bind_memory(&buf_bind_info); + + const VkBufferImageCopy2 copy_region = { + .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2, + .pNext = NULL, + .bufferOffset = buf_offset, + .bufferRowLength = buf_stride / bpp, + .bufferImageHeight = src->vk.extent.height, + .imageSubresource = region->dstSubresource, + .imageOffset = region->dstOffset, + .imageExtent = region->extent, + }; + + return texel_buffer_shader_copy(cmd_buffer, + region->dstSubresource.aspectMask, + dst, + format, + format, + &src_buffer, + src->planes[src_plane].cpp, + 0 /* color mask: full */, &ident_swizzle, + 1, ©_region); +} + VKAPI_ATTR void VKAPI_CALL -v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, - const VkCopyImageInfo2KHR *info) +v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer, + const VkCopyImageInfo2 *info) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); @@ -1141,25 +1687,34 @@ v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, assert(src->vk.samples == dst->vk.samples); + cmd_buffer->state.is_transfer = true; + for (uint32_t i = 0; i < info->regionCount; i++) { - if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i])) + const VkImageCopy2 *region = &info->pRegions[i]; + if (copy_image_tfu(cmd_buffer, dst, src, region)) + continue; + if (copy_image_tlb(cmd_buffer, dst, src, region)) continue; - if (copy_image_tlb(cmd_buffer, dst, src, &info->pRegions[i])) + if (copy_image_blit(cmd_buffer, dst, src, region)) continue; - if (copy_image_blit(cmd_buffer, dst, src, &info->pRegions[i])) + if (copy_image_linear_texel_buffer(cmd_buffer, dst, src, region)) continue; unreachable("Image copy not supported"); } + + cmd_buffer->state.is_transfer = false; } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, - const VkCopyBufferInfo2KHR *pCopyBufferInfo) +v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer, + const VkCopyBufferInfo2 *pCopyBufferInfo) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer); V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer); + cmd_buffer->state.is_transfer = true; + for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) { v3dv_X(cmd_buffer->device, meta_copy_buffer) (cmd_buffer, @@ -1167,6 +1722,8 @@ v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, src_buffer->mem->bo, src_buffer->mem_offset, &pCopyBufferInfo->pRegions[i]); } + + cmd_buffer->state.is_transfer = false; } static void @@ -1202,12 +1759,14 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, return; } + cmd_buffer->state.is_transfer = true; + memcpy(src_bo->map, pData, dataSize); v3dv_bo_unmap(cmd_buffer->device, src_bo); - VkBufferCopy2KHR region = { - .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2_KHR, + VkBufferCopy2 region = { + .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2, .srcOffset = 0, .dstOffset = dstOffset, .size = dataSize, @@ -1217,11 +1776,12 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset, src_bo, 0, ®ion); - if (!copy_job) - return; + if (copy_job) { + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb); + } - v3dv_cmd_buffer_add_private_obj( - cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb); + cmd_buffer->state.is_transfer = false; } VKAPI_ATTR void VKAPI_CALL @@ -1234,6 +1794,8 @@ v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer, V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer); + cmd_buffer->state.is_transfer = true; + struct v3dv_bo *bo = dst_buffer->mem->bo; /* From the Vulkan spec: @@ -1248,6 +1810,8 @@ v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer, v3dv_X(cmd_buffer->device, meta_fill_buffer) (cmd_buffer, bo, dstOffset, size, data); + + cmd_buffer->state.is_transfer = false; } /** @@ -1258,19 +1822,24 @@ static bool copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *image, struct v3dv_buffer *buffer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { + if (V3D_DBG(DISABLE_TFU)) { + perf_debug("Copy buffer to image: TFU disabled, fallbacks could be slower.\n"); + return false; + } + assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT); /* Destination can't be raster format */ - if (image->vk.tiling == VK_IMAGE_TILING_LINEAR) + if (!image->tiled) return false; /* We can't copy D24S8 because buffer to image copies only copy one aspect * at a time, and the TFU copies full images. Also, V3D depth bits for * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but * the Vulkan spec has the buffer data specified the other way around, so it - * is not a straight copy, we would havew to swizzle the channels, which the + * is not a straight copy, we would have to swizzle the channels, which the * TFU can't do. */ if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT || @@ -1295,12 +1864,20 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, else height = region->bufferImageHeight; - if (width != image->vk.extent.width || height != image->vk.extent.height) + const uint8_t plane = + v3dv_plane_from_aspect(region->imageSubresource.aspectMask); + + const uint32_t mip_level = region->imageSubresource.mipLevel; + const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level]; + + if (width != slice->width || height != slice->height) return false; /* Handle region semantics for compressed images */ - const uint32_t block_w = vk_format_get_blockwidth(image->vk.format); - const uint32_t block_h = vk_format_get_blockheight(image->vk.format); + const uint32_t block_w = + vk_format_get_blockwidth(image->planes[plane].vk_format); + const uint32_t block_h = + vk_format_get_blockheight(image->planes[plane].vk_format); width = DIV_ROUND_UP(width, block_w); height = DIV_ROUND_UP(height, block_h); @@ -1311,10 +1888,10 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, */ const struct v3dv_format *format = v3dv_get_compatible_tfu_format(cmd_buffer->device, - image->cpp, NULL); - - const uint32_t mip_level = region->imageSubresource.mipLevel; - const struct v3d_resource_slice *slice = &image->slices[mip_level]; + image->planes[plane].cpp, NULL); + /* We only use single-plane formats with the TFU */ + assert(format->plane_count == 1); + const struct v3dv_format_plane *format_plane = &format->planes[0]; uint32_t num_layers; if (image->vk.image_type != VK_IMAGE_TYPE_3D) @@ -1323,14 +1900,14 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, num_layers = region->imageExtent.depth; assert(num_layers > 0); - assert(image->mem && image->mem->bo); - const struct v3dv_bo *dst_bo = image->mem->bo; + assert(image->planes[plane].mem && image->planes[plane].mem->bo); + const struct v3dv_bo *dst_bo = image->planes[plane].mem->bo; assert(buffer->mem && buffer->mem->bo); const struct v3dv_bo *src_bo = buffer->mem->bo; /* Emit a TFU job per layer to copy */ - const uint32_t buffer_stride = width * image->cpp; + const uint32_t buffer_stride = width * image->planes[plane].cpp; for (int i = 0; i < num_layers; i++) { uint32_t layer; if (image->vk.image_type != VK_IMAGE_TYPE_3D) @@ -1338,46 +1915,27 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, else layer = region->imageOffset.z + i; - struct drm_v3d_submit_tfu tfu = { - .ios = (height << 16) | width, - .bo_handles = { - dst_bo->handle, - src_bo->handle != dst_bo->handle ? src_bo->handle : 0 - }, - }; - const uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset + height * buffer_stride * i; - const uint32_t src_offset = src_bo->offset + buffer_offset; - tfu.iia |= src_offset; - tfu.icfg |= V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT; - tfu.iis |= width; const uint32_t dst_offset = - dst_bo->offset + v3dv_layer_offset(image, mip_level, layer); - tfu.ioa |= dst_offset; - - tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE + - (slice->tiling - V3D_TILING_LINEARTILE)) << - V3D_TFU_IOA_FORMAT_SHIFT; - tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT; - - /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the - * OPAD field for the destination (how many extra UIF blocks beyond - * those necessary to cover the height). - */ - if (slice->tiling == V3D_TILING_UIF_NO_XOR || - slice->tiling == V3D_TILING_UIF_XOR) { - uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp); - uint32_t implicit_padded_height = align(height, uif_block_h); - uint32_t icfg = - (slice->padded_height - implicit_padded_height) / uif_block_h; - tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT; - } - - v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu); + dst_bo->offset + v3dv_layer_offset(image, mip_level, layer, plane); + + v3dv_X(cmd_buffer->device, meta_emit_tfu_job)( + cmd_buffer, + dst_bo->handle, + dst_offset, + slice->tiling, + slice->padded_height, + image->planes[plane].cpp, + src_bo->handle, + src_offset, + V3D_TILING_RASTER, + width, + 1, + width, height, format_plane); } return true; @@ -1391,11 +1949,17 @@ static bool copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *image, struct v3dv_buffer *buffer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { VkFormat fb_format; - if (!v3dv_meta_can_use_tlb(image, ®ion->imageOffset, &fb_format)) + uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask); + assert(plane < image->plane_count); + + if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel, + ®ion->imageOffset, ®ion->imageExtent, + &fb_format)) { return false; + } uint32_t internal_type, internal_bpp; v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects) @@ -1415,13 +1979,16 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, return true; /* Handle copy to compressed format using a compatible format */ - const uint32_t block_w = vk_format_get_blockwidth(image->vk.format); - const uint32_t block_h = vk_format_get_blockheight(image->vk.format); + const uint32_t block_w = + vk_format_get_blockwidth(image->planes[plane].vk_format); + const uint32_t block_h = + vk_format_get_blockheight(image->planes[plane].vk_format); const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, false, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, @@ -1440,7 +2007,7 @@ static bool create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *image, struct v3dv_buffer *buffer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region)) return true; @@ -1569,8 +2136,6 @@ create_blit_render_pass(struct v3dv_device *device, VkRenderPass *pass_load, VkRenderPass *pass_no_load); -static nir_ssa_def *gen_rect_vertices(nir_builder *b); - static bool create_pipeline(struct v3dv_device *device, struct v3dv_render_pass *pass, @@ -1595,7 +2160,7 @@ get_texel_buffer_copy_vs() glsl_vec4_type(), "gl_Position"); vs_out_pos->data.location = VARYING_SLOT_POS; - nir_ssa_def *pos = gen_rect_vertices(&b); + nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); nir_store_var(&b, vs_out_pos, pos, 0xf); return b.shader; @@ -1618,8 +2183,8 @@ get_texel_buffer_copy_gs() nir->info.inputs_read = 1ull << VARYING_SLOT_POS; nir->info.outputs_written = (1ull << VARYING_SLOT_POS) | (1ull << VARYING_SLOT_LAYER); - nir->info.gs.input_primitive = GL_TRIANGLES; - nir->info.gs.output_primitive = GL_TRIANGLE_STRIP; + nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES; + nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP; nir->info.gs.vertices_in = 3; nir->info.gs.vertices_out = 3; nir->info.gs.invocations = 1; @@ -1652,7 +2217,7 @@ get_texel_buffer_copy_gs() nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i); /* gl_Layer from push constants */ - nir_ssa_def *layer = + nir_def *layer = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET, .range = 4); @@ -1666,7 +2231,7 @@ get_texel_buffer_copy_gs() return nir; } -static nir_ssa_def * +static nir_def * load_frag_coord(nir_builder *b) { nir_foreach_shader_in_variable(var, b->shader) { @@ -1730,24 +2295,24 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format, /* Load the box describing the pixel region we want to copy from the * texel buffer. */ - nir_ssa_def *box = + nir_def *box = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET, .range = 16); /* Load the buffer stride (this comes in texel units) */ - nir_ssa_def *stride = + nir_def *stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET, .range = 4); /* Load the buffer offset (this comes in texel units) */ - nir_ssa_def *offset = + nir_def *offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET, .range = 4); - nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b)); + nir_def *coord = nir_f2i32(&b, load_frag_coord(&b)); /* Load pixel data from texel buffer based on the x,y offset of the pixel * within the box. Texel buffers are 1D arrays of texels. @@ -1757,28 +2322,26 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format, * texel buffer should always be within its bounds and we we don't need * to add a check for that here. */ - nir_ssa_def *x_offset = + nir_def *x_offset = nir_isub(&b, nir_channel(&b, coord, 0), nir_channel(&b, box, 0)); - nir_ssa_def *y_offset = + nir_def *y_offset = nir_isub(&b, nir_channel(&b, coord, 1), nir_channel(&b, box, 1)); - nir_ssa_def *texel_offset = + nir_def *texel_offset = nir_iadd(&b, nir_iadd(&b, offset, x_offset), nir_imul(&b, y_offset, stride)); - nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa; + nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def; nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2); tex->sampler_dim = GLSL_SAMPLER_DIM_BUF; tex->op = nir_texop_txf; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(texel_offset); - tex->src[1].src_type = nir_tex_src_texture_deref; - tex->src[1].src = nir_src_for_ssa(tex_deref); + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset); + tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref); tex->dest_type = nir_type_uint32; tex->is_array = false; tex->coord_components = 1; - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result"); + nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(&b, &tex->instr); uint32_t swiz[4]; @@ -1790,7 +2353,7 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format, component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b); swiz[3] = component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a); - nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4); + nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4); nir_store_var(&b, fs_out_color, s, 0xf); return b.shader; @@ -1876,7 +2439,7 @@ get_copy_texel_buffer_pipeline( mtx_lock(&device->meta.mtx); struct hash_entry *entry = _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type], - &key); + key); if (entry) { mtx_unlock(&device->meta.mtx); *pipeline = entry->data; @@ -1905,8 +2468,10 @@ get_copy_texel_buffer_pipeline( if (!ok) goto fail; + uint8_t *dupkey = malloc(V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE); + memcpy(dupkey, key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE); _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type], - &key, *pipeline); + dupkey, *pipeline); mtx_unlock(&device->meta.mtx); return true; @@ -1938,7 +2503,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, VkColorComponentFlags cmask, VkComponentMapping *cswizzle, uint32_t region_count, - const VkBufferImageCopy2KHR *regions) + const VkBufferImageCopy2 *regions) { VkResult result; bool handled = false; @@ -1957,7 +2522,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, /* We only handle color copies. Callers can copy D/S aspects by using * a compatible color format and maybe a cmask/cswizzle for D24 formats. */ - if (aspect != VK_IMAGE_ASPECT_COLOR_BIT) + if (!vk_format_is_color(dst_format) || !vk_format_is_color(src_format)) return handled; /* FIXME: we only handle uncompressed images for now. */ @@ -1978,7 +2543,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) { if (v3dv_buffer_format_supports_features( cmd_buffer->device, src_format, - VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) { + VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT)) { buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; } else { return handled; @@ -2027,13 +2592,10 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, if (result != VK_SUCCESS) return handled; - /* FIXME: for some reason passing region->bufferOffset here for the - * offset field doesn't work, making the following CTS tests fail: - * - * dEQP-VK.api.copy_and_blit.core.buffer_to_image.*buffer_offset* - * - * So instead we pass 0 here and we pass the offset in texels as a push - * constant to the shader, which seems to work correctly. + /* We can't pass region->bufferOffset here for the offset field because + * the texture base pointer in the texture shader state must be a 64-byte + * aligned value. Instead, we use 0 here and we pass the offset in texels + * as a push constant to the shader. */ VkDevice _device = v3dv_device_to_handle(cmd_buffer->device); VkBufferViewCreateInfo buffer_view_info = { @@ -2068,7 +2630,6 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, /* Push command buffer state before starting meta operation */ v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); - uint32_t dirty_dynamic_state = 0; /* Bind common state for all layers and regions */ VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); @@ -2087,8 +2648,10 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, * For 3D images, this creates a layered framebuffer with a number of * layers matching the depth extent of the 3D image. */ - uint32_t fb_width = u_minify(image->vk.extent.width, resource->mipLevel); - uint32_t fb_height = u_minify(image->vk.extent.height, resource->mipLevel); + uint8_t plane = v3dv_plane_from_aspect(aspect); + uint32_t fb_width = u_minify(image->planes[plane].width, resource->mipLevel); + uint32_t fb_height = u_minify(image->planes[plane].height, resource->mipLevel); + VkImageViewCreateInfo image_view_info = { .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, .image = v3dv_image_to_handle(image), @@ -2103,8 +2666,8 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, }, }; VkImageView image_view; - result = v3dv_CreateImageView(_device, &image_view_info, - &cmd_buffer->device->vk.alloc, &image_view); + result = v3dv_create_image_view(cmd_buffer->device, + &image_view_info, &image_view); if (result != VK_SUCCESS) goto fail; @@ -2173,7 +2736,12 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, .clearValueCount = 0, }; - v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE); + VkSubpassBeginInfo sp_info = { + .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO, + .contents = VK_SUBPASS_CONTENTS_INLINE, + }; + + v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info); struct v3dv_job *job = cmd_buffer->state.job; if (!job) goto fail; @@ -2190,9 +2758,8 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, } /* For each region */ - dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR; for (uint32_t r = 0; r < region_count; r++) { - const VkBufferImageCopy2KHR *region = ®ions[r]; + const VkBufferImageCopy2 *region = ®ions[r]; /* Obtain the 2D buffer region spec */ uint32_t buf_width, buf_height; @@ -2240,11 +2807,15 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer, v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0); } /* For each region */ - v3dv_CmdEndRenderPass(_cmd_buffer); + VkSubpassEndInfo sp_end_info = { + .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO, + }; + + v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info); } /* For each layer */ fail: - v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true); + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true); return handled; } @@ -2263,7 +2834,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer, VkColorComponentFlags cmask, VkComponentMapping *cswizzle, uint32_t region_count, - const VkBufferImageCopy2KHR *regions) + const VkBufferImageCopy2 *regions) { /* Since we can't sample linear images we need to upload the linear * buffer to a tiled image that we can use as a blit source, which @@ -2338,14 +2909,19 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer, */ assert(num_layers == 1 || region_count == 1); - const uint32_t block_width = vk_format_get_blockwidth(image->vk.format); - const uint32_t block_height = vk_format_get_blockheight(image->vk.format); + uint8_t plane = v3dv_plane_from_aspect(aspect); + assert(plane < image->plane_count); + + const uint32_t block_width = + vk_format_get_blockwidth(image->planes[plane].vk_format); + const uint32_t block_height = + vk_format_get_blockheight(image->planes[plane].vk_format); /* Copy regions by uploading each region to a temporary tiled image using * the memory we have just allocated as storage. */ for (uint32_t r = 0; r < region_count; r++) { - const VkBufferImageCopy2KHR *region = ®ions[r]; + const VkBufferImageCopy2 *region = ®ions[r]; /* Obtain the 2D buffer region spec */ uint32_t buf_width, buf_height; @@ -2396,16 +2972,23 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer, if (result != VK_SUCCESS) return handled; + /* When copying a multi-plane image the aspect indicates the plane to + * copy. For these, we only copy one plane at a time, which is always + * a color plane. + */ + VkImageAspectFlags copy_aspect = + image->plane_count == 1 ? aspect : VK_IMAGE_ASPECT_COLOR_BIT; + /* Upload buffer contents for the selected layer */ const VkDeviceSize buf_offset_bytes = region->bufferOffset + i * buf_height * buf_width * buffer_bpp; - const VkBufferImageCopy2KHR buffer_image_copy = { - .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2_KHR, + const VkBufferImageCopy2 buffer_image_copy = { + .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2, .bufferOffset = buf_offset_bytes, .bufferRowLength = region->bufferRowLength / block_width, .bufferImageHeight = region->bufferImageHeight / block_height, .imageSubresource = { - .aspectMask = aspect, + .aspectMask = copy_aspect, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1, @@ -2434,10 +3017,10 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer, * image, but that we need to blit to a S8D24 destination (the only * stencil format we support). */ - const VkImageBlit2KHR blit_region = { - .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR, + const VkImageBlit2 blit_region = { + .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2, .srcSubresource = { - .aspectMask = aspect, + .aspectMask = copy_aspect, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1, @@ -2493,7 +3076,7 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *image, struct v3dv_buffer *buffer, uint32_t region_count, - const VkBufferImageCopy2KHR *regions, + const VkBufferImageCopy2 *regions, bool use_texel_buffer) { /* We can only call this with region_count > 1 if we can batch the regions @@ -2501,12 +3084,20 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, * the same aspect. */ VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask; + const VkImageAspectFlagBits any_plane_aspect = + VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; + + bool is_plane_aspect = aspect & any_plane_aspect; /* Generally, the bpp of the data in the buffer matches that of the * destination image. The exception is the case where we are uploading * stencil (8bpp) to a combined d24s8 image (32bpp). */ - uint32_t buf_bpp = image->cpp; + uint8_t plane = v3dv_plane_from_aspect(aspect); + assert(plane < image->plane_count); + uint32_t buf_bpp = image->planes[plane].cpp; /* We are about to upload the buffer data to an image so we can then * blit that to our destination region. Because we are going to implement @@ -2539,6 +3130,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, case 4: switch (aspect) { case VK_IMAGE_ASPECT_COLOR_BIT: + case VK_IMAGE_ASPECT_PLANE_0_BIT: + case VK_IMAGE_ASPECT_PLANE_1_BIT: + case VK_IMAGE_ASPECT_PLANE_2_BIT: src_format = VK_FORMAT_R8G8B8A8_UINT; dst_format = src_format; break; @@ -2548,7 +3142,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32); src_format = VK_FORMAT_R8G8B8A8_UINT; dst_format = src_format; - aspect = VK_IMAGE_ASPECT_COLOR_BIT; /* For D24 formats, the Vulkan spec states that the depth component * in the buffer is stored in the 24-LSB, but V3D wants it in the @@ -2578,7 +3171,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, src_format = VK_FORMAT_R8_UINT; dst_format = VK_FORMAT_R8G8B8A8_UINT; cmask = VK_COLOR_COMPONENT_R_BIT; - aspect = VK_IMAGE_ASPECT_COLOR_BIT; break; default: unreachable("unsupported aspect"); @@ -2586,12 +3178,14 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, }; break; case 2: - aspect = VK_IMAGE_ASPECT_COLOR_BIT; + assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT || + aspect == VK_IMAGE_ASPECT_DEPTH_BIT || + is_plane_aspect); src_format = VK_FORMAT_R16_UINT; dst_format = src_format; break; case 1: - assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT); + assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT || is_plane_aspect); src_format = VK_FORMAT_R8_UINT; dst_format = src_format; break; @@ -2615,75 +3209,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, } } -/** - * Returns true if the implementation supports the requested operation (even if - * it failed to process it, for example, due to an out-of-memory error). - */ -static bool -copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_image *image, - struct v3dv_buffer *buffer, - const VkBufferImageCopy2KHR *region) -{ - /* FIXME */ - if (vk_format_is_depth_or_stencil(image->vk.format)) - return false; - - if (vk_format_is_compressed(image->vk.format)) - return false; - - if (image->vk.tiling == VK_IMAGE_TILING_LINEAR) - return false; - - uint32_t buffer_width, buffer_height; - if (region->bufferRowLength == 0) - buffer_width = region->imageExtent.width; - else - buffer_width = region->bufferRowLength; - - if (region->bufferImageHeight == 0) - buffer_height = region->imageExtent.height; - else - buffer_height = region->bufferImageHeight; - - uint32_t buffer_stride = buffer_width * image->cpp; - uint32_t buffer_layer_stride = buffer_stride * buffer_height; - - uint32_t num_layers; - if (image->vk.image_type != VK_IMAGE_TYPE_3D) - num_layers = region->imageSubresource.layerCount; - else - num_layers = region->imageExtent.depth; - assert(num_layers > 0); - - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE, - cmd_buffer, -1); - if (!job) - return true; - - job->cpu.copy_buffer_to_image.image = image; - job->cpu.copy_buffer_to_image.buffer = buffer; - job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride; - job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride; - job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset; - job->cpu.copy_buffer_to_image.image_extent = region->imageExtent; - job->cpu.copy_buffer_to_image.image_offset = region->imageOffset; - job->cpu.copy_buffer_to_image.mip_level = - region->imageSubresource.mipLevel; - job->cpu.copy_buffer_to_image.base_layer = - region->imageSubresource.baseArrayLayer; - job->cpu.copy_buffer_to_image.layer_count = num_layers; - - list_addtail(&job->list_link, &cmd_buffer->jobs); - - return true; -} - VKAPI_ATTR void VKAPI_CALL -v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, - const VkCopyBufferToImageInfo2KHR *info) +v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, + const VkCopyBufferToImageInfo2 *info) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer); @@ -2691,6 +3219,8 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT); + cmd_buffer->state.is_transfer = true; + uint32_t r = 0; while (r < info->regionCount) { /* The TFU and TLB paths can only copy one region at a time and the region @@ -2739,12 +3269,6 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, * slow it might not be worth it and we should instead put more effort * in handling more cases with the other paths. */ - if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, - &info->pRegions[r])) { - batch_size = 1; - goto handled; - } - if (copy_buffer_to_image_shader(cmd_buffer, image, buffer, batch_size, &info->pRegions[r], false)) { goto handled; @@ -2755,6 +3279,8 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, handled: r += batch_size; } + + cmd_buffer->state.is_transfer = false; } static void @@ -2773,17 +3299,31 @@ static bool blit_tfu(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *dst, struct v3dv_image *src, - const VkImageBlit2KHR *region) + const VkImageBlit2 *region) { + if (V3D_DBG(DISABLE_TFU)) { + perf_debug("Blit: TFU disabled, fallbacks could be slower."); + return false; + } + assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT); assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT); + /* From vkCmdBlitImage: + * "srcImage must not use a format that requires a sampler YCBCR + * conversion" + * "dstImage must not use a format that requires a sampler YCBCR + * conversion" + */ + assert(dst->plane_count == 1); + assert(src->plane_count == 1); + /* Format must match */ if (src->vk.format != dst->vk.format) return false; /* Destination can't be raster format */ - if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR) + if (!dst->tiled) return false; /* Source region must start at (0,0) */ @@ -2825,7 +3365,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer, */ const struct v3dv_format *format = v3dv_get_compatible_tfu_format(cmd_buffer->device, - dst->cpp, NULL); + dst->planes[0].cpp, NULL); /* Emit a TFU job for each layer to blit */ assert(region->dstSubresource.layerCount == @@ -2871,10 +3411,31 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer, dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i; const uint32_t src_layer = src_mirror_z ? max_src_layer - i - 1: min_src_layer + i; - v3dv_X(cmd_buffer->device, meta_emit_tfu_job) - (cmd_buffer, dst, dst_mip_level, dst_layer, - src, src_mip_level, src_layer, - dst_width, dst_height, format); + + const uint32_t dst_offset = + dst->planes[0].mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level, + dst_layer, 0); + const uint32_t src_offset = + src->planes[0].mem->bo->offset + v3dv_layer_offset(src, src_mip_level, + src_layer, 0); + + const struct v3d_resource_slice *dst_slice = &dst->planes[0].slices[dst_mip_level]; + const struct v3d_resource_slice *src_slice = &src->planes[0].slices[src_mip_level]; + + v3dv_X(cmd_buffer->device, meta_emit_tfu_job)( + cmd_buffer, + dst->planes[0].mem->bo->handle, + dst_offset, + dst_slice->tiling, + dst_slice->padded_height, + dst->planes[0].cpp, + src->planes[0].mem->bo->handle, + src_offset, + src_slice->tiling, + src_slice->tiling == V3D_TILING_RASTER ? + src_slice->stride : src_slice->padded_height, + src->planes[0].cpp, + dst_width, dst_height, &format->planes[0]); } return true; @@ -2941,7 +3502,8 @@ create_blit_render_pass(struct v3dv_device *device, const bool is_color_blit = vk_format_is_color(dst_format); /* Attachment load operation is specified below */ - VkAttachmentDescription att = { + VkAttachmentDescription2 att = { + .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2, .format = dst_format, .samples = VK_SAMPLE_COUNT_1_BIT, .storeOp = VK_ATTACHMENT_STORE_OP_STORE, @@ -2949,12 +3511,14 @@ create_blit_render_pass(struct v3dv_device *device, .finalLayout = VK_IMAGE_LAYOUT_GENERAL, }; - VkAttachmentReference att_ref = { + VkAttachmentReference2 att_ref = { + .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, .attachment = 0, .layout = VK_IMAGE_LAYOUT_GENERAL, }; - VkSubpassDescription subpass = { + VkSubpassDescription2 subpass = { + .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2, .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, .inputAttachmentCount = 0, .colorAttachmentCount = is_color_blit ? 1 : 0, @@ -2965,8 +3529,8 @@ create_blit_render_pass(struct v3dv_device *device, .pPreserveAttachments = NULL, }; - VkRenderPassCreateInfo info = { - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + VkRenderPassCreateInfo2 info = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2, .attachmentCount = 1, .pAttachments = &att, .subpassCount = 1, @@ -2977,60 +3541,27 @@ create_blit_render_pass(struct v3dv_device *device, VkResult result; att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; - result = v3dv_CreateRenderPass(v3dv_device_to_handle(device), - &info, &device->vk.alloc, pass_load); + result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device), + &info, &device->vk.alloc, pass_load); if (result != VK_SUCCESS) return false; att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - result = v3dv_CreateRenderPass(v3dv_device_to_handle(device), - &info, &device->vk.alloc, pass_no_load); + result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device), + &info, &device->vk.alloc, pass_no_load); return result == VK_SUCCESS; } -static nir_ssa_def * -gen_rect_vertices(nir_builder *b) -{ - nir_ssa_def *vertex_id = nir_load_vertex_id(b); - - /* vertex 0: -1.0, -1.0 - * vertex 1: -1.0, 1.0 - * vertex 2: 1.0, -1.0 - * vertex 3: 1.0, 1.0 - * - * so: - * - * channel 0 is vertex_id < 2 ? -1.0 : 1.0 - * channel 1 is vertex id & 1 ? 1.0 : -1.0 - */ - - nir_ssa_def *one = nir_imm_int(b, 1); - nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2)); - nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one); - - nir_ssa_def *comp[4]; - comp[0] = nir_bcsel(b, c0cmp, - nir_imm_float(b, -1.0f), - nir_imm_float(b, 1.0f)); - - comp[1] = nir_bcsel(b, c1cmp, - nir_imm_float(b, 1.0f), - nir_imm_float(b, -1.0f)); - comp[2] = nir_imm_float(b, 0.0f); - comp[3] = nir_imm_float(b, 1.0f); - return nir_vec(b, comp, 4); -} - -static nir_ssa_def * +static nir_def * gen_tex_coords(nir_builder *b) { - nir_ssa_def *tex_box = + nir_def *tex_box = nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16); - nir_ssa_def *tex_z = + nir_def *tex_z = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4); - nir_ssa_def *vertex_id = nir_load_vertex_id(b); + nir_def *vertex_id = nir_load_vertex_id(b); /* vertex 0: src0_x, src0_y * vertex 1: src0_x, src1_y @@ -3043,11 +3574,11 @@ gen_tex_coords(nir_builder *b) * channel 1 is vertex id & 1 ? src1_y : src0_y */ - nir_ssa_def *one = nir_imm_int(b, 1); - nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2)); - nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one); + nir_def *one = nir_imm_int(b, 1); + nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2); + nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one); - nir_ssa_def *comp[4]; + nir_def *comp[4]; comp[0] = nir_bcsel(b, c0cmp, nir_channel(b, tex_box, 0), nir_channel(b, tex_box, 2)); @@ -3060,9 +3591,9 @@ gen_tex_coords(nir_builder *b) return nir_vec(b, comp, 4); } -static nir_ssa_def * +static nir_def * build_nir_tex_op_read(struct nir_builder *b, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type, enum glsl_sampler_dim dim) { @@ -3075,57 +3606,49 @@ build_nir_tex_op_read(struct nir_builder *b, sampler->data.descriptor_set = 0; sampler->data.binding = 0; - nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa; + nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def; nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3); tex->sampler_dim = dim; tex->op = nir_texop_tex; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(tex_pos); - tex->src[1].src_type = nir_tex_src_texture_deref; - tex->src[1].src = nir_src_for_ssa(tex_deref); - tex->src[2].src_type = nir_tex_src_sampler_deref; - tex->src[2].src = nir_src_for_ssa(tex_deref); + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos); + tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref); + tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref); tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type); tex->is_array = glsl_sampler_type_is_array(sampler_type); tex->coord_components = tex_pos->num_components; - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(b, &tex->instr); - return &tex->dest.ssa; + return &tex->def; } -static nir_ssa_def * +static nir_def * build_nir_tex_op_ms_fetch_sample(struct nir_builder *b, nir_variable *sampler, - nir_ssa_def *tex_deref, + nir_def *tex_deref, enum glsl_base_type tex_type, - nir_ssa_def *tex_pos, - nir_ssa_def *sample_idx) + nir_def *tex_pos, + nir_def *sample_idx) { - nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4); + nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3); tex->sampler_dim = GLSL_SAMPLER_DIM_MS; tex->op = nir_texop_txf_ms; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(tex_pos); - tex->src[1].src_type = nir_tex_src_texture_deref; - tex->src[1].src = nir_src_for_ssa(tex_deref); - tex->src[2].src_type = nir_tex_src_sampler_deref; - tex->src[2].src = nir_src_for_ssa(tex_deref); - tex->src[3].src_type = nir_tex_src_ms_index; - tex->src[3].src = nir_src_for_ssa(sample_idx); + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos); + tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref); + tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx); tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type); tex->is_array = false; tex->coord_components = tex_pos->num_components; - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(b, &tex->instr); - return &tex->dest.ssa; + return &tex->def; } /* Fetches all samples at the given position and averages them */ -static nir_ssa_def * +static nir_def * build_nir_tex_op_ms_resolve(struct nir_builder *b, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type, VkSampleCountFlagBits src_samples) { @@ -3139,10 +3662,10 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b, const bool is_int = glsl_base_type_is_integer(tex_type); - nir_ssa_def *tmp = NULL; - nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa; + nir_def *tmp = NULL; + nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def; for (uint32_t i = 0; i < src_samples; i++) { - nir_ssa_def *s = + nir_def *s = build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref, tex_type, tex_pos, nir_imm_int(b, i)); @@ -3157,13 +3680,13 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b, } assert(!is_int); - return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples)); + return nir_fmul_imm(b, tmp, 1.0f / src_samples); } /* Fetches the current sample (gl_SampleID) at the given position */ -static nir_ssa_def * +static nir_def * build_nir_tex_op_ms_read(struct nir_builder *b, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type) { const struct glsl_type *sampler_type = @@ -3173,17 +3696,17 @@ build_nir_tex_op_ms_read(struct nir_builder *b, sampler->data.descriptor_set = 0; sampler->data.binding = 0; - nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa; + nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def; return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref, tex_type, tex_pos, nir_load_sample_id(b)); } -static nir_ssa_def * +static nir_def * build_nir_tex_op(struct nir_builder *b, struct v3dv_device *device, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type, VkSampleCountFlagBits dst_samples, VkSampleCountFlagBits src_samples, @@ -3227,10 +3750,10 @@ get_blit_vs() vs_out_tex_coord->data.location = VARYING_SLOT_VAR0; vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH; - nir_ssa_def *pos = gen_rect_vertices(&b); + nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); nir_store_var(&b, vs_out_pos, pos, 0xf); - nir_ssa_def *tex_coord = gen_tex_coords(&b); + nir_def *tex_coord = gen_tex_coords(&b); nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf); return b.shader; @@ -3281,11 +3804,11 @@ get_color_blit_fs(struct v3dv_device *device, nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color"); fs_out_color->data.location = FRAG_RESULT_DATA0; - nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord); + nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord); const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim); tex_coord = nir_channels(&b, tex_coord, channel_mask); - nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type, + nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type, dst_samples, src_samples, sampler_dim); /* For integer textures, if the bit-size of the destination is too small to @@ -3300,7 +3823,7 @@ get_color_blit_fs(struct v3dv_device *device, enum pipe_format src_pformat = vk_format_to_pipe_format(src_format); enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format); - nir_ssa_def *c[4]; + nir_def *c[4]; for (uint32_t i = 0; i < 4; i++) { c[i] = nir_channel(&b, color, i); @@ -3318,11 +3841,11 @@ get_color_blit_fs(struct v3dv_device *device, assert(dst_bit_size > 0); if (util_format_is_pure_uint(dst_pformat)) { - nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1); + nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1); c[i] = nir_umin(&b, c[i], max); } else { - nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1); - nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1))); + nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1); + nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1))); c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min); } } @@ -3348,14 +3871,12 @@ create_pipeline(struct v3dv_device *device, const VkPipelineLayout layout, VkPipeline *pipeline) { - struct vk_shader_module vs_m; + struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir); + struct vk_shader_module fs_m = vk_shader_module_from_nir(fs_nir); struct vk_shader_module gs_m; - struct vk_shader_module fs_m; uint32_t num_stages = gs_nir ? 3 : 2; - v3dv_shader_module_internal_init(device, &vs_m, vs_nir); - v3dv_shader_module_internal_init(device, &fs_m, fs_nir); VkPipelineShaderStageCreateInfo stages[3] = { { @@ -3379,7 +3900,7 @@ create_pipeline(struct v3dv_device *device, }; if (gs_nir) { - v3dv_shader_module_internal_init(device, &gs_m, gs_nir); + gs_m = vk_shader_module_from_nir(gs_nir); stages[2].module = vk_shader_module_to_handle(&gs_m); } @@ -3452,6 +3973,7 @@ create_pipeline(struct v3dv_device *device, pipeline); ralloc_free(vs_nir); + ralloc_free(gs_nir); ralloc_free(fs_nir); return result == VK_SUCCESS; @@ -3762,6 +4284,8 @@ allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer, * cmask parameter (which can be 0 to default to all channels), as well as a * swizzle to apply to the source via the cswizzle parameter (which can be NULL * to use the default identity swizzle). + * + * Supports multi-plane formats too. */ static bool blit_shader(struct v3dv_cmd_buffer *cmd_buffer, @@ -3771,25 +4295,23 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, VkFormat src_format, VkColorComponentFlags cmask, VkComponentMapping *cswizzle, - const VkImageBlit2KHR *_region, + const VkImageBlit2 *region, VkFilter filter, bool dst_is_padded_image) { bool handled = true; VkResult result; - uint32_t dirty_dynamic_state = 0; /* We don't support rendering to linear depth/stencil, this should have * been rewritten to a compatible color blit by the caller. */ - assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR || - !vk_format_is_depth_or_stencil(dst_format)); + assert(dst->tiled || !vk_format_is_depth_or_stencil(dst_format)); /* Can't sample from linear images */ - if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && src->vk.image_type != VK_IMAGE_TYPE_1D) + if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) { return false; + } - VkImageBlit2KHR region = *_region; /* Rewrite combined D/S blits to compatible color blits */ if (vk_format_is_depth_or_stencil(dst_format)) { assert(src_format == dst_format); @@ -3803,12 +4325,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, break; case VK_FORMAT_X8_D24_UNORM_PACK32: case VK_FORMAT_D24_UNORM_S8_UINT: - if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { + if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { cmask |= VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; } - if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { + if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT); cmask |= VK_COLOR_COMPONENT_R_BIT; } @@ -3818,10 +4340,15 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, unreachable("Unsupported depth/stencil format"); }; src_format = dst_format; - region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; } + uint8_t src_plane = + v3dv_plane_from_aspect(region->srcSubresource.aspectMask); + assert(src_plane < src->plane_count); + uint8_t dst_plane = + v3dv_plane_from_aspect(region->dstSubresource.aspectMask); + assert(dst_plane < dst->plane_count); + const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | @@ -3844,34 +4371,40 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, * need to apply those same semantics here when we compute the size of the * destination image level. */ - const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format); - const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format); - const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format); - const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format); + const uint32_t dst_block_w = + vk_format_get_blockwidth(dst->planes[dst_plane].vk_format); + const uint32_t dst_block_h = + vk_format_get_blockheight(dst->planes[dst_plane].vk_format); + const uint32_t src_block_w = + vk_format_get_blockwidth(src->planes[src_plane].vk_format); + const uint32_t src_block_h = + vk_format_get_blockheight(src->planes[src_plane].vk_format); const uint32_t dst_level_w = u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w), - region.dstSubresource.mipLevel); + region->dstSubresource.mipLevel); const uint32_t dst_level_h = u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h), - region.dstSubresource.mipLevel); + region->dstSubresource.mipLevel); const uint32_t src_level_w = - u_minify(src->vk.extent.width, region.srcSubresource.mipLevel); + u_minify(src->planes[src_plane].width, region->srcSubresource.mipLevel); const uint32_t src_level_h = - u_minify(src->vk.extent.height, region.srcSubresource.mipLevel); + u_minify(src->planes[src_plane].height, region->srcSubresource.mipLevel); + + assert(src->plane_count == 1 || src->vk.image_type != VK_IMAGE_TYPE_3D); const uint32_t src_level_d = - u_minify(src->vk.extent.depth, region.srcSubresource.mipLevel); + u_minify(src->vk.extent.depth, region->srcSubresource.mipLevel); uint32_t dst_x, dst_y, dst_w, dst_h; bool dst_mirror_x, dst_mirror_y; - compute_blit_box(region.dstOffsets, + compute_blit_box(region->dstOffsets, dst_level_w, dst_level_h, &dst_x, &dst_y, &dst_w, &dst_h, &dst_mirror_x, &dst_mirror_y); uint32_t src_x, src_y, src_w, src_h; bool src_mirror_x, src_mirror_y; - compute_blit_box(region.srcOffsets, + compute_blit_box(region->srcOffsets, src_level_w, src_level_h, &src_x, &src_y, &src_w, &src_h, &src_mirror_x, &src_mirror_y); @@ -3880,10 +4413,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, uint32_t max_dst_layer; bool dst_mirror_z = false; if (dst->vk.image_type != VK_IMAGE_TYPE_3D) { - min_dst_layer = region.dstSubresource.baseArrayLayer; - max_dst_layer = min_dst_layer + region.dstSubresource.layerCount; + min_dst_layer = region->dstSubresource.baseArrayLayer; + max_dst_layer = min_dst_layer + region->dstSubresource.layerCount; } else { - compute_blit_3d_layers(region.dstOffsets, + compute_blit_3d_layers(region->dstOffsets, &min_dst_layer, &max_dst_layer, &dst_mirror_z); } @@ -3892,10 +4425,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, uint32_t max_src_layer; bool src_mirror_z = false; if (src->vk.image_type != VK_IMAGE_TYPE_3D) { - min_src_layer = region.srcSubresource.baseArrayLayer; - max_src_layer = min_src_layer + region.srcSubresource.layerCount; + min_src_layer = region->srcSubresource.baseArrayLayer; + max_src_layer = min_src_layer + region->srcSubresource.layerCount; } else { - compute_blit_3d_layers(region.srcOffsets, + compute_blit_3d_layers(region->srcOffsets, &min_src_layer, &max_src_layer, &src_mirror_z); } @@ -4010,7 +4543,6 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, }; /* Record per-layer commands */ - VkImageAspectFlags aspects = region.dstSubresource.aspectMask; for (uint32_t i = 0; i < layer_count; i++) { /* Setup framebuffer */ VkImageViewCreateInfo dst_image_view_info = { @@ -4019,16 +4551,16 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, .viewType = v3dv_image_type_to_view_type(dst->vk.image_type), .format = dst_format, .subresourceRange = { - .aspectMask = aspects, - .baseMipLevel = region.dstSubresource.mipLevel, + .aspectMask = region->dstSubresource.aspectMask, + .baseMipLevel = region->dstSubresource.mipLevel, .levelCount = 1, .baseArrayLayer = min_dst_layer + i, .layerCount = 1 }, }; VkImageView dst_image_view; - result = v3dv_CreateImageView(_device, &dst_image_view_info, - &device->vk.alloc, &dst_image_view); + result = v3dv_create_image_view(device, &dst_image_view_info, + &dst_image_view); if (result != VK_SUCCESS) goto fail; @@ -4078,8 +4610,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, .format = src_format, .components = *cswizzle, .subresourceRange = { - .aspectMask = aspects, - .baseMipLevel = region.srcSubresource.mipLevel, + .aspectMask = region->srcSubresource.aspectMask, + .baseMipLevel = region->srcSubresource.mipLevel, .levelCount = 1, .baseArrayLayer = src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i, @@ -4087,8 +4619,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, }, }; VkImageView src_image_view; - result = v3dv_CreateImageView(_device, &src_image_view_info, - &device->vk.alloc, &src_image_view); + result = v3dv_create_image_view(device, &src_image_view_info, + &src_image_view); if (result != VK_SUCCESS) goto fail; @@ -4146,7 +4678,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, .clearValueCount = 0, }; - v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE); + VkSubpassBeginInfo sp_info = { + .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO, + .contents = VK_SUBPASS_CONTENTS_INLINE, + }; + + v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info); struct v3dv_job *job = cmd_buffer->state.job; if (!job) goto fail; @@ -4170,25 +4707,37 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0); - v3dv_CmdEndRenderPass(_cmd_buffer); - dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR; + VkSubpassEndInfo sp_end_info = { + .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO, + }; + + v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info); } fail: - v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true); + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true); return handled; } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, - const VkBlitImageInfo2KHR *pBlitImageInfo) +v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer, + const VkBlitImageInfo2 *pBlitImageInfo) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage); V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage); - /* This command can only happen outside a render pass */ + /* From vkCmdBlitImage: + * "srcImage must not use a format that requires a sampler YCBCR + * conversion" + * "dstImage must not use a format that requires a sampler YCBCR + * conversion" + */ + assert(src->plane_count == 1); + assert(dst->plane_count == 1); + + /* This command can only happen outside a render pass */ assert(cmd_buffer->state.pass == NULL); assert(cmd_buffer->state.job == NULL); @@ -4199,29 +4748,41 @@ v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */ assert(!vk_format_is_compressed(dst->vk.format)); + cmd_buffer->state.is_transfer = true; + for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) { - if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i])) + const VkImageBlit2 *region = &pBlitImageInfo->pRegions[i]; + + if (blit_tfu(cmd_buffer, dst, src, region)) continue; if (blit_shader(cmd_buffer, dst, dst->vk.format, src, src->vk.format, 0, NULL, - &pBlitImageInfo->pRegions[i], + region, pBlitImageInfo->filter, true)) { continue; } unreachable("Unsupported blit operation"); } + + cmd_buffer->state.is_transfer = false; } static bool resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *dst, struct v3dv_image *src, - const VkImageResolve2KHR *region) + const VkImageResolve2 *region) { - if (!v3dv_meta_can_use_tlb(src, ®ion->srcOffset, NULL) || - !v3dv_meta_can_use_tlb(dst, ®ion->dstOffset, NULL)) { + /* No resolve for multi-planar images. Using plane 0 */ + assert(dst->plane_count == 1); + assert(src->plane_count == 1); + + if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel, + ®ion->srcOffset, NULL, NULL) || + !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel, + ®ion->dstOffset, ®ion->extent, NULL)) { return false; } @@ -4242,8 +4803,10 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, if (!job) return true; - const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format); - const uint32_t block_h = vk_format_get_blockheight(dst->vk.format); + const uint32_t block_w = + vk_format_get_blockwidth(dst->planes[0].vk_format); + const uint32_t block_h = + vk_format_get_blockheight(dst->planes[0].vk_format); const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h); @@ -4252,8 +4815,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, (fb_format, region->srcSubresource.aspectMask, &internal_type, &internal_bpp); - v3dv_job_start_frame(job, width, height, num_layers, false, - 1, internal_bpp, true); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + true); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, @@ -4271,10 +4835,10 @@ static bool resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *dst, struct v3dv_image *src, - const VkImageResolve2KHR *region) + const VkImageResolve2 *region) { - const VkImageBlit2KHR blit_region = { - .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR, + const VkImageBlit2 blit_region = { + .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2, .srcSubresource = region->srcSubresource, .srcOffsets = { region->srcOffset, @@ -4300,8 +4864,8 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer, } VKAPI_ATTR void VKAPI_CALL -v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, - const VkResolveImageInfo2KHR *info) +v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer, + const VkResolveImageInfo2 *info) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); @@ -4315,6 +4879,12 @@ v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT); assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT); + /* We don't support multi-sampled multi-plane images */ + assert(src->plane_count == 1); + assert(dst->plane_count == 1); + + cmd_buffer->state.is_transfer = true; + for (uint32_t i = 0; i < info->regionCount; i++) { if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i])) continue; @@ -4322,4 +4892,6 @@ v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, continue; unreachable("Unsupported multismaple resolve operation"); } + + cmd_buffer->state.is_transfer = false; } diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c index 464703e42a4..ae6e37159d4 100644 --- a/src/broadcom/vulkan/v3dv_pass.c +++ b/src/broadcom/vulkan/v3dv_pass.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,7 +24,7 @@ #include "v3dv_private.h" static uint32_t -num_subpass_attachments(const VkSubpassDescription *desc) +num_subpass_attachments(const VkSubpassDescription2 *desc) { return desc->inputAttachmentCount + desc->colorAttachmentCount + @@ -33,11 +33,11 @@ num_subpass_attachments(const VkSubpassDescription *desc) } static void -set_use_tlb_resolve(struct v3dv_device *device, +set_try_tlb_resolve(struct v3dv_device *device, struct v3dv_render_pass_attachment *att) { const struct v3dv_format *format = v3dv_X(device, get_format)(att->desc.format); - att->use_tlb_resolve = v3dv_X(device, format_supports_tlb_resolve)(format); + att->try_tlb_resolve = v3dv_X(device, format_supports_tlb_resolve)(format); } static void @@ -82,7 +82,7 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device, if (subpass->resolve_attachments && subpass->resolve_attachments[j].attachment != VK_ATTACHMENT_UNUSED) { - set_use_tlb_resolve(device, att); + set_try_tlb_resolve(device, att); } } @@ -92,6 +92,9 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device, pass->attachments[ds_attachment_idx].first_subpass = i; if (i > pass->attachments[ds_attachment_idx].last_subpass) pass->attachments[ds_attachment_idx].last_subpass = i; + + if (subpass->ds_resolve_attachment.attachment != VK_ATTACHMENT_UNUSED) + set_try_tlb_resolve(device, &pass->attachments[ds_attachment_idx]); } for (uint32_t j = 0; j < subpass->input_count; j++) { @@ -118,21 +121,57 @@ pass_find_subpass_range_for_attachments(struct v3dv_device *device, } } +/* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa), + * the clear might get lost. If a subpass has this then we can't emit + * the clear using the TLB and we have to do it as a draw call. This + * issue is fixed since V3D 4.3.18. + * + * FIXME: separate stencil. + */ +static void +check_do_depth_stencil_clear_with_draw(struct v3dv_device *device, + struct v3dv_render_pass *pass, + struct v3dv_subpass *subpass) +{ + if (device->devinfo.ver > 42 || + subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) { + return; + } + + struct v3dv_render_pass_attachment *att = + &pass->attachments[subpass->ds_attachment.attachment]; + if (att->desc.format != VK_FORMAT_D24_UNORM_S8_UINT) + return; + + if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR && + att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD) { + subpass->do_depth_clear_with_draw = true; + } else if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD && + att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) { + subpass->do_stencil_clear_with_draw = true; + } +} VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateRenderPass(VkDevice _device, - const VkRenderPassCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkRenderPass *pRenderPass) +v3dv_CreateRenderPass2(VkDevice _device, + const VkRenderPassCreateInfo2 *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkRenderPass *pRenderPass) { V3DV_FROM_HANDLE(v3dv_device, device, _device); struct v3dv_render_pass *pass; - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2); - const VkRenderPassMultiviewCreateInfo *multiview_info = - vk_find_struct_const(pCreateInfo->pNext, RENDER_PASS_MULTIVIEW_CREATE_INFO); - bool multiview_enabled = multiview_info && multiview_info->subpassCount > 0; + /* From the VK_KHR_multiview spec: + * + * When a subpass uses a non-zero view mask, multiview functionality is + * considered to be enabled. Multiview is all-or-nothing for a render + * pass - that is, either all subpasses must have a non-zero view mask + * (though some subpasses may have only one view) or all must be zero. + */ + bool multiview_enabled = pCreateInfo->subpassCount && + pCreateInfo->pSubpasses[0].viewMask; size_t size = sizeof(*pass); size_t subpasses_offset = size; @@ -143,7 +182,7 @@ v3dv_CreateRenderPass(VkDevice _device, pass = vk_object_zalloc(&device->vk, pAllocator, size, VK_OBJECT_TYPE_RENDER_PASS); if (pass == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); pass->multiview_enabled = multiview_enabled; pass->attachment_count = pCreateInfo->attachmentCount; @@ -156,7 +195,7 @@ v3dv_CreateRenderPass(VkDevice _device, uint32_t subpass_attachment_count = 0; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { - const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i]; + const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i]; subpass_attachment_count += num_subpass_attachments(desc); } @@ -168,7 +207,7 @@ v3dv_CreateRenderPass(VkDevice _device, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (pass->subpass_attachments == NULL) { vk_object_free(&device->vk, pAllocator, pass); - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } } else { pass->subpass_attachments = NULL; @@ -176,13 +215,12 @@ v3dv_CreateRenderPass(VkDevice _device, struct v3dv_subpass_attachment *p = pass->subpass_attachments; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { - const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i]; + const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i]; struct v3dv_subpass *subpass = &pass->subpasses[i]; subpass->input_count = desc->inputAttachmentCount; subpass->color_count = desc->colorAttachmentCount; - if (multiview_enabled) - subpass->view_mask = multiview_info->pViewMasks[i]; + subpass->view_mask = desc->viewMask; if (desc->inputAttachmentCount > 0) { subpass->input_attachments = p; @@ -226,27 +264,38 @@ v3dv_CreateRenderPass(VkDevice _device, .layout = desc->pDepthStencilAttachment->layout, }; - /* GFXH-1461: if depth is cleared but stencil is loaded (or viceversa), - * the clear might get lost. If a subpass has this then we can't emit - * the clear using the TLB and we have to do it as a draw call. - * - * FIXME: separate stencil. - */ - if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { - struct v3dv_render_pass_attachment *att = - &pass->attachments[subpass->ds_attachment.attachment]; - if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) { - if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR && - att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD) { - subpass->do_depth_clear_with_draw = true; - } else if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD && - att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) { - subpass->do_stencil_clear_with_draw = true; - } - } + check_do_depth_stencil_clear_with_draw(device, pass, subpass); + + /* VK_KHR_depth_stencil_resolve */ + const VkSubpassDescriptionDepthStencilResolve *resolve_desc = + vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE); + const VkAttachmentReference2 *resolve_att = + resolve_desc && resolve_desc->pDepthStencilResolveAttachment && + resolve_desc->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED ? + resolve_desc->pDepthStencilResolveAttachment : NULL; + if (resolve_att) { + subpass->ds_resolve_attachment = (struct v3dv_subpass_attachment) { + .attachment = resolve_att->attachment, + .layout = resolve_att->layout, + }; + assert(resolve_desc->depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT || + resolve_desc->stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT); + subpass->resolve_depth = + resolve_desc->depthResolveMode != VK_RESOLVE_MODE_NONE && + resolve_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT; + subpass->resolve_stencil = + resolve_desc->stencilResolveMode != VK_RESOLVE_MODE_NONE && + resolve_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT; + } else { + subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED; + subpass->resolve_depth = false; + subpass->resolve_stencil = false; } } else { subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED; + subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED; + subpass->resolve_depth = false; + subpass->resolve_stencil = false; } } @@ -280,50 +329,44 @@ subpass_get_granularity(struct v3dv_device *device, uint32_t subpass_idx, VkExtent2D *granularity) { - static const uint8_t tile_sizes[] = { - 64, 64, - 64, 32, - 32, 32, - 32, 16, - 16, 16, - 16, 8, - 8, 8 - }; - - /* Our tile size depends on the number of color attachments and the maximum - * bpp across them. - */ + /* Granularity is defined by the tile size */ assert(subpass_idx < pass->subpass_count); struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx]; - const uint32_t color_attachment_count = subpass->color_count; + const uint32_t color_count = subpass->color_count; + bool msaa = false; uint32_t max_internal_bpp = 0; - for (uint32_t i = 0; i < color_attachment_count; i++) { + uint32_t total_color_bpp = 0; + for (uint32_t i = 0; i < color_count; i++) { uint32_t attachment_idx = subpass->color_attachments[i].attachment; if (attachment_idx == VK_ATTACHMENT_UNUSED) continue; - const VkAttachmentDescription *desc = + const VkAttachmentDescription2 *desc = &pass->attachments[attachment_idx].desc; const struct v3dv_format *format = v3dv_X(device, get_format)(desc->format); uint32_t internal_type, internal_bpp; + /* We don't support rendering to YCbCr images */ + assert(format->plane_count == 1); v3dv_X(device, get_internal_type_bpp_for_output_format) - (format->rt_type, &internal_type, &internal_bpp); + (format->planes[0].rt_type, &internal_type, &internal_bpp); max_internal_bpp = MAX2(max_internal_bpp, internal_bpp); - } - - uint32_t idx = 0; - if (color_attachment_count > 2) - idx += 2; - else if (color_attachment_count > 1) - idx += 1; + total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); - idx += max_internal_bpp; + if (desc->samples > VK_SAMPLE_COUNT_1_BIT) + msaa = true; + } - assert(idx < ARRAY_SIZE(tile_sizes)); + /* If requested, double-buffer may or may not be enabled depending on + * heuristics so we choose a conservative granularity here, with it disabled. + */ + uint32_t width, height; + v3d_choose_tile_size(&device->devinfo, color_count, + max_internal_bpp, total_color_bpp, msaa, + false /* double-buffer */, &width, &height); *granularity = (VkExtent2D) { - .width = tile_sizes[idx * 2], - .height = tile_sizes[idx * 2 + 1] + .width = width, + .height = height }; } @@ -390,3 +433,264 @@ v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device, (fb->has_edge_padding && area->offset.y + area->extent.height >= fb->height)); } + +static void +setup_dynamic_attachment(struct v3dv_device *device, + struct v3dv_render_pass_attachment *att, + const VkRenderingAttachmentInfo *info, + bool is_stencil, + bool is_resolve) +{ + struct v3dv_image_view *view = v3dv_image_view_from_handle(info->imageView); + + VkAttachmentLoadOp load_op, stencil_load_op; + VkAttachmentStoreOp store_op, stencil_store_op; + + if (!is_stencil) { + stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE; + if (!is_resolve) { + load_op = info->loadOp; + store_op = info->storeOp; + } else { + load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + store_op = VK_ATTACHMENT_STORE_OP_STORE; + } + } else { + load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE; + if (!is_resolve) { + stencil_load_op = info->loadOp; + stencil_store_op = info->storeOp; + } else { + stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + stencil_store_op = VK_ATTACHMENT_STORE_OP_STORE; + } + } + + att->desc = (VkAttachmentDescription2) { + .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2, + .flags = 0, + .format = view->vk.format, + .samples = view->vk.image->samples, + .loadOp = load_op, + .storeOp = store_op, + .stencilLoadOp = stencil_load_op, + .stencilStoreOp = stencil_store_op, + .initialLayout = info->imageLayout, + .finalLayout = info->imageLayout, + }; + + if (is_resolve) + set_try_tlb_resolve(device, att); +} + +void +v3dv_setup_dynamic_render_pass(struct v3dv_cmd_buffer *cmd_buffer, + const VkRenderingInfoKHR *info) +{ + struct v3dv_device *device = cmd_buffer->device; + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + + struct v3dv_render_pass *pass = &state->dynamic_pass; + struct v3dv_subpass *subpass = &state->dynamic_subpass; + struct v3dv_render_pass_attachment *pass_attachments = + &state->dynamic_attachments[0]; + struct v3dv_subpass_attachment *subpass_attachments = + &state->dynamic_subpass_attachments[0]; + + memset(pass, 0, sizeof(*pass)); + memset(subpass, 0, sizeof(*subpass)); + memset(pass_attachments, 0, sizeof(state->dynamic_subpass_attachments)); + memset(subpass_attachments, 0, sizeof(state->dynamic_subpass_attachments)); + + vk_object_base_init(&device->vk, (struct vk_object_base *) pass, + VK_OBJECT_TYPE_RENDER_PASS); + + pass->attachments = pass_attachments; + pass->subpass_attachments = subpass_attachments; + + subpass->view_mask = info->viewMask; + subpass->color_count = info->colorAttachmentCount; + subpass->color_attachments = &subpass_attachments[0]; + subpass->resolve_attachments = &subpass_attachments[subpass->color_count]; + + pass->multiview_enabled = info->viewMask != 0; + pass->subpass_count = 1; + pass->subpasses = subpass; + + int a = 0; + for (int i = 0; i < info->colorAttachmentCount; i++) { + struct v3dv_render_pass_attachment *att = &pass->attachments[a]; + const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i]; + + if (att_info->imageView == VK_NULL_HANDLE) { + subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED; + subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED; + continue; + } + + setup_dynamic_attachment(device, att, att_info, false, false); + subpass->color_attachments[i].attachment = a++; + subpass->color_attachments[i].layout = att_info->imageLayout; + + if (att_info->resolveMode != VK_RESOLVE_MODE_NONE) { + struct v3dv_render_pass_attachment *resolve_att = &pass->attachments[a]; + setup_dynamic_attachment(device, resolve_att, att_info, false, true); + subpass->resolve_attachments[i].attachment = a++; + subpass->resolve_attachments[i].layout = att_info->resolveImageLayout; + } else { + subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED; + } + } + + bool has_depth = info->pDepthAttachment && + info->pDepthAttachment->imageView != VK_NULL_HANDLE; + bool has_stencil = info->pStencilAttachment && + info->pStencilAttachment->imageView != VK_NULL_HANDLE; + if (has_depth || has_stencil) { + struct v3dv_render_pass_attachment *att = &pass->attachments[a]; + subpass->ds_attachment.attachment = a++; + + bool has_depth_resolve = false; + bool has_stencil_resolve = false; + + if (has_depth) { + setup_dynamic_attachment(device, att, info->pDepthAttachment, + false, false); + subpass->ds_attachment.layout = info->pDepthAttachment->imageLayout; + has_depth_resolve = + info->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE; + } + + if (has_stencil) { + if (has_depth) { + att->desc.stencilLoadOp = info->pStencilAttachment->loadOp; + att->desc.stencilStoreOp = info->pStencilAttachment->storeOp; + } else { + setup_dynamic_attachment(device, att, info->pStencilAttachment, + true, false); + subpass->ds_attachment.layout = + info->pStencilAttachment->imageLayout; + } + has_stencil_resolve = + info->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE; + } + + if (has_depth_resolve || has_stencil_resolve) { + struct v3dv_render_pass_attachment *att = &pass->attachments[a]; + subpass->ds_resolve_attachment.attachment = a++; + if (has_depth_resolve) { + setup_dynamic_attachment(device, att, info->pDepthAttachment, + false, true); + subpass->ds_resolve_attachment.layout = + info->pDepthAttachment->resolveImageLayout; + subpass->resolve_depth = true; + } + if (has_stencil_resolve) { + if (has_depth_resolve) { + att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; + } else { + setup_dynamic_attachment(device, att, info->pStencilAttachment, + true, true); + subpass->ds_resolve_attachment.layout = + info->pStencilAttachment->resolveImageLayout; + } + subpass->resolve_stencil = true; + } + } else { + subpass->ds_resolve_attachment.attachment = VK_ATTACHMENT_UNUSED; + } + } else { + subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED; + } + + check_do_depth_stencil_clear_with_draw(device, pass, subpass); + + pass->attachment_count = a; +} + +void +v3dv_setup_dynamic_render_pass_inheritance(struct v3dv_cmd_buffer *cmd_buffer, + const VkCommandBufferInheritanceRenderingInfo *info) +{ + struct v3dv_device *device = cmd_buffer->device; + struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + + struct v3dv_render_pass *pass = &state->dynamic_pass; + struct v3dv_subpass *subpass = &state->dynamic_subpass; + struct v3dv_render_pass_attachment *pass_attachments = + &state->dynamic_attachments[0]; + struct v3dv_subpass_attachment *subpass_attachments = + &state->dynamic_subpass_attachments[0]; + + memset(pass, 0, sizeof(*pass)); + memset(subpass, 0, sizeof(*subpass)); + memset(pass_attachments, 0, sizeof(state->dynamic_subpass_attachments)); + memset(subpass_attachments, 0, sizeof(state->dynamic_subpass_attachments)); + + vk_object_base_init(&device->vk, (struct vk_object_base *) pass, + VK_OBJECT_TYPE_RENDER_PASS); + + pass->attachments = pass_attachments; + pass->subpass_attachments = subpass_attachments; + + subpass->view_mask = info->viewMask; + subpass->color_count = info->colorAttachmentCount; + subpass->color_attachments = &subpass_attachments[0]; + subpass->resolve_attachments = NULL; + + pass->multiview_enabled = info->viewMask != 0; + pass->subpass_count = 1; + pass->subpasses = subpass; + + int a = 0; + for (int i = 0; i < info->colorAttachmentCount; i++) { + struct v3dv_render_pass_attachment *att = &pass->attachments[a]; + const VkFormat format = info->pColorAttachmentFormats[i]; + + if (format == VK_FORMAT_UNDEFINED) { + subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED; + continue; + } + + /* We don't have info about load/store, so we assume we load and we + * store. + */ + att->desc.format = format; + att->desc.samples = info->rasterizationSamples; + att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + att->desc.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + subpass->color_attachments[i].attachment = a++; + } + + if (info->depthAttachmentFormat != VK_FORMAT_UNDEFINED || + info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) { + struct v3dv_render_pass_attachment *att = &pass->attachments[a]; + att->desc.format = info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ? + info->depthAttachmentFormat : info->stencilAttachmentFormat; + att->desc.samples = info->rasterizationSamples; + if (vk_format_has_depth(att->desc.format)) { + att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + att->desc.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + } else { + att->desc.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + att->desc.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + } + if (vk_format_has_stencil(att->desc.format)) { + att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; + } else { + att->desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + att->desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + } + subpass->ds_attachment.attachment = a++; + } else { + subpass->ds_attachment.attachment = VK_ATTACHMENT_UNUSED; + } + + pass->attachment_count = a; +} diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index 44962c50508..9851a24c2cd 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,18 +26,18 @@ #include "v3dv_debug.h" #include "v3dv_private.h" -#include "vk_format_info.h" - #include "common/v3d_debug.h" +#include "qpu/qpu_disasm.h" #include "compiler/nir/nir_builder.h" #include "nir/nir_serialize.h" #include "util/u_atomic.h" -#include "util/u_prim.h" #include "util/os_time.h" -#include "vulkan/util/vk_format.h" +#include "vk_format.h" +#include "vk_nir_convert_ycbcr.h" +#include "vk_pipeline.h" static VkResult compute_vpm_config(struct v3dv_pipeline *pipeline); @@ -61,31 +61,15 @@ v3dv_print_v3d_key(struct v3d_key *key, } static void -pipeline_compute_sha1_from_nir(nir_shader *nir, - unsigned char sha1[20]) -{ - assert(nir); - struct blob blob; - blob_init(&blob); - - nir_serialize(&blob, nir, false); - if (!blob.out_of_memory) - _mesa_sha1_compute(blob.data, blob.size, sha1); - - blob_finish(&blob); -} - -void -v3dv_shader_module_internal_init(struct v3dv_device *device, - struct vk_shader_module *module, - nir_shader *nir) +pipeline_compute_sha1_from_nir(struct v3dv_pipeline_stage *p_stage) { - vk_object_base_init(&device->vk, &module->base, - VK_OBJECT_TYPE_SHADER_MODULE); - module->nir = nir; - module->size = 0; + VkPipelineShaderStageCreateInfo info = { + .module = vk_shader_module_handle_from_nir(p_stage->nir), + .pName = p_stage->entrypoint, + .stage = mesa_to_vk_shader_stage(p_stage->nir->info.stage), + }; - pipeline_compute_sha1_from_nir(nir, module->sha1); + vk_pipeline_hash_shader_stage(&info, NULL, p_stage->shader_sha1); } void @@ -95,6 +79,10 @@ v3dv_shader_variant_destroy(struct v3dv_device *device, /* The assembly BO is shared by all variants in the pipeline, so it can't * be freed here and should be freed with the pipeline */ + if (variant->qpu_insts) { + free(variant->qpu_insts); + variant->qpu_insts = NULL; + } ralloc_free(variant->prog_data.base); vk_free(&device->vk.alloc, variant); } @@ -118,22 +106,10 @@ pipeline_free_stages(struct v3dv_device *device, { assert(pipeline); - /* FIXME: we can't just use a loop over mesa stage due the bin, would be - * good to find an alternative. - */ - destroy_pipeline_stage(device, pipeline->vs, pAllocator); - destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator); - destroy_pipeline_stage(device, pipeline->gs, pAllocator); - destroy_pipeline_stage(device, pipeline->gs_bin, pAllocator); - destroy_pipeline_stage(device, pipeline->fs, pAllocator); - destroy_pipeline_stage(device, pipeline->cs, pAllocator); - - pipeline->vs = NULL; - pipeline->vs_bin = NULL; - pipeline->gs = NULL; - pipeline->gs_bin = NULL; - pipeline->fs = NULL; - pipeline->cs = NULL; + for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) { + destroy_pipeline_stage(device, pipeline->stages[stage], pAllocator); + pipeline->stages[stage] = NULL; + } } static void @@ -161,6 +137,12 @@ v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline, pipeline->default_attribute_values = NULL; } + if (pipeline->executables.mem_ctx) + ralloc_free(pipeline->executables.mem_ctx); + + if (pipeline->layout) + v3dv_pipeline_layout_unref(device, pipeline->layout, pAllocator); + vk_object_free(&device->vk, pAllocator, pipeline); } @@ -181,31 +163,44 @@ v3dv_DestroyPipeline(VkDevice _device, static const struct spirv_to_nir_options default_spirv_options = { .caps = { .device_group = true, + .float_controls = true, .multiview = true, + .storage_8bit = true, + .storage_16bit = true, + .subgroup_ballot = true, .subgroup_basic = true, + .subgroup_quad = true, + .subgroup_shuffle = true, + .subgroup_vote = true, .variable_pointers = true, + .vk_memory_model = true, + .vk_memory_model_device_scope = true, + .physical_storage_buffer_address = true, + .workgroup_memory_explicit_layout = true, + .image_read_without_format = true, + .demote_to_helper_invocation = true, }, .ubo_addr_format = nir_address_format_32bit_index_offset, .ssbo_addr_format = nir_address_format_32bit_index_offset, - .phys_ssbo_addr_format = nir_address_format_64bit_global, + .phys_ssbo_addr_format = nir_address_format_2x32bit_global, .push_const_addr_format = nir_address_format_logical, .shared_addr_format = nir_address_format_32bit_offset, - .frag_coord_is_sysval = false, }; const nir_shader_compiler_options v3dv_nir_options = { .lower_uadd_sat = true, + .lower_usub_sat = true, .lower_iadd_sat = true, .lower_all_io_to_temps = true, .lower_extract_byte = true, .lower_extract_word = true, .lower_insert_byte = true, .lower_insert_word = true, - .lower_bitfield_insert_to_shifts = true, - .lower_bitfield_extract_to_shifts = true, + .lower_bitfield_insert = true, + .lower_bitfield_extract = true, .lower_bitfield_reverse = true, .lower_bit_count = true, - .lower_cs_local_id_from_index = true, + .lower_cs_local_id_to_index = true, .lower_ffract = true, .lower_fmod = true, .lower_pack_unorm_2x16 = true, @@ -218,14 +213,9 @@ const nir_shader_compiler_options v3dv_nir_options = { .lower_unpack_snorm_4x8 = true, .lower_pack_half_2x16 = true, .lower_unpack_half_2x16 = true, - /* FIXME: see if we can avoid the uadd_carry and usub_borrow lowering and - * get the tests to pass since it might produce slightly better code. - */ - .lower_uadd_carry = true, - .lower_usub_borrow = true, - /* FIXME: check if we can use multop + umul24 to implement mul2x32_64 - * without lowering. - */ + .lower_pack_32_2x16 = true, + .lower_pack_32_2x16_split = true, + .lower_unpack_32_2x16_split = true, .lower_mul_2x32_64 = true, .lower_fdiv = true, .lower_find_lsb = true, @@ -240,10 +230,10 @@ const nir_shader_compiler_options v3dv_nir_options = { .lower_isign = true, .lower_ldexp = true, .lower_mul_high = true, - .lower_wpos_pntc = true, - .lower_rotate = true, + .lower_wpos_pntc = false, .lower_to_scalar = true, .lower_device_index_to_zero = true, + .lower_fquantize2f16 = true, .has_fsub = true, .has_isub = true, .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic @@ -252,7 +242,7 @@ const nir_shader_compiler_options v3dv_nir_options = { .max_unroll_iterations = 16, .force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp), .divergence_analysis_options = - nir_divergence_multiple_workgroup_per_compute_subgroup + nir_divergence_multiple_workgroup_per_compute_subgroup, }; const nir_shader_compiler_options * @@ -261,95 +251,39 @@ v3dv_pipeline_get_nir_options(void) return &v3dv_nir_options; } -#define OPT(pass, ...) ({ \ - bool this_progress = false; \ - NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ - if (this_progress) \ - progress = true; \ - this_progress; \ -}) - -static void -nir_optimize(nir_shader *nir, bool allow_copies) -{ - bool progress; - - do { - progress = false; - OPT(nir_split_array_vars, nir_var_function_temp); - OPT(nir_shrink_vec_array_vars, nir_var_function_temp); - OPT(nir_opt_deref); - OPT(nir_lower_vars_to_ssa); - if (allow_copies) { - /* Only run this pass in the first call to nir_optimize. Later calls - * assume that we've lowered away any copy_deref instructions and we - * don't want to introduce any more. - */ - OPT(nir_opt_find_array_copies); - } - OPT(nir_opt_copy_prop_vars); - OPT(nir_opt_dead_write_vars); - OPT(nir_opt_combine_stores, nir_var_all); - - OPT(nir_lower_alu_to_scalar, NULL, NULL); - - OPT(nir_copy_prop); - OPT(nir_lower_phis_to_scalar, false); - - OPT(nir_copy_prop); - OPT(nir_opt_dce); - OPT(nir_opt_cse); - OPT(nir_opt_combine_stores, nir_var_all); - - /* Passing 0 to the peephole select pass causes it to convert - * if-statements that contain only move instructions in the branches - * regardless of the count. - * - * Passing 1 to the peephole select pass causes it to convert - * if-statements that contain at most a single ALU instruction (total) - * in both branches. - */ - OPT(nir_opt_peephole_select, 0, false, false); - OPT(nir_opt_peephole_select, 8, false, true); - - OPT(nir_opt_intrinsics); - OPT(nir_opt_idiv_const, 32); - OPT(nir_opt_algebraic); - OPT(nir_opt_constant_folding); - - OPT(nir_opt_dead_cf); +static const struct vk_ycbcr_conversion_state * +lookup_ycbcr_conversion(const void *_pipeline_layout, uint32_t set, + uint32_t binding, uint32_t array_index) +{ + struct v3dv_pipeline_layout *pipeline_layout = + (struct v3dv_pipeline_layout *) _pipeline_layout; - OPT(nir_opt_if, false); - OPT(nir_opt_conditional_discard); + assert(set < pipeline_layout->num_sets); + struct v3dv_descriptor_set_layout *set_layout = + pipeline_layout->set[set].layout; - OPT(nir_opt_remove_phis); - OPT(nir_opt_undef); - OPT(nir_lower_pack); - } while (progress); + assert(binding < set_layout->binding_count); + struct v3dv_descriptor_set_binding_layout *bind_layout = + &set_layout->binding[binding]; - OPT(nir_remove_dead_variables, nir_var_function_temp, NULL); + if (bind_layout->immutable_samplers_offset) { + const struct v3dv_sampler *immutable_samplers = + v3dv_immutable_samplers(set_layout, bind_layout); + const struct v3dv_sampler *sampler = &immutable_samplers[array_index]; + return sampler->conversion ? &sampler->conversion->state : NULL; + } else { + return NULL; + } } static void preprocess_nir(nir_shader *nir) { - /* We have to lower away local variable initializers right before we - * inline functions. That way they get properly initialized at the top - * of the function and not at the top of its caller. - */ - NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); - NIR_PASS_V(nir, nir_lower_returns); - NIR_PASS_V(nir, nir_inline_functions); - NIR_PASS_V(nir, nir_opt_deref); - - /* Pick off the single entrypoint that we want */ - foreach_list_typed_safe(nir_function, func, node, &nir->functions) { - if (func->is_entrypoint) - func->name = ralloc_strdup(func, "main"); - else - exec_node_remove(&func->node); - } - assert(exec_list_length(&nir->functions) == 1); + const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = { + .frag_coord = true, + .point_coord = true, + }; + NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings); /* Vulkan uses the separate-shader linking model */ nir->info.separate_shader = true; @@ -357,76 +291,63 @@ preprocess_nir(nir_shader *nir) /* Make sure we lower variable initializers on output variables so that * nir_remove_dead_variables below sees the corresponding stores */ - NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out); - - /* Now that we've deleted all but the main function, we can go ahead and - * lower the rest of the variable initializers. - */ - NIR_PASS_V(nir, nir_lower_variable_initializers, ~0); - - /* Split member structs. We do this before lower_io_to_temporaries so that - * it doesn't lower system values to temporaries by accident. - */ - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_split_per_member_structs); + NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_shader_out); if (nir->info.stage == MESA_SHADER_FRAGMENT) - NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out); + NIR_PASS(_, nir, nir_lower_io_to_vector, nir_var_shader_out); if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_lower_input_attachments, + NIR_PASS(_, nir, nir_lower_input_attachments, &(nir_input_attachment_options) { .use_fragcoord_sysval = false, }); } - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_push_const, - nir_address_format_32bit_offset); + NIR_PASS_V(nir, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(nir), true, false); - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_ubo | nir_var_mem_ssbo, - nir_address_format_32bit_index_offset); + NIR_PASS(_, nir, nir_lower_system_values); - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in | - nir_var_shader_out | nir_var_system_value | nir_var_mem_shared, - NULL); + NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL); - NIR_PASS_V(nir, nir_propagate_invariant, false); - NIR_PASS_V(nir, nir_lower_io_to_temporaries, - nir_shader_get_entrypoint(nir), true, false); + NIR_PASS(_, nir, nir_normalize_cubemap_coords); - NIR_PASS_V(nir, nir_lower_system_values); - NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays); + NIR_PASS(_, nir, nir_lower_global_vars_to_local); - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(_, nir, nir_split_var_copies); + NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp); - NIR_PASS_V(nir, nir_normalize_cubemap_coords); + v3d_optimize_nir(NULL, nir); - NIR_PASS_V(nir, nir_lower_global_vars_to_local); + NIR_PASS(_, nir, nir_lower_explicit_io, + nir_var_mem_push_const, + nir_address_format_32bit_offset); - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_split_struct_vars, nir_var_function_temp); + NIR_PASS(_, nir, nir_lower_explicit_io, + nir_var_mem_ubo | nir_var_mem_ssbo, + nir_address_format_32bit_index_offset); - nir_optimize(nir, true); + NIR_PASS(_, nir, nir_lower_explicit_io, + nir_var_mem_global, + nir_address_format_2x32bit_global); - NIR_PASS_V(nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); /* Lower a bunch of stuff */ - NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS(_, nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX); + NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX); - NIR_PASS_V(nir, nir_lower_indirect_derefs, - nir_var_function_temp, 2); + NIR_PASS(_, nir, nir_lower_indirect_derefs, + nir_var_function_temp, 2); - NIR_PASS_V(nir, nir_lower_array_deref_of_vec, - nir_var_mem_ubo | nir_var_mem_ssbo, - nir_lower_direct_array_deref_of_vec_load); + NIR_PASS(_, nir, nir_lower_array_deref_of_vec, + nir_var_mem_ubo | nir_var_mem_ssbo, + nir_lower_direct_array_deref_of_vec_load); - NIR_PASS_V(nir, nir_lower_frexp); + NIR_PASS(_, nir, nir_lower_frexp); /* Get rid of split copies */ - nir_optimize(nir, false); + v3d_optimize_nir(NULL, nir); } static nir_shader * @@ -435,42 +356,35 @@ shader_module_compile_to_nir(struct v3dv_device *device, { nir_shader *nir; const nir_shader_compiler_options *nir_options = &v3dv_nir_options; + gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(stage->stage); - if (!stage->module->nir) { - uint32_t *spirv = (uint32_t *) stage->module->data; - assert(stage->module->size % 4 == 0); - - if (V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV) - v3dv_print_spirv(stage->module->data, stage->module->size, stderr); - - uint32_t num_spec_entries = 0; - struct nir_spirv_specialization *spec_entries = - vk_spec_info_to_nir_spirv(stage->spec_info, &num_spec_entries); - const struct spirv_to_nir_options spirv_options = default_spirv_options; - nir = spirv_to_nir(spirv, stage->module->size / 4, - spec_entries, num_spec_entries, - broadcom_shader_stage_to_gl(stage->stage), - stage->entrypoint, - &spirv_options, nir_options); - assert(nir); - nir_validate_shader(nir, "after spirv_to_nir"); - free(spec_entries); - } else { - /* For NIR modules created by the driver we can't consume the NIR - * directly, we need to clone it first, since ownership of the NIR code - * (as with SPIR-V code for SPIR-V shaders), belongs to the creator - * of the module and modules can be destroyed immediately after been used - * to create pipelines. - */ - nir = nir_shader_clone(NULL, stage->module->nir); - nir_validate_shader(nir, "nir module"); + + if (V3D_DBG(DUMP_SPIRV) && stage->module->nir == NULL) + v3dv_print_spirv(stage->module->data, stage->module->size, stderr); + + /* vk_shader_module_to_nir also handles internal shaders, when module->nir + * != NULL. It also calls nir_validate_shader on both cases, so we don't + * call it again here. + */ + VkResult result = vk_shader_module_to_nir(&device->vk, stage->module, + gl_stage, + stage->entrypoint, + stage->spec_info, + &default_spirv_options, + nir_options, + NULL, &nir); + if (result != VK_SUCCESS) + return NULL; + assert(nir->info.stage == gl_stage); + + if (V3D_DBG(SHADERDB) && stage->module->nir == NULL) { + char sha1buf[41]; + _mesa_sha1_format(sha1buf, stage->pipeline->sha1); + nir->info.name = ralloc_strdup(nir, sha1buf); } - assert(nir->info.stage == broadcom_shader_stage_to_gl(stage->stage)); - if (V3D_DEBUG & (V3D_DEBUG_NIR | - v3d_debug_flag_for_shader_stage( - broadcom_shader_stage_to_gl(stage->stage)))) { - fprintf(stderr, "Initial form: %s prog %d NIR:\n", + if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) { + fprintf(stderr, "NIR after vk_shader_module_to_nir: %s prog %d NIR:\n", broadcom_shader_stage_name(stage->stage), stage->program_id); nir_print_shader(nir, stderr); @@ -497,17 +411,21 @@ descriptor_map_add(struct v3dv_descriptor_map *map, int binding, int array_index, int array_size, - uint8_t return_size) + int start_index, + uint8_t return_size, + uint8_t plane) { assert(array_index < array_size); assert(return_size == 16 || return_size == 32); - unsigned index = 0; - for (unsigned i = 0; i < map->num_desc; i++) { - if (set == map->set[i] && - binding == map->binding[i] && - array_index == map->array_index[i]) { - assert(array_size == map->array_size[i]); + unsigned index = start_index; + for (; index < map->num_desc; index++) { + if (map->used[index] && + set == map->set[index] && + binding == map->binding[index] && + array_index == map->array_index[index] && + plane == map->plane[index]) { + assert(array_size == map->array_size[index]); if (return_size != map->return_size[index]) { /* It the return_size is different it means that the same sampler * was used for operations with different precision @@ -517,26 +435,36 @@ descriptor_map_add(struct v3dv_descriptor_map *map, map->return_size[index] = 32; } return index; + } else if (!map->used[index]) { + break; } - index++; } - assert(index == map->num_desc); + assert(index < DESCRIPTOR_MAP_SIZE); + assert(!map->used[index]); - map->set[map->num_desc] = set; - map->binding[map->num_desc] = binding; - map->array_index[map->num_desc] = array_index; - map->array_size[map->num_desc] = array_size; - map->return_size[map->num_desc] = return_size; - map->num_desc++; + map->used[index] = true; + map->set[index] = set; + map->binding[index] = binding; + map->array_index[index] = array_index; + map->array_size[index] = array_size; + map->return_size[index] = return_size; + map->plane[index] = plane; + map->num_desc = MAX2(map->num_desc, index + 1); return index; } +struct lower_pipeline_layout_state { + struct v3dv_pipeline *pipeline; + const struct v3dv_pipeline_layout *layout; + bool needs_default_sampler_state; +}; + static void lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr, - struct v3dv_pipeline *pipeline) + struct lower_pipeline_layout_state *state) { assert(instr->intrinsic == nir_intrinsic_load_push_constant); instr->intrinsic = nir_intrinsic_load_uniform; @@ -568,8 +496,11 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline, &pipeline->shared_data->maps[broadcom_stage]->sampler_map : &pipeline->shared_data->maps[broadcom_stage]->texture_map; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: return &pipeline->shared_data->maps[broadcom_stage]->ubo_map; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map; default: unreachable("Descriptor type unknown or not having a descriptor map"); @@ -581,9 +512,7 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline, static void lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, - nir_shader *shader, - struct v3dv_pipeline *pipeline, - const struct v3dv_pipeline_layout *layout) + struct lower_pipeline_layout_state *state) { assert(instr->intrinsic == nir_intrinsic_vulkan_resource_index); @@ -591,35 +520,50 @@ lower_vulkan_resource_index(nir_builder *b, unsigned set = nir_intrinsic_desc_set(instr); unsigned binding = nir_intrinsic_binding(instr); - struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout; + struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout; struct v3dv_descriptor_set_binding_layout *binding_layout = &set_layout->binding[binding]; unsigned index = 0; - const VkDescriptorType desc_type = nir_intrinsic_desc_type(instr); - switch (desc_type) { + switch (binding_layout->type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: { + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { struct v3dv_descriptor_map *descriptor_map = - pipeline_get_descriptor_map(pipeline, desc_type, shader->info.stage, false); + pipeline_get_descriptor_map(state->pipeline, binding_layout->type, + b->shader->info.stage, false); if (!const_val) unreachable("non-constant vulkan_resource_index array index"); + /* At compile-time we will need to know if we are processing a UBO load + * for an inline or a regular UBO so we can handle inline loads like + * push constants. At the level of NIR level however, the inline + * information is gone, so we rely on the index to make this distinction. + * Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for + * inline buffers. This means that at the descriptor map level + * we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1, + * and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS. + */ + uint32_t start_index = 0; + if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || + binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) { + start_index += MAX_INLINE_UNIFORM_BUFFERS; + } + index = descriptor_map_add(descriptor_map, set, binding, const_val->u32, binding_layout->array_size, - 32 /* return_size: doesn't really apply for this case */); - - if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { - /* skip index 0 which is used for push constants */ - index++; - } + start_index, + 32 /* return_size: doesn't really apply for this case */, + 0); break; } default: - unreachable("unsupported desc_type for vulkan_resource_index"); + unreachable("unsupported descriptor type for vulkan_resource_index"); break; } @@ -627,30 +571,43 @@ lower_vulkan_resource_index(nir_builder *b, * vulkan_load_descriptor return a vec2 providing an index and * offset. Our backend compiler only cares about the index part. */ - nir_ssa_def_rewrite_uses(&instr->dest.ssa, + nir_def_rewrite_uses(&instr->def, nir_imm_ivec2(b, index, 0)); nir_instr_remove(&instr->instr); } +static uint8_t +tex_instr_get_and_remove_plane_src(nir_tex_instr *tex) +{ + int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane); + if (plane_src_idx < 0) + return 0; + + uint8_t plane = nir_src_as_uint(tex->src[plane_src_idx].src); + nir_tex_instr_remove_src(tex, plane_src_idx); + return plane; +} + /* Returns return_size, so it could be used for the case of not having a * sampler object */ static uint8_t -lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, - nir_shader *shader, - struct v3dv_pipeline *pipeline, - const struct v3dv_pipeline_layout *layout) +lower_tex_src(nir_builder *b, + nir_tex_instr *instr, + unsigned src_idx, + struct lower_pipeline_layout_state *state) { - nir_ssa_def *index = NULL; + nir_def *index = NULL; unsigned base_index = 0; unsigned array_elements = 1; nir_tex_src *src = &instr->src[src_idx]; bool is_sampler = src->src_type == nir_tex_src_sampler_deref; + uint8_t plane = tex_instr_get_and_remove_plane_src(instr); + /* We compute first the offsets */ nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr); while (deref->deref_type != nir_deref_type_var) { - assert(deref->parent.is_ssa); nir_deref_instr *parent = nir_instr_as_deref(deref->parent.ssa->parent_instr); @@ -667,8 +624,8 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, } index = nir_iadd(b, index, - nir_imul(b, nir_imm_int(b, array_elements), - nir_ssa_for_src(b, deref->arr.index, 1))); + nir_imul_imm(b, deref->arr.index.ssa, + array_elements)); } array_elements *= glsl_get_length(parent->type); @@ -683,8 +640,7 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, * instr if needed */ if (index) { - nir_instr_rewrite_src(&instr->instr, &src->src, - nir_src_for_ssa(index)); + nir_src_rewrite(&src->src, index); src->src_type = is_sampler ? nir_tex_src_sampler_offset : @@ -696,13 +652,13 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, uint32_t set = deref->var->data.descriptor_set; uint32_t binding = deref->var->data.binding; /* FIXME: this is a really simplified check for the precision to be used - * for the sampling. Right now we are ony checking for the variables used + * for the sampling. Right now we are only checking for the variables used * on the operation itself, but there are other cases that we could use to * infer the precision requirement. */ bool relaxed_precision = deref->var->data.precision == GLSL_PRECISION_MEDIUM || deref->var->data.precision == GLSL_PRECISION_LOW; - struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout; + struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout; struct v3dv_descriptor_set_binding_layout *binding_layout = &set_layout->binding[binding]; @@ -714,23 +670,25 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, base_index; uint8_t return_size; - if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT)) + if (V3D_DBG(TMU_16BIT)) return_size = 16; - else if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT)) + else if (V3D_DBG(TMU_32BIT)) return_size = 32; else - return_size = relaxed_precision || instr->is_shadow ? 16 : 32; + return_size = relaxed_precision ? 16 : 32; struct v3dv_descriptor_map *map = - pipeline_get_descriptor_map(pipeline, binding_layout->type, - shader->info.stage, is_sampler); + pipeline_get_descriptor_map(state->pipeline, binding_layout->type, + b->shader->info.stage, is_sampler); int desc_index = descriptor_map_add(map, deref->var->data.descriptor_set, deref->var->data.binding, array_index, binding_layout->array_size, - return_size); + 0, + return_size, + plane); if (is_sampler) instr->sampler_index = desc_index; @@ -741,10 +699,9 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, } static bool -lower_sampler(nir_builder *b, nir_tex_instr *instr, - nir_shader *shader, - struct v3dv_pipeline *pipeline, - const struct v3dv_pipeline_layout *layout) +lower_sampler(nir_builder *b, + nir_tex_instr *instr, + struct lower_pipeline_layout_state *state) { uint8_t return_size = 0; @@ -752,44 +709,43 @@ lower_sampler(nir_builder *b, nir_tex_instr *instr, nir_tex_instr_src_index(instr, nir_tex_src_texture_deref); if (texture_idx >= 0) - return_size = lower_tex_src_to_offset(b, instr, texture_idx, shader, - pipeline, layout); + return_size = lower_tex_src(b, instr, texture_idx, state); int sampler_idx = nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref); - if (sampler_idx >= 0) - lower_tex_src_to_offset(b, instr, sampler_idx, shader, pipeline, layout); + if (sampler_idx >= 0) { + assert(nir_tex_instr_need_sampler(instr)); + lower_tex_src(b, instr, sampler_idx, state); + } if (texture_idx < 0 && sampler_idx < 0) return false; - /* If we don't have a sampler, we assign it the idx we reserve for this - * case, and we ensure that it is using the correct return size. + /* If the instruction doesn't have a sampler (i.e. txf) we use backend_flags + * to bind a default sampler state to configure precission. */ if (sampler_idx < 0) { - instr->sampler_index = return_size == 16 ? + state->needs_default_sampler_state = true; + instr->backend_flags = return_size == 16 ? V3DV_NO_SAMPLER_16BIT_IDX : V3DV_NO_SAMPLER_32BIT_IDX; } return true; } -/* FIXME: really similar to lower_tex_src_to_offset, perhaps refactor? */ +/* FIXME: really similar to lower_tex_src, perhaps refactor? */ static void lower_image_deref(nir_builder *b, nir_intrinsic_instr *instr, - nir_shader *shader, - struct v3dv_pipeline *pipeline, - const struct v3dv_pipeline_layout *layout) + struct lower_pipeline_layout_state *state) { nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); - nir_ssa_def *index = NULL; + nir_def *index = NULL; unsigned array_elements = 1; unsigned base_index = 0; while (deref->deref_type != nir_deref_type_var) { - assert(deref->parent.is_ssa); nir_deref_instr *parent = nir_instr_as_deref(deref->parent.ssa->parent_instr); @@ -806,8 +762,8 @@ lower_image_deref(nir_builder *b, } index = nir_iadd(b, index, - nir_imul(b, nir_imm_int(b, array_elements), - nir_ssa_for_src(b, deref->arr.index, 1))); + nir_imul_imm(b, deref->arr.index.ssa, + array_elements)); } array_elements *= glsl_get_length(parent->type); @@ -820,7 +776,7 @@ lower_image_deref(nir_builder *b, uint32_t set = deref->var->data.descriptor_set; uint32_t binding = deref->var->data.binding; - struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout; + struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout; struct v3dv_descriptor_set_binding_layout *binding_layout = &set_layout->binding[binding]; @@ -830,8 +786,8 @@ lower_image_deref(nir_builder *b, binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER); struct v3dv_descriptor_map *map = - pipeline_get_descriptor_map(pipeline, binding_layout->type, - shader->info.stage, false); + pipeline_get_descriptor_map(state->pipeline, binding_layout->type, + b->shader->info.stage, false); int desc_index = descriptor_map_add(map, @@ -839,7 +795,9 @@ lower_image_deref(nir_builder *b, deref->var->data.binding, array_index, binding_layout->array_size, - 32 /* return_size: doesn't apply for textures */); + 0, + 32 /* return_size: doesn't apply for textures */, + 0); /* Note: we don't need to do anything here in relation to the precision and * the output size because for images we can infer that info from the image @@ -853,53 +811,35 @@ lower_image_deref(nir_builder *b, } static bool -lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, - nir_shader *shader, - struct v3dv_pipeline *pipeline, - const struct v3dv_pipeline_layout *layout) +lower_intrinsic(nir_builder *b, + nir_intrinsic_instr *instr, + struct lower_pipeline_layout_state *state) { switch (instr->intrinsic) { - case nir_intrinsic_load_layer_id: - /* FIXME: if layered rendering gets supported, this would need a real - * lowering - */ - nir_ssa_def_rewrite_uses(&instr->dest.ssa, - nir_imm_int(b, 0)); - nir_instr_remove(&instr->instr); - return true; - case nir_intrinsic_load_push_constant: - lower_load_push_constant(b, instr, pipeline); + lower_load_push_constant(b, instr, state); return true; case nir_intrinsic_vulkan_resource_index: - lower_vulkan_resource_index(b, instr, shader, pipeline, layout); + lower_vulkan_resource_index(b, instr, state); return true; case nir_intrinsic_load_vulkan_descriptor: { /* Loading the descriptor happens as part of load/store instructions, * so for us this is a no-op. */ - nir_ssa_def_rewrite_uses(&instr->dest.ssa, instr->src[0].ssa); + nir_def_rewrite_uses(&instr->def, instr->src[0].ssa); nir_instr_remove(&instr->instr); return true; } case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_imin: - case nir_intrinsic_image_deref_atomic_umin: - case nir_intrinsic_image_deref_atomic_imax: - case nir_intrinsic_image_deref_atomic_umax: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic: + case nir_intrinsic_image_deref_atomic_swap: case nir_intrinsic_image_deref_size: case nir_intrinsic_image_deref_samples: - lower_image_deref(b, instr, shader, pipeline, layout); + lower_image_deref(b, instr, state); return true; default: @@ -908,32 +848,23 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, } static bool -lower_impl(nir_function_impl *impl, - nir_shader *shader, - struct v3dv_pipeline *pipeline, - const struct v3dv_pipeline_layout *layout) +lower_pipeline_layout_cb(nir_builder *b, + nir_instr *instr, + void *_state) { - nir_builder b; - nir_builder_init(&b, impl); bool progress = false; + struct lower_pipeline_layout_state *state = _state; - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) { - b.cursor = nir_before_instr(instr); - switch (instr->type) { - case nir_instr_type_tex: - progress |= - lower_sampler(&b, nir_instr_as_tex(instr), shader, pipeline, layout); - break; - case nir_instr_type_intrinsic: - progress |= - lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, - pipeline, layout); - break; - default: - break; - } - } + b->cursor = nir_before_instr(instr); + switch (instr->type) { + case nir_instr_type_tex: + progress |= lower_sampler(b, nir_instr_as_tex(instr), state); + break; + case nir_instr_type_intrinsic: + progress |= lower_intrinsic(b, nir_instr_as_intrinsic(instr), state); + break; + default: + break; } return progress; @@ -942,25 +873,62 @@ lower_impl(nir_function_impl *impl, static bool lower_pipeline_layout_info(nir_shader *shader, struct v3dv_pipeline *pipeline, - const struct v3dv_pipeline_layout *layout) + const struct v3dv_pipeline_layout *layout, + bool *needs_default_sampler_state) { bool progress = false; - nir_foreach_function(function, shader) { - if (function->impl) - progress |= lower_impl(function->impl, shader, pipeline, layout); - } + struct lower_pipeline_layout_state state = { + .pipeline = pipeline, + .layout = layout, + .needs_default_sampler_state = false, + }; + + progress = nir_shader_instructions_pass(shader, lower_pipeline_layout_cb, + nir_metadata_block_index | + nir_metadata_dominance, + &state); + + *needs_default_sampler_state = state.needs_default_sampler_state; return progress; } +/* This flips gl_PointCoord.y to match Vulkan requirements */ +static bool +lower_point_coord_cb(nir_builder *b, nir_intrinsic_instr *intr, void *_state) +{ + if (intr->intrinsic != nir_intrinsic_load_input) + return false; + + if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PNTC) + return false; + + b->cursor = nir_after_instr(&intr->instr); + nir_def *result = &intr->def; + result = + nir_vector_insert_imm(b, result, + nir_fsub_imm(b, 1.0, nir_channel(b, result, 1)), 1); + nir_def_rewrite_uses_after(&intr->def, + result, result->parent_instr); + return true; +} + +static bool +v3d_nir_lower_point_coord(nir_shader *s) +{ + assert(s->info.stage == MESA_SHADER_FRAGMENT); + return nir_shader_intrinsics_pass(s, lower_point_coord_cb, + nir_metadata_block_index | + nir_metadata_dominance, NULL); +} static void lower_fs_io(nir_shader *nir) { /* Our backend doesn't handle array fragment shader outputs */ NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false); - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_out, NULL); + NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL); nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, MESA_SHADER_FRAGMENT); @@ -968,8 +936,8 @@ lower_fs_io(nir_shader *nir) nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, MESA_SHADER_FRAGMENT); - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - type_size_vec4, 0); + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, 0); } static void @@ -1014,8 +982,7 @@ shader_debug_output(const char *message, void *data) static void pipeline_populate_v3d_key(struct v3d_key *key, const struct v3dv_pipeline_stage *p_stage, - uint32_t ucp_enables, - bool robust_buffer_access) + uint32_t ucp_enables) { assert(p_stage->pipeline->shared_data && p_stage->pipeline->shared_data->maps[p_stage->stage]); @@ -1051,7 +1018,8 @@ pipeline_populate_v3d_key(struct v3d_key *key, switch (p_stage->stage) { case BROADCOM_SHADER_VERTEX: case BROADCOM_SHADER_VERTEX_BIN: - key->is_last_geometry_stage = p_stage->pipeline->gs == NULL; + key->is_last_geometry_stage = + p_stage->pipeline->stages[BROADCOM_SHADER_GEOMETRY] == NULL; break; case BROADCOM_SHADER_GEOMETRY: case BROADCOM_SHADER_GEOMETRY_BIN: @@ -1078,27 +1046,42 @@ pipeline_populate_v3d_key(struct v3d_key *key, */ key->ucp_enables = ucp_enables; - key->robust_buffer_access = robust_buffer_access; + const VkPipelineRobustnessBufferBehaviorEXT robust_buffer_enabled = + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT; - key->environment = V3D_ENVIRONMENT_VULKAN; + const VkPipelineRobustnessImageBehaviorEXT robust_image_enabled = + VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_EXT; + + key->robust_uniform_access = + p_stage->robustness.uniform_buffers == robust_buffer_enabled; + key->robust_storage_access = + p_stage->robustness.storage_buffers == robust_buffer_enabled; + key->robust_image_access = + p_stage->robustness.images == robust_image_enabled; } /* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the * same. For not using prim_mode that is the one already used on v3d */ -static const enum pipe_prim_type vk_to_pipe_prim_type[] = { - [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS, - [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES, - [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN, - [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY, - [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY, +static const enum mesa_prim vk_to_mesa_prim[] = { + [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = MESA_PRIM_POINTS, + [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = MESA_PRIM_LINES, + [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = MESA_PRIM_LINE_STRIP, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = MESA_PRIM_TRIANGLES, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = MESA_PRIM_TRIANGLE_STRIP, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = MESA_PRIM_TRIANGLE_FAN, + [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = MESA_PRIM_LINES_ADJACENCY, + [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = MESA_PRIM_LINE_STRIP_ADJACENCY, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = MESA_PRIM_TRIANGLES_ADJACENCY, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = MESA_PRIM_TRIANGLE_STRIP_ADJACENCY, }; +uint32_t +v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim) +{ + return v3d_hw_prim_type(vk_to_mesa_prim[vk_prim]); +} + static const enum pipe_logicop vk_to_pipe_logicop[] = { [VK_LOGIC_OP_CLEAR] = PIPE_LOGICOP_CLEAR, [VK_LOGIC_OP_AND] = PIPE_LOGICOP_AND, @@ -1118,9 +1101,74 @@ static const enum pipe_logicop vk_to_pipe_logicop[] = { [VK_LOGIC_OP_SET] = PIPE_LOGICOP_SET, }; +static bool +enable_line_smooth(uint8_t topology, + const VkPipelineRasterizationStateCreateInfo *rs_info) +{ + if (!rs_info || rs_info->rasterizerDiscardEnable) + return false; + + const VkPipelineRasterizationLineStateCreateInfoKHR *ls_info = + vk_find_struct_const(rs_info->pNext, + PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_KHR); + + if (!ls_info) + return false; + + switch(topology) { + case MESA_PRIM_LINES: + case MESA_PRIM_LINE_LOOP: + case MESA_PRIM_LINE_STRIP: + case MESA_PRIM_LINES_ADJACENCY: + case MESA_PRIM_LINE_STRIP_ADJACENCY: + return ls_info->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR; + default: + return false; + } +} + +static void +v3d_fs_key_set_color_attachment(struct v3d_fs_key *key, + const struct v3dv_pipeline_stage *p_stage, + uint32_t index, + VkFormat fb_format) +{ + key->cbufs |= 1 << index; + + enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format); + + /* If logic operations are enabled then we might emit color reads and we + * need to know the color buffer format and swizzle for that + */ + if (key->logicop_func != PIPE_LOGICOP_COPY) { + /* Framebuffer formats should be single plane */ + assert(vk_format_get_plane_count(fb_format) == 1); + key->color_fmt[index].format = fb_pipe_format; + memcpy(key->color_fmt[index].swizzle, + v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format, 0), + sizeof(key->color_fmt[index].swizzle)); + } + + const struct util_format_description *desc = + vk_format_description(fb_format); + + if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + desc->channel[0].size == 32) { + key->f32_color_rb |= 1 << index; + } + + if (p_stage->nir->info.fs.untyped_color_outputs) { + if (util_format_is_pure_uint(fb_pipe_format)) + key->uint_color_rb |= 1 << index; + else if (util_format_is_pure_sint(fb_pipe_format)) + key->int_color_rb |= 1 << index; + } +} + static void pipeline_populate_v3d_fs_key(struct v3d_fs_key *key, const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct vk_render_pass_state *rendering_info, const struct v3dv_pipeline_stage *p_stage, bool has_geometry_shader, uint32_t ucp_enables) @@ -1129,16 +1177,29 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key, memset(key, 0, sizeof(*key)); - const bool rba = p_stage->pipeline->device->features.robustBufferAccess; - pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables, rba); + struct v3dv_device *device = p_stage->pipeline->device; + assert(device); + + pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables); const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - uint8_t topology = vk_to_pipe_prim_type[ia_info->topology]; + uint8_t topology = vk_to_mesa_prim[ia_info->topology]; + + key->is_points = (topology == MESA_PRIM_POINTS); + key->is_lines = (topology >= MESA_PRIM_LINES && + topology <= MESA_PRIM_LINE_STRIP); + + if (key->is_points) { + /* This mask represents state for GL_ARB_point_sprite which is not + * relevant to Vulkan. + */ + key->point_sprite_mask = 0; + + /* Vulkan mandates upper left. */ + key->point_coord_upper_left = true; + } - key->is_points = (topology == PIPE_PRIM_POINTS); - key->is_lines = (topology >= PIPE_PRIM_LINES && - topology <= PIPE_PRIM_LINE_STRIP); key->has_gs = has_geometry_shader; const VkPipelineColorBlendStateCreateInfo *cb_info = @@ -1150,6 +1211,7 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key, PIPE_LOGICOP_COPY; const bool raster_enabled = + pCreateInfo->pRasterizationState && !pCreateInfo->pRasterizationState->rasterizerDiscardEnable; /* Multisample rasterization state must be ignored if rasterization @@ -1162,68 +1224,24 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key, ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT); key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT; - if (key->msaa) { - key->sample_coverage = - p_stage->pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1; + if (key->msaa) key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable; - key->sample_alpha_to_one = ms_info->alphaToOneEnable; - } + + key->sample_alpha_to_one = ms_info->alphaToOneEnable; } + key->line_smoothing = enable_line_smooth(topology, pCreateInfo->pRasterizationState); + /* This is intended for V3D versions before 4.1, otherwise we just use the * tile buffer load/store swap R/B bit. */ key->swap_color_rb = 0; - const struct v3dv_render_pass *pass = - v3dv_render_pass_from_handle(pCreateInfo->renderPass); - const struct v3dv_subpass *subpass = p_stage->pipeline->subpass; - for (uint32_t i = 0; i < subpass->color_count; i++) { - const uint32_t att_idx = subpass->color_attachments[i].attachment; - if (att_idx == VK_ATTACHMENT_UNUSED) + for (uint32_t i = 0; i < rendering_info->color_attachment_count; i++) { + if (rendering_info->color_attachment_formats[i] == VK_FORMAT_UNDEFINED) continue; - - key->cbufs |= 1 << i; - - VkFormat fb_format = pass->attachments[att_idx].desc.format; - enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format); - - /* If logic operations are enabled then we might emit color reads and we - * need to know the color buffer format and swizzle for that - */ - if (key->logicop_func != PIPE_LOGICOP_COPY) { - key->color_fmt[i].format = fb_pipe_format; - key->color_fmt[i].swizzle = - v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format); - } - - const struct util_format_description *desc = - vk_format_description(fb_format); - - if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && - desc->channel[0].size == 32) { - key->f32_color_rb |= 1 << i; - } - - if (p_stage->nir->info.fs.untyped_color_outputs) { - if (util_format_is_pure_uint(fb_pipe_format)) - key->uint_color_rb |= 1 << i; - else if (util_format_is_pure_sint(fb_pipe_format)) - key->int_color_rb |= 1 << i; - } - - if (key->is_points) { - /* FIXME: The mask would need to be computed based on the shader - * inputs. On gallium it is done at st_atom_rasterizer - * (sprite_coord_enable). anv seems (need to confirm) to do that on - * genX_pipeline (PointSpriteTextureCoordinateEnable). Would be also - * better to have tests to guide filling the mask. - */ - key->point_sprite_mask = 0; - - /* Vulkan mandates upper left. */ - key->point_coord_upper_left = true; - } + v3d_fs_key_set_color_attachment(key, p_stage, i, + rendering_info->color_attachment_formats[i]); } } @@ -1247,10 +1265,12 @@ pipeline_populate_v3d_gs_key(struct v3d_gs_key *key, assert(p_stage->stage == BROADCOM_SHADER_GEOMETRY || p_stage->stage == BROADCOM_SHADER_GEOMETRY_BIN); + struct v3dv_device *device = p_stage->pipeline->device; + assert(device); + memset(key, 0, sizeof(*key)); - const bool rba = p_stage->pipeline->device->features.robustBufferAccess; - pipeline_populate_v3d_key(&key->base, p_stage, 0, rba); + pipeline_populate_v3d_key(&key->base, p_stage, 0); struct v3dv_pipeline *pipeline = p_stage->pipeline; @@ -1289,10 +1309,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key, assert(p_stage->stage == BROADCOM_SHADER_VERTEX || p_stage->stage == BROADCOM_SHADER_VERTEX_BIN); - memset(key, 0, sizeof(*key)); + struct v3dv_device *device = p_stage->pipeline->device; + assert(device); - const bool rba = p_stage->pipeline->device->features.robustBufferAccess; - pipeline_populate_v3d_key(&key->base, p_stage, 0, rba); + memset(key, 0, sizeof(*key)); + pipeline_populate_v3d_key(&key->base, p_stage, 0); struct v3dv_pipeline *pipeline = p_stage->pipeline; @@ -1301,11 +1322,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key, */ const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - uint8_t topology = vk_to_pipe_prim_type[ia_info->topology]; + uint8_t topology = vk_to_mesa_prim[ia_info->topology]; /* FIXME: PRIM_POINTS is not enough, in gallium the full check is - * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */ - key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS); + * MESA_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */ + key->per_vertex_point_size = (topology == MESA_PRIM_POINTS); key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage); @@ -1318,7 +1339,7 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key, key->num_used_outputs = 0; } else { /* Linking against GS binning program */ - assert(pipeline->gs); + assert(pipeline->stages[BROADCOM_SHADER_GEOMETRY]); struct v3dv_shader_variant *gs_bin_variant = pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]; @@ -1333,7 +1354,7 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key, sizeof(key->used_outputs)); } } else { /* Render VS */ - if (pipeline->gs) { + if (pipeline->stages[BROADCOM_SHADER_GEOMETRY]) { /* Linking against GS render program */ struct v3dv_shader_variant *gs_variant = pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]; @@ -1370,8 +1391,10 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key, const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i]; assert(desc->location < MAX_VERTEX_ATTRIBS); - if (desc->format == VK_FORMAT_B8G8R8A8_UNORM) + if (desc->format == VK_FORMAT_B8G8R8A8_UNORM || + desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) { key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location); + } } } @@ -1407,14 +1430,33 @@ pipeline_stage_create_binning(const struct v3dv_pipeline_stage *src, p_stage->stage = bin_stage; p_stage->entrypoint = src->entrypoint; p_stage->module = src->module; - p_stage->nir = src->nir ? nir_shader_clone(NULL, src->nir) : NULL; + /* For binning shaders we will clone the NIR code from the corresponding + * render shader later, when we call pipeline_compile_xxx_shader. This way + * we only have to run the relevant NIR lowerings once for render shaders + */ + p_stage->nir = NULL; + p_stage->program_id = src->program_id; p_stage->spec_info = src->spec_info; - p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 }; + p_stage->feedback = (VkPipelineCreationFeedback) { 0 }; + p_stage->robustness = src->robustness; memcpy(p_stage->shader_sha1, src->shader_sha1, 20); return p_stage; } +/* + * Based on some creation flags we assume that the QPU would be needed later + * to gather further info. In that case we just keep the qput_insts around, + * instead of map/unmap the bo later. + */ +static bool +pipeline_keep_qpu(struct v3dv_pipeline *pipeline) +{ + return pipeline->flags & + (VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR | + VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR); +} + /** * Returns false if it was not able to allocate or map the assembly bo memory. */ @@ -1454,9 +1496,10 @@ upload_assembly(struct v3dv_pipeline *pipeline) memcpy(bo->map + offset, variant->qpu_insts, variant->qpu_insts_size); offset += variant->qpu_insts_size; - /* We dont need qpu_insts anymore. */ - free(variant->qpu_insts); - variant->qpu_insts = NULL; + if (!pipeline_keep_qpu(pipeline)) { + free(variant->qpu_insts); + variant->qpu_insts = NULL; + } } } assert(total_size == offset); @@ -1474,20 +1517,27 @@ pipeline_hash_graphics(const struct v3dv_pipeline *pipeline, struct mesa_sha1 ctx; _mesa_sha1_init(&ctx); - /* We need to include all shader stages in the sha1 key as linking may modify - * the shader code in any stage. An alternative would be to use the + if (pipeline->layout) { + _mesa_sha1_update(&ctx, &pipeline->layout->sha1, + sizeof(pipeline->layout->sha1)); + } + + /* We need to include all shader stages in the sha1 key as linking may + * modify the shader code in any stage. An alternative would be to use the * serialized NIR, but that seems like an overkill. */ - _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1, - sizeof(pipeline->vs->shader_sha1)); + for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) { + if (broadcom_shader_stage_is_binning(stage)) + continue; - if (pipeline->gs) { - _mesa_sha1_update(&ctx, pipeline->gs->shader_sha1, - sizeof(pipeline->gs->shader_sha1)); - } + struct v3dv_pipeline_stage *p_stage = pipeline->stages[stage]; + if (p_stage == NULL) + continue; - _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1, - sizeof(pipeline->fs->shader_sha1)); + assert(stage != BROADCOM_SHADER_COMPUTE); + + _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1)); + } _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key)); @@ -1502,8 +1552,15 @@ pipeline_hash_compute(const struct v3dv_pipeline *pipeline, struct mesa_sha1 ctx; _mesa_sha1_init(&ctx); - _mesa_sha1_update(&ctx, pipeline->cs->shader_sha1, - sizeof(pipeline->cs->shader_sha1)); + if (pipeline->layout) { + _mesa_sha1_update(&ctx, &pipeline->layout->sha1, + sizeof(pipeline->layout->sha1)); + } + + struct v3dv_pipeline_stage *p_stage = + pipeline->stages[BROADCOM_SHADER_COMPUTE]; + + _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1)); _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key)); @@ -1553,7 +1610,7 @@ pipeline_check_spill_size(struct v3dv_pipeline *pipeline) * so it is assumed that the caller will prove a pointer that the * shader_variant will own. * - * Creation doesn't include allocate a BD to store the content of qpu_insts, + * Creation doesn't include allocate a BO to store the content of qpu_insts, * as we will try to share the same bo for several shader variants. Also note * that qpu_ints being NULL is valid, for example if we are creating the * shader_variants from the cache, so we can just upload the assembly of all @@ -1615,13 +1672,11 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage, int64_t stage_start = os_time_get_nano(); struct v3dv_pipeline *pipeline = p_stage->pipeline; - struct v3dv_physical_device *physical_device = - &pipeline->device->instance->physicalDevice; + struct v3dv_physical_device *physical_device = pipeline->device->pdevice; const struct v3d_compiler *compiler = physical_device->compiler; + gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(p_stage->stage); - if (V3D_DEBUG & (V3D_DEBUG_NIR | - v3d_debug_flag_for_shader_stage - (broadcom_shader_stage_to_gl(p_stage->stage)))) { + if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) { fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n", broadcom_shader_stage_name(p_stage->stage), p_stage->program_id); @@ -1632,8 +1687,7 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage, uint64_t *qpu_insts; uint32_t qpu_insts_size; struct v3d_prog_data *prog_data; - uint32_t prog_data_size = - v3d_prog_data_size(broadcom_shader_stage_to_gl(p_stage->stage)); + uint32_t prog_data_size = v3d_prog_data_size(gl_stage); qpu_insts = v3d_compile(compiler, key, &prog_data, @@ -1646,7 +1700,7 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage, if (!qpu_insts) { fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n", - gl_shader_stage_name(p_stage->stage), + broadcom_shader_stage_name(p_stage->stage), p_stage->program_id); *out_vk_result = VK_ERROR_UNKNOWN; } else { @@ -1667,59 +1721,6 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage, return variant; } -/* FIXME: C&P from st, common place? */ -static void -st_nir_opts(nir_shader *nir) -{ - bool progress; - - do { - progress = false; - - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - - /* Linking deals with unused inputs/outputs, but here we can remove - * things local to the shader in the hopes that we can cleanup other - * things. This pass will also remove variables with only stores, so we - * might be able to make progress after it. - */ - NIR_PASS(progress, nir, nir_remove_dead_variables, - (nir_variable_mode)(nir_var_function_temp | - nir_var_shader_temp | - nir_var_mem_shared), - NULL); - - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_dead_write_vars); - - if (nir->options->lower_to_scalar) { - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); - NIR_PASS_V(nir, nir_lower_phis_to_scalar, false); - } - - NIR_PASS_V(nir, nir_lower_alu); - NIR_PASS_V(nir, nir_lower_pack); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_dce); - if (nir_opt_trivial_continues(nir)) { - progress = true; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - } - NIR_PASS(progress, nir, nir_opt_if, false); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); - - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); - - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_opt_conditional_discard); - } while (progress); -} - static void link_shaders(nir_shader *producer, nir_shader *consumer) { @@ -1727,34 +1728,34 @@ link_shaders(nir_shader *producer, nir_shader *consumer) assert(consumer); if (producer->options->lower_to_scalar) { - NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out); - NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); + NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out); + NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); } nir_lower_io_arrays_to_elements(producer, consumer); - st_nir_opts(producer); - st_nir_opts(consumer); + v3d_optimize_nir(NULL, producer); + v3d_optimize_nir(NULL, consumer); if (nir_link_opt_varyings(producer, consumer)) - st_nir_opts(consumer); + v3d_optimize_nir(NULL, consumer); - NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL); - NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); + NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL); + NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); if (nir_remove_unused_varyings(producer, consumer)) { - NIR_PASS_V(producer, nir_lower_global_vars_to_local); - NIR_PASS_V(consumer, nir_lower_global_vars_to_local); + NIR_PASS(_, producer, nir_lower_global_vars_to_local); + NIR_PASS(_, consumer, nir_lower_global_vars_to_local); - st_nir_opts(producer); - st_nir_opts(consumer); + v3d_optimize_nir(NULL, producer); + v3d_optimize_nir(NULL, consumer); /* Optimizations can cause varyings to become unused. * nir_compact_varyings() depends on all dead varyings being removed so * we need to call nir_remove_dead_variables() again here. */ - NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL); - NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); + NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL); + NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); } } @@ -1768,6 +1769,9 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline, assert(pipeline->shared_data && pipeline->shared_data->maps[p_stage->stage]); + NIR_PASS_V(p_stage->nir, nir_vk_lower_ycbcr_tex, + lookup_ycbcr_conversion, layout); + nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir)); /* We add this because we need a valid sampler for nir_lower_tex to do @@ -1777,18 +1781,27 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline, * We add two of those, one for the case we need a 16bit return_size, and * another for the case we need a 32bit return size. */ - UNUSED unsigned index = - descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map, - -1, -1, -1, 0, 16); + struct v3dv_descriptor_maps *maps = + pipeline->shared_data->maps[p_stage->stage]; + + UNUSED unsigned index; + index = descriptor_map_add(&maps->sampler_map, -1, -1, -1, 0, 0, 16, 0); assert(index == V3DV_NO_SAMPLER_16BIT_IDX); - index = - descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map, - -2, -2, -2, 0, 32); + index = descriptor_map_add(&maps->sampler_map, -2, -2, -2, 0, 0, 32, 0); assert(index == V3DV_NO_SAMPLER_32BIT_IDX); /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ - NIR_PASS_V(p_stage->nir, lower_pipeline_layout_info, pipeline, layout); + bool needs_default_sampler_state = false; + NIR_PASS(_, p_stage->nir, lower_pipeline_layout_info, pipeline, layout, + &needs_default_sampler_state); + + /* If in the end we didn't need to use the default sampler states and the + * shader doesn't need any other samplers, get rid of them so we can + * recognize that this program doesn't use any samplers at all. + */ + if (!needs_default_sampler_state && maps->sampler_map.num_desc == 2) + maps->sampler_map.num_desc = 0; p_stage->feedback.duration += os_time_get_nano() - stage_start; } @@ -1830,7 +1843,7 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage, if (nir) { assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage)); - /* A NIR cach hit doesn't avoid the large majority of pipeline stage + /* A NIR cache hit doesn't avoid the large majority of pipeline stage * creation so the cache hit is not recorded in the pipeline feedback * flags */ @@ -1866,53 +1879,34 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage, return NULL; } -static void -pipeline_hash_shader(const struct vk_shader_module *module, - const char *entrypoint, - gl_shader_stage stage, - const VkSpecializationInfo *spec_info, - unsigned char *sha1_out) -{ - struct mesa_sha1 ctx; - _mesa_sha1_init(&ctx); - - _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1)); - _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint)); - _mesa_sha1_update(&ctx, &stage, sizeof(stage)); - if (spec_info) { - _mesa_sha1_update(&ctx, spec_info->pMapEntries, - spec_info->mapEntryCount * - sizeof(*spec_info->pMapEntries)); - _mesa_sha1_update(&ctx, spec_info->pData, - spec_info->dataSize); - } - - _mesa_sha1_final(&ctx, sha1_out); -} - static VkResult pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline, const VkAllocationCallbacks *pAllocator, const VkGraphicsPipelineCreateInfo *pCreateInfo) { - assert(pipeline->vs_bin != NULL); - if (pipeline->vs_bin->nir == NULL) { - assert(pipeline->vs->nir); - pipeline->vs_bin->nir = nir_shader_clone(NULL, pipeline->vs->nir); + struct v3dv_pipeline_stage *p_stage_vs = + pipeline->stages[BROADCOM_SHADER_VERTEX]; + struct v3dv_pipeline_stage *p_stage_vs_bin = + pipeline->stages[BROADCOM_SHADER_VERTEX_BIN]; + + assert(p_stage_vs_bin != NULL); + if (p_stage_vs_bin->nir == NULL) { + assert(p_stage_vs->nir); + p_stage_vs_bin->nir = nir_shader_clone(NULL, p_stage_vs->nir); } VkResult vk_result; struct v3d_vs_key key; - pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs); + pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs); pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] = - pipeline_compile_shader_variant(pipeline->vs, &key.base, sizeof(key), + pipeline_compile_shader_variant(p_stage_vs, &key.base, sizeof(key), pAllocator, &vk_result); if (vk_result != VK_SUCCESS) return vk_result; - pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin); + pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs_bin); pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] = - pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key), + pipeline_compile_shader_variant(p_stage_vs_bin, &key.base, sizeof(key), pAllocator, &vk_result); return vk_result; @@ -1923,26 +1917,30 @@ pipeline_compile_geometry_shader(struct v3dv_pipeline *pipeline, const VkAllocationCallbacks *pAllocator, const VkGraphicsPipelineCreateInfo *pCreateInfo) { - assert(pipeline->gs); + struct v3dv_pipeline_stage *p_stage_gs = + pipeline->stages[BROADCOM_SHADER_GEOMETRY]; + struct v3dv_pipeline_stage *p_stage_gs_bin = + pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN]; - assert(pipeline->gs_bin != NULL); - if (pipeline->gs_bin->nir == NULL) { - assert(pipeline->gs->nir); - pipeline->gs_bin->nir = nir_shader_clone(NULL, pipeline->gs->nir); + assert(p_stage_gs); + assert(p_stage_gs_bin != NULL); + if (p_stage_gs_bin->nir == NULL) { + assert(p_stage_gs->nir); + p_stage_gs_bin->nir = nir_shader_clone(NULL, p_stage_gs->nir); } VkResult vk_result; struct v3d_gs_key key; - pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs); + pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs); pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] = - pipeline_compile_shader_variant(pipeline->gs, &key.base, sizeof(key), + pipeline_compile_shader_variant(p_stage_gs, &key.base, sizeof(key), pAllocator, &vk_result); if (vk_result != VK_SUCCESS) return vk_result; - pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs_bin); + pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs_bin); pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN] = - pipeline_compile_shader_variant(pipeline->gs_bin, &key.base, sizeof(key), + pipeline_compile_shader_variant(p_stage_gs_bin, &key.base, sizeof(key), pAllocator, &vk_result); return vk_result; @@ -1953,19 +1951,26 @@ pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline, const VkAllocationCallbacks *pAllocator, const VkGraphicsPipelineCreateInfo *pCreateInfo) { - struct v3dv_pipeline_stage *p_stage = pipeline->vs; - - p_stage = pipeline->fs; + struct v3dv_pipeline_stage *p_stage_vs = + pipeline->stages[BROADCOM_SHADER_VERTEX]; + struct v3dv_pipeline_stage *p_stage_fs = + pipeline->stages[BROADCOM_SHADER_FRAGMENT]; + struct v3dv_pipeline_stage *p_stage_gs = + pipeline->stages[BROADCOM_SHADER_GEOMETRY]; struct v3d_fs_key key; + pipeline_populate_v3d_fs_key(&key, pCreateInfo, &pipeline->rendering_info, + p_stage_fs, p_stage_gs != NULL, + get_ucp_enable_mask(p_stage_vs)); - pipeline_populate_v3d_fs_key(&key, pCreateInfo, p_stage, - pipeline->gs != NULL, - get_ucp_enable_mask(pipeline->vs)); + if (key.is_points) { + assert(key.point_coord_upper_left); + NIR_PASS(_, p_stage_fs->nir, v3d_nir_lower_point_coord); + } VkResult vk_result; pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] = - pipeline_compile_shader_variant(p_stage, &key.base, sizeof(key), + pipeline_compile_shader_variant(p_stage_fs, &key.base, sizeof(key), pAllocator, &vk_result); return vk_result; @@ -1976,16 +1981,20 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline, struct v3dv_pipeline_key *key, const VkGraphicsPipelineCreateInfo *pCreateInfo) { + struct v3dv_device *device = pipeline->device; + assert(device); + memset(key, 0, sizeof(*key)); - key->robust_buffer_access = - pipeline->device->features.robustBufferAccess; + + key->line_smooth = pipeline->line_smooth; const bool raster_enabled = + pCreateInfo->pRasterizationState && !pCreateInfo->pRasterizationState->rasterizerDiscardEnable; const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - key->topology = vk_to_pipe_prim_type[ia_info->topology]; + key->topology = vk_to_mesa_prim[ia_info->topology]; const VkPipelineColorBlendStateCreateInfo *cb_info = raster_enabled ? pCreateInfo->pColorBlendState : NULL; @@ -2004,34 +2013,32 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline, ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT); key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT; - if (key->msaa) { - key->sample_coverage = - pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1; + if (key->msaa) key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable; - key->sample_alpha_to_one = ms_info->alphaToOneEnable; - } + + key->sample_alpha_to_one = ms_info->alphaToOneEnable; } - const struct v3dv_render_pass *pass = - v3dv_render_pass_from_handle(pCreateInfo->renderPass); - const struct v3dv_subpass *subpass = pipeline->subpass; - for (uint32_t i = 0; i < subpass->color_count; i++) { - const uint32_t att_idx = subpass->color_attachments[i].attachment; - if (att_idx == VK_ATTACHMENT_UNUSED) + struct vk_render_pass_state *ri = &pipeline->rendering_info; + for (uint32_t i = 0; i < ri->color_attachment_count; i++) { + if (ri->color_attachment_formats[i] == VK_FORMAT_UNDEFINED) continue; key->cbufs |= 1 << i; - VkFormat fb_format = pass->attachments[att_idx].desc.format; + VkFormat fb_format = ri->color_attachment_formats[i]; enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format); /* If logic operations are enabled then we might emit color reads and we * need to know the color buffer format and swizzle for that */ if (key->logicop_func != PIPE_LOGICOP_COPY) { + /* Framebuffer formats should be single plane */ + assert(vk_format_get_plane_count(fb_format) == 1); key->color_fmt[i].format = fb_pipe_format; - key->color_fmt[i].swizzle = v3dv_get_format_swizzle(pipeline->device, - fb_format); + memcpy(key->color_fmt[i].swizzle, + v3dv_get_format_swizzle(pipeline->device, fb_format, 0), + sizeof(key->color_fmt[i].swizzle)); } const struct util_format_description *desc = @@ -2049,12 +2056,13 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline, const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i]; assert(desc->location < MAX_VERTEX_ATTRIBS); - if (desc->format == VK_FORMAT_B8G8R8A8_UNORM) + if (desc->format == VK_FORMAT_B8G8R8A8_UNORM || + desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) { key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location); + } } - assert(pipeline->subpass); - key->has_multiview = pipeline->subpass->view_mask != 0; + key->has_multiview = ri->view_mask != 0; } static void @@ -2062,14 +2070,15 @@ pipeline_populate_compute_key(struct v3dv_pipeline *pipeline, struct v3dv_pipeline_key *key, const VkComputePipelineCreateInfo *pCreateInfo) { + struct v3dv_device *device = pipeline->device; + assert(device); + /* We use the same pipeline key for graphics and compute, but we don't need * to add a field to flag compute keys because this key is not used alone * to search in the cache, we also use the SPIR-V or the serialized NIR for * example, which already flags compute shaders. */ memset(key, 0, sizeof(*key)); - key->robust_buffer_access = - pipeline->device->features.robustBufferAccess; } static struct v3dv_pipeline_shared_data * @@ -2102,9 +2111,10 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20], continue; } - if (stage == BROADCOM_SHADER_GEOMETRY && !pipeline->gs) { + if (stage == BROADCOM_SHADER_GEOMETRY && + !pipeline->stages[BROADCOM_SHADER_GEOMETRY]) { /* We always inject a custom GS if we have multiview */ - if (!pipeline->subpass->view_mask) + if (!pipeline->rendering_info.view_mask) continue; } @@ -2146,69 +2156,52 @@ fail: static void write_creation_feedback(struct v3dv_pipeline *pipeline, const void *next, - const VkPipelineCreationFeedbackEXT *pipeline_feedback, + const VkPipelineCreationFeedback *pipeline_feedback, uint32_t stage_count, const VkPipelineShaderStageCreateInfo *stages) { - const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback = - vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT); + const VkPipelineCreationFeedbackCreateInfo *create_feedback = + vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); if (create_feedback) { typed_memcpy(create_feedback->pPipelineCreationFeedback, pipeline_feedback, 1); - assert(stage_count == create_feedback->pipelineStageCreationFeedbackCount); + const uint32_t feedback_stage_count = + create_feedback->pipelineStageCreationFeedbackCount; + assert(feedback_stage_count <= stage_count); - for (uint32_t i = 0; i < stage_count; i++) { + for (uint32_t i = 0; i < feedback_stage_count; i++) { gl_shader_stage s = vk_to_mesa_shader_stage(stages[i].stage); - switch (s) { - case MESA_SHADER_VERTEX: - create_feedback->pPipelineStageCreationFeedbacks[i] = - pipeline->vs->feedback; - - create_feedback->pPipelineStageCreationFeedbacks[i].duration += - pipeline->vs_bin->feedback.duration; - break; + enum broadcom_shader_stage bs = gl_shader_stage_to_broadcom(s); - case MESA_SHADER_GEOMETRY: - create_feedback->pPipelineStageCreationFeedbacks[i] = - pipeline->gs->feedback; + create_feedback->pPipelineStageCreationFeedbacks[i] = + pipeline->stages[bs]->feedback; + if (broadcom_shader_stage_is_render_with_binning(bs)) { + enum broadcom_shader_stage bs_bin = + broadcom_binning_shader_stage_for_render_stage(bs); create_feedback->pPipelineStageCreationFeedbacks[i].duration += - pipeline->gs_bin->feedback.duration; - break; - - case MESA_SHADER_FRAGMENT: - create_feedback->pPipelineStageCreationFeedbacks[i] = - pipeline->fs->feedback; - break; - - case MESA_SHADER_COMPUTE: - create_feedback->pPipelineStageCreationFeedbacks[i] = - pipeline->cs->feedback; - break; - - default: - unreachable("not supported shader stage"); + pipeline->stages[bs_bin]->feedback.duration; } } } } -static uint32_t +static enum mesa_prim multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline) { switch (pipeline->topology) { - case PIPE_PRIM_POINTS: - return GL_POINTS; - case PIPE_PRIM_LINES: - case PIPE_PRIM_LINE_STRIP: - return GL_LINES; - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - return GL_TRIANGLES; + case MESA_PRIM_POINTS: + return MESA_PRIM_POINTS; + case MESA_PRIM_LINES: + case MESA_PRIM_LINE_STRIP: + return MESA_PRIM_LINES; + case MESA_PRIM_TRIANGLES: + case MESA_PRIM_TRIANGLE_STRIP: + case MESA_PRIM_TRIANGLE_FAN: + return MESA_PRIM_TRIANGLES; default: /* Since we don't allow GS with multiview, we can only see non-adjacency * primitives. @@ -2217,19 +2210,19 @@ multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline) } } -static uint32_t +static enum mesa_prim multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline) { switch (pipeline->topology) { - case PIPE_PRIM_POINTS: - return GL_POINTS; - case PIPE_PRIM_LINES: - case PIPE_PRIM_LINE_STRIP: - return GL_LINE_STRIP; - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - return GL_TRIANGLE_STRIP; + case MESA_PRIM_POINTS: + return MESA_PRIM_POINTS; + case MESA_PRIM_LINES: + case MESA_PRIM_LINE_STRIP: + return MESA_PRIM_LINE_STRIP; + case MESA_PRIM_TRIANGLES: + case MESA_PRIM_TRIANGLE_STRIP: + case MESA_PRIM_TRIANGLE_FAN: + return MESA_PRIM_TRIANGLE_STRIP; default: /* Since we don't allow GS with multiview, we can only see non-adjacency * primitives. @@ -2244,8 +2237,9 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline, const VkAllocationCallbacks *pAllocator) { /* Create the passthrough GS from the VS output interface */ - pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache); - nir_shader *vs_nir = pipeline->vs->nir; + struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX]; + p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache); + nir_shader *vs_nir = p_stage_vs->nir; const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options, @@ -2255,7 +2249,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline, nir->info.outputs_written = vs_nir->info.outputs_written | (1ull << VARYING_SLOT_LAYER); - uint32_t vertex_count = u_vertices_per_prim(pipeline->topology); + uint32_t vertex_count = mesa_vertices_per_prim(pipeline->topology); nir->info.gs.input_primitive = multiview_gs_input_primitive_from_pipeline(pipeline); nir->info.gs.output_primitive = @@ -2297,7 +2291,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline, out_layer->data.location = VARYING_SLOT_LAYER; /* Get the view index value that we will write to gl_Layer */ - nir_ssa_def *layer = + nir_def *layer = nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32); /* Emit all output vertices */ @@ -2323,8 +2317,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline, /* Attach the geometry shader to the pipeline */ struct v3dv_device *device = pipeline->device; - struct v3dv_physical_device *physical_device = - &device->instance->physicalDevice; + struct v3dv_physical_device *physical_device = device->pdevice; struct v3dv_pipeline_stage *p_stage = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8, @@ -2340,21 +2333,36 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline, p_stage->entrypoint = "main"; p_stage->module = 0; p_stage->nir = nir; - pipeline_compute_sha1_from_nir(p_stage->nir, p_stage->shader_sha1); + pipeline_compute_sha1_from_nir(p_stage); p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id); + p_stage->robustness = pipeline->stages[BROADCOM_SHADER_VERTEX]->robustness; pipeline->has_gs = true; - pipeline->gs = p_stage; + pipeline->stages[BROADCOM_SHADER_GEOMETRY] = p_stage; pipeline->active_stages |= MESA_SHADER_GEOMETRY; - pipeline->gs_bin = - pipeline_stage_create_binning(pipeline->gs, pAllocator); - if (pipeline->gs_bin == NULL) - return false; + pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] = + pipeline_stage_create_binning(p_stage, pAllocator); + if (pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] == NULL) + return false; return true; } +static void +pipeline_check_buffer_device_address(struct v3dv_pipeline *pipeline) +{ + for (int i = BROADCOM_SHADER_VERTEX; i < BROADCOM_SHADER_STAGES; i++) { + struct v3dv_shader_variant *variant = pipeline->shared_data->variants[i]; + if (variant && variant->prog_data.base->has_global_address) { + pipeline->uses_buffer_device_address = true; + return; + } + } + + pipeline->uses_buffer_device_address = false; +} + /* * It compiles a pipeline. Note that it also allocate internal object, but if * some allocations success, but other fails, the method is not freeing the @@ -2371,14 +2379,13 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator) { - VkPipelineCreationFeedbackEXT pipeline_feedback = { - .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT, + VkPipelineCreationFeedback pipeline_feedback = { + .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, }; int64_t pipeline_start = os_time_get_nano(); struct v3dv_device *device = pipeline->device; - struct v3dv_physical_device *physical_device = - &device->instance->physicalDevice; + struct v3dv_physical_device *physical_device = device->pdevice; /* First pass to get some common info from the shader, and create the * individual pipeline_stage objects @@ -2394,26 +2401,24 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, if (p_stage == NULL) return VK_ERROR_OUT_OF_HOST_MEMORY; - /* Note that we are assigning program_id slightly differently that - * v3d. Here we are assigning one per pipeline stage, so vs and vs_bin - * would have a different program_id, while v3d would have the same for - * both. For the case of v3dv, it is more natural to have an id this way, - * as right now we are using it for debugging, not for shader-db. - */ p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id); + enum broadcom_shader_stage broadcom_stage = + gl_shader_stage_to_broadcom(stage); + p_stage->pipeline = pipeline; - p_stage->stage = gl_shader_stage_to_broadcom(stage); + p_stage->stage = broadcom_stage; p_stage->entrypoint = sinfo->pName; p_stage->module = vk_shader_module_from_handle(sinfo->module); p_stage->spec_info = sinfo->pSpecializationInfo; - pipeline_hash_shader(p_stage->module, - p_stage->entrypoint, - stage, - p_stage->spec_info, - p_stage->shader_sha1); + vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness, + pCreateInfo->pNext, sinfo->pNext); + + vk_pipeline_hash_shader_stage(&pCreateInfo->pStages[i], + &p_stage->robustness, + p_stage->shader_sha1); pipeline->active_stages |= sinfo->stage; @@ -2421,36 +2426,24 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, * worry about getting the nir shader for now. */ p_stage->nir = NULL; - - switch(stage) { - case MESA_SHADER_VERTEX: - pipeline->vs = p_stage; - pipeline->vs_bin = - pipeline_stage_create_binning(pipeline->vs, pAllocator); - if (pipeline->vs_bin == NULL) - return VK_ERROR_OUT_OF_HOST_MEMORY; - break; - - case MESA_SHADER_GEOMETRY: + pipeline->stages[broadcom_stage] = p_stage; + if (broadcom_stage == BROADCOM_SHADER_GEOMETRY) pipeline->has_gs = true; - pipeline->gs = p_stage; - pipeline->gs_bin = - pipeline_stage_create_binning(pipeline->gs, pAllocator); - if (pipeline->gs_bin == NULL) - return VK_ERROR_OUT_OF_HOST_MEMORY; - break; - case MESA_SHADER_FRAGMENT: - pipeline->fs = p_stage; - break; + if (broadcom_shader_stage_is_render_with_binning(broadcom_stage)) { + enum broadcom_shader_stage broadcom_stage_bin = + broadcom_binning_shader_stage_for_render_stage(broadcom_stage); - default: - unreachable("not supported shader stage"); + pipeline->stages[broadcom_stage_bin] = + pipeline_stage_create_binning(p_stage, pAllocator); + + if (pipeline->stages[broadcom_stage_bin] == NULL) + return VK_ERROR_OUT_OF_HOST_MEMORY; } } /* Add a no-op fragment shader if needed */ - if (!pipeline->fs) { + if (!pipeline->stages[BROADCOM_SHADER_FRAGMENT]) { nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, &v3dv_nir_options, "noop_fs"); @@ -2467,109 +2460,126 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, p_stage->entrypoint = "main"; p_stage->module = 0; p_stage->nir = b.shader; - pipeline_compute_sha1_from_nir(p_stage->nir, p_stage->shader_sha1); + vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness, + NULL, NULL); + pipeline_compute_sha1_from_nir(p_stage); p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id); - pipeline->fs = p_stage; + pipeline->stages[BROADCOM_SHADER_FRAGMENT] = p_stage; pipeline->active_stages |= MESA_SHADER_FRAGMENT; } /* If multiview is enabled, we inject a custom passthrough geometry shader * to broadcast draw calls to the appropriate views. */ - assert(!pipeline->subpass->view_mask || (!pipeline->has_gs && !pipeline->gs)); - if (pipeline->subpass->view_mask) { + const uint32_t view_mask = pipeline->rendering_info.view_mask; + assert(!view_mask || + (!pipeline->has_gs && !pipeline->stages[BROADCOM_SHADER_GEOMETRY])); + if (view_mask) { if (!pipeline_add_multiview_gs(pipeline, cache, pAllocator)) return VK_ERROR_OUT_OF_HOST_MEMORY; } - /* First we try to get the variants from the pipeline cache */ - struct v3dv_pipeline_key pipeline_key; - pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo); - unsigned char pipeline_sha1[20]; - pipeline_hash_graphics(pipeline, &pipeline_key, pipeline_sha1); - - bool cache_hit = false; - - pipeline->shared_data = - v3dv_pipeline_cache_search_for_pipeline(cache, - pipeline_sha1, - &cache_hit); - - if (pipeline->shared_data != NULL) { - /* A correct pipeline must have at least a VS and FS */ - assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]); - assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]); - assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]); - assert(!pipeline->gs || - pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]); - assert(!pipeline->gs || - pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]); - - if (cache_hit && cache != &pipeline->device->default_pipeline_cache) - pipeline_feedback.flags |= - VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT; - - goto success; + /* First we try to get the variants from the pipeline cache (unless we are + * required to capture internal representations, since in that case we need + * compile). + */ + bool needs_executable_info = + pCreateInfo->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; + if (!needs_executable_info) { + struct v3dv_pipeline_key pipeline_key; + pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo); + pipeline_hash_graphics(pipeline, &pipeline_key, pipeline->sha1); + + bool cache_hit = false; + + pipeline->shared_data = + v3dv_pipeline_cache_search_for_pipeline(cache, + pipeline->sha1, + &cache_hit); + + if (pipeline->shared_data != NULL) { + /* A correct pipeline must have at least a VS and FS */ + assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]); + assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]); + assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]); + assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] || + pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]); + assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] || + pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]); + + if (cache_hit && cache != &pipeline->device->default_pipeline_cache) + pipeline_feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + + goto success; + } } - if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) - return VK_PIPELINE_COMPILE_REQUIRED_EXT; + if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) + return VK_PIPELINE_COMPILE_REQUIRED; /* Otherwise we try to get the NIR shaders (either from the original SPIR-V * shader or the pipeline cache) and compile. */ pipeline->shared_data = - v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline, true); - - pipeline->vs->feedback.flags |= - VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT; - if (pipeline->gs) - pipeline->gs->feedback.flags |= - VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT; - pipeline->fs->feedback.flags |= - VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT; - - if (!pipeline->vs->nir) - pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache); - if (pipeline->gs && !pipeline->gs->nir) - pipeline->gs->nir = pipeline_stage_get_nir(pipeline->gs, pipeline, cache); - if (!pipeline->fs->nir) - pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache); + v3dv_pipeline_shared_data_new_empty(pipeline->sha1, pipeline, true); + if (!pipeline->shared_data) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX]; + struct v3dv_pipeline_stage *p_stage_fs = pipeline->stages[BROADCOM_SHADER_FRAGMENT]; + struct v3dv_pipeline_stage *p_stage_gs = pipeline->stages[BROADCOM_SHADER_GEOMETRY]; + + p_stage_vs->feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT; + if (p_stage_gs) + p_stage_gs->feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT; + p_stage_fs->feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT; + + if (!p_stage_vs->nir) + p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache); + if (p_stage_gs && !p_stage_gs->nir) + p_stage_gs->nir = pipeline_stage_get_nir(p_stage_gs, pipeline, cache); + if (!p_stage_fs->nir) + p_stage_fs->nir = pipeline_stage_get_nir(p_stage_fs, pipeline, cache); /* Linking + pipeline lowerings */ - if (pipeline->gs) { - link_shaders(pipeline->gs->nir, pipeline->fs->nir); - link_shaders(pipeline->vs->nir, pipeline->gs->nir); + if (p_stage_gs) { + link_shaders(p_stage_gs->nir, p_stage_fs->nir); + link_shaders(p_stage_vs->nir, p_stage_gs->nir); } else { - link_shaders(pipeline->vs->nir, pipeline->fs->nir); + link_shaders(p_stage_vs->nir, p_stage_fs->nir); } - pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout); - lower_fs_io(pipeline->fs->nir); + pipeline_lower_nir(pipeline, p_stage_fs, pipeline->layout); + lower_fs_io(p_stage_fs->nir); - if (pipeline->gs) { - pipeline_lower_nir(pipeline, pipeline->gs, pipeline->layout); - lower_gs_io(pipeline->gs->nir); + if (p_stage_gs) { + pipeline_lower_nir(pipeline, p_stage_gs, pipeline->layout); + lower_gs_io(p_stage_gs->nir); } - pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout); - lower_vs_io(pipeline->vs->nir); + pipeline_lower_nir(pipeline, p_stage_vs, pipeline->layout); + lower_vs_io(p_stage_vs->nir); /* Compiling to vir */ VkResult vk_result; /* We should have got all the variants or no variants from the cache */ assert(!pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]); - vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator, pCreateInfo); + vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator, + pCreateInfo); if (vk_result != VK_SUCCESS) return vk_result; assert(!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] && !pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]); - if (pipeline->gs) { + if (p_stage_gs) { vk_result = pipeline_compile_geometry_shader(pipeline, pAllocator, pCreateInfo); if (vk_result != VK_SUCCESS) @@ -2590,6 +2600,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, success: + pipeline_check_buffer_device_address(pipeline); + pipeline_feedback.duration = os_time_get_nano() - pipeline_start; write_creation_feedback(pipeline, pCreateInfo->pNext, @@ -2600,7 +2612,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, /* Since we have the variants in the pipeline shared data we can now free * the pipeline stages. */ - pipeline_free_stages(device, pipeline, pAllocator); + if (!needs_executable_info) + pipeline_free_stages(device, pipeline, pAllocator); pipeline_check_spill_size(pipeline); @@ -2638,139 +2651,11 @@ compute_vpm_config(struct v3dv_pipeline *pipeline) return VK_SUCCESS; } -static unsigned -v3dv_dynamic_state_mask(VkDynamicState state) -{ - switch(state) { - case VK_DYNAMIC_STATE_VIEWPORT: - return V3DV_DYNAMIC_VIEWPORT; - case VK_DYNAMIC_STATE_SCISSOR: - return V3DV_DYNAMIC_SCISSOR; - case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK: - return V3DV_DYNAMIC_STENCIL_COMPARE_MASK; - case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK: - return V3DV_DYNAMIC_STENCIL_WRITE_MASK; - case VK_DYNAMIC_STATE_STENCIL_REFERENCE: - return V3DV_DYNAMIC_STENCIL_REFERENCE; - case VK_DYNAMIC_STATE_BLEND_CONSTANTS: - return V3DV_DYNAMIC_BLEND_CONSTANTS; - case VK_DYNAMIC_STATE_DEPTH_BIAS: - return V3DV_DYNAMIC_DEPTH_BIAS; - case VK_DYNAMIC_STATE_LINE_WIDTH: - return V3DV_DYNAMIC_LINE_WIDTH; - case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: - return V3DV_DYNAMIC_COLOR_WRITE_ENABLE; - - /* Depth bounds testing is not available in in V3D 4.2 so here we are just - * ignoring this dynamic state. We are already asserting at pipeline creation - * time that depth bounds testing is not enabled. - */ - case VK_DYNAMIC_STATE_DEPTH_BOUNDS: - return 0; - - default: - unreachable("Unhandled dynamic state"); - } -} - -static void -pipeline_init_dynamic_state( - struct v3dv_pipeline *pipeline, - const VkPipelineDynamicStateCreateInfo *pDynamicState, - const VkPipelineViewportStateCreateInfo *pViewportState, - const VkPipelineDepthStencilStateCreateInfo *pDepthStencilState, - const VkPipelineColorBlendStateCreateInfo *pColorBlendState, - const VkPipelineRasterizationStateCreateInfo *pRasterizationState, - const VkPipelineColorWriteCreateInfoEXT *pColorWriteState) -{ - pipeline->dynamic_state = default_dynamic_state; - struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state; - - /* Create a mask of enabled dynamic states */ - uint32_t dynamic_states = 0; - if (pDynamicState) { - uint32_t count = pDynamicState->dynamicStateCount; - for (uint32_t s = 0; s < count; s++) { - dynamic_states |= - v3dv_dynamic_state_mask(pDynamicState->pDynamicStates[s]); - } - } - - /* For any pipeline states that are not dynamic, set the dynamic state - * from the static pipeline state. - */ - if (pViewportState) { - if (!(dynamic_states & V3DV_DYNAMIC_VIEWPORT)) { - dynamic->viewport.count = pViewportState->viewportCount; - typed_memcpy(dynamic->viewport.viewports, pViewportState->pViewports, - pViewportState->viewportCount); - - for (uint32_t i = 0; i < dynamic->viewport.count; i++) { - v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i], - dynamic->viewport.scale[i], - dynamic->viewport.translate[i]); - } - } - - if (!(dynamic_states & V3DV_DYNAMIC_SCISSOR)) { - dynamic->scissor.count = pViewportState->scissorCount; - typed_memcpy(dynamic->scissor.scissors, pViewportState->pScissors, - pViewportState->scissorCount); - } - } - - if (pDepthStencilState) { - if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) { - dynamic->stencil_compare_mask.front = - pDepthStencilState->front.compareMask; - dynamic->stencil_compare_mask.back = - pDepthStencilState->back.compareMask; - } - - if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) { - dynamic->stencil_write_mask.front = pDepthStencilState->front.writeMask; - dynamic->stencil_write_mask.back = pDepthStencilState->back.writeMask; - } - - if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_REFERENCE)) { - dynamic->stencil_reference.front = pDepthStencilState->front.reference; - dynamic->stencil_reference.back = pDepthStencilState->back.reference; - } - } - - if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) { - memcpy(dynamic->blend_constants, pColorBlendState->blendConstants, - sizeof(dynamic->blend_constants)); - } - - if (pRasterizationState) { - if (pRasterizationState->depthBiasEnable && - !(dynamic_states & V3DV_DYNAMIC_DEPTH_BIAS)) { - dynamic->depth_bias.constant_factor = - pRasterizationState->depthBiasConstantFactor; - dynamic->depth_bias.depth_bias_clamp = - pRasterizationState->depthBiasClamp; - dynamic->depth_bias.slope_factor = - pRasterizationState->depthBiasSlopeFactor; - } - if (!(dynamic_states & V3DV_DYNAMIC_LINE_WIDTH)) - dynamic->line_width = pRasterizationState->lineWidth; - } - - if (pColorWriteState && !(dynamic_states & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) { - dynamic->color_write_enable = 0; - for (uint32_t i = 0; i < pColorWriteState->attachmentCount; i++) - dynamic->color_write_enable |= pColorWriteState->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; - } - - pipeline->dynamic_state.mask = dynamic_states; -} - static bool -stencil_op_is_no_op(const VkStencilOpState *stencil) +stencil_op_is_no_op(struct vk_stencil_test_face_state *stencil) { - return stencil->depthFailOp == VK_STENCIL_OP_KEEP && - stencil->compareOp == VK_COMPARE_OP_ALWAYS; + return stencil->op.depth_fail == VK_STENCIL_OP_KEEP && + stencil->op.compare == VK_COMPARE_OP_ALWAYS; } static void @@ -2786,113 +2671,63 @@ enable_depth_bias(struct v3dv_pipeline *pipeline, /* Check the depth/stencil attachment description for the subpass used with * this pipeline. */ - assert(pipeline->pass && pipeline->subpass); - struct v3dv_render_pass *pass = pipeline->pass; - struct v3dv_subpass *subpass = pipeline->subpass; - - if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) - return; - - assert(subpass->ds_attachment.attachment < pass->attachment_count); - struct v3dv_render_pass_attachment *att = - &pass->attachments[subpass->ds_attachment.attachment]; - - if (att->desc.format == VK_FORMAT_D16_UNORM) + VkFormat ds_format = pipeline->rendering_info.depth_attachment_format; + if (ds_format == VK_FORMAT_D16_UNORM) pipeline->depth_bias.is_z16 = true; pipeline->depth_bias.enabled = true; } -static void -pipeline_set_ez_state(struct v3dv_pipeline *pipeline, - const VkPipelineDepthStencilStateCreateInfo *ds_info) +/* Computes the ez_state based on a given vk_dynamic_graphics_state. Note + * that the parameter dyn doesn't need to be pipeline->dynamic_graphics_state, + * as this method can be used by the cmd_buffer too. + */ +void +v3dv_compute_ez_state(struct vk_dynamic_graphics_state *dyn, + struct v3dv_pipeline *pipeline, + enum v3dv_ez_state *ez_state, + bool *incompatible_ez_test) { - if (!ds_info || !ds_info->depthTestEnable) { - pipeline->ez_state = V3D_EZ_DISABLED; + if (!dyn->ds.depth.test_enable) { + *ez_state = V3D_EZ_DISABLED; return; } - switch (ds_info->depthCompareOp) { + switch (dyn->ds.depth.compare_op) { case VK_COMPARE_OP_LESS: case VK_COMPARE_OP_LESS_OR_EQUAL: - pipeline->ez_state = V3D_EZ_LT_LE; + *ez_state = V3D_EZ_LT_LE; break; case VK_COMPARE_OP_GREATER: case VK_COMPARE_OP_GREATER_OR_EQUAL: - pipeline->ez_state = V3D_EZ_GT_GE; + *ez_state = V3D_EZ_GT_GE; break; case VK_COMPARE_OP_NEVER: case VK_COMPARE_OP_EQUAL: - pipeline->ez_state = V3D_EZ_UNDECIDED; + *ez_state = V3D_EZ_UNDECIDED; break; default: - pipeline->ez_state = V3D_EZ_DISABLED; + *ez_state = V3D_EZ_DISABLED; + *incompatible_ez_test = true; break; } /* If stencil is enabled and is not a no-op, we need to disable EZ */ - if (ds_info->stencilTestEnable && - (!stencil_op_is_no_op(&ds_info->front) || - !stencil_op_is_no_op(&ds_info->back))) { - pipeline->ez_state = V3D_EZ_DISABLED; + if (dyn->ds.stencil.test_enable && + (!stencil_op_is_no_op(&dyn->ds.stencil.front) || + !stencil_op_is_no_op(&dyn->ds.stencil.back))) { + *ez_state = V3D_EZ_DISABLED; } -} -static bool -pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) -{ - for (uint8_t i = 0; i < pipeline->va_count; i++) { - if (vk_format_is_int(pipeline->va[i].vk_format)) - return true; + /* If the FS writes Z, then it may update against the chosen EZ direction */ + struct v3dv_shader_variant *fs_variant = + pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]; + if (fs_variant && fs_variant->prog_data.fs->writes_z && + !fs_variant->prog_data.fs->writes_z_from_fep) { + *ez_state = V3D_EZ_DISABLED; } - return false; } -/* @pipeline can be NULL. We assume in that case that all the attributes have - * a float format (we only create an all-float BO once and we reuse it with - * all float pipelines), otherwise we look at the actual type of each - * attribute used with the specific pipeline passed in. - */ -struct v3dv_bo * -v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, - struct v3dv_pipeline *pipeline) -{ - uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; - struct v3dv_bo *bo; - - bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); - - if (!bo) { - fprintf(stderr, "failed to allocate memory for the default " - "attribute values\n"); - return NULL; - } - - bool ok = v3dv_bo_map(device, bo, size); - if (!ok) { - fprintf(stderr, "failed to map default attribute values buffer\n"); - return false; - } - - uint32_t *attrs = bo->map; - uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; - for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { - attrs[i * 4 + 0] = 0; - attrs[i * 4 + 1] = 0; - attrs[i * 4 + 2] = 0; - VkFormat attr_format = - pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; - if (i < va_count && vk_format_is_int(attr_format)) { - attrs[i * 4 + 3] = 1; - } else { - attrs[i * 4 + 3] = fui(1.0); - } - } - - v3dv_bo_unmap(device, bo); - - return bo; -} static void pipeline_set_sample_mask(struct v3dv_pipeline *pipeline, @@ -2918,6 +2753,135 @@ pipeline_set_sample_rate_shading(struct v3dv_pipeline *pipeline, ms_info->sampleShadingEnable; } +static void +pipeline_setup_rendering_info(struct v3dv_device *device, + struct v3dv_pipeline *pipeline, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const VkAllocationCallbacks *alloc) +{ + struct vk_render_pass_state *rp = &pipeline->rendering_info; + + if (pipeline->pass) { + assert(pipeline->subpass); + struct v3dv_render_pass *pass = pipeline->pass; + struct v3dv_subpass *subpass = pipeline->subpass; + const uint32_t attachment_idx = subpass->ds_attachment.attachment; + + rp->view_mask = subpass->view_mask; + + rp->depth_attachment_format = VK_FORMAT_UNDEFINED; + rp->stencil_attachment_format = VK_FORMAT_UNDEFINED; + rp->attachments = MESA_VK_RP_ATTACHMENT_NONE; + if (attachment_idx != VK_ATTACHMENT_UNUSED) { + VkFormat ds_format = pass->attachments[attachment_idx].desc.format; + if (vk_format_has_depth(ds_format)) { + rp->depth_attachment_format = ds_format; + rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT; + } + if (vk_format_has_stencil(ds_format)) { + rp->stencil_attachment_format = ds_format; + rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT; + } + } + + rp->color_attachment_count = subpass->color_count; + for (uint32_t i = 0; i < subpass->color_count; i++) { + const uint32_t attachment_idx = subpass->color_attachments[i].attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) { + rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED; + continue; + } + rp->color_attachment_formats[i] = + pass->attachments[attachment_idx].desc.format; + rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i); + } + return; + } + + const VkPipelineRenderingCreateInfo *ri = + vk_find_struct_const(pCreateInfo->pNext, + PIPELINE_RENDERING_CREATE_INFO); + if (ri) { + rp->view_mask = ri->viewMask; + + rp->color_attachment_count = ri->colorAttachmentCount; + for (int i = 0; i < ri->colorAttachmentCount; i++) { + rp->color_attachment_formats[i] = ri->pColorAttachmentFormats[i]; + if (rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) { + rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i); + } + } + + rp->depth_attachment_format = ri->depthAttachmentFormat; + if (ri->depthAttachmentFormat != VK_FORMAT_UNDEFINED) + rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT; + + rp->stencil_attachment_format = ri->stencilAttachmentFormat; + if (ri->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) + rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT; + + return; + } + + /* From the Vulkan spec for VkPipelineRenderingCreateInfo: + * + * "if this structure is not specified, and the pipeline does not include + * a VkRenderPass, viewMask and colorAttachmentCount are 0, and + * depthAttachmentFormat and stencilAttachmentFormat are + * VK_FORMAT_UNDEFINED. + */ + pipeline->rendering_info = (struct vk_render_pass_state) { + .view_mask = 0, + .attachments = 0, + .color_attachment_count = 0, + .depth_attachment_format = VK_FORMAT_UNDEFINED, + .stencil_attachment_format = VK_FORMAT_UNDEFINED, + }; +} + +static VkResult +pipeline_init_dynamic_state(struct v3dv_device *device, + struct v3dv_pipeline *pipeline, + struct vk_graphics_pipeline_state *pipeline_state, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const VkPipelineColorWriteCreateInfoEXT *cw_info) +{ + VkResult result = VK_SUCCESS; + struct vk_graphics_pipeline_all_state all; + result = vk_graphics_pipeline_state_fill(&pipeline->device->vk, pipeline_state, + pCreateInfo, &pipeline->rendering_info, 0, + &all, NULL, 0, NULL); + if (result != VK_SUCCESS) + return result; + + vk_dynamic_graphics_state_fill(&pipeline->dynamic_graphics_state, pipeline_state); + + struct v3dv_dynamic_state *v3dv_dyn = &pipeline->dynamic; + struct vk_dynamic_graphics_state *dyn = &pipeline->dynamic_graphics_state; + + if (BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS) || + BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_SCISSORS)) { + /* FIXME: right now we don't support multiViewport so viewporst[0] would + * work now, but would need to change if we allow multiple viewports. + */ + v3dv_X(device, viewport_compute_xform)(&dyn->vp.viewports[0], + v3dv_dyn->viewport.scale[0], + v3dv_dyn->viewport.translate[0]); + + } + + v3dv_dyn->color_write_enable = + (1ull << (4 * V3D_MAX_RENDER_TARGETS(device->devinfo.ver))) - 1; + if (cw_info && BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) { + v3dv_dyn->color_write_enable = 0; + for (uint32_t i = 0; i < cw_info->attachmentCount; i++) + v3dv_dyn->color_write_enable |= + cw_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; + } + + return result; +} + static VkResult pipeline_init(struct v3dv_pipeline *pipeline, struct v3dv_device *device, @@ -2928,25 +2892,34 @@ pipeline_init(struct v3dv_pipeline *pipeline, VkResult result = VK_SUCCESS; pipeline->device = device; + pipeline->flags = pCreateInfo->flags; V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, pCreateInfo->layout); pipeline->layout = layout; + v3dv_pipeline_layout_ref(pipeline->layout); V3DV_FROM_HANDLE(v3dv_render_pass, render_pass, pCreateInfo->renderPass); - assert(pCreateInfo->subpass < render_pass->subpass_count); - pipeline->pass = render_pass; - pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass]; + if (render_pass) { + assert(pCreateInfo->subpass < render_pass->subpass_count); + pipeline->pass = render_pass; + pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass]; + } + + pipeline_setup_rendering_info(device, pipeline, pCreateInfo, pAllocator); const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - pipeline->topology = vk_to_pipe_prim_type[ia_info->topology]; + pipeline->topology = vk_to_mesa_prim[ia_info->topology]; /* If rasterization is not enabled, various CreateInfo structs must be * ignored. */ const bool raster_enabled = + pCreateInfo->pRasterizationState && !pCreateInfo->pRasterizationState->rasterizerDiscardEnable; + pipeline->rasterization_enabled = raster_enabled; + const VkPipelineViewportStateCreateInfo *vp_info = raster_enabled ? pCreateInfo->pViewportState : NULL; @@ -2957,11 +2930,17 @@ pipeline_init(struct v3dv_pipeline *pipeline, raster_enabled ? pCreateInfo->pRasterizationState : NULL; const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info = - rs_info ? vk_find_struct_const( + raster_enabled ? vk_find_struct_const( rs_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT) : NULL; + const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info = + raster_enabled ? vk_find_struct_const( + rs_info->pNext, + PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT) : + NULL; + const VkPipelineColorBlendStateCreateInfo *cb_info = raster_enabled ? pCreateInfo->pColorBlendState : NULL; @@ -2973,22 +2952,35 @@ pipeline_init(struct v3dv_pipeline *pipeline, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT) : NULL; - pipeline_init_dynamic_state(pipeline, - pCreateInfo->pDynamicState, - vp_info, ds_info, cb_info, rs_info, cw_info); + struct vk_graphics_pipeline_state pipeline_state = { }; + result = pipeline_init_dynamic_state(device, pipeline, &pipeline_state, + pCreateInfo, cw_info); - /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that - * feature and it shouldn't be used by any pipeline. - */ - assert(!ds_info || !ds_info->depthBoundsTestEnable); + if (result != VK_SUCCESS) { + /* Caller would already destroy the pipeline, and we didn't allocate any + * extra info. We don't need to do anything else. + */ + return result; + } - v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info, - rs_info, pv_info, ms_info); + const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_control = + vp_info ? vk_find_struct_const(vp_info->pNext, + PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT) : + NULL; + + if (depth_clip_control) + pipeline->negative_one_to_one = depth_clip_control->negativeOneToOne; - pipeline_set_ez_state(pipeline, ds_info); enable_depth_bias(pipeline, rs_info); + + v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info, + rs_info, pv_info, ls_info, + ms_info, + &pipeline_state); + pipeline_set_sample_mask(pipeline, ms_info); pipeline_set_sample_rate_shading(pipeline, ms_info); + pipeline->line_smooth = enable_line_smooth(pipeline->topology, rs_info); pipeline->primitive_restart = pCreateInfo->pInputAssemblyState->primitiveRestartEnable; @@ -3011,15 +3003,22 @@ pipeline_init(struct v3dv_pipeline *pipeline, v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info); - if (pipeline_has_integer_vertex_attrib(pipeline)) { + if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) { pipeline->default_attribute_values = - v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline); + v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline); + if (!pipeline->default_attribute_values) return VK_ERROR_OUT_OF_DEVICE_MEMORY; } else { pipeline->default_attribute_values = NULL; } + /* This must be done after the pipeline has been compiled */ + v3dv_compute_ez_state(&pipeline->dynamic_graphics_state, + pipeline, + &pipeline->ez_state, + &pipeline->incompatible_ez_test); + return result; } @@ -3044,15 +3043,13 @@ graphics_pipeline_create(VkDevice _device, VK_OBJECT_TYPE_PIPELINE); if (pipeline == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - result = pipeline_init(pipeline, device, cache, - pCreateInfo, - pAllocator); + result = pipeline_init(pipeline, device, cache, pCreateInfo, pAllocator); if (result != VK_SUCCESS) { v3dv_destroy_pipeline(pipeline, device, pAllocator); - if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT) + if (result == VK_PIPELINE_COMPILE_REQUIRED) *pPipeline = VK_NULL_HANDLE; return result; } @@ -3073,7 +3070,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device, V3DV_FROM_HANDLE(v3dv_device, device, _device); VkResult result = VK_SUCCESS; - if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS)) + if (V3D_DBG(SHADERS)) mtx_lock(&device->pdevice->mutex); uint32_t i = 0; @@ -3091,7 +3088,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device, pPipelines[i] = VK_NULL_HANDLE; if (pCreateInfos[i].flags & - VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) + VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) break; } } @@ -3099,7 +3096,7 @@ v3dv_CreateGraphicsPipelines(VkDevice _device, for (; i < count; i++) pPipelines[i] = VK_NULL_HANDLE; - if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS)) + if (V3D_DBG(SHADERS)) mtx_unlock(&device->pdevice->mutex); return result; @@ -3118,12 +3115,20 @@ shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) } static void -lower_cs_shared(struct nir_shader *nir) +lower_compute(struct nir_shader *nir) { - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared, shared_type_info); - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_shared, nir_address_format_32bit_offset); + if (!nir->info.shared_memory_explicit_layout) { + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared, shared_type_info); + } + + NIR_PASS(_, nir, nir_lower_explicit_io, + nir_var_mem_shared, nir_address_format_32bit_offset); + + struct nir_lower_compute_system_values_options sysval_options = { + .has_base_workgroup_id = true, + }; + NIR_PASS_V(nir, nir_lower_compute_system_values, &sysval_options); } static VkResult @@ -3132,14 +3137,13 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline, const VkComputePipelineCreateInfo *info, const VkAllocationCallbacks *alloc) { - VkPipelineCreationFeedbackEXT pipeline_feedback = { - .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT, + VkPipelineCreationFeedback pipeline_feedback = { + .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, }; int64_t pipeline_start = os_time_get_nano(); struct v3dv_device *device = pipeline->device; - struct v3dv_physical_device *physical_device = - &device->instance->physicalDevice; + struct v3dv_physical_device *physical_device = device->pdevice; const VkPipelineShaderStageCreateInfo *sinfo = &info->stage; gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage); @@ -3156,61 +3160,69 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline, p_stage->entrypoint = sinfo->pName; p_stage->module = vk_shader_module_from_handle(sinfo->module); p_stage->spec_info = sinfo->pSpecializationInfo; - p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 }; + p_stage->feedback = (VkPipelineCreationFeedback) { 0 }; - pipeline_hash_shader(p_stage->module, - p_stage->entrypoint, - stage, - p_stage->spec_info, - p_stage->shader_sha1); + vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness, + info->pNext, sinfo->pNext); + + vk_pipeline_hash_shader_stage(&info->stage, + &p_stage->robustness, + p_stage->shader_sha1); - /* We try to get directly the variant first from the cache */ p_stage->nir = NULL; - pipeline->cs = p_stage; + pipeline->stages[BROADCOM_SHADER_COMPUTE] = p_stage; pipeline->active_stages |= sinfo->stage; - struct v3dv_pipeline_key pipeline_key; - pipeline_populate_compute_key(pipeline, &pipeline_key, info); - unsigned char pipeline_sha1[20]; - pipeline_hash_compute(pipeline, &pipeline_key, pipeline_sha1); - - bool cache_hit = false; - pipeline->shared_data = - v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1, &cache_hit); - - if (pipeline->shared_data != NULL) { - assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); - if (cache_hit && cache != &pipeline->device->default_pipeline_cache) - pipeline_feedback.flags |= - VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT; - - goto success; + /* First we try to get the variants from the pipeline cache (unless we are + * required to capture internal representations, since in that case we need + * compile). + */ + bool needs_executable_info = + info->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; + if (!needs_executable_info) { + struct v3dv_pipeline_key pipeline_key; + pipeline_populate_compute_key(pipeline, &pipeline_key, info); + pipeline_hash_compute(pipeline, &pipeline_key, pipeline->sha1); + + bool cache_hit = false; + pipeline->shared_data = + v3dv_pipeline_cache_search_for_pipeline(cache, pipeline->sha1, &cache_hit); + + if (pipeline->shared_data != NULL) { + assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); + if (cache_hit && cache != &pipeline->device->default_pipeline_cache) + pipeline_feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + + goto success; + } } - if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) - return VK_PIPELINE_COMPILE_REQUIRED_EXT; + if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) + return VK_PIPELINE_COMPILE_REQUIRED; - pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline_sha1, + pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline->sha1, pipeline, false); + if (!pipeline->shared_data) + return VK_ERROR_OUT_OF_HOST_MEMORY; - p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT; + p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT; /* If not found on cache, compile it */ p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache); assert(p_stage->nir); - st_nir_opts(p_stage->nir); + v3d_optimize_nir(NULL, p_stage->nir); pipeline_lower_nir(pipeline, p_stage, pipeline->layout); - lower_cs_shared(p_stage->nir); + lower_compute(p_stage->nir); VkResult result = VK_SUCCESS; struct v3d_key key; memset(&key, 0, sizeof(key)); - pipeline_populate_v3d_key(&key, p_stage, 0, - pipeline->device->features.robustBufferAccess); + pipeline_populate_v3d_key(&key, p_stage, 0); pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE] = pipeline_compile_shader_variant(p_stage, &key, sizeof(key), alloc, &result); @@ -3225,6 +3237,8 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline, success: + pipeline_check_buffer_device_address(pipeline); + pipeline_feedback.duration = os_time_get_nano() - pipeline_start; write_creation_feedback(pipeline, info->pNext, @@ -3233,9 +3247,10 @@ success: &info->stage); /* As we got the variants in pipeline->shared_data, after compiling we - * don't need the pipeline_stages + * don't need the pipeline_stages. */ - pipeline_free_stages(device, pipeline, alloc); + if (!needs_executable_info) + pipeline_free_stages(device, pipeline, alloc); pipeline_check_spill_size(pipeline); @@ -3253,8 +3268,11 @@ compute_pipeline_init(struct v3dv_pipeline *pipeline, pipeline->device = device; pipeline->layout = layout; + v3dv_pipeline_layout_ref(pipeline->layout); VkResult result = pipeline_compile_compute(pipeline, cache, info, alloc); + if (result != VK_SUCCESS) + return result; return result; } @@ -3279,13 +3297,13 @@ compute_pipeline_create(VkDevice _device, pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE); if (pipeline == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); result = compute_pipeline_init(pipeline, device, cache, pCreateInfo, pAllocator); if (result != VK_SUCCESS) { v3dv_destroy_pipeline(pipeline, device, pAllocator); - if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT) + if (result == VK_PIPELINE_COMPILE_REQUIRED) *pPipeline = VK_NULL_HANDLE; return result; } @@ -3306,7 +3324,7 @@ v3dv_CreateComputePipelines(VkDevice _device, V3DV_FROM_HANDLE(v3dv_device, device, _device); VkResult result = VK_SUCCESS; - if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS)) + if (V3D_DBG(SHADERS)) mtx_lock(&device->pdevice->mutex); uint32_t i = 0; @@ -3323,7 +3341,7 @@ v3dv_CreateComputePipelines(VkDevice _device, pPipelines[i] = VK_NULL_HANDLE; if (pCreateInfos[i].flags & - VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) + VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) break; } } @@ -3331,8 +3349,303 @@ v3dv_CreateComputePipelines(VkDevice _device, for (; i < createInfoCount; i++) pPipelines[i] = VK_NULL_HANDLE; - if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS)) + if (V3D_DBG(SHADERS)) mtx_unlock(&device->pdevice->mutex); return result; } + +static nir_shader * +pipeline_get_nir(struct v3dv_pipeline *pipeline, + enum broadcom_shader_stage stage) +{ + assert(stage >= 0 && stage < BROADCOM_SHADER_STAGES); + if (pipeline->stages[stage]) + return pipeline->stages[stage]->nir; + + return NULL; +} + +static struct v3d_prog_data * +pipeline_get_prog_data(struct v3dv_pipeline *pipeline, + enum broadcom_shader_stage stage) +{ + if (pipeline->shared_data->variants[stage]) + return pipeline->shared_data->variants[stage]->prog_data.base; + return NULL; +} + +static uint64_t * +pipeline_get_qpu(struct v3dv_pipeline *pipeline, + enum broadcom_shader_stage stage, + uint32_t *qpu_size) +{ + struct v3dv_shader_variant *variant = + pipeline->shared_data->variants[stage]; + if (!variant) { + *qpu_size = 0; + return NULL; + } + + *qpu_size = variant->qpu_insts_size; + return variant->qpu_insts; +} + +/* FIXME: we use the same macro in various drivers, maybe move it to + * the common vk_util.h? + */ +#define WRITE_STR(field, ...) ({ \ + memset(field, 0, sizeof(field)); \ + UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \ + assert(_i > 0 && _i < sizeof(field)); \ +}) + +static bool +write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir, + const char *data) +{ + ir->isText = VK_TRUE; + + size_t data_len = strlen(data) + 1; + + if (ir->pData == NULL) { + ir->dataSize = data_len; + return true; + } + + strncpy(ir->pData, data, ir->dataSize); + if (ir->dataSize < data_len) + return false; + + ir->dataSize = data_len; + return true; +} + +static void +append(char **str, size_t *offset, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + ralloc_vasprintf_rewrite_tail(str, offset, fmt, args); + va_end(args); +} + +static void +pipeline_collect_executable_data(struct v3dv_pipeline *pipeline) +{ + if (pipeline->executables.mem_ctx) + return; + + pipeline->executables.mem_ctx = ralloc_context(NULL); + util_dynarray_init(&pipeline->executables.data, + pipeline->executables.mem_ctx); + + /* Don't crash for failed/bogus pipelines */ + if (!pipeline->shared_data) + return; + + for (int s = BROADCOM_SHADER_VERTEX; s <= BROADCOM_SHADER_COMPUTE; s++) { + VkShaderStageFlags vk_stage = + mesa_to_vk_shader_stage(broadcom_shader_stage_to_gl(s)); + if (!(vk_stage & pipeline->active_stages)) + continue; + + char *nir_str = NULL; + char *qpu_str = NULL; + + if (pipeline_keep_qpu(pipeline)) { + nir_shader *nir = pipeline_get_nir(pipeline, s); + nir_str = nir ? + nir_shader_as_str(nir, pipeline->executables.mem_ctx) : NULL; + + uint32_t qpu_size; + uint64_t *qpu = pipeline_get_qpu(pipeline, s, &qpu_size); + if (qpu) { + uint32_t qpu_inst_count = qpu_size / sizeof(uint64_t); + qpu_str = rzalloc_size(pipeline->executables.mem_ctx, + qpu_inst_count * 96); + size_t offset = 0; + for (int i = 0; i < qpu_inst_count; i++) { + const char *str = v3d_qpu_disasm(&pipeline->device->devinfo, qpu[i]); + append(&qpu_str, &offset, "%s\n", str); + ralloc_free((void *)str); + } + } + } + + struct v3dv_pipeline_executable_data data = { + .stage = s, + .nir_str = nir_str, + .qpu_str = qpu_str, + }; + util_dynarray_append(&pipeline->executables.data, + struct v3dv_pipeline_executable_data, data); + } +} + +static const struct v3dv_pipeline_executable_data * +pipeline_get_executable(struct v3dv_pipeline *pipeline, uint32_t index) +{ + assert(index < util_dynarray_num_elements(&pipeline->executables.data, + struct v3dv_pipeline_executable_data)); + return util_dynarray_element(&pipeline->executables.data, + struct v3dv_pipeline_executable_data, + index); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_GetPipelineExecutableInternalRepresentationsKHR( + VkDevice device, + const VkPipelineExecutableInfoKHR *pExecutableInfo, + uint32_t *pInternalRepresentationCount, + VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations) +{ + V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline); + + pipeline_collect_executable_data(pipeline); + + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out, + pInternalRepresentations, pInternalRepresentationCount); + + bool incomplete = false; + const struct v3dv_pipeline_executable_data *exe = + pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); + + if (exe->nir_str) { + vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, + &out, ir) { + WRITE_STR(ir->name, "NIR (%s)", broadcom_shader_stage_name(exe->stage)); + WRITE_STR(ir->description, "Final NIR form"); + if (!write_ir_text(ir, exe->nir_str)) + incomplete = true; + } + } + + if (exe->qpu_str) { + vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, + &out, ir) { + WRITE_STR(ir->name, "QPU (%s)", broadcom_shader_stage_name(exe->stage)); + WRITE_STR(ir->description, "Final QPU assembly"); + if (!write_ir_text(ir, exe->qpu_str)) + incomplete = true; + } + } + + return incomplete ? VK_INCOMPLETE : vk_outarray_status(&out); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_GetPipelineExecutablePropertiesKHR( + VkDevice device, + const VkPipelineInfoKHR *pPipelineInfo, + uint32_t *pExecutableCount, + VkPipelineExecutablePropertiesKHR *pProperties) +{ + V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pPipelineInfo->pipeline); + + pipeline_collect_executable_data(pipeline); + + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, + pProperties, pExecutableCount); + + util_dynarray_foreach(&pipeline->executables.data, + struct v3dv_pipeline_executable_data, exe) { + vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) { + gl_shader_stage mesa_stage = broadcom_shader_stage_to_gl(exe->stage); + props->stages = mesa_to_vk_shader_stage(mesa_stage); + + WRITE_STR(props->name, "%s (%s)", + _mesa_shader_stage_to_abbrev(mesa_stage), + broadcom_shader_stage_is_binning(exe->stage) ? + "Binning" : "Render"); + + WRITE_STR(props->description, "%s", + _mesa_shader_stage_to_string(mesa_stage)); + + props->subgroupSize = V3D_CHANNELS; + } + } + + return vk_outarray_status(&out); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_GetPipelineExecutableStatisticsKHR( + VkDevice device, + const VkPipelineExecutableInfoKHR *pExecutableInfo, + uint32_t *pStatisticCount, + VkPipelineExecutableStatisticKHR *pStatistics) +{ + V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline); + + pipeline_collect_executable_data(pipeline); + + const struct v3dv_pipeline_executable_data *exe = + pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); + + struct v3d_prog_data *prog_data = + pipeline_get_prog_data(pipeline, exe->stage); + + struct v3dv_shader_variant *variant = + pipeline->shared_data->variants[exe->stage]; + uint32_t qpu_inst_count = variant->qpu_insts_size / sizeof(uint64_t); + + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, + pStatistics, pStatisticCount); + + if (qpu_inst_count > 0) { + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Compile Strategy"); + WRITE_STR(stat->description, "Chosen compile strategy index"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->compile_strategy_idx; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Instruction Count"); + WRITE_STR(stat->description, "Number of QPU instructions"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = qpu_inst_count; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Thread Count"); + WRITE_STR(stat->description, "Number of QPU threads dispatched"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->threads; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Spill Size"); + WRITE_STR(stat->description, "Size of the spill buffer in bytes"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->spill_size; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "TMU Spills"); + WRITE_STR(stat->description, "Number of times a register was spilled " + "to memory"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->spill_size; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "TMU Fills"); + WRITE_STR(stat->description, "Number of times a register was filled " + "from memory"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->spill_size; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "QPU Read Stalls"); + WRITE_STR(stat->description, "Number of cycles the QPU stalls for a " + "register read dependency"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->qpu_read_stalls; + } + } + + return vk_outarray_status(&out); +} diff --git a/src/broadcom/vulkan/v3dv_pipeline_cache.c b/src/broadcom/vulkan/v3dv_pipeline_cache.c index 02721ec1d79..d2124ee0b08 100644 --- a/src/broadcom/vulkan/v3dv_pipeline_cache.c +++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -22,7 +22,7 @@ */ #include "v3dv_private.h" -#include "vulkan/util/vk_util.h" +#include "vk_util.h" #include "util/blob.h" #include "nir/nir_serialize.h" @@ -61,20 +61,22 @@ cache_dump_stats(struct v3dv_pipeline_cache *cache) fprintf(stderr, " cache entries: %d\n", cache->stats.count); fprintf(stderr, " cache miss count: %d\n", cache->stats.miss); fprintf(stderr, " cache hit count: %d\n", cache->stats.hit); + + fprintf(stderr, " on-disk cache hit count: %d\n", cache->stats.on_disk_hit); } static void pipeline_cache_lock(struct v3dv_pipeline_cache *cache) { if (!cache->externally_synchronized) - pthread_mutex_lock(&cache->mutex); + mtx_lock(&cache->mutex); } static void pipeline_cache_unlock(struct v3dv_pipeline_cache *cache) { if (!cache->externally_synchronized) - pthread_mutex_unlock(&cache->mutex); + mtx_unlock(&cache->mutex); } void @@ -178,7 +180,7 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline, } else { cache->nir_stats.hit++; if (debug_cache) { - fprintf(stderr, "\tnir cache hit: %p\n", nir); + fprintf(stderr, "[v3dv nir cache] hit: %p\n", nir); if (dump_stats) cache_dump_stats(cache); } @@ -188,7 +190,7 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline, cache->nir_stats.miss++; if (debug_cache) { - fprintf(stderr, "\tnir cache miss\n"); + fprintf(stderr, "[v3dv nir cache] miss\n"); if (dump_stats) cache_dump_stats(cache); } @@ -203,7 +205,7 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache, bool cache_enabled) { cache->device = device; - pthread_mutex_init(&cache->mutex, NULL); + mtx_init(&cache->mutex, mtx_plain); if (cache_enabled) { cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func, @@ -219,7 +221,7 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache, cache->stats.count = 0; cache->externally_synchronized = flags & - VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT; + VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT; } else { cache->nir_cache = NULL; cache->cache = NULL; @@ -241,7 +243,7 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data * struct blob *blob); /** - * It searchs for pipeline cached data, and returns a v3dv_pipeline_shared_data with + * It searches for pipeline cached data, and returns a v3dv_pipeline_shared_data with * it, or NULL if doesn't have it cached. On the former, it will increases the * ref_count, so caller is responsible to unref it. */ @@ -273,7 +275,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache, cache->stats.hit++; *cache_hit = true; if (debug_cache) { - fprintf(stderr, "\tcache hit: %p\n", cache_entry); + fprintf(stderr, "[v3dv cache] hit: %p\n", cache_entry); if (dump_stats) cache_dump_stats(cache); } @@ -288,7 +290,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache, cache->stats.miss++; if (debug_cache) { - fprintf(stderr, "\tcache miss\n"); + fprintf(stderr, "[v3dv cache] miss\n"); if (dump_stats) cache_dump_stats(cache); } @@ -300,7 +302,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache, struct disk_cache *disk_cache = device->pdevice->disk_cache; /* Note that the on-disk-cache can be independently disabled, while keeping * the pipeline cache working, by using the environment variable - * MESA_GLSL_CACHE_DISABLE. In that case the calls to disk_cache_put/get + * MESA_SHADER_CACHE_DISABLE. In that case the calls to disk_cache_put/get * will not do anything. */ if (disk_cache && device->instance->pipeline_cache_enabled) { @@ -309,25 +311,32 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache, size_t buffer_size; uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size); + if (V3D_DBG(CACHE)) { + char sha1buf[41]; + _mesa_sha1_format(sha1buf, cache_key); + fprintf(stderr, "[v3dv on-disk cache] %s %s\n", + buffer ? "hit" : "miss", + sha1buf); + } + if (buffer) { struct blob_reader blob; struct v3dv_pipeline_shared_data *shared_data; - if (debug_cache) - fprintf(stderr, "\ton-disk-cache hit\n"); - blob_reader_init(&blob, buffer, buffer_size); shared_data = v3dv_pipeline_shared_data_create_from_blob(cache, &blob); free(buffer); if (shared_data) { + /* Technically we could increase on_disk_hit as soon as we have a + * buffer, but we are more interested on hits that got a valid + * shared_data + */ + cache->stats.on_disk_hit++; if (cache) pipeline_cache_upload_shared_data(cache, shared_data, true); return shared_data; } - } else { - if (debug_cache) - fprintf(stderr, "\ton-disk-cache miss\n"); } } #endif @@ -393,15 +402,13 @@ v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache, "pipeline shader assembly", true); if (!bo) { fprintf(stderr, "failed to allocate memory for shaders assembly\n"); - v3dv_pipeline_shared_data_unref(cache->device, new_entry); - return NULL; + goto fail; } bool ok = v3dv_bo_map(cache->device, bo, total_assembly_size); if (!ok) { fprintf(stderr, "failed to map source shader buffer\n"); - v3dv_pipeline_shared_data_unref(cache->device, new_entry); - return NULL; + goto fail; } memcpy(bo->map, total_assembly, total_assembly_size); @@ -409,6 +416,10 @@ v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache, new_entry->assembly_bo = bo; return new_entry; + +fail: + v3dv_pipeline_shared_data_unref(cache->device, new_entry); + return NULL; } static void @@ -425,8 +436,13 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache, return; pipeline_cache_lock(cache); - struct hash_entry *entry = - _mesa_hash_table_search(cache->cache, shared_data->sha1_key); + struct hash_entry *entry = NULL; + + /* If this is being called from the disk cache, we already know that the + * entry is not on the hash table + */ + if (!from_disk_cache) + entry = _mesa_hash_table_search(cache->cache, shared_data->sha1_key); if (entry) { pipeline_cache_unlock(cache); @@ -464,14 +480,12 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache, cache_key cache_key; disk_cache_compute_key(disk_cache, shared_data->sha1_key, 20, cache_key); - disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL); - if (debug_cache) { + if (V3D_DBG(CACHE)) { char sha1buf[41]; _mesa_sha1_format(sha1buf, shared_data->sha1_key); - - fprintf(stderr, "on-disk-cache, new cache entry with sha1 key %s:%p\n\n", - sha1buf, shared_data); + fprintf(stderr, "[v3dv on-disk cache] storing %s\n", sha1buf); } + disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL); } blob_finish(&binary); @@ -528,7 +542,7 @@ shader_variant_create_from_blob(struct v3dv_device *device, if (blob->overrun) return NULL; - uint ulist_data_size = sizeof(uint32_t) * ulist_count; + size_t ulist_data_size = sizeof(uint32_t) * ulist_count; const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size); if (blob->overrun) return NULL; @@ -564,6 +578,7 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache, const unsigned char *sha1_key = blob_read_bytes(blob, 20); struct v3dv_descriptor_maps *maps[BROADCOM_SHADER_STAGES] = { 0 }; + struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 }; uint8_t descriptor_maps_count = blob_read_uint8(blob); for (uint8_t count = 0; count < descriptor_maps_count; count++) { @@ -573,14 +588,14 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache, blob_read_bytes(blob, sizeof(struct v3dv_descriptor_maps)); if (blob->overrun) - return NULL; + goto fail; maps[stage] = vk_zalloc2(&cache->device->vk.alloc, NULL, sizeof(struct v3dv_descriptor_maps), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (maps[stage] == NULL) - return NULL; + goto fail; memcpy(maps[stage], current_maps, sizeof(struct v3dv_descriptor_maps)); if (broadcom_shader_stage_is_render_with_binning(stage)) { @@ -592,8 +607,6 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache, uint8_t variant_count = blob_read_uint8(blob); - struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 }; - for (uint8_t count = 0; count < variant_count; count++) { uint8_t stage = blob_read_uint8(blob); struct v3dv_shader_variant *variant = @@ -606,10 +619,25 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache, blob_read_bytes(blob, total_assembly_size); if (blob->overrun) - return NULL; + goto fail; + + struct v3dv_pipeline_shared_data *data = + v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants, + total_assembly, total_assembly_size); + + if (!data) + goto fail; - return v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants, - total_assembly, total_assembly_size); + return data; + +fail: + for (int i = 0; i < BROADCOM_SHADER_STAGES; i++) { + if (maps[i]) + vk_free2(&cache->device->vk.alloc, NULL, maps[i]); + if (variants[i]) + v3dv_shader_variant_destroy(cache->device, variants[i]); + } + return NULL; } static void @@ -618,7 +646,7 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache, const void *data) { struct v3dv_device *device = cache->device; - struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; + struct v3dv_physical_device *pdevice = device->pdevice; struct vk_pipeline_cache_header header; if (cache->cache == NULL || cache->nir_cache == NULL) @@ -695,7 +723,7 @@ v3dv_CreatePipelineCache(VkDevice _device, VK_OBJECT_TYPE_PIPELINE_CACHE); if (cache == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); v3dv_pipeline_cache_init(cache, device, pCreateInfo->flags, device->instance->pipeline_cache_enabled); @@ -714,7 +742,7 @@ v3dv_CreatePipelineCache(VkDevice _device, void v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache) { - pthread_mutex_destroy(&cache->mutex); + mtx_destroy(&cache->mutex); if (dump_stats_on_destroy) cache_dump_stats(cache); @@ -934,7 +962,7 @@ v3dv_GetPipelineCacheData(VkDevice _device, blob_init_fixed(&blob, NULL, SIZE_MAX); } - struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; + struct v3dv_physical_device *pdevice = device->pdevice; VkResult result = VK_INCOMPLETE; pipeline_cache_lock(cache); diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index b5ab7ed2c59..892afcf3ab8 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * based in part on anv driver which is: * Copyright © 2015 Intel Corporation @@ -36,12 +36,24 @@ #include <vulkan/vk_icd.h> #include <vk_enum_to_str.h> +#include "vk_descriptor_update_template.h" #include "vk_device.h" +#include "vk_device_memory.h" +#include "vk_format.h" #include "vk_instance.h" #include "vk_image.h" +#include "vk_log.h" #include "vk_physical_device.h" #include "vk_shader_module.h" +#include "vk_sync.h" +#include "vk_sync_timeline.h" #include "vk_util.h" +#include "vk_ycbcr_conversion.h" + +#include "vk_command_buffer.h" +#include "vk_command_pool.h" +#include "vk_queue.h" +#include "vk_pipeline.h" #include <xf86drm.h> @@ -53,6 +65,13 @@ #define VG(x) ((void)0) #endif +#include "util/detect_os.h" + +#if DETECT_OS_ANDROID +#include <vndk/hardware_buffer.h> +#include "util/u_gralloc/u_gralloc.h" +#endif + #include "v3dv_limits.h" #include "common/v3d_device_info.h" @@ -68,8 +87,9 @@ #include "vk_debug_report.h" #include "util/set.h" #include "util/hash_table.h" +#include "util/sparse_array.h" #include "util/xmlconfig.h" -#include "u_atomic.h" +#include "util/u_atomic.h" #include "v3dv_entrypoints.h" #include "v3dv_bo.h" @@ -84,7 +104,7 @@ #include "wsi_common.h" /* A non-fatal assert. Useful for debugging. */ -#ifdef DEBUG +#if MESA_DEBUG #define v3dv_assert(x) ({ \ if (unlikely(!(x))) \ fprintf(stderr, "%s:%d ASSERT: %s", __FILE__, __LINE__, #x); \ @@ -94,7 +114,7 @@ #endif #define perf_debug(...) do { \ - if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) \ + if (V3D_DBG(PERF)) \ fprintf(stderr, __VA_ARGS__); \ } while (0) @@ -111,13 +131,15 @@ struct v3d_simulator_file; /* Minimum required by the Vulkan 1.1 spec */ #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30) +/* Maximum performance counters number */ +#define V3D_MAX_PERFCNT 93 + struct v3dv_physical_device { struct vk_physical_device vk; char *name; int32_t render_fd; int32_t display_fd; - int32_t master_fd; /* We need these because it is not clear how to detect * valid devids in a portable way @@ -128,11 +150,19 @@ struct v3dv_physical_device { dev_t primary_devid; dev_t render_devid; +#if using_v3d_simulator + uint32_t device_id; +#endif + uint8_t driver_build_sha1[20]; uint8_t pipeline_cache_uuid[VK_UUID_SIZE]; uint8_t device_uuid[VK_UUID_SIZE]; uint8_t driver_uuid[VK_UUID_SIZE]; + struct vk_sync_type drm_syncobj_type; + struct vk_sync_timeline_type sync_timeline_type; + const struct vk_sync_type *sync_types[3]; + struct disk_cache *disk_cache; mtx_t mutex; @@ -148,14 +178,41 @@ struct v3dv_physical_device { const struct v3d_compiler *compiler; uint32_t next_program_id; + alignas(8) uint64_t heap_used; + + /* This array holds all our 'struct v3dv_bo' allocations. We use this + * so we can add a refcount to our BOs and check if a particular BO + * was already allocated in this device using its GEM handle. This is + * necessary to properly manage BO imports, because the kernel doesn't + * refcount the underlying BO memory. + * + * Specifically, when self-importing (i.e. importing a BO into the same + * device that created it), the kernel will give us the same BO handle + * for both BOs and we must only free it once when both references are + * freed. Otherwise, if we are not self-importing, we get two different BO + * handles, and we want to free each one individually. + * + * The BOs in this map all have a refcnt with the reference counter and + * only self-imported BOs will ever have a refcnt > 1. + */ + struct util_sparse_array bo_map; + struct { bool merge_jobs; } options; + + struct { + bool cpu_queue; + bool multisync; + bool perfmon; + } caps; }; -VkResult v3dv_physical_device_acquire_display(struct v3dv_instance *instance, - struct v3dv_physical_device *pdevice, - VkIcdSurfaceBase *surface); +static inline struct v3dv_bo * +v3dv_device_lookup_bo(struct v3dv_physical_device *device, uint32_t handle) +{ + return (struct v3dv_bo *) util_sparse_array_get(&device->bo_map, handle); +} VkResult v3dv_wsi_init(struct v3dv_physical_device *physical_device); void v3dv_wsi_finish(struct v3dv_physical_device *physical_device); @@ -172,64 +229,72 @@ void v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device); void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device); bool v3dv_meta_can_use_tlb(struct v3dv_image *image, + uint8_t plane, + uint8_t miplevel, const VkOffset3D *offset, + const VkExtent3D *extent, VkFormat *compat_format); struct v3dv_instance { struct vk_instance vk; - int physicalDeviceCount; - struct v3dv_physical_device physicalDevice; - bool pipeline_cache_enabled; bool default_pipeline_cache_enabled; }; -/* Tracks wait threads spawned from a single vkQueueSubmit call */ -struct v3dv_queue_submit_wait_info { - /* struct vk_object_base base; ?*/ - struct list_head list_link; - - struct v3dv_device *device; - - /* List of wait threads spawned for any command buffers in a particular - * call to vkQueueSubmit. - */ - uint32_t wait_thread_count; - struct { - pthread_t thread; - bool finished; - } wait_threads[16]; - - /* The master wait thread for the entire submit. This will wait for all - * other threads in this submit to complete before processing signal - * semaphores and fences. +/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd, + * tfu), we still need a syncobj to track the last overall job submitted + * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can + * start expecting multisync to be present and drop the legacy implementation + * together with this V3DV_QUEUE_ANY tracker. + */ +enum v3dv_queue_type { + V3DV_QUEUE_CL = 0, + V3DV_QUEUE_CSD, + V3DV_QUEUE_TFU, + V3DV_QUEUE_CPU, + V3DV_QUEUE_ANY, + V3DV_QUEUE_COUNT, +}; + +/* For each GPU queue, we use a syncobj to track the last job submitted. We + * set the flag `first` to determine when we are starting a new cmd buffer + * batch and therefore a job submitted to a given queue will be the first in a + * cmd buf batch. + */ +struct v3dv_last_job_sync { + /* If the job is the first submitted to a GPU queue in a cmd buffer batch. + * + * We use V3DV_QUEUE_{CL,CSD,TFU} both with and without multisync. */ - pthread_t master_wait_thread; - - /* List of semaphores (and fence) to signal after all wait threads completed - * and all command buffer jobs in the submission have been sent to the GPU. + bool first[V3DV_QUEUE_COUNT]; + /* Array of syncobj to track the last job submitted to a GPU queue. + * + * With multisync we use V3DV_QUEUE_{CL,CSD,TFU} to track syncobjs for each + * queue, but without multisync we only track the last job submitted to any + * queue in V3DV_QUEUE_ANY. */ - uint32_t signal_semaphore_count; - VkSemaphore *signal_semaphores; - VkFence fence; + uint32_t syncs[V3DV_QUEUE_COUNT]; }; struct v3dv_queue { - struct vk_object_base base; + struct vk_queue vk; struct v3dv_device *device; - VkDeviceQueueCreateFlags flags; - /* A list of active v3dv_queue_submit_wait_info */ - struct list_head submit_wait_list; - - /* A mutex to prevent concurrent access to the list of wait threads */ - mtx_t mutex; + struct v3dv_last_job_sync last_job_syncs; struct v3dv_job *noop_job; + + /* The last active perfmon ID to prevent mixing of counter results when a + * job is submitted with a different perfmon id. + */ + uint32_t last_perfmon_id; }; +VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue, + struct vk_queue_submit *submit); + #define V3DV_META_BLIT_CACHE_KEY_SIZE (4 * sizeof(uint32_t)) #define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (3 * sizeof(uint32_t) + \ sizeof(VkComponentMapping)) @@ -261,27 +326,27 @@ struct v3dv_meta_texel_buffer_copy_pipeline { }; struct v3dv_pipeline_key { - bool robust_buffer_access; uint8_t topology; uint8_t logicop_func; bool msaa; - bool sample_coverage; bool sample_alpha_to_coverage; bool sample_alpha_to_one; uint8_t cbufs; struct { enum pipe_format format; - const uint8_t *swizzle; + uint8_t swizzle[4]; } color_fmt[V3D_MAX_DRAW_BUFFERS]; uint8_t f32_color_rb; uint32_t va_swap_rb_mask; bool has_multiview; + bool line_smooth; }; struct v3dv_pipeline_cache_stats { uint32_t miss; uint32_t hit; uint32_t count; + uint32_t on_disk_hit; }; /* Equivalent to gl_shader_stage, but including the coordinate shaders @@ -411,11 +476,11 @@ struct v3dv_device { struct v3d_device_info devinfo; struct v3dv_queue queue; - /* A sync object to track the last job submitted to the GPU. */ - uint32_t last_job_sync; + /* Guards query->maybe_available and value for timestamps */ + mtx_t query_mutex; - /* A mutex to prevent concurrent access to last_job_sync from the queue */ - mtx_t mutex; + /* Signaled whenever a query is ended */ + cnd_t query_ended; /* Resources used for meta operations */ struct { @@ -457,37 +522,107 @@ struct v3dv_device { uint32_t bo_size; uint32_t bo_count; + /* Event handling resources. + * + * Our implementation of events uses a BO to store event state (signaled vs + * reset) and dispatches compute shaders to handle GPU event functions + * (signal, reset, wait). This struct holds all the resources required + * by the implementation. + */ + struct { + mtx_t lock; + + /* BO for the event states: signaled (1) or reset (0) */ + struct v3dv_bo *bo; + + /* We pre-allocate all the events we can fit for the size of the BO we + * create to track their states, where each event has an index which is + * basically the offset of its state in that BO. We keep a free list with + * the pre-allocated events that are available. + */ + uint32_t event_count; + struct v3dv_event *events; + struct list_head free_list; + + /* Vulkan resources to access the event BO from shaders. We have a + * pipeline that sets the state of an event and another that waits on + * a single event. Both pipelines require access to the event state BO, + * for which we need to allocate a single descripot set. + */ + VkBuffer buffer; + VkDeviceMemory mem; + VkDescriptorSetLayout descriptor_set_layout; + VkPipelineLayout pipeline_layout; + VkDescriptorPool descriptor_pool; + VkDescriptorSet descriptor_set; + VkPipeline set_event_pipeline; + VkPipeline wait_event_pipeline; + } events; + + /* Query handling resources. + * + * Our implementation of occlusion queries uses a BO per pool to keep track + * of the per-query availability state and dispatches compute shaders to + * handle GPU query functions that read and write that state. This struct + * holds Vulkan resources that can be shared across all query pools to + * implement this. This framework may be extended in the future to handle + * more query types. + */ + struct { + VkDescriptorSetLayout buf_descriptor_set_layout; + + /* Set query availability */ + VkPipelineLayout avail_pipeline_layout; + VkPipeline avail_pipeline; + + /* Reset query availability and clear occlusion counters */ + VkPipelineLayout reset_occlusion_pipeline_layout; + VkPipeline reset_occlusion_pipeline; + + /* Copy query results */ + VkPipelineLayout copy_pipeline_layout; + VkPipeline copy_pipeline[8]; + } queries; + struct v3dv_pipeline_cache default_pipeline_cache; - /* GL_SHADER_STATE_RECORD needs to speficy default attribute values. The + /* GL_SHADER_STATE_RECORD needs to specify default attribute values. The * following covers the most common case, that is all attributes format * being float being float, allowing us to reuse the same BO for all * pipelines matching this requirement. Pipelines that need integer * attributes will create their own BO. + * + * Note that since v71 the default attribute values are not needed, so this + * can be NULL. */ struct v3dv_bo *default_attribute_float; - VkPhysicalDeviceFeatures features; + + void *device_address_mem_ctx; + struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */ + +#if DETECT_OS_ANDROID + struct u_gralloc *gralloc; +#endif }; struct v3dv_device_memory { - struct vk_object_base base; + struct vk_device_memory vk; struct v3dv_bo *bo; const VkMemoryType *type; - bool has_bo_ownership; bool is_for_wsi; + bool is_for_device_address; }; #define V3D_OUTPUT_IMAGE_FORMAT_NO 255 #define TEXTURE_DATA_FORMAT_NO 255 -struct v3dv_format { - bool supported; - - /* One of V3D33_OUTPUT_IMAGE_FORMAT_*, or OUTPUT_IMAGE_FORMAT_NO */ +#define V3DV_MAX_PLANE_COUNT 3 +struct v3dv_format_plane { + /* One of V3D42_OUTPUT_IMAGE_FORMAT_*, or OUTPUT_IMAGE_FORMAT_NO */ uint8_t rt_type; - /* One of V3D33_TEXTURE_DATA_FORMAT_*. */ + /* One of V3D42_TEXTURE_DATA_FORMAT_*. */ uint8_t tex_type; /* Swizzle to apply to the RGBA shader output for storing to the tile @@ -499,15 +634,54 @@ struct v3dv_format { /* Whether the return value is 16F/I/UI or 32F/I/UI. */ uint8_t return_size; +}; + +struct v3dv_format { + /* Non 0 plane count implies supported */ + uint8_t plane_count; + + struct v3dv_format_plane planes[V3DV_MAX_PLANE_COUNT]; /* If the format supports (linear) filtering when texturing. */ bool supports_filtering; }; +/* Note that although VkImageAspectFlags would allow to combine more than one + * PLANE bit, for all the use cases we implement that use VkImageAspectFlags, + * only one plane is allowed, like for example vkCmdCopyImage: + * + * "If srcImage has a VkFormat with two planes then for each element of + * pRegions, srcSubresource.aspectMask must be VK_IMAGE_ASPECT_PLANE_0_BIT + * or VK_IMAGE_ASPECT_PLANE_1_BIT" + * + */ +static uint8_t v3dv_plane_from_aspect(VkImageAspectFlags aspect) +{ + switch (aspect) { + case VK_IMAGE_ASPECT_COLOR_BIT: + case VK_IMAGE_ASPECT_DEPTH_BIT: + case VK_IMAGE_ASPECT_STENCIL_BIT: + case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT: + case VK_IMAGE_ASPECT_PLANE_0_BIT: + case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT: + return 0; + case VK_IMAGE_ASPECT_PLANE_1_BIT: + case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT: + return 1; + case VK_IMAGE_ASPECT_PLANE_2_BIT: + case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT: + return 2; + default: + unreachable("invalid image aspect"); + } +} + struct v3d_resource_slice { uint32_t offset; uint32_t stride; uint32_t padded_height; + uint32_t width; + uint32_t height; /* Size of a single pane of the slice. For 3D textures, there will be * a number of panes equal to the minified, power-of-two-aligned * depth. @@ -518,24 +692,85 @@ struct v3d_resource_slice { uint32_t padded_height_of_output_image_in_uif_blocks; }; +bool v3dv_format_swizzle_needs_rb_swap(const uint8_t *swizzle); +bool v3dv_format_swizzle_needs_reverse(const uint8_t *swizzle); + struct v3dv_image { struct vk_image vk; const struct v3dv_format *format; - uint32_t cpp; bool tiled; - struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS]; - uint64_t size; /* Total size in bytes */ - uint32_t cube_map_stride; + uint8_t plane_count; - struct v3dv_device_memory *mem; - VkDeviceSize mem_offset; - uint32_t alignment; + /* If 0, this is a multi-plane image with use disjoint memory, where each + * plane binds a different device memory. Otherwise, all the planes share + * the same device memory and this stores the total size of the image in + * bytes. + */ + uint32_t non_disjoint_size; + + struct { + uint32_t cpp; + + struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS]; + /* Total size of the plane in bytes. */ + uint64_t size; + uint32_t cube_map_stride; + + /* If not using disjoint memory, mem and mem_offset is the same for all + * planes, in which case mem_offset is the offset of plane 0. + */ + struct v3dv_device_memory *mem; + VkDeviceSize mem_offset; + uint32_t alignment; + + /* Pre-subsampled per plane width and height + */ + uint32_t width; + uint32_t height; + + /* Even if we can get it from the parent image format, we keep the + * format here for convenience + */ + VkFormat vk_format; + } planes[V3DV_MAX_PLANE_COUNT]; + + /* Used only when sampling a linear texture (which V3D doesn't support). + * This holds a tiled copy of the image we can use for that purpose. + */ + struct v3dv_image *shadow; + +#if DETECT_OS_ANDROID + /* Image is backed by VK_ANDROID_native_buffer, */ + bool is_native_buffer_memory; + /* Image is backed by VK_ANDROID_external_memory_android_hardware_buffer */ + bool is_ahb; + VkImageDrmFormatModifierExplicitCreateInfoEXT *android_explicit_layout; + VkSubresourceLayout *android_plane_layouts; +#endif }; +VkResult +v3dv_image_init(struct v3dv_device *device, + const VkImageCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + struct v3dv_image *image); + VkImageViewType v3dv_image_type_to_view_type(VkImageType type); +static uint32_t +v3dv_image_aspect_to_plane(const struct v3dv_image *image, + VkImageAspectFlagBits aspect) +{ + assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects)); + + /* Because we always put image and view planes in aspect-bit-order, the + * plane index is the number of bits in the image aspect before aspect. + */ + return util_bitcount(image->vk.aspects & (aspect - 1)); +} + /* Pre-generating packets needs to consider changes in packet sizes across hw * versions. Keep things simple and allocate enough space for any supported * version. We ensure the size is large enough through static asserts. @@ -553,31 +788,50 @@ struct v3dv_image_view { struct vk_image_view vk; const struct v3dv_format *format; - bool swap_rb; - uint32_t internal_bpp; - uint32_t internal_type; - uint32_t offset; - /* Precomputed (composed from createinfo->components and formar swizzle) - * swizzles to pass in to the shader key. - * - * This could be also included on the descriptor bo, but the shader state - * packet doesn't need it on a bo, so we can just avoid a memory copy - */ - uint8_t swizzle[4]; + uint8_t view_swizzle[4]; - /* Prepacked TEXTURE_SHADER_STATE. It will be copied to the descriptor info - * during UpdateDescriptorSets. - * - * Empirical tests show that cube arrays need a different shader state - * depending on whether they are used with a sampler or not, so for these - * we generate two states and select the one to use based on the descriptor - * type. + uint8_t plane_count; + struct { + uint8_t image_plane; + + bool swap_rb; + bool channel_reverse; + uint32_t internal_bpp; + uint32_t internal_type; + uint32_t offset; + + /* Precomputed swizzle (composed from the view swizzle and the format + * swizzle). + * + * This could be also included on the descriptor bo, but the shader state + * packet doesn't need it on a bo, so we can just avoid a memory copy + */ + uint8_t swizzle[4]; + + /* Prepacked TEXTURE_SHADER_STATE. It will be copied to the descriptor info + * during UpdateDescriptorSets. + * + * Empirical tests show that cube arrays need a different shader state + * depending on whether they are used with a sampler or not, so for these + * we generate two states and select the one to use based on the descriptor + * type. + */ + uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH]; + } planes[V3DV_MAX_PLANE_COUNT]; + + /* Used only when sampling a linear texture (which V3D doesn't support). + * This would represent a view over the tiled shadow image. */ - uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH]; + struct v3dv_image_view *shadow; }; -uint32_t v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer); +VkResult v3dv_create_image_view(struct v3dv_device *device, + const VkImageViewCreateInfo *pCreateInfo, + VkImageView *pView); + +uint32_t v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer, + uint8_t plane); struct v3dv_buffer { struct vk_object_base base; @@ -590,6 +844,15 @@ struct v3dv_buffer { VkDeviceSize mem_offset; }; +void +v3dv_buffer_init(struct v3dv_device *device, + const VkBufferCreateInfo *pCreateInfo, + struct v3dv_buffer *buffer, + uint32_t alignment); + +void +v3dv_buffer_bind_memory(const VkBindBufferMemoryInfo *info); + struct v3dv_buffer_view { struct vk_object_base base; @@ -622,6 +885,8 @@ struct v3dv_subpass { struct v3dv_subpass_attachment *resolve_attachments; struct v3dv_subpass_attachment ds_attachment; + struct v3dv_subpass_attachment ds_resolve_attachment; + bool resolve_depth, resolve_stencil; /* If we need to emit the clear of the depth/stencil attachment using a * a draw call instead of using the TLB (GFXH-1461). @@ -634,7 +899,7 @@ struct v3dv_subpass { }; struct v3dv_render_pass_attachment { - VkAttachmentDescription desc; + VkAttachmentDescription2 desc; uint32_t first_subpass; uint32_t last_subpass; @@ -650,10 +915,11 @@ struct v3dv_render_pass_attachment { uint32_t last_subpass; } views[MAX_MULTIVIEW_VIEW_COUNT]; - /* If this is a multismapled attachment that is going to be resolved, - * whether we can use the TLB resolve on store. + /* If this is a multisampled attachment that is going to be resolved, + * whether we may be able to use the TLB hardware resolve based on the + * attachment format. */ - bool use_tlb_resolve; + bool try_tlb_resolve; }; struct v3dv_render_pass { @@ -678,7 +944,7 @@ struct v3dv_framebuffer { uint32_t layers; /* Typically, edge tiles in the framebuffer have padding depending on the - * underlying tiling layout. One consequnce of this is that when the + * underlying tiling layout. One consequence of this is that when the * framebuffer dimensions are not aligned to tile boundaries, tile stores * would still write full tiles on the edges and write to the padded area. * If the framebuffer is aliasing a smaller region of a larger image, then @@ -690,6 +956,11 @@ struct v3dv_framebuffer { uint32_t attachment_count; uint32_t color_attachment_count; + + /* Notice that elements in 'attachments' will be NULL if the framebuffer + * was created imageless. The driver is expected to access attachment info + * from the command buffer state instead. + */ struct v3dv_image_view *attachments[0]; }; @@ -699,7 +970,9 @@ struct v3dv_frame_tiling { uint32_t layers; uint32_t render_target_count; uint32_t internal_bpp; + uint32_t total_color_bpp; bool msaa; + bool double_buffer; uint32_t tile_width; uint32_t tile_height; uint32_t draw_tiles_x; @@ -710,22 +983,26 @@ struct v3dv_frame_tiling { uint32_t frame_height_in_supertiles; }; -void v3dv_framebuffer_compute_internal_bpp_msaa(const struct v3dv_framebuffer *framebuffer, - const struct v3dv_subpass *subpass, - uint8_t *max_bpp, bool *msaa); - bool v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device, const VkRect2D *area, struct v3dv_framebuffer *fb, struct v3dv_render_pass *pass, uint32_t subpass_idx); -struct v3dv_cmd_pool { - struct vk_object_base base; - - VkAllocationCallbacks alloc; - struct list_head cmd_buffers; -}; +/* Checks if we need to emit 2 initial tile clears for double buffer mode. + * This happens when we render at least 2 tiles, because in this mode each + * tile uses a different half of the tile buffer memory so we can have 2 tiles + * in flight (one being stored to memory and the next being rendered). In this + * scenario, if we emit a single initial tile clear we would only clear the + * first half of the tile buffer. + */ +static inline bool +v3dv_do_double_initial_tile_clear(const struct v3dv_frame_tiling *tiling) +{ + return tiling->double_buffer && + (tiling->draw_tiles_x > 1 || tiling->draw_tiles_y > 1 || + tiling->layers > 1); +} enum v3dv_cmd_buffer_status { V3DV_CMD_BUFFER_STATUS_NEW = 0, @@ -748,100 +1025,67 @@ struct v3dv_cmd_buffer_attachment_state { /* The hardware clear value */ union v3dv_clear_value clear_value; + + /* The underlying image view (from the framebuffer or, if imageless + * framebuffer is used, from VkRenderPassAttachmentBeginInfo. + */ + struct v3dv_image_view *image_view; + + /* If this is a multisampled attachment with a resolve operation. */ + bool has_resolve; + + /* If this is a multisampled attachment with a resolve operation, + * whether we can use the TLB for the resolve. + */ + bool use_tlb_resolve; }; +/* Cached values derived from Vulkan viewport/count */ struct v3dv_viewport_state { - uint32_t count; - VkViewport viewports[MAX_VIEWPORTS]; float translate[MAX_VIEWPORTS][3]; float scale[MAX_VIEWPORTS][3]; }; -struct v3dv_scissor_state { - uint32_t count; - VkRect2D scissors[MAX_SCISSORS]; -}; - -/* Mostly a v3dv mapping of VkDynamicState, used to track which data as - * defined as dynamic - */ -enum v3dv_dynamic_state_bits { - V3DV_DYNAMIC_VIEWPORT = 1 << 0, - V3DV_DYNAMIC_SCISSOR = 1 << 1, - V3DV_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 2, - V3DV_DYNAMIC_STENCIL_WRITE_MASK = 1 << 3, - V3DV_DYNAMIC_STENCIL_REFERENCE = 1 << 4, - V3DV_DYNAMIC_BLEND_CONSTANTS = 1 << 5, - V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6, - V3DV_DYNAMIC_LINE_WIDTH = 1 << 7, - V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8, - V3DV_DYNAMIC_ALL = (1 << 9) - 1, -}; - -/* Flags for dirty pipeline state. +/* Flags for custom dirty state, that could lead to packet emission. + * + * Note *custom*, for all the dynamic state tracking coming from the Vulkan + * API, we use the Mesa runtime framework and their predefined flags + * (MESA_VK_DYNAMIC_XXX). + * + * Here we defined additional flags used to track dirty state. */ enum v3dv_cmd_dirty_bits { - V3DV_CMD_DIRTY_VIEWPORT = 1 << 0, - V3DV_CMD_DIRTY_SCISSOR = 1 << 1, - V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK = 1 << 2, - V3DV_CMD_DIRTY_STENCIL_WRITE_MASK = 1 << 3, - V3DV_CMD_DIRTY_STENCIL_REFERENCE = 1 << 4, - V3DV_CMD_DIRTY_PIPELINE = 1 << 5, - V3DV_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 6, - V3DV_CMD_DIRTY_VERTEX_BUFFER = 1 << 7, - V3DV_CMD_DIRTY_INDEX_BUFFER = 1 << 8, - V3DV_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 9, - V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 10, - V3DV_CMD_DIRTY_PUSH_CONSTANTS = 1 << 11, - V3DV_CMD_DIRTY_BLEND_CONSTANTS = 1 << 12, - V3DV_CMD_DIRTY_OCCLUSION_QUERY = 1 << 13, - V3DV_CMD_DIRTY_DEPTH_BIAS = 1 << 14, - V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 15, - V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 16, - V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 17, + V3DV_CMD_DIRTY_PIPELINE = 1 << 0, + V3DV_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1, + V3DV_CMD_DIRTY_VERTEX_BUFFER = 1 << 2, + V3DV_CMD_DIRTY_INDEX_BUFFER = 1 << 3, + V3DV_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 4, + V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 5, + V3DV_CMD_DIRTY_PUSH_CONSTANTS = 1 << 6, + V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO = 1 << 7, + V3DV_CMD_DIRTY_OCCLUSION_QUERY = 1 << 8, + V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 9, + V3DV_CMD_DIRTY_DRAW_ID = 1 << 10, + V3DV_CMD_DIRTY_ALL = (1 << 10) - 1, }; struct v3dv_dynamic_state { - /** - * Bitmask of (1 << VK_DYNAMIC_STATE_*). - * Defines the set of saved dynamic state. + /* FIXME: we keep some viewport info cached (translate, scale) because we + * use that on more that one place. But note that translate_z and scale_z + * is also used in several places, and we recompute it based on + * scissor/viewport info all time. So perhaps we could do the same with the + * x and y component. */ - uint32_t mask; - struct v3dv_viewport_state viewport; - struct v3dv_scissor_state scissor; - - struct { - uint32_t front; - uint32_t back; - } stencil_compare_mask; - - struct { - uint32_t front; - uint32_t back; - } stencil_write_mask; - - struct { - uint32_t front; - uint32_t back; - } stencil_reference; - - float blend_constants[4]; - - struct { - float constant_factor; - float depth_bias_clamp; - float slope_factor; - } depth_bias; - - float line_width; - + /* We cache the color_write_enable as the vulkan runtime keeps a 8-bit + * bitset with a bit per attachment, but in order to combine with the + * color_write_masks is easier to cache a 32-bit bitset with 4 bits per + * attachment. + */ uint32_t color_write_enable; }; -extern const struct v3dv_dynamic_state default_dynamic_state; - void v3dv_viewport_compute_xform(const VkViewport *viewport, float scale[3], float translate[3]); @@ -855,15 +1099,12 @@ enum v3dv_ez_state { enum v3dv_job_type { V3DV_JOB_TYPE_GPU_CL = 0, - V3DV_JOB_TYPE_GPU_CL_SECONDARY, + V3DV_JOB_TYPE_GPU_CL_INCOMPLETE, V3DV_JOB_TYPE_GPU_TFU, V3DV_JOB_TYPE_GPU_CSD, V3DV_JOB_TYPE_CPU_RESET_QUERIES, V3DV_JOB_TYPE_CPU_END_QUERY, V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS, - V3DV_JOB_TYPE_CPU_SET_EVENT, - V3DV_JOB_TYPE_CPU_WAIT_EVENTS, - V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE, V3DV_JOB_TYPE_CPU_CSD_INDIRECT, V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY, }; @@ -874,7 +1115,7 @@ struct v3dv_reset_query_cpu_job_info { uint32_t count; }; -struct v3dv_end_query_cpu_job_info { +struct v3dv_end_query_info { struct v3dv_query_pool *pool; uint32_t query; @@ -892,31 +1133,14 @@ struct v3dv_copy_query_results_cpu_job_info { VkQueryResultFlags flags; }; -struct v3dv_event_set_cpu_job_info { - struct v3dv_event *event; - int state; -}; - -struct v3dv_event_wait_cpu_job_info { - /* List of events to wait on */ - uint32_t event_count; - struct v3dv_event **events; - - /* Whether any postponed jobs after the wait should wait on semaphores */ - bool sem_wait; -}; +struct v3dv_submit_sync_info { + /* List of syncs to wait before running a job */ + uint32_t wait_count; + struct vk_sync_wait *waits; -struct v3dv_copy_buffer_to_image_cpu_job_info { - struct v3dv_image *image; - struct v3dv_buffer *buffer; - uint32_t buffer_offset; - uint32_t buffer_stride; - uint32_t buffer_layer_stride; - VkOffset3D image_offset; - VkExtent3D image_extent; - uint32_t mip_level; - uint32_t base_layer; - uint32_t layer_count; + /* List of syncs to signal when all jobs complete */ + uint32_t signal_count; + struct vk_sync_signal *signals; }; struct v3dv_csd_indirect_cpu_job_info { @@ -936,6 +1160,19 @@ struct v3dv_timestamp_query_cpu_job_info { uint32_t count; }; +/* Number of perfmons required to handle all supported performance counters */ +#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \ + DRM_V3D_MAX_PERF_COUNTERS) + +struct v3dv_perf_query { + uint32_t kperfmon_ids[V3DV_MAX_PERFMONS]; + + /* A DRM syncobj to wait on the GPU jobs for which we are collecting + * performance data. + */ + struct vk_sync *last_job_sync; +}; + struct v3dv_job { struct list_head list_link; @@ -945,6 +1182,61 @@ struct v3dv_job { */ bool is_clone; + /* If this is a cloned job, if it has its own BCL resource. This happens + * when we suspend jobs with in command buffers with the + * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT flag. + */ + bool clone_owns_bcl; + + /* VK_KHR_dynamic_rendering */ + bool suspending; + bool resuming; + struct v3dv_cl_out *suspend_branch_inst_ptr; + uint32_t suspended_bcl_end; + + /* If the job executes on the transfer stage of the pipeline */ + bool is_transfer; + + /* VK_KHR_buffer_device_address allows shaders to use pointers that can + * dereference memory in any buffer that has been flagged with + * VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT. These buffers may not + * be bound via descriptor sets, so we need to make sure that a job that + * uses this functionality includes all these buffers in its kernel + * submission. + */ + bool uses_buffer_device_address; + + /* True if we have not identified anything that would be incompatible + * with double-buffer (like MSAA) or that would make double-buffer mode + * not efficient (like tile loads or not having any stores). + */ + bool can_use_double_buffer; + + /* This structure keeps track of various scores to inform a heuristic + * for double-buffer mode. + */ + struct { + /* Cost of geometry shading */ + uint32_t geom; + /* Cost of shader rendering */ + uint32_t render; + } double_buffer_score; + + /* We only need to allocate tile state for all layers if the binner + * writes primitives to layers other than the first. This can only be + * done using layered rendering (writing gl_Layer from a geometry shader), + * so for other cases of multilayered framebuffers (typically with + * meta copy/clear operations) that won't use layered rendering, we only + * need one layer worth of of tile state for the binner. + */ + bool allocate_tile_state_for_all_layers; + + /* A pointer to the location of the TILE_BINNING_MODE_CFG packet so we can + * rewrite it to enable double-buffer mode by the time we have enough info + * about the job to make that decision. + */ + struct v3dv_cl_out *bcl_tile_binning_mode_ptr; + enum v3dv_job_type type; struct v3dv_device *device; @@ -988,6 +1280,9 @@ struct v3dv_job { */ bool decided_global_ez_enable; + /* If the job emitted any draw calls with Early Z/S enabled */ + bool has_ez_draws; + /* If this job has been configured to use early Z/S clear */ bool early_zs_clear; @@ -1000,8 +1295,10 @@ struct v3dv_job { */ bool always_flush; - /* Whether we need to serialize this job in our command stream */ - bool serialize; + /* A mask of V3DV_BARRIER_* indicating the source(s) of the barrier. We + * can use this to select the hw queues where we need to serialize the job. + */ + uint8_t serialize; /* If this is a CL job, whether we should sync before binning */ bool needs_bcl_sync; @@ -1009,11 +1306,8 @@ struct v3dv_job { /* Job specs for CPU jobs */ union { struct v3dv_reset_query_cpu_job_info query_reset; - struct v3dv_end_query_cpu_job_info query_end; + struct v3dv_end_query_info query_end; struct v3dv_copy_query_results_cpu_job_info query_copy_results; - struct v3dv_event_set_cpu_job_info event_set; - struct v3dv_event_wait_cpu_job_info event_wait; - struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image; struct v3dv_csd_indirect_cpu_job_info csd_indirect; struct v3dv_timestamp_query_cpu_job_info query_timestamp; } cpu; @@ -1028,6 +1322,9 @@ struct v3dv_job { uint32_t wg_base[3]; struct drm_v3d_submit_csd submit; } csd; + + /* Perfmons with last job sync for CSD and CL jobs */ + struct v3dv_perf_query *perf; }; void v3dv_job_init(struct v3dv_job *job, @@ -1045,10 +1342,17 @@ void v3dv_job_start_frame(struct v3dv_job *job, uint32_t height, uint32_t layers, bool allocate_tile_state_for_all_layers, + bool allocate_tile_state_now, uint32_t render_target_count, uint8_t max_internal_bpp, + uint8_t total_color_bpp, bool msaa); +bool v3dv_job_type_is_gpu(struct v3dv_job *job); + +struct v3dv_job * +v3dv_job_clone(struct v3dv_job *job, bool skip_bcl); + struct v3dv_job * v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job, struct v3dv_cmd_buffer *cmd_buffer); @@ -1065,7 +1369,26 @@ v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer, uint32_t *alloc_count, void **ptr); -void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer); +void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, + bool indexed, bool indirect, + uint32_t vertex_count); + +bool v3dv_job_allocate_tile_state(struct v3dv_job *job); + +void +v3dv_setup_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer, + const VkRenderingInfoKHR *pRenderingInfo); + +void +v3dv_destroy_dynamic_framebuffer(struct v3dv_cmd_buffer *cmd_buffer); + +void +v3dv_setup_dynamic_render_pass(struct v3dv_cmd_buffer *cmd_buffer, + const VkRenderingInfoKHR *pRenderingInfo); + +void +v3dv_setup_dynamic_render_pass_inheritance(struct v3dv_cmd_buffer *cmd_buffer, + const VkCommandBufferInheritanceRenderingInfo *info); /* FIXME: only used on v3dv_cmd_buffer and v3dvx_cmd_buffer, perhaps move to a * cmd_buffer specific header? @@ -1094,9 +1417,46 @@ struct v3dv_cmd_pipeline_state { struct v3dv_descriptor_state descriptor_state; }; +enum { + V3DV_BARRIER_GRAPHICS_BIT = (1 << 0), + V3DV_BARRIER_COMPUTE_BIT = (1 << 1), + V3DV_BARRIER_TRANSFER_BIT = (1 << 2), + V3DV_BARRIER_CPU_BIT = (1 << 3), +}; +#define V3DV_BARRIER_ALL (V3DV_BARRIER_GRAPHICS_BIT | \ + V3DV_BARRIER_TRANSFER_BIT | \ + V3DV_BARRIER_COMPUTE_BIT | \ + V3DV_BARRIER_CPU_BIT); + +struct v3dv_barrier_state { + /* Mask of V3DV_BARRIER_* indicating where we consume a barrier. */ + uint8_t dst_mask; + + /* For each possible consumer of a barrier, a mask of V3DV_BARRIER_* + * indicating the sources of the dependency. + */ + uint8_t src_mask_graphics; + uint8_t src_mask_transfer; + uint8_t src_mask_compute; + + /* For graphics barriers, access masks involved. Used to decide if we need + * to execute a binning or render barrier. + */ + VkAccessFlags2 bcl_buffer_access; + VkAccessFlags2 bcl_image_access; +}; + struct v3dv_cmd_buffer_state { struct v3dv_render_pass *pass; struct v3dv_framebuffer *framebuffer; + + /* VK_KHR_dynamic_rendering */ + struct v3dv_render_pass dynamic_pass; + struct v3dv_subpass dynamic_subpass; + struct v3dv_render_pass_attachment dynamic_attachments[18 /* (8 color + D/S) x 2 (for resolves) */]; + struct v3dv_subpass_attachment dynamic_subpass_attachments[18]; + struct v3dv_framebuffer *dynamic_framebuffer; + VkRect2D render_area; /* Current job being recorded */ @@ -1107,8 +1467,16 @@ struct v3dv_cmd_buffer_state { struct v3dv_cmd_pipeline_state gfx; struct v3dv_cmd_pipeline_state compute; + /* For most state tracking we rely on vk_dynamic_graphics_state, but we + * maintain a custom structure for some state-related data that we want to + * cache. + */ struct v3dv_dynamic_state dynamic; + /* This dirty is for v3dv_cmd_dirty_bits (FIXME: perhaps we should be more + * explicit about it). For dirty flags coming from Vulkan dynamic state, + * use the vk_dynamic_graphics_state handled by the vk_cmd_buffer + */ uint32_t dirty; VkShaderStageFlagBits dirty_descriptor_stages; VkShaderStageFlagBits dirty_push_constants_stages; @@ -1128,6 +1496,14 @@ struct v3dv_cmd_buffer_state { */ bool tile_aligned_render_area; + /* FIXME: we have just one client-side BO for the push constants, + * independently of the stageFlags in vkCmdPushConstants, and the + * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage + * tuning in the future if it makes sense. + */ + uint32_t push_constants_size; + uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4]; + uint32_t attachment_alloc_count; struct v3dv_cmd_buffer_attachment_state *attachments; @@ -1151,14 +1527,21 @@ struct v3dv_cmd_buffer_state { /* Current view index for multiview rendering */ uint32_t view_index; + /* Current draw ID for multidraw */ + uint32_t draw_id; + /* Used to flag OOM conditions during command buffer recording */ bool oom; - /* Whether we have recorded a pipeline barrier that we still need to - * process. - */ - bool has_barrier; - bool has_bcl_barrier; + /* If we are currently recording job(s) for a transfer operation */ + bool is_transfer; + + /* VK_KHR_dynamic_rendering */ + bool suspending; + bool resuming; + + /* Barrier state tracking */ + struct v3dv_barrier_state barrier; /* Secondary command buffer state */ struct { @@ -1178,12 +1561,14 @@ struct v3dv_cmd_buffer_state { bool tile_aligned_render_area; VkRect2D render_area; + struct vk_dynamic_graphics_state dynamic_graphics_state; struct v3dv_dynamic_state dynamic; struct v3dv_cmd_pipeline_state gfx; bool has_descriptor_state; uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; + uint32_t push_constants_size; } meta; /* Command buffer state for queries */ @@ -1196,19 +1581,49 @@ struct v3dv_cmd_buffer_state { struct { uint32_t used_count; uint32_t alloc_count; - struct v3dv_end_query_cpu_job_info *states; + struct v3dv_end_query_info *states; } end; - /* This BO is not NULL if we have an active query, that is, we have - * called vkCmdBeginQuery but not vkCmdEndQuery. - */ struct { + /* This BO is not NULL if we have an active occlusion query, that is, + * we have called vkCmdBeginQuery but not vkCmdEndQuery. + */ struct v3dv_bo *bo; uint32_t offset; + /* When the driver emits draw calls to implement other operations in + * the middle of a render pass (such as an attachment clear), we need + * to pause occlusion query recording and resume it later so that + * these draw calls don't register in occlussion counters. We use + * this to store the BO reference in which we should resume occlusion + * query counters after the driver is done emitting its draw calls. + */ + struct v3dv_bo *paused_bo; + + /* This pointer is not NULL if we have an active performance query */ + struct v3dv_perf_query *perf; } active_query; } query; + + /* This is dynamic state since VK_EXT_extended_dynamic_state. */ + bool z_updates_enable; + + /* ez_state can be dynamic since VK_EXT_extended_dynamic_state so we need + * to keep track of it in the cmd_buffer state + */ + enum v3dv_ez_state ez_state; + + /* incompatible_ez_test can be dynamic since VK_EXT_extended_dynamic_state + * so we need to keep track of it in the cmd_buffer state + */ + bool incompatible_ez_test; + }; +void +v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t vp_idx, + float *translate_z, float *scale_z); + /* The following struct represents the info from a descriptor that we store on * the host memory. They are mostly links to other existing vulkan objects, * like the image_view in order to access to swizzle info, or the buffer used @@ -1228,8 +1643,8 @@ struct v3dv_descriptor { struct { struct v3dv_buffer *buffer; - uint32_t offset; - uint32_t range; + size_t offset; + size_t range; }; struct v3dv_buffer_view *buffer_view; @@ -1237,28 +1652,90 @@ struct v3dv_descriptor { }; struct v3dv_query { + /* Used by queries where we implement result copying in the CPU so we can + * tell if the relevant jobs have been submitted for execution. Currently + * these are all but occlusion queries. + */ bool maybe_available; + union { - /* Used by GPU queries (occlusion) */ + /* Used by occlusion queries */ struct { - struct v3dv_bo *bo; + /* Offset of this query in the occlusion query counter BO */ uint32_t offset; - }; - /* Used by CPU queries (timestamp) */ - uint64_t value; + } occlusion; + + /* Used by timestamp queries */ + struct { + /* Offset of this query in the timestamp BO for its value */ + uint32_t offset; + + /* Syncobj to signal timestamp query availability */ + struct vk_sync *sync; + } timestamp; + + /* Used by performance queries */ + struct v3dv_perf_query perf; }; }; struct v3dv_query_pool { struct vk_object_base base; - struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */ + /* Per-pool Vulkan resources required to implement GPU-side query + * functions (only occlusion queries for now). + */ + struct { + /* Buffer to access the BO with the occlusion query results and + * availability info. + */ + VkBuffer buf; + VkDeviceMemory mem; + + /* Descriptor set for accessing the buffer from a pipeline. */ + VkDescriptorPool descriptor_pool; + VkDescriptorSet descriptor_set; + } meta; + + /* Only used with occlusion queries */ + struct { + /* BO with the occlusion counters and query availability */ + struct v3dv_bo *bo; + /* Offset of the availability info in the BO */ + uint32_t avail_offset; + } occlusion; + + /* Only used with timestamp queries */ + struct { + /* BO with the query timestamp values */ + struct v3dv_bo *bo; + } timestamp; + + /* Only used with performance queries */ + struct { + uint32_t ncounters; + uint8_t counters[V3D_MAX_PERFCNT]; + + /* V3D has a limit on the number of counters we can track in a + * single performance monitor, so if too many counters are requested + * we need to create multiple monitors to record all of them. This + * field represents the number of monitors required for the number + * of counters requested. + */ + uint8_t nperfmons; + } perfmon; VkQueryType query_type; uint32_t query_count; struct v3dv_query *queries; }; +VkResult +v3dv_query_allocate_resources(struct v3dv_device *decice); + +void +v3dv_query_free_resources(struct v3dv_device *decice); + VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t first, @@ -1267,6 +1744,16 @@ VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device, VkDeviceSize stride, VkQueryResultFlags flags); +void v3dv_reset_query_pool_cpu(struct v3dv_device *device, + struct v3dv_query_pool *query_pool, + uint32_t first, + uint32_t last); + +void v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query, uint32_t count, + uint8_t availability); + typedef void (*v3dv_cmd_buffer_private_obj_destroy_cb)(VkDevice device, uint64_t pobj, VkAllocationCallbacks *alloc); @@ -1276,33 +1763,20 @@ struct v3dv_cmd_buffer_private_obj { v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb; }; +extern const struct vk_command_buffer_ops v3dv_cmd_buffer_ops; + struct v3dv_cmd_buffer { - struct vk_object_base base; + struct vk_command_buffer vk; struct v3dv_device *device; - struct v3dv_cmd_pool *pool; - struct list_head pool_link; - - /* Used at submit time to link command buffers in the submission that have - * spawned wait threads, so we can then wait on all of them to complete - * before we process any signal sempahores or fences. - */ - struct list_head list_link; - VkCommandBufferUsageFlags usage_flags; - VkCommandBufferLevel level; enum v3dv_cmd_buffer_status status; struct v3dv_cmd_buffer_state state; - /* FIXME: we have just one client-side and bo for the push constants, - * independently of the stageFlags in vkCmdPushConstants, and the - * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage - * tunning in the future if it makes sense. - */ - uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4]; + /* Buffer where we upload push constant data to resolve indirect indexing */ struct v3dv_cl_reloc push_constants_resource; /* Collection of Vulkan objects created internally by the driver (typically @@ -1321,6 +1795,10 @@ struct v3dv_cmd_buffer { /* The current descriptor pool for texel buffer copy sources */ VkDescriptorPool dspool; } texel_buffer_copy; + struct { + /* The current descriptor pool for the copy query results output buffer */ + VkDescriptorPool dspool; + } query; } meta; /* List of jobs in the command buffer. For primary command buffers it @@ -1346,19 +1824,16 @@ void v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer); void v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer, bool push_descriptor_state); void v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer, - uint32_t dirty_dynamic_state, bool needs_subpass_resume); -void v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_query_pool *pool, - uint32_t first, - uint32_t count); - void v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_query_pool *pool, uint32_t query, VkQueryControlFlags flags); +void v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer); +void v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer); + void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_query_pool *pool, uint32_t query); @@ -1375,38 +1850,58 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, struct drm_v3d_submit_tfu *tfu); -void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info, +void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device, + struct v3dv_csd_indirect_cpu_job_info *info, const uint32_t *wg_counts); void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, uint64_t obj, v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb); -struct v3dv_semaphore { - struct vk_object_base base; +void v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst, + struct v3dv_barrier_state *src); - /* A syncobject handle associated with this semaphore */ - uint32_t sync; +void v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_job *job); - /* A temporary syncobject handle produced from a vkImportSemaphoreFd. */ - uint32_t temp_sync; -}; +bool v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state, + VkImageAspectFlags aspect, + uint32_t first_subpass_idx, + VkAttachmentLoadOp load_op, + uint32_t last_subpass_idx, + VkAttachmentStoreOp store_op); -struct v3dv_fence { - struct vk_object_base base; +bool v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state, + VkImageAspectFlags aspect, + uint32_t last_subpass_idx, + VkAttachmentStoreOp store_op); - /* A syncobject handle associated with this fence */ - uint32_t sync; +void v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer, + const VkDependencyInfo *info); - /* A temporary syncobject handle produced from a vkImportFenceFd. */ - uint32_t temp_sync; -}; +bool v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *dst, + struct v3dv_image *src, + const VkImageCopy2 *region); struct v3dv_event { struct vk_object_base base; - int state; + + /* Link in the device list of pre-allocated free events */ + struct list_head link; + + /* Each event gets a different index, which we use to compute the offset + * in the BO we use to track their state (signaled vs reset). + */ + uint32_t index; }; +VkResult +v3dv_event_allocate_resources(struct v3dv_device *device); + +void +v3dv_event_free_resources(struct v3dv_device *device); + struct v3dv_shader_variant { enum broadcom_shader_stage stage; @@ -1428,9 +1923,11 @@ struct v3dv_shader_variant { */ uint32_t assembly_offset; - /* Note: it is really likely that qpu_insts would be NULL, as it will be - * used only temporarily, to upload it to the shared bo, as we compile the - * different stages individually. + /* Note: don't assume qpu_insts to be always NULL or not-NULL. In general + * we will try to free it as soon as we upload it to the shared bo while we + * compile the different stages. But we can decide to keep it around based + * on some pipeline creation flags, like + * VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT. */ uint64_t *qpu_insts; uint32_t qpu_insts_size; @@ -1462,7 +1959,9 @@ struct v3dv_pipeline_stage { /** A name for this program, so you can track it in shader-db output. */ uint32_t program_id; - VkPipelineCreationFeedbackEXT feedback; + VkPipelineCreationFeedback feedback; + + struct vk_pipeline_robustness_state robustness; }; /* We are using the descriptor pool entry for two things: @@ -1486,6 +1985,9 @@ struct v3dv_descriptor_pool_entry struct v3dv_descriptor_pool { struct vk_object_base base; + /* A list with all descriptor sets allocated from the pool. */ + struct list_head set_list; + /* If this descriptor pool has been allocated for the driver for internal * use, typically to implement meta operations. */ @@ -1515,9 +2017,12 @@ struct v3dv_descriptor_pool { struct v3dv_descriptor_set { struct vk_object_base base; + /* List link into the list of all sets allocated from the pool */ + struct list_head pool_link; + struct v3dv_descriptor_pool *pool; - const struct v3dv_descriptor_set_layout *layout; + struct v3dv_descriptor_set_layout *layout; /* Offset relative to the descriptor pool bo for this set */ uint32_t base_offset; @@ -1533,7 +2038,7 @@ struct v3dv_descriptor_set_binding_layout { /* Number of array elements in this binding */ uint32_t array_size; - /* Index into the flattend descriptor set */ + /* Index into the flattened descriptor set */ uint32_t descriptor_index; uint32_t dynamic_offset_count; @@ -1548,6 +2053,11 @@ struct v3dv_descriptor_set_binding_layout { * if there are no immutable samplers. */ uint32_t immutable_samplers_offset; + + /* Descriptors for multiplanar combined image samplers are larger. + * For mutable descriptors, this is always 1. + */ + uint8_t plane_stride; }; struct v3dv_descriptor_set_layout { @@ -1571,10 +2081,35 @@ struct v3dv_descriptor_set_layout { /* Number of dynamic offsets used by this descriptor set */ uint16_t dynamic_offset_count; + /* Descriptor set layouts can be destroyed even if they are still being + * used. + */ + uint32_t ref_cnt; + /* Bindings in this descriptor set */ struct v3dv_descriptor_set_binding_layout binding[0]; }; +void +v3dv_descriptor_set_layout_destroy(struct v3dv_device *device, + struct v3dv_descriptor_set_layout *set_layout); + +static inline void +v3dv_descriptor_set_layout_ref(struct v3dv_descriptor_set_layout *set_layout) +{ + assert(set_layout && set_layout->ref_cnt >= 1); + p_atomic_inc(&set_layout->ref_cnt); +} + +static inline void +v3dv_descriptor_set_layout_unref(struct v3dv_device *device, + struct v3dv_descriptor_set_layout *set_layout) +{ + assert(set_layout && set_layout->ref_cnt >= 1); + if (p_atomic_dec_zero(&set_layout->ref_cnt)) + v3dv_descriptor_set_layout_destroy(device, set_layout); +} + struct v3dv_pipeline_layout { struct vk_object_base base; @@ -1590,8 +2125,37 @@ struct v3dv_pipeline_layout { uint32_t dynamic_offset_count; uint32_t push_constant_size; + + /* Pipeline layouts can be destroyed after creating pipelines since + * maintenance4. + */ + uint32_t ref_cnt; + + unsigned char sha1[20]; }; +void +v3dv_pipeline_layout_destroy(struct v3dv_device *device, + struct v3dv_pipeline_layout *layout, + const VkAllocationCallbacks *alloc); + +static inline void +v3dv_pipeline_layout_ref(struct v3dv_pipeline_layout *layout) +{ + assert(layout && layout->ref_cnt >= 1); + p_atomic_inc(&layout->ref_cnt); +} + +static inline void +v3dv_pipeline_layout_unref(struct v3dv_device *device, + struct v3dv_pipeline_layout *layout, + const VkAllocationCallbacks *alloc) +{ + assert(layout && layout->ref_cnt >= 1); + if (p_atomic_dec_zero(&layout->ref_cnt)) + v3dv_pipeline_layout_destroy(device, layout, alloc); +} + /* * We are using descriptor maps for ubo/ssbo and texture/samplers, so we need * it to be big enough to include the max value for all of them. @@ -1599,18 +2163,20 @@ struct v3dv_pipeline_layout { * FIXME: one alternative would be to allocate the map as big as you need for * each descriptor type. That would means more individual allocations. */ -#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \ - MAX_UNIFORM_BUFFERS, \ +#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \ + MAX_UNIFORM_BUFFERS + MAX_INLINE_UNIFORM_BUFFERS, \ MAX_STORAGE_BUFFERS) struct v3dv_descriptor_map { - /* TODO: avoid fixed size array/justify the size */ + /* FIXME: avoid fixed size array/justify the size */ unsigned num_desc; /* Number of descriptors */ int set[DESCRIPTOR_MAP_SIZE]; int binding[DESCRIPTOR_MAP_SIZE]; int array_index[DESCRIPTOR_MAP_SIZE]; int array_size[DESCRIPTOR_MAP_SIZE]; + uint8_t plane[DESCRIPTOR_MAP_SIZE]; + bool used[DESCRIPTOR_MAP_SIZE]; /* NOTE: the following is only for sampler, but this is the easier place to * put it. @@ -1620,57 +2186,19 @@ struct v3dv_descriptor_map { struct v3dv_sampler { struct vk_object_base base; + struct vk_ycbcr_conversion *conversion; bool compare_enable; bool unnormalized_coordinates; - bool clamp_to_transparent_black_border; - /* Prepacked SAMPLER_STATE, that is referenced as part of the tmu + /* Prepacked per plane SAMPLER_STATE, that is referenced as part of the tmu * configuration. If needed it will be copied to the descriptor info during * UpdateDescriptorSets */ + uint8_t plane_count; uint8_t sampler_state[V3DV_SAMPLER_STATE_LENGTH]; }; -struct v3dv_descriptor_template_entry { - /* The type of descriptor in this entry */ - VkDescriptorType type; - - /* Binding in the descriptor set */ - uint32_t binding; - - /* Offset at which to write into the descriptor set binding */ - uint32_t array_element; - - /* Number of elements to write into the descriptor set binding */ - uint32_t array_count; - - /* Offset into the user provided data */ - size_t offset; - - /* Stride between elements into the user provided data */ - size_t stride; -}; - -struct v3dv_descriptor_update_template { - struct vk_object_base base; - - VkPipelineBindPoint bind_point; - - /* The descriptor set this template corresponds to. This value is only - * valid if the template was created with the templateType - * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET. - */ - uint8_t set; - - /* Number of entries in this template */ - uint32_t entry_count; - - /* Entries of the template */ - struct v3dv_descriptor_template_entry entries[0]; -}; - - /* We keep two special values for the sampler idx that represents exactly when a * sampler is not needed/provided. The main use is that even if we don't have * sampler, we still need to do the output unpacking (through @@ -1685,32 +2213,6 @@ struct v3dv_descriptor_update_template { #define V3DV_NO_SAMPLER_16BIT_IDX 0 #define V3DV_NO_SAMPLER_32BIT_IDX 1 -/* - * Following two methods are using on the combined to/from texture/sampler - * indices maps at v3dv_pipeline. - */ -static inline uint32_t -v3dv_pipeline_combined_index_key_create(uint32_t texture_index, - uint32_t sampler_index) -{ - return texture_index << 24 | sampler_index; -} - -static inline void -v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key, - uint32_t *texture_index, - uint32_t *sampler_index) -{ - uint32_t texture = combined_index_key >> 24; - uint32_t sampler = combined_index_key & 0xffffff; - - if (texture_index) - *texture_index = texture; - - if (sampler_index) - *sampler_index = sampler; -} - struct v3dv_descriptor_maps { struct v3dv_descriptor_map ubo_map; struct v3dv_descriptor_map ssbo_map; @@ -1733,50 +2235,59 @@ struct v3dv_pipeline_shared_data { struct v3dv_bo *assembly_bo; }; +struct v3dv_pipeline_executable_data { + enum broadcom_shader_stage stage; + char *nir_str; + char *qpu_str; +}; + struct v3dv_pipeline { struct vk_object_base base; struct v3dv_device *device; VkShaderStageFlags active_stages; + VkPipelineCreateFlags flags; struct v3dv_render_pass *pass; struct v3dv_subpass *subpass; - /* Note: We can't use just a MESA_SHADER_STAGES array because we also need - * to track binning shaders. Note these will be freed once the pipeline - * has been compiled. - */ - struct v3dv_pipeline_stage *vs; - struct v3dv_pipeline_stage *vs_bin; - struct v3dv_pipeline_stage *gs; - struct v3dv_pipeline_stage *gs_bin; - struct v3dv_pipeline_stage *fs; - struct v3dv_pipeline_stage *cs; + struct v3dv_pipeline_stage *stages[BROADCOM_SHADER_STAGES]; + + /* For VK_KHR_dynamic_rendering */ + struct vk_render_pass_state rendering_info; /* Flags for whether optional pipeline stages are present, for convenience */ bool has_gs; + /* Whether any stage in this pipeline uses VK_KHR_buffer_device_address */ + bool uses_buffer_device_address; + /* Spilling memory requirements */ struct { struct v3dv_bo *bo; uint32_t size_per_thread; } spill; - struct v3dv_dynamic_state dynamic_state; + struct vk_dynamic_graphics_state dynamic_graphics_state; + struct v3dv_dynamic_state dynamic; struct v3dv_pipeline_layout *layout; - /* Whether this pipeline enables depth writes */ - bool z_updates_enable; - enum v3dv_ez_state ez_state; + /* If ez_state is V3D_EZ_DISABLED, if the reason for disabling is that the + * pipeline selects an incompatible depth test function. + */ + bool incompatible_ez_test; + + bool rasterization_enabled; bool msaa; bool sample_rate_shading; uint32_t sample_mask; bool primitive_restart; + bool negative_one_to_one; /* Accessed by binding. So vb[binding]->stride is the stride of the vertex * array with such binding @@ -1799,12 +2310,18 @@ struct v3dv_pipeline { } va[MAX_VERTEX_ATTRIBS]; uint32_t va_count; - enum pipe_prim_type topology; + enum mesa_prim topology; + + bool line_smooth; struct v3dv_pipeline_shared_data *shared_data; + /* It is the combined stages sha1, layout sha1, plus the pipeline key sha1. */ + unsigned char sha1[20]; + /* In general we can reuse v3dv_device->default_attribute_float, so note - * that the following can be NULL. + * that the following can be NULL. In 7.x this is not used, so it will be + * always NULL. * * FIXME: the content of this BO will be small, so it could be improved to * be uploaded to a common BO. But as in most cases it will be NULL, it is @@ -1838,6 +2355,11 @@ struct v3dv_pipeline { bool is_z16; } depth_bias; + struct { + void *mem_ctx; + struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */ + } executables; + /* Packets prepacked during pipeline creation */ uint8_t cfg_bits[V3DV_CFG_BITS_LENGTH]; @@ -1848,6 +2370,13 @@ struct v3dv_pipeline { uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH]; }; +static inline bool +v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device) +{ + return device->devinfo.ver > 71 || + (device->devinfo.ver == 71 && device->devinfo.rev >= 5); +} + static inline VkPipelineBindPoint v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline) { @@ -1872,28 +2401,17 @@ const nir_shader_compiler_options *v3dv_pipeline_get_nir_options(void); uint32_t v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev); uint32_t v3dv_physical_device_device_id(struct v3dv_physical_device *dev); -VkResult __vk_errorf(struct v3dv_instance *instance, VkResult error, - const char *file, int line, - const char *format, ...); - -#define vk_error(instance, error) __vk_errorf(instance, error, __FILE__, __LINE__, NULL); -#define vk_errorf(instance, error, format, ...) __vk_errorf(instance, error, __FILE__, __LINE__, format, ## __VA_ARGS__); - -#ifdef DEBUG #define v3dv_debug_ignored_stype(sType) \ - fprintf(stderr, "%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType)) -#else -#define v3dv_debug_ignored_stype(sType) -#endif + mesa_logd("%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType)) -const uint8_t *v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f); -uint8_t v3dv_get_tex_return_size(const struct v3dv_format *vf, bool compare_enable); +const uint8_t *v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f, + uint8_t plane); const struct v3dv_format * v3dv_get_compatible_tfu_format(struct v3dv_device *device, uint32_t bpp, VkFormat *out_vk_format); bool v3dv_buffer_format_supports_features(struct v3dv_device *device, VkFormat vk_format, - VkFormatFeatureFlags features); + VkFormatFeatureFlags2 features); struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_pipeline *pipeline, @@ -1953,6 +2471,14 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat uint32_t index, uint32_t *dynamic_offset); +struct v3dv_cl_reloc +v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device, + struct v3dv_descriptor_state *descriptor_state, + struct v3dv_descriptor_map *map, + struct v3dv_pipeline_layout *pipeline_layout, + uint32_t index, + VkDescriptorType *out_type); + const struct v3dv_sampler * v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state, struct v3dv_descriptor_map *map, @@ -1973,13 +2499,6 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device, struct v3dv_pipeline_layout *pipeline_layout, uint32_t index); -const struct v3dv_format* -v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_state, - struct v3dv_descriptor_map *map, - struct v3dv_pipeline_layout *pipeline_layout, - uint32_t index, - VkFormat *out_vk_format); - struct v3dv_bo* v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_state, struct v3dv_descriptor_map *map, @@ -2020,71 +2539,56 @@ void v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline, struct v3dv_pipeline_cache *cache); -struct v3dv_bo * -v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, - struct v3dv_pipeline *pipeline); - -void v3dv_shader_module_internal_init(struct v3dv_device *device, - struct vk_shader_module *module, - nir_shader *nir); - -#define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType) \ - \ - static inline struct __v3dv_type * \ - __v3dv_type ## _from_handle(__VkType _handle) \ - { \ - return (struct __v3dv_type *) _handle; \ - } \ - \ - static inline __VkType \ - __v3dv_type ## _to_handle(struct __v3dv_type *_obj) \ - { \ - return (__VkType) _obj; \ - } - -#define V3DV_DEFINE_NONDISP_HANDLE_CASTS(__v3dv_type, __VkType) \ - \ - static inline struct __v3dv_type * \ - __v3dv_type ## _from_handle(__VkType _handle) \ - { \ - return (struct __v3dv_type *)(uintptr_t) _handle; \ - } \ - \ - static inline __VkType \ - __v3dv_type ## _to_handle(struct __v3dv_type *_obj) \ - { \ - return (__VkType)(uintptr_t) _obj; \ - } +VkResult +v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device, + nir_shader *nir, + VkPipelineLayout pipeline_layout, + VkPipeline *pipeline); #define V3DV_FROM_HANDLE(__v3dv_type, __name, __handle) \ - struct __v3dv_type *__name = __v3dv_type ## _from_handle(__handle) - -V3DV_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, VkCommandBuffer) -V3DV_DEFINE_HANDLE_CASTS(v3dv_device, VkDevice) -V3DV_DEFINE_HANDLE_CASTS(v3dv_instance, VkInstance) -V3DV_DEFINE_HANDLE_CASTS(v3dv_physical_device, VkPhysicalDevice) -V3DV_DEFINE_HANDLE_CASTS(v3dv_queue, VkQueue) - -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_cmd_pool, VkCommandPool) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, VkBuffer) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, VkBufferView) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, VkDeviceMemory) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, VkDescriptorPool) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, VkDescriptorSet) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, VkDescriptorSetLayout) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, VkDescriptorUpdateTemplate) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, VkEvent) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, VkFence) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, VkFramebuffer) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, VkImage) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, VkImageView) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, VkPipeline) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, VkPipelineCache) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, VkPipelineLayout) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, VkQueryPool) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, VkRenderPass) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, VkSampler) -V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, VkSemaphore) + VK_FROM_HANDLE(__v3dv_type, __name, __handle) + +VK_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, vk.base, VkCommandBuffer, + VK_OBJECT_TYPE_COMMAND_BUFFER) +VK_DEFINE_HANDLE_CASTS(v3dv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) +VK_DEFINE_HANDLE_CASTS(v3dv_instance, vk.base, VkInstance, + VK_OBJECT_TYPE_INSTANCE) +VK_DEFINE_HANDLE_CASTS(v3dv_physical_device, vk.base, VkPhysicalDevice, + VK_OBJECT_TYPE_PHYSICAL_DEVICE) +VK_DEFINE_HANDLE_CASTS(v3dv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) + +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, base, VkBuffer, + VK_OBJECT_TYPE_BUFFER) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, base, VkBufferView, + VK_OBJECT_TYPE_BUFFER_VIEW) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, vk.base, VkDeviceMemory, + VK_OBJECT_TYPE_DEVICE_MEMORY) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, base, VkDescriptorPool, + VK_OBJECT_TYPE_DESCRIPTOR_POOL) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, base, VkDescriptorSet, + VK_OBJECT_TYPE_DESCRIPTOR_SET) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, base, + VkDescriptorSetLayout, + VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, base, VkFramebuffer, + VK_OBJECT_TYPE_FRAMEBUFFER) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, vk.base, VkImage, + VK_OBJECT_TYPE_IMAGE) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, vk.base, VkImageView, + VK_OBJECT_TYPE_IMAGE_VIEW) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, base, VkPipeline, + VK_OBJECT_TYPE_PIPELINE) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, base, VkPipelineCache, + VK_OBJECT_TYPE_PIPELINE_CACHE) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, base, VkPipelineLayout, + VK_OBJECT_TYPE_PIPELINE_LAYOUT) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, base, VkQueryPool, + VK_OBJECT_TYPE_QUERY_POOL) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, base, VkRenderPass, + VK_OBJECT_TYPE_RENDER_PASS) +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, base, VkSampler, + VK_OBJECT_TYPE_SAMPLER) static inline int v3dv_ioctl(int fd, unsigned long request, void *arg) @@ -2132,19 +2636,39 @@ u64_compare(const void *key1, const void *key2) return memcmp(key1, key2, sizeof(uint64_t)) == 0; } -/* Helper to call hw ver speficic functions */ +/* Helper to call hw ver specific functions */ #define v3dv_X(device, thing) ({ \ __typeof(&v3d42_##thing) v3d_X_thing; \ switch (device->devinfo.ver) { \ case 42: \ v3d_X_thing = &v3d42_##thing; \ break; \ + case 71: \ + v3d_X_thing = &v3d71_##thing; \ + break; \ default: \ unreachable("Unsupported hardware generation"); \ } \ v3d_X_thing; \ }) +/* Helper to get hw-specific macro values */ +#define V3DV_X(device, thing) ({ \ + __typeof(V3D42_##thing) V3D_X_THING; \ + switch (device->devinfo.ver) { \ + case 42: \ + V3D_X_THING = V3D42_##thing; \ + break; \ + case 71: \ + V3D_X_THING = V3D71_##thing; \ + break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ + V3D_X_THING; \ +}) + + /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to * define v3dX for each version supported, because when we compile code that @@ -2157,6 +2681,45 @@ u64_compare(const void *key1, const void *key2) # define v3dX(x) v3d42_##x # include "v3dvx_private.h" # undef v3dX + +# define v3dX(x) v3d71_##x +# include "v3dvx_private.h" +# undef v3dX #endif +VkResult +v3dv_update_image_layout(struct v3dv_device *device, + struct v3dv_image *image, + uint64_t modifier, + bool disjoint, + const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info); + +float +v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline, + struct v3dv_cmd_buffer *buffer); + + +void +v3dv_compute_ez_state(struct vk_dynamic_graphics_state *dyn, + struct v3dv_pipeline *pipeline, + enum v3dv_ez_state *ez_state, + bool *incompatible_ez_test); + +uint32_t v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim); + +#if DETECT_OS_ANDROID +VkResult +v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc, + struct u_gralloc_buffer_handle *in_hnd, + VkImageDrmFormatModifierExplicitCreateInfoEXT *out, + VkSubresourceLayout *out_layouts, + int max_planes); + +VkResult +v3dv_import_native_buffer_fd(VkDevice device_h, + int dma_buf, + const VkAllocationCallbacks *alloc, + VkImage image_h); +#endif /* DETECT_OS_ANDROID */ + #endif /* V3DV_PRIVATE_H */ diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c index 0deb430fc16..7231c694fff 100644 --- a/src/broadcom/vulkan/v3dv_query.c +++ b/src/broadcom/vulkan/v3dv_query.c @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Raspberry Pi + * Copyright © 2020 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,6 +23,224 @@ #include "v3dv_private.h" +#include "util/timespec.h" +#include "compiler/nir/nir_builder.h" + +static void +kperfmon_create(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query) +{ + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters); + + struct drm_v3d_perfmon_create req = { + .ncounters = MIN2(pool->perfmon.ncounters - + i * DRM_V3D_MAX_PERF_COUNTERS, + DRM_V3D_MAX_PERF_COUNTERS), + }; + memcpy(req.counters, + &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS], + req.ncounters); + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_PERFMON_CREATE, + &req); + if (ret) + fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret)); + + pool->queries[query].perf.kperfmon_ids[i] = req.id; + } +} + +static void +kperfmon_destroy(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query) +{ + /* Skip destroying if never created */ + if (!pool->queries[query].perf.kperfmon_ids[0]) + return; + + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + struct drm_v3d_perfmon_destroy req = { + .id = pool->queries[query].perf.kperfmon_ids[i] + }; + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_PERFMON_DESTROY, + &req); + + if (ret) { + fprintf(stderr, "Failed to destroy perfmon %u: %s\n", + req.id, strerror(ret)); + } + } +} + +/** + * Creates a VkBuffer (and VkDeviceMemory) to access a BO. + */ +static VkResult +create_vk_storage_buffer(struct v3dv_device *device, + struct v3dv_bo *bo, + VkBuffer *vk_buf, + VkDeviceMemory *vk_mem) +{ + VkDevice vk_device = v3dv_device_to_handle(device); + + VkBufferCreateInfo buf_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = bo->size, + .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + }; + VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf); + if (result != VK_SUCCESS) + return result; + + struct v3dv_device_memory *mem = + vk_object_zalloc(&device->vk, NULL, sizeof(*mem), + VK_OBJECT_TYPE_DEVICE_MEMORY); + if (!mem) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + mem->bo = bo; + mem->type = &device->pdevice->memory.memoryTypes[0]; + + *vk_mem = v3dv_device_memory_to_handle(mem); + VkBindBufferMemoryInfo bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, + .buffer = *vk_buf, + .memory = *vk_mem, + .memoryOffset = 0, + }; + v3dv_BindBufferMemory2(vk_device, 1, &bind_info); + + return VK_SUCCESS; +} + +static void +destroy_vk_storage_buffer(struct v3dv_device *device, + VkBuffer *vk_buf, + VkDeviceMemory *vk_mem) +{ + if (*vk_mem) { + vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem)); + *vk_mem = VK_NULL_HANDLE; + } + + v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL); + *vk_buf = VK_NULL_HANDLE; +} + +/** + * Allocates descriptor sets to access query pool BO (availability and + * occlusion query results) from Vulkan pipelines. + */ +static VkResult +create_pool_descriptors(struct v3dv_device *device, + struct v3dv_query_pool *pool) +{ + assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION); + VkDevice vk_device = v3dv_device_to_handle(device); + + VkDescriptorPoolSize pool_size = { + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + }; + VkDescriptorPoolCreateInfo pool_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .maxSets = 1, + .poolSizeCount = 1, + .pPoolSizes = &pool_size, + }; + VkResult result = + v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL, + &pool->meta.descriptor_pool); + + if (result != VK_SUCCESS) + return result; + + VkDescriptorSetAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = pool->meta.descriptor_pool, + .descriptorSetCount = 1, + .pSetLayouts = &device->queries.buf_descriptor_set_layout, + }; + result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info, + &pool->meta.descriptor_set); + if (result != VK_SUCCESS) + return result; + + VkDescriptorBufferInfo desc_buf_info = { + .buffer = pool->meta.buf, + .offset = 0, + .range = VK_WHOLE_SIZE, + }; + + VkWriteDescriptorSet write = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = pool->meta.descriptor_set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .pBufferInfo = &desc_buf_info, + }; + v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL); + + return VK_SUCCESS; +} + +static void +destroy_pool_descriptors(struct v3dv_device *device, + struct v3dv_query_pool *pool) +{ + assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION); + + v3dv_FreeDescriptorSets(v3dv_device_to_handle(device), + pool->meta.descriptor_pool, + 1, &pool->meta.descriptor_set); + pool->meta.descriptor_set = VK_NULL_HANDLE; + + v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device), + pool->meta.descriptor_pool, NULL); + pool->meta.descriptor_pool = VK_NULL_HANDLE; +} + +static VkResult +pool_create_meta_resources(struct v3dv_device *device, + struct v3dv_query_pool *pool) +{ + VkResult result; + + if (pool->query_type != VK_QUERY_TYPE_OCCLUSION) + return VK_SUCCESS; + + result = create_vk_storage_buffer(device, pool->occlusion.bo, + &pool->meta.buf, &pool->meta.mem); + if (result != VK_SUCCESS) + return result; + + result = create_pool_descriptors(device, pool); + if (result != VK_SUCCESS) + return result; + + return VK_SUCCESS; +} + +static void +pool_destroy_meta_resources(struct v3dv_device *device, + struct v3dv_query_pool *pool) +{ + if (pool->query_type != VK_QUERY_TYPE_OCCLUSION) + return; + + destroy_pool_descriptors(device, pool); + destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem); +} + VKAPI_ATTR VkResult VKAPI_CALL v3dv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, @@ -32,74 +250,149 @@ v3dv_CreateQueryPool(VkDevice _device, V3DV_FROM_HANDLE(v3dv_device, device, _device); assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION || - pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP); + pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP || + pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); assert(pCreateInfo->queryCount > 0); struct v3dv_query_pool *pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool), VK_OBJECT_TYPE_QUERY_POOL); if (pool == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); pool->query_type = pCreateInfo->queryType; pool->query_count = pCreateInfo->queryCount; + uint32_t query_idx = 0; VkResult result; const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count; pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (pool->queries == NULL) { - result = vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); goto fail; } - if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: { /* The hardware allows us to setup groups of 16 queries in consecutive * 4-byte addresses, requiring only that each group of 16 queries is * aligned to a 1024 byte boundary. */ const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16); - const uint32_t bo_size = query_groups * 1024; - pool->bo = v3dv_bo_alloc(device, bo_size, "query", true); - if (!pool->bo) { - result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); + uint32_t bo_size = query_groups * 1024; + /* After the counters we store avalability data, 1 byte/query */ + pool->occlusion.avail_offset = bo_size; + bo_size += pool->query_count; + pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true); + if (!pool->occlusion.bo) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); goto fail; } - if (!v3dv_bo_map(device, pool->bo, bo_size)) { - result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); + if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); goto fail; } + break; } + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + const VkQueryPoolPerformanceCreateInfoKHR *pq_info = + vk_find_struct_const(pCreateInfo->pNext, + QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); + + assert(pq_info); + + pool->perfmon.ncounters = pq_info->counterIndexCount; + for (uint32_t i = 0; i < pq_info->counterIndexCount; i++) + pool->perfmon.counters[i] = pq_info->pCounterIndices[i]; - uint32_t i; - for (i = 0; i < pool->query_count; i++) { - pool->queries[i].maybe_available = false; + pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters, + DRM_V3D_MAX_PERF_COUNTERS); + + assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS); + break; + } + case VK_QUERY_TYPE_TIMESTAMP: { + /* 8 bytes per query used for the timestamp value. We have all + * timestamps tightly packed first in the buffer. + */ + const uint32_t bo_size = pool->query_count * 8; + pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true); + if (!pool->timestamp.bo) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto fail; + } + if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto fail; + } + break; + } + default: + unreachable("Unsupported query type"); + } + + /* Initialize queries in the pool */ + for (; query_idx < pool->query_count; query_idx++) { + pool->queries[query_idx].maybe_available = false; switch (pool->query_type) { case VK_QUERY_TYPE_OCCLUSION: { - const uint32_t query_group = i / 16; - const uint32_t query_offset = query_group * 1024 + (i % 16) * 4; - pool->queries[i].bo = pool->bo; - pool->queries[i].offset = query_offset; + const uint32_t query_group = query_idx / 16; + const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4; + pool->queries[query_idx].occlusion.offset = query_offset; break; } case VK_QUERY_TYPE_TIMESTAMP: - pool->queries[i].value = 0; + pool->queries[query_idx].timestamp.offset = query_idx * 8; + result = vk_sync_create(&device->vk, + &device->pdevice->drm_syncobj_type, 0, 0, + &pool->queries[query_idx].timestamp.sync); + if (result != VK_SUCCESS) + goto fail; + break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + result = vk_sync_create(&device->vk, + &device->pdevice->drm_syncobj_type, 0, 0, + &pool->queries[query_idx].perf.last_job_sync); + if (result != VK_SUCCESS) + goto fail; + + kperfmon_create(device, pool, query_idx); break; + } default: unreachable("Unsupported query type"); } } + /* Create meta resources */ + result = pool_create_meta_resources(device, pool); + if (result != VK_SUCCESS) + goto fail; + *pQueryPool = v3dv_query_pool_to_handle(pool); return VK_SUCCESS; fail: - if (pool->bo) - v3dv_bo_free(device, pool->bo); + if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + for (uint32_t j = 0; j < query_idx; j++) + vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync); + } + + if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t j = 0; j < query_idx; j++) + vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync); + } + + if (pool->occlusion.bo) + v3dv_bo_free(device, pool->occlusion.bo); + if (pool->timestamp.bo) + v3dv_bo_free(device, pool->timestamp.bo); if (pool->queries) vk_free2(&device->vk.alloc, pAllocator, pool->queries); + pool_destroy_meta_resources(device, pool); vk_object_free(&device->vk, pAllocator, pool); return result; @@ -116,17 +409,34 @@ v3dv_DestroyQueryPool(VkDevice _device, if (!pool) return; - if (pool->bo) - v3dv_bo_free(device, pool->bo); + if (pool->occlusion.bo) + v3dv_bo_free(device, pool->occlusion.bo); + + if (pool->timestamp.bo) + v3dv_bo_free(device, pool->timestamp.bo); + + if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + for (uint32_t i = 0; i < pool->query_count; i++) + vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync); + } + + if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t i = 0; i < pool->query_count; i++) { + kperfmon_destroy(device, pool, i); + vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync); + } + } if (pool->queries) vk_free2(&device->vk.alloc, pAllocator, pool->queries); + pool_destroy_meta_resources(device, pool); + vk_object_free(&device->vk, pAllocator, pool); } static void -write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value) +write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value) { if (do_64bit) { uint64_t *dst64 = (uint64_t *) dst; @@ -138,89 +448,255 @@ write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value) } static VkResult -get_occlusion_query_result(struct v3dv_device *device, - struct v3dv_query_pool *pool, - uint32_t query, - bool do_wait, - bool *available, - uint64_t *value) +query_wait_available(struct v3dv_device *device, + struct v3dv_query_pool *pool, + struct v3dv_query *q, + uint32_t query_idx) { - assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION); + /* For occlusion queries we prefer to poll the availability BO in a loop + * to waiting on the query results BO, because the latter would + * make us wait for any job running queries from the pool, even if those + * queries do not involve the one we want to wait on. + */ + if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { + uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) + + pool->occlusion.avail_offset + query_idx; + while (*q_addr == 0) + usleep(250); + return VK_SUCCESS; + } - struct v3dv_query *q = &pool->queries[query]; - assert(q->bo && q->bo->map); + if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + if (vk_sync_wait(&device->vk, q->timestamp.sync, + 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) { + return vk_device_set_lost(&device->vk, "Query job wait failed"); + } + return VK_SUCCESS; + } - if (do_wait) { - /* From the Vulkan 1.0 spec: - * - * "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not - * become available in a finite amount of time (e.g. due to not - * issuing a query since the last reset), a VK_ERROR_DEVICE_LOST - * error may occur." + assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + /* For performance queries we need to wait for the queue to signal that + * the query has been submitted for execution before anything else. + */ + VkResult result = VK_SUCCESS; + if (!q->maybe_available) { + struct timespec timeout; + timespec_get(&timeout, TIME_UTC); + timespec_add_msec(&timeout, &timeout, 2000); + + mtx_lock(&device->query_mutex); + while (!q->maybe_available) { + if (vk_device_is_lost(&device->vk)) { + result = VK_ERROR_DEVICE_LOST; + break; + } + + int ret = cnd_timedwait(&device->query_ended, + &device->query_mutex, + &timeout); + if (ret != thrd_success) { + mtx_unlock(&device->query_mutex); + result = vk_device_set_lost(&device->vk, "Query wait failed"); + break; + } + } + mtx_unlock(&device->query_mutex); + + if (result != VK_SUCCESS) + return result; + + /* For performance queries, we also need to wait for the relevant syncobj + * to be signaled to ensure completion of the GPU work. */ - if (!q->maybe_available) - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); + if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR && + vk_sync_wait(&device->vk, q->perf.last_job_sync, + 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) { + return vk_device_set_lost(&device->vk, "Query job wait failed"); + } + } + + return result; +} + +static VkResult +query_check_available(struct v3dv_device *device, + struct v3dv_query_pool *pool, + struct v3dv_query *q, + uint32_t query_idx) +{ + /* For occlusion we check the availability BO */ + if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { + const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) + + pool->occlusion.avail_offset + query_idx; + return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY; + } - if (!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull)) - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); + /* For timestamp queries, we need to check if the relevant job + * has completed. + */ + if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + if (vk_sync_wait(&device->vk, q->timestamp.sync, + 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) { + return VK_NOT_READY; + } + return VK_SUCCESS; + } + + /* For other queries we need to check if the queue has submitted the query + * for execution at all. + */ + assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + if (!q->maybe_available) + return VK_NOT_READY; + + /* For performance queries, we also need to check if the relevant GPU job + * has completed. + */ + if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR && + vk_sync_wait(&device->vk, q->perf.last_job_sync, + 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) { + return VK_NOT_READY; + } + + return VK_SUCCESS; +} + +static VkResult +query_is_available(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_wait, + bool *available) +{ + struct v3dv_query *q = &pool->queries[query]; + + if (do_wait) { + VkResult result = query_wait_available(device, pool, q, query); + if (result != VK_SUCCESS) { + *available = false; + return result; + } *available = true; } else { - *available = q->maybe_available && v3dv_bo_wait(device, q->bo, 0); + VkResult result = query_check_available(device, pool, q, query); + assert(result == VK_SUCCESS || result == VK_NOT_READY); + *available = (result == VK_SUCCESS); } - const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset; - *value = (uint64_t) *((uint32_t *)query_addr); return VK_SUCCESS; } static VkResult -get_timestamp_query_result(struct v3dv_device *device, - struct v3dv_query_pool *pool, - uint32_t query, - bool do_wait, - bool *available, - uint64_t *value) +write_occlusion_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) +{ + assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION); + + if (vk_device_is_lost(&device->vk)) + return VK_ERROR_DEVICE_LOST; + + struct v3dv_query *q = &pool->queries[query]; + assert(pool->occlusion.bo && pool->occlusion.bo->map); + + const uint8_t *query_addr = + ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset; + write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr)); + return VK_SUCCESS; +} + +static VkResult +write_timestamp_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) { assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP); struct v3dv_query *q = &pool->queries[query]; - if (do_wait) { - /* From the Vulkan 1.0 spec: - * - * "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not - * become available in a finite amount of time (e.g. due to not - * issuing a query since the last reset), a VK_ERROR_DEVICE_LOST - * error may occur." - */ - if (!q->maybe_available) - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); + const uint8_t *query_addr = + ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset; - *available = true; - } else { - *available = q->maybe_available; + write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr)); + return VK_SUCCESS; +} + +static VkResult +write_performance_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) +{ + assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + struct v3dv_query *q = &pool->queries[query]; + uint64_t counter_values[V3D_MAX_PERFCNT]; + + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + struct drm_v3d_perfmon_get_values req = { + .id = q->perf.kperfmon_ids[i], + .values_ptr = (uintptr_t)(&counter_values[i * + DRM_V3D_MAX_PERF_COUNTERS]) + }; + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_PERFMON_GET_VALUES, + &req); + + if (ret) { + fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret)); + return vk_error(device, VK_ERROR_DEVICE_LOST); + } } - *value = q->value; + for (uint32_t i = 0; i < pool->perfmon.ncounters; i++) + write_to_buffer(data, slot + i, do_64bit, counter_values[i]); + return VK_SUCCESS; } static VkResult -get_query_result(struct v3dv_device *device, - struct v3dv_query_pool *pool, - uint32_t query, - bool do_wait, - bool *available, - uint64_t *value) +write_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) +{ + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: + return write_occlusion_query_result(device, pool, query, do_64bit, + data, slot); + case VK_QUERY_TYPE_TIMESTAMP: + return write_timestamp_query_result(device, pool, query, do_64bit, + data, slot); + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + return write_performance_query_result(device, pool, query, do_64bit, + data, slot); + default: + unreachable("Unsupported query type"); + } +} + +static uint32_t +get_query_result_count(struct v3dv_query_pool *pool) { switch (pool->query_type) { case VK_QUERY_TYPE_OCCLUSION: - return get_occlusion_query_result(device, pool, query, do_wait, - available, value); case VK_QUERY_TYPE_TIMESTAMP: - return get_timestamp_query_result(device, pool, query, do_wait, - available, value); + return 1; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + return pool->perfmon.ncounters; default: unreachable("Unsupported query type"); } @@ -239,16 +715,18 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device, assert(first + count <= pool->query_count); assert(data); - const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT; + const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT || + pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR; const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT; const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT; + uint32_t result_count = get_query_result_count(pool); + VkResult result = VK_SUCCESS; for (uint32_t i = first; i < first + count; i++) { bool available = false; - uint64_t value = 0; VkResult query_result = - get_query_result(device, pool, i, do_wait, &available, &value); + query_is_available(device, pool, i, do_wait, &available); if (query_result == VK_ERROR_DEVICE_LOST) result = VK_ERROR_DEVICE_LOST; @@ -266,11 +744,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device, const bool write_result = available || do_partial; if (write_result) - write_query_result(data, slot, do_64bit, value); - slot++; + write_query_result(device, pool, i, do_64bit, data, slot); + slot += result_count; if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) - write_query_result(data, slot++, do_64bit, available ? 1u : 0u); + write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u); if (!write_result && result != VK_ERROR_DEVICE_LOST) result = VK_NOT_READY; @@ -298,6 +776,170 @@ v3dv_GetQueryPoolResults(VkDevice _device, pData, stride, flags); } +/* Emits a series of vkCmdDispatchBase calls to execute all the workgroups + * required to handle a number of queries considering per-dispatch limits. + */ +static void +cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t query_count) +{ + VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); + + uint32_t dispatched = 0; + const uint32_t max_batch_size = 65535; + while (dispatched < query_count) { + uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size); + v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1); + dispatched += batch_size; + } +} + +void +v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query, uint32_t count, + uint8_t availability) +{ + assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION || + pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + struct v3dv_device *device = cmd_buffer->device; + VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); + + /* We are about to emit a compute job to set query availability and we need + * to ensure this executes after the graphics work using the queries has + * completed. + */ + VkMemoryBarrier2 barrier = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + }; + VkDependencyInfo barrier_info = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier, + }; + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); + + /* Dispatch queries */ + v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); + + v3dv_CmdBindPipeline(vk_cmd_buffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->queries.avail_pipeline); + + v3dv_CmdBindDescriptorSets(vk_cmd_buffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->queries.avail_pipeline_layout, + 0, 1, &pool->meta.descriptor_set, + 0, NULL); + + struct { + uint32_t offset; + uint32_t query; + uint8_t availability; + } push_data = { pool->occlusion.avail_offset, query, availability }; + v3dv_CmdPushConstants(vk_cmd_buffer, + device->queries.avail_pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(push_data), &push_data); + cmd_buffer_emit_dispatch_queries(cmd_buffer, count); + + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); +} + +static void +cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query, uint32_t count) +{ + struct v3dv_device *device = cmd_buffer->device; + VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); + + /* Ensure the GPU is done with the queries in the graphics queue before + * we reset in the compute queue. + */ + VkMemoryBarrier2 barrier = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + }; + VkDependencyInfo barrier_info = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier, + }; + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); + + /* Emit compute reset */ + v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); + + v3dv_CmdBindPipeline(vk_cmd_buffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->queries.reset_occlusion_pipeline); + + v3dv_CmdBindDescriptorSets(vk_cmd_buffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->queries.reset_occlusion_pipeline_layout, + 0, 1, &pool->meta.descriptor_set, + 0, NULL); + struct { + uint32_t offset; + uint32_t query; + } push_data = { pool->occlusion.avail_offset, query }; + v3dv_CmdPushConstants(vk_cmd_buffer, + device->queries.reset_occlusion_pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(push_data), &push_data); + + cmd_buffer_emit_dispatch_queries(cmd_buffer, count); + + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); + + /* Ensure future work in the graphics queue using the queries doesn't start + * before the reset completed. + */ + barrier = (VkMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT, + }; + barrier_info = (VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier, + }; + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); +} + +static void +cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t first, uint32_t count) +{ + assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION); + cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count); +} + +static void +cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t first, uint32_t count) +{ + assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION); + + struct v3dv_job *job = + v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, + V3DV_JOB_TYPE_CPU_RESET_QUERIES, + cmd_buffer, -1); + v3dv_return_if_oom(cmd_buffer, NULL); + job->cpu.query_reset.pool = pool; + job->cpu.query_reset.first = first; + job->cpu.query_reset.count = count; + list_addtail(&job->list_link, &cmd_buffer->jobs); +} + VKAPI_ATTR void VKAPI_CALL v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -307,7 +949,261 @@ v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer, V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); - v3dv_cmd_buffer_reset_queries(cmd_buffer, pool, firstQuery, queryCount); + /* Resets can only happen outside a render pass instance so we should not + * be in the middle of job recording. + */ + assert(cmd_buffer->state.pass == NULL); + assert(cmd_buffer->state.job == NULL); + + assert(firstQuery < pool->query_count); + assert(firstQuery + queryCount <= pool->query_count); + + /* We can reset occlusion queries in the GPU, but for other query types + * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed + * in the queue. + */ + if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { + cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount); + } else { + cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool, + firstQuery, queryCount); + } +} + +/** + * Creates a descriptor pool so we can create a descriptors for the destination + * buffers of vkCmdCopyQueryResults for queries where this is implemented in + * the GPU. + */ +static VkResult +create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer) +{ + /* If this is not the first pool we create one for this command buffer + * size it based on the size of the currently exhausted pool. + */ + uint32_t descriptor_count = 32; + if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) { + struct v3dv_descriptor_pool *exhausted_pool = + v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool); + descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024); + } + + /* Create the descriptor pool */ + cmd_buffer->meta.query.dspool = VK_NULL_HANDLE; + VkDescriptorPoolSize pool_size = { + .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = descriptor_count, + }; + VkDescriptorPoolCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = descriptor_count, + .poolSizeCount = 1, + .pPoolSizes = &pool_size, + .flags = 0, + }; + VkResult result = + v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device), + &info, + &cmd_buffer->device->vk.alloc, + &cmd_buffer->meta.query.dspool); + + if (result == VK_SUCCESS) { + assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE); + const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool; + + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uintptr_t) vk_pool, + (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool); + + struct v3dv_descriptor_pool *pool = + v3dv_descriptor_pool_from_handle(vk_pool); + pool->is_driver_internal = true; + } + + return result; +} + +static VkResult +allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer, + VkDescriptorSet *set) +{ + /* Make sure we have a descriptor pool */ + VkResult result; + if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) { + result = create_storage_buffer_descriptor_pool(cmd_buffer); + if (result != VK_SUCCESS) + return result; + } + assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE); + + /* Allocate descriptor set */ + struct v3dv_device *device = cmd_buffer->device; + VkDevice vk_device = v3dv_device_to_handle(device); + VkDescriptorSetAllocateInfo info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = cmd_buffer->meta.query.dspool, + .descriptorSetCount = 1, + .pSetLayouts = &device->queries.buf_descriptor_set_layout, + }; + result = v3dv_AllocateDescriptorSets(vk_device, &info, set); + + /* If we ran out of pool space, grow the pool and try again */ + if (result == VK_ERROR_OUT_OF_POOL_MEMORY) { + result = create_storage_buffer_descriptor_pool(cmd_buffer); + if (result == VK_SUCCESS) { + info.descriptorPool = cmd_buffer->meta.query.dspool; + result = v3dv_AllocateDescriptorSets(vk_device, &info, set); + } + } + + return result; +} + +static uint32_t +copy_pipeline_index_from_flags(VkQueryResultFlags flags) +{ + uint32_t index = 0; + if (flags & VK_QUERY_RESULT_64_BIT) + index |= 1; + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + index |= 2; + if (flags & VK_QUERY_RESULT_PARTIAL_BIT) + index |= 4; + assert(index < 8); + return index; +} + +static nir_shader * +get_copy_query_results_cs(VkQueryResultFlags flags); + +static void +cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t first, uint32_t count, + struct v3dv_buffer *buf, + uint32_t offset, uint32_t stride, + VkQueryResultFlags flags) +{ + struct v3dv_device *device = cmd_buffer->device; + VkDevice vk_device = v3dv_device_to_handle(device); + VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); + + /* Create the required copy pipeline if not yet created */ + uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags); + if (!device->queries.copy_pipeline[pipeline_idx]) { + nir_shader *copy_query_results_cs_nir = get_copy_query_results_cs(flags); + VkResult result = + v3dv_create_compute_pipeline_from_nir( + device, copy_query_results_cs_nir, + device->queries.copy_pipeline_layout, + &device->queries.copy_pipeline[pipeline_idx]); + ralloc_free(copy_query_results_cs_nir); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to create copy query results pipeline\n"); + return; + } + } + + /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called + * and that already waits maybe we don't (since this is serialized + * in the compute queue with EndQuery anyway). + */ + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + VkMemoryBarrier2 barrier = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + }; + VkDependencyInfo barrier_info = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier, + }; + v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info); + } + + /* Allocate and setup descriptor set for output buffer */ + VkDescriptorSet out_buf_descriptor_set; + VkResult result = + allocate_storage_buffer_descriptor_set(cmd_buffer, + &out_buf_descriptor_set); + if (result != VK_SUCCESS) { + fprintf(stderr, "vkCmdCopyQueryPoolResults failed: " + "could not allocate descriptor.\n"); + return; + } + + VkDescriptorBufferInfo desc_buf_info = { + .buffer = v3dv_buffer_to_handle(buf), + .offset = 0, + .range = VK_WHOLE_SIZE, + }; + VkWriteDescriptorSet write = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = out_buf_descriptor_set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .pBufferInfo = &desc_buf_info, + }; + v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL); + + /* Dispatch copy */ + v3dv_cmd_buffer_meta_state_push(cmd_buffer, true); + + assert(device->queries.copy_pipeline[pipeline_idx]); + v3dv_CmdBindPipeline(vk_cmd_buffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->queries.copy_pipeline[pipeline_idx]); + + VkDescriptorSet sets[2] = { + pool->meta.descriptor_set, + out_buf_descriptor_set, + }; + v3dv_CmdBindDescriptorSets(vk_cmd_buffer, + VK_PIPELINE_BIND_POINT_COMPUTE, + device->queries.copy_pipeline_layout, + 0, 2, sets, 0, NULL); + + struct { + uint32_t avail_offset, first, offset, stride, flags; + } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags }; + v3dv_CmdPushConstants(vk_cmd_buffer, + device->queries.copy_pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(push_data), &push_data); + + cmd_buffer_emit_dispatch_queries(cmd_buffer, count); + + v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false); +} + +static void +cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t first, + uint32_t count, + struct v3dv_buffer *dst, + uint32_t offset, + uint32_t stride, + VkQueryResultFlags flags) +{ + struct v3dv_job *job = + v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, + V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS, + cmd_buffer, -1); + v3dv_return_if_oom(cmd_buffer, NULL); + + job->cpu.query_copy_results.pool = pool; + job->cpu.query_copy_results.first = first; + job->cpu.query_copy_results.count = count; + job->cpu.query_copy_results.dst = dst; + job->cpu.query_copy_results.offset = offset; + job->cpu.query_copy_results.stride = stride; + job->cpu.query_copy_results.flags = flags; + + list_addtail(&job->list_link, &cmd_buffer->jobs); } VKAPI_ATTR void VKAPI_CALL @@ -324,9 +1220,30 @@ v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer); - v3dv_cmd_buffer_copy_query_results(cmd_buffer, pool, - firstQuery, queryCount, - dst, dstOffset, stride, flags); + /* Copies can only happen outside a render pass instance so we should not + * be in the middle of job recording. + */ + assert(cmd_buffer->state.pass == NULL); + assert(cmd_buffer->state.job == NULL); + + assert(firstQuery < pool->query_count); + assert(firstQuery + queryCount <= pool->query_count); + + /* For occlusion queries we implement the copy in the GPU but for other + * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu + * when executed in the queue. + */ + if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { + cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool, + firstQuery, queryCount, + dst, (uint32_t) dstOffset, + (uint32_t) stride, flags); + } else { + cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool, + firstQuery, queryCount, + dst, (uint32_t)dstOffset, + (uint32_t) stride, flags); + } } VKAPI_ATTR void VKAPI_CALL @@ -351,3 +1268,537 @@ v3dv_CmdEndQuery(VkCommandBuffer commandBuffer, v3dv_cmd_buffer_end_query(cmd_buffer, pool, query); } + +void +v3dv_reset_query_pool_cpu(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t first, + uint32_t count) +{ + mtx_lock(&device->query_mutex); + + if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + assert(first + count <= pool->query_count); + + /* Reset timestamp */ + uint8_t *base_addr; + base_addr = ((uint8_t *) pool->timestamp.bo->map) + + pool->queries[first].timestamp.offset; + memset(base_addr, 0, 8 * count); + + for (uint32_t i = first; i < first + count; i++) { + if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS) + fprintf(stderr, "Failed to reset sync"); + } + + mtx_unlock(&device->query_mutex); + return; + } + + for (uint32_t i = first; i < first + count; i++) { + assert(i < pool->query_count); + struct v3dv_query *q = &pool->queries[i]; + q->maybe_available = false; + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: { + /* Reset availability */ + uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) + + pool->occlusion.avail_offset + first; + memset(base_addr, 0, count); + + /* Reset occlusion counter */ + const uint8_t *q_addr = + ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset; + uint32_t *counter = (uint32_t *) q_addr; + *counter = 0; + break; + } + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + kperfmon_destroy(device, pool, i); + kperfmon_create(device, pool, i); + if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS) + fprintf(stderr, "Failed to reset sync"); + break; + default: + unreachable("Unsupported query type"); + } + } + + mtx_unlock(&device->query_mutex); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_ResetQueryPool(VkDevice _device, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); + + v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions) +{ + V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice); + + return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount, + pCounters, + pCounterDescriptions); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( + VkPhysicalDevice physicalDevice, + const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, + uint32_t *pNumPasses) +{ + *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount, + DRM_V3D_MAX_PERF_COUNTERS); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_AcquireProfilingLockKHR( + VkDevice _device, + const VkAcquireProfilingLockInfoKHR *pInfo) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_ReleaseProfilingLockKHR(VkDevice device) +{ +} + +static inline void +nir_set_query_availability(nir_builder *b, + nir_def *buf, + nir_def *offset, + nir_def *query_idx, + nir_def *avail) +{ + offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */ + nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1); +} + +static inline nir_def * +nir_get_query_availability(nir_builder *b, + nir_def *buf, + nir_def *offset, + nir_def *query_idx) +{ + offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */ + nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1); + return nir_i2i32(b, avail); +} + +static nir_shader * +get_set_query_availability_cs() +{ + const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, + "set query availability cs"); + + nir_def *buf = + nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), + .desc_set = 0, + .binding = 0, + .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + /* This assumes a local size of 1 and a horizontal-only dispatch. If we + * ever change any of these parameters we need to update how we compute the + * query index here. + */ + nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0); + + nir_def *offset = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); + + nir_def *query_idx = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); + + nir_def *avail = + nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1); + + query_idx = nir_iadd(&b, query_idx, wg_id); + nir_set_query_availability(&b, buf, offset, query_idx, avail); + + return b.shader; +} + +static inline nir_def * +nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx) +{ + nir_def *query_group = nir_udiv_imm(b, query_idx, 16); + nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16); + nir_def *offset = + nir_iadd(b, nir_imul_imm(b, query_group, 1024), + nir_imul_imm(b, query_group_offset, 4)); + return offset; +} + +static inline void +nir_reset_occlusion_counter(nir_builder *b, + nir_def *buf, + nir_def *query_idx) +{ + nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx); + nir_def *zero = nir_imm_int(b, 0); + nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4); +} + +static inline nir_def * +nir_read_occlusion_counter(nir_builder *b, + nir_def *buf, + nir_def *query_idx) +{ + nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx); + return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4); +} + +static nir_shader * +get_reset_occlusion_query_cs() +{ + const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, + "reset occlusion query cs"); + + nir_def *buf = + nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), + .desc_set = 0, + .binding = 0, + .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + /* This assumes a local size of 1 and a horizontal-only dispatch. If we + * ever change any of these parameters we need to update how we compute the + * query index here. + */ + nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0); + + nir_def *avail_offset = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); + + nir_def *base_query_idx = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); + + nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); + + nir_set_query_availability(&b, buf, avail_offset, query_idx, + nir_imm_intN_t(&b, 0, 8)); + nir_reset_occlusion_counter(&b, buf, query_idx); + + return b.shader; +} + +static void +write_query_buffer(nir_builder *b, + nir_def *buf, + nir_def **offset, + nir_def *value, + bool flag_64bit) +{ + if (flag_64bit) { + /* Create a 64-bit value using a vec2 with the .Y component set to 0 + * so we can write a 64-bit value in a single store. + */ + nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0)); + nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8); + *offset = nir_iadd_imm(b, *offset, 8); + } else { + nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4); + *offset = nir_iadd_imm(b, *offset, 4); + } +} + +static nir_shader * +get_copy_query_results_cs(VkQueryResultFlags flags) +{ + bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT; + bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; + bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT; + + const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options(); + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, + "copy query results cs"); + + nir_def *buf = + nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), + .desc_set = 0, + .binding = 0, + .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + nir_def *buf_out = + nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), + .desc_set = 1, + .binding = 0, + .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + /* Read push constants */ + nir_def *avail_offset = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); + + nir_def *base_query_idx = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); + + nir_def *base_offset_out = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4); + + nir_def *stride = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4); + + /* This assumes a local size of 1 and a horizontal-only dispatch. If we + * ever change any of these parameters we need to update how we compute the + * query index here. + */ + nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0); + nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); + + /* Read query availability if needed */ + nir_def *avail = NULL; + if (flag_avail || !flag_partial) + avail = nir_get_query_availability(&b, buf, avail_offset, query_idx); + + /* Write occusion query result... */ + nir_def *offset = + nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride)); + + /* ...if partial is requested, we always write */ + if(flag_partial) { + nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); + write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit); + } else { + /*...otherwise, we only write if the query is available */ + nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0)); + nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); + write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit); + nir_pop_if(&b, if_stmt); + } + + /* Write query availability */ + if (flag_avail) + write_query_buffer(&b, buf_out, &offset, avail, flag_64bit); + + return b.shader; +} + +static bool +create_query_pipelines(struct v3dv_device *device) +{ + VkResult result; + VkPipeline pipeline; + + /* Set layout: single storage buffer */ + if (!device->queries.buf_descriptor_set_layout) { + VkDescriptorSetLayoutBinding descriptor_set_layout_binding = { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }; + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 1, + .pBindings = &descriptor_set_layout_binding, + }; + result = + v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device), + &descriptor_set_layout_info, + &device->vk.alloc, + &device->queries.buf_descriptor_set_layout); + if (result != VK_SUCCESS) + return false; + } + + /* Set availability pipeline. + * + * Pipeline layout: + * - 1 storage buffer for the BO with the query availability. + * - 2 push constants: + * 0B: offset of the availability info in the buffer (4 bytes) + * 4B: base query index (4 bytes). + * 8B: availability (1 byte). + */ + if (!device->queries.avail_pipeline_layout) { + VkPipelineLayoutCreateInfo pipeline_layout_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->queries.buf_descriptor_set_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = + &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 }, + }; + + result = + v3dv_CreatePipelineLayout(v3dv_device_to_handle(device), + &pipeline_layout_info, + &device->vk.alloc, + &device->queries.avail_pipeline_layout); + + if (result != VK_SUCCESS) + return false; + } + + if (!device->queries.avail_pipeline) { + nir_shader *set_query_availability_cs_nir = get_set_query_availability_cs(); + result = v3dv_create_compute_pipeline_from_nir(device, + set_query_availability_cs_nir, + device->queries.avail_pipeline_layout, + &pipeline); + ralloc_free(set_query_availability_cs_nir); + if (result != VK_SUCCESS) + return false; + + device->queries.avail_pipeline = pipeline; + } + + /* Reset occlusion query pipeline. + * + * Pipeline layout: + * - 1 storage buffer for the BO with the occlusion and availability data. + * - Push constants: + * 0B: offset of the availability info in the buffer (4B) + * 4B: base query index (4B) + */ + if (!device->queries.reset_occlusion_pipeline_layout) { + VkPipelineLayoutCreateInfo pipeline_layout_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->queries.buf_descriptor_set_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = + &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 }, + }; + + result = + v3dv_CreatePipelineLayout(v3dv_device_to_handle(device), + &pipeline_layout_info, + &device->vk.alloc, + &device->queries.reset_occlusion_pipeline_layout); + + if (result != VK_SUCCESS) + return false; + } + + if (!device->queries.reset_occlusion_pipeline) { + nir_shader *reset_occlusion_query_cs_nir = get_reset_occlusion_query_cs(); + result = v3dv_create_compute_pipeline_from_nir( + device, + reset_occlusion_query_cs_nir, + device->queries.reset_occlusion_pipeline_layout, + &pipeline); + ralloc_free(reset_occlusion_query_cs_nir); + if (result != VK_SUCCESS) + return false; + + device->queries.reset_occlusion_pipeline = pipeline; + } + + /* Copy query results pipelines. + * + * Pipeline layout: + * - 1 storage buffer for the BO with the query availability and occlusion. + * - 1 storage buffer for the output. + * - Push constants: + * 0B: offset of the availability info in the buffer (4B) + * 4B: base query index (4B) + * 8B: offset into output buffer (4B) + * 12B: stride (4B) + * + * We create multiple specialized pipelines depending on the copy flags + * to remove conditionals from the copy shader and get more optimized + * pipelines. + */ + if (!device->queries.copy_pipeline_layout) { + VkDescriptorSetLayout set_layouts[2] = { + device->queries.buf_descriptor_set_layout, + device->queries.buf_descriptor_set_layout + }; + VkPipelineLayoutCreateInfo pipeline_layout_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 2, + .pSetLayouts = set_layouts, + .pushConstantRangeCount = 1, + .pPushConstantRanges = + &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 }, + }; + + result = + v3dv_CreatePipelineLayout(v3dv_device_to_handle(device), + &pipeline_layout_info, + &device->vk.alloc, + &device->queries.copy_pipeline_layout); + + if (result != VK_SUCCESS) + return false; + } + + /* Actual copy pipelines are created lazily on demand since there can be up + * to 8 depending on the flags used, however it is likely that applications + * will use the same flags every time and only one pipeline is required. + */ + + return true; +} + +static void +destroy_query_pipelines(struct v3dv_device *device) +{ + VkDevice _device = v3dv_device_to_handle(device); + + /* Availability pipeline */ + v3dv_DestroyPipeline(_device, device->queries.avail_pipeline, + &device->vk.alloc); + device->queries.avail_pipeline = VK_NULL_HANDLE; + v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout, + &device->vk.alloc); + device->queries.avail_pipeline_layout = VK_NULL_HANDLE; + + /* Reset occlusion pipeline */ + v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline, + &device->vk.alloc); + device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE; + v3dv_DestroyPipelineLayout(_device, + device->queries.reset_occlusion_pipeline_layout, + &device->vk.alloc); + device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE; + + /* Copy pipelines */ + for (int i = 0; i < 8; i++) { + v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i], + &device->vk.alloc); + device->queries.copy_pipeline[i] = VK_NULL_HANDLE; + } + v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout, + &device->vk.alloc); + device->queries.copy_pipeline_layout = VK_NULL_HANDLE; + + v3dv_DestroyDescriptorSetLayout(_device, + device->queries.buf_descriptor_set_layout, + &device->vk.alloc); + device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE; +} + +/** + * Allocates device resources for implementing certain types of queries. + */ +VkResult +v3dv_query_allocate_resources(struct v3dv_device *device) +{ + if (!create_query_pipelines(device)) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + return VK_SUCCESS; +} + +void +v3dv_query_free_resources(struct v3dv_device *device) +{ + destroy_query_pipelines(device); +} diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index 05343b0a24c..ac981984c4f 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,6 +25,9 @@ #include "drm-uapi/v3d_drm.h" #include "broadcom/clif/clif_dump.h" +#include "util/libsync.h" +#include "util/os_time.h" +#include "vk_drm_syncobj.h" #include <errno.h> #include <time.h> @@ -34,16 +37,16 @@ v3dv_clif_dump(struct v3dv_device *device, struct v3dv_job *job, struct drm_v3d_submit_cl *submit) { - if (!(V3D_DEBUG & (V3D_DEBUG_CL | - V3D_DEBUG_CL_NO_BIN | - V3D_DEBUG_CLIF))) + if (!(V3D_DBG(CL) || + V3D_DBG(CL_NO_BIN) || + V3D_DBG(CLIF))) return; struct clif_dump *clif = clif_dump_init(&device->devinfo, stderr, - V3D_DEBUG & (V3D_DEBUG_CL | - V3D_DEBUG_CL_NO_BIN), - V3D_DEBUG & V3D_DEBUG_CL_NO_BIN); + V3D_DBG(CL) || + V3D_DBG(CL_NO_BIN), + V3D_DBG(CL_NO_BIN)); set_foreach(job->bos, entry) { struct v3dv_bo *bo = (void *)entry->key; @@ -67,131 +70,415 @@ v3dv_clif_dump(struct v3dv_device *device, clif_dump_destroy(clif); } -static uint64_t -gettime_ns() +static VkResult +queue_wait_idle(struct v3dv_queue *queue, + struct v3dv_submit_sync_info *sync_info) { - struct timespec current; - clock_gettime(CLOCK_MONOTONIC, ¤t); - return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec; -} + int ret = drmSyncobjWait(queue->device->pdevice->render_fd, + queue->last_job_syncs.syncs, 4, + INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, + NULL); + if (ret) + return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m"); -static uint64_t -get_absolute_timeout(uint64_t timeout) -{ - uint64_t current_time = gettime_ns(); - uint64_t max_timeout = (uint64_t) INT64_MAX - current_time; + bool first = true; + for (int i = 0; i < 4; i++) { + if (!queue->last_job_syncs.first[i]) + first = false; + } - timeout = MIN2(max_timeout, timeout); + /* If we're not the first job, that means we're waiting on some + * per-queue-type syncobj which transitively waited on the semaphores + * so we can skip the semaphore wait. + */ + if (first) { + VkResult result = vk_sync_wait_many(&queue->device->vk, + sync_info->wait_count, + sync_info->waits, + VK_SYNC_WAIT_COMPLETE, + UINT64_MAX); + if (result != VK_SUCCESS) + return result; + } - return (current_time + timeout); -} + for (int i = 0; i < 4; i++) + queue->last_job_syncs.first[i] = false; -static VkResult -queue_submit_job(struct v3dv_queue *queue, - struct v3dv_job *job, - bool do_sem_wait, - pthread_t *wait_thread); + return VK_SUCCESS; +} -/* Waits for active CPU wait threads spawned before the current thread to - * complete and submit all their GPU jobs. - */ static void -cpu_queue_wait_idle(struct v3dv_queue *queue) +multisync_free(struct v3dv_device *device, + struct drm_v3d_multi_sync *ms) { - const pthread_t this_thread = pthread_self(); - -retry: - mtx_lock(&queue->mutex); - list_for_each_entry(struct v3dv_queue_submit_wait_info, info, - &queue->submit_wait_list, list_link) { - for (uint32_t i = 0; i < info->wait_thread_count; i++) { - if (info->wait_threads[i].finished) - continue; - - /* Because we are testing this against the list of spawned threads - * it will never match for the main thread, so when we call this from - * the main thread we are effectively waiting for all active threads - * to complete, and otherwise we are only waiting for work submitted - * before the wait thread that called this (a wait thread should never - * be waiting for work submitted after it). - */ - if (info->wait_threads[i].thread == this_thread) - goto done; + vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs); + vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs); +} - /* Wait and try again */ - mtx_unlock(&queue->mutex); - usleep(500); /* 0.5 ms */ - goto retry; - } +static struct drm_v3d_sem * +set_in_syncs(struct v3dv_queue *queue, + struct v3dv_job *job, + enum v3dv_queue_type queue_sync, + uint32_t *count, + struct vk_sync_wait *waits, + unsigned wait_count, + struct v3dv_submit_sync_info *sync_info) +{ + struct v3dv_device *device = queue->device; + uint32_t n_syncs = 0; + + /* If this is the first job submitted to a given GPU queue in this cmd buf + * batch, it has to wait on wait semaphores (if any) before running. + */ + if (queue->last_job_syncs.first[queue_sync]) + n_syncs = sync_info->wait_count; + + /* If the serialize flag is set the job needs to be serialized in the + * corresponding queues. Notice that we may implement transfer operations + * as both CL or TFU jobs. + * + * FIXME: maybe we could track more precisely if the source of a transfer + * barrier is a CL and/or a TFU job. + */ + bool sync_csd = job->serialize & V3DV_BARRIER_COMPUTE_BIT; + bool sync_tfu = job->serialize & V3DV_BARRIER_TRANSFER_BIT; + bool sync_cl = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT | + V3DV_BARRIER_TRANSFER_BIT); + bool sync_cpu = job->serialize & V3DV_BARRIER_CPU_BIT; + + *count = n_syncs; + if (sync_cl) + (*count)++; + if (sync_tfu) + (*count)++; + if (sync_csd) + (*count)++; + if (sync_cpu) + (*count)++; + + *count += wait_count; + + if (!*count) + return NULL; + + struct drm_v3d_sem *syncs = + vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + if (!syncs) + return NULL; + + for (int i = 0; i < n_syncs; i++) { + syncs[i].handle = + vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj; } -done: - mtx_unlock(&queue->mutex); + for (int i = 0; i < wait_count; i++) { + syncs[n_syncs++].handle = + vk_sync_as_drm_syncobj(waits[i].sync)->syncobj; + } + + if (sync_cl) + syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL]; + + if (sync_csd) + syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD]; + + if (sync_tfu) + syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU]; + + if (sync_cpu) + syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU]; + + assert(n_syncs == *count); + return syncs; } -static VkResult -gpu_queue_wait_idle(struct v3dv_queue *queue) +static struct drm_v3d_sem * +set_out_syncs(struct v3dv_queue *queue, + struct v3dv_job *job, + enum v3dv_queue_type queue_sync, + uint32_t *count, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { struct v3dv_device *device = queue->device; - mtx_lock(&device->mutex); - uint32_t last_job_sync = device->last_job_sync; - mtx_unlock(&device->mutex); + uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0; - int ret = drmSyncobjWait(device->pdevice->render_fd, - &last_job_sync, 1, INT64_MAX, 0, NULL); - if (ret) - return VK_ERROR_DEVICE_LOST; + /* We always signal the syncobj from `device->last_job_syncs` related to + * this v3dv_queue_type to track the last job submitted to this queue. + */ + (*count) = n_vk_syncs + 1; - return VK_SUCCESS; + struct drm_v3d_sem *syncs = + vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + if (!syncs) + return NULL; + + if (n_vk_syncs) { + for (unsigned i = 0; i < n_vk_syncs; i++) { + syncs[i].handle = + vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj; + } + } + + syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync]; + + return syncs; } -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_QueueWaitIdle(VkQueue _queue) +static void +set_ext(struct drm_v3d_extension *ext, + struct drm_v3d_extension *next, + uint32_t id, + uintptr_t flags) { - V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); + ext->next = (uintptr_t)(void *)next; + ext->id = id; + ext->flags = flags; +} - /* Check that we don't have any wait threads running in the CPU first, - * as these can spawn new GPU jobs. - */ - cpu_queue_wait_idle(queue); +/* This function sets the extension for multiple in/out syncobjs. When it is + * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC. + * Otherwise, the extension id is 0, which means an out-of-memory error. + */ +static void +set_multisync(struct drm_v3d_multi_sync *ms, + struct v3dv_submit_sync_info *sync_info, + struct vk_sync_wait *waits, + unsigned wait_count, + struct drm_v3d_extension *next, + struct v3dv_device *device, + struct v3dv_job *job, + enum v3dv_queue_type in_queue_sync, + enum v3dv_queue_type out_queue_sync, + enum v3d_queue wait_stage, + bool signal_syncs) +{ + struct v3dv_queue *queue = &device->queue; + uint32_t out_sync_count = 0, in_sync_count = 0; + struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL; - /* Check we don't have any GPU jobs running */ - return gpu_queue_wait_idle(queue); + in_syncs = set_in_syncs(queue, job, in_queue_sync, + &in_sync_count, waits, wait_count, sync_info); + if (!in_syncs && in_sync_count) + goto fail; + + out_syncs = set_out_syncs(queue, job, out_queue_sync, + &out_sync_count, sync_info, signal_syncs); + + assert(out_sync_count > 0); + + if (!out_syncs) + goto fail; + + set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0); + ms->wait_stage = wait_stage; + ms->out_sync_count = out_sync_count; + ms->out_syncs = (uintptr_t)(void *)out_syncs; + ms->in_sync_count = in_sync_count; + ms->in_syncs = (uintptr_t)(void *)in_syncs; + + return; + +fail: + if (in_syncs) + vk_free(&device->vk.alloc, in_syncs); + assert(!out_syncs); + + return; } static VkResult -handle_reset_query_cpu_job(struct v3dv_job *job) +handle_reset_query_cpu_job(struct v3dv_queue *queue, + struct v3dv_job *job, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { + struct v3dv_device *device = queue->device; struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset; assert(info->pool); - /* We are about to reset query counters so we need to make sure that - * The GPU is not using them. The exception is timestamp queries, since - * we handle those in the CPU. - * - * FIXME: we could avoid blocking the main thread for this if we use - * submission thread. + assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION); + + if (device->pdevice->caps.cpu_queue) { + assert(info->first + info->count <= info->pool->query_count); + + struct drm_v3d_submit_cpu submit = {0}; + struct drm_v3d_multi_sync ms = {0}; + + uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count); + uintptr_t *kperfmon_ids = NULL; + + if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + submit.bo_handle_count = 1; + submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle; + + struct drm_v3d_reset_timestamp_query reset = {0}; + + set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0); + + reset.count = info->count; + reset.offset = info->pool->queries[info->first].timestamp.offset; + + for (uint32_t i = 0; i < info->count; i++) { + struct v3dv_query *query = &info->pool->queries[info->first + i]; + syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj; + } + + reset.syncs = (uintptr_t)(void *)syncs; + + set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job, + V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + } else { + assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + struct drm_v3d_reset_performance_query reset = {0}; + + set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0); + + struct vk_sync_wait waits[info->count]; + unsigned wait_count = 0; + for (int i = 0; i < info->count; i++) { + struct v3dv_query *query = &info->pool->queries[info->first + i]; + /* Only wait for a query if we've used it otherwise we will be + * waiting forever for the fence to become signaled. + */ + if (query->maybe_available) { + waits[wait_count] = (struct vk_sync_wait){ + .sync = query->perf.last_job_sync + }; + wait_count++; + }; + } + + reset.count = info->count; + reset.nperfmons = info->pool->perfmon.nperfmons; + + kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count); + + for (uint32_t i = 0; i < info->count; i++) { + struct v3dv_query *query = &info->pool->queries[info->first + i]; + + syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj; + kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids; + } + + reset.syncs = (uintptr_t)(void *)syncs; + reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids; + + set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job, + V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + submit.flags |= DRM_V3D_SUBMIT_EXTENSION; + submit.extensions = (uintptr_t)(void *)&ms; + + /* From the Vulkan spec for vkCmdResetQueryPool: + * + * "This command defines an execution dependency between other query commands + * that reference the same query. + * ... + * The second synchronization scope includes all commands which reference the + * queries in queryPool indicated by firstQuery and queryCount that occur later + * in submission order." + * + * This means we should ensure that any timestamps after a reset don't execute before + * the reset, however, for timestamps queries in particular we don't have to do + * anything special because timestamp queries have to wait for all previously + * submitted work to complete before executing (which we accomplish by using + * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue. + */ + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_SUBMIT_CPU, &submit); + + free(syncs); + free(kperfmon_ids); + multisync_free(device, &ms); + + queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false; + + if (ret) + return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m"); + + return VK_SUCCESS; + } + + /* We are about to reset query counters in user-space so we need to make + * sure that the GPU is not using them. */ - if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) - v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE); - - for (uint32_t i = info->first; i < info->first + info->count; i++) { - assert(i < info->pool->query_count); - struct v3dv_query *q = &info->pool->queries[i]; - q->maybe_available = false; - switch (info->pool->query_type) { - case VK_QUERY_TYPE_OCCLUSION: { - const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset; - uint32_t *counter = (uint32_t *) q_addr; - *counter = 0; - break; + if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + VkResult result = queue_wait_idle(queue, sync_info); + if (result != VK_SUCCESS) + return result; + + v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE); + } + + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + struct vk_sync_wait waits[info->count]; + unsigned wait_count = 0; + for (int i = 0; i < info->count; i++) { + struct v3dv_query *query = &info->pool->queries[info->first + i]; + /* Only wait for a query if we've used it otherwise we will be + * waiting forever for the fence to become signaled. + */ + if (query->maybe_available) { + waits[wait_count] = (struct vk_sync_wait){ + .sync = query->perf.last_job_sync + }; + wait_count++; + }; } - case VK_QUERY_TYPE_TIMESTAMP: - q->value = 0; - break; - default: - unreachable("Unsupported query type"); + + VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits, + VK_SYNC_WAIT_COMPLETE, UINT64_MAX); + + if (result != VK_SUCCESS) + return result; + } + + v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count); + + return VK_SUCCESS; +} + +static VkResult +export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd) +{ + int err; + static const enum v3dv_queue_type queues_to_sync[] = { + V3DV_QUEUE_CL, + V3DV_QUEUE_CSD, + }; + + for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) { + enum v3dv_queue_type queue_type = queues_to_sync[i]; + int tmp_fd = -1; + + err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd, + queue->last_job_syncs.syncs[queue_type], + &tmp_fd); + + if (err) { + close(*fd); + return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, + "sync file export failed: %m"); + } + + err = sync_accumulate("v3dv", fd, tmp_fd); + + if (err) { + close(tmp_fd); + close(*fd); + return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, + "failed to accumulate sync files: %m"); } } @@ -199,36 +486,200 @@ handle_reset_query_cpu_job(struct v3dv_job *job) } static VkResult -handle_end_query_cpu_job(struct v3dv_job *job) +handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx) { - struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end; + VkResult result = VK_SUCCESS; + + mtx_lock(&job->device->query_mutex); + + struct v3dv_end_query_info *info = &job->cpu.query_end; + struct v3dv_queue *queue = &job->device->queue; + + int err = 0; + int fd = -1; + + assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + result = export_perfmon_last_job_sync(queue, job, &fd); + + if (result != VK_SUCCESS) + goto fail; + + assert(fd >= 0); + } + for (uint32_t i = 0; i < info->count; i++) { assert(info->query + i < info->pool->query_count); struct v3dv_query *query = &info->pool->queries[info->query + i]; + + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj; + err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd, + syncobj, fd); + + if (err) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file import failed: %m"); + goto fail; + } + } + query->maybe_available = true; } - return VK_SUCCESS; +fail: + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) + close(fd); + + cnd_broadcast(&job->device->query_ended); + mtx_unlock(&job->device->query_mutex); + + return result; } static VkResult -handle_copy_query_results_cpu_job(struct v3dv_job *job) +handle_copy_query_results_cpu_job(struct v3dv_queue *queue, + struct v3dv_job *job, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { + struct v3dv_device *device = queue->device; struct v3dv_copy_query_results_cpu_job_info *info = &job->cpu.query_copy_results; + assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || + info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP); + assert(info->dst && info->dst->mem && info->dst->mem->bo); struct v3dv_bo *bo = info->dst->mem->bo; + if (device->pdevice->caps.cpu_queue) { + struct drm_v3d_submit_cpu submit = {0}; + struct drm_v3d_multi_sync ms = {0}; + + uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count); + uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count); + uint32_t *bo_handles = NULL; + uintptr_t *kperfmon_ids = NULL; + + if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) { + submit.bo_handle_count = 2; + + bo_handles = (uint32_t *) + malloc(sizeof(uint32_t) * submit.bo_handle_count); + + bo_handles[0] = bo->handle; + bo_handles[1] = info->pool->timestamp.bo->handle; + submit.bo_handles = (uintptr_t)(void *)bo_handles; + + struct drm_v3d_copy_timestamp_query copy = {0}; + + set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0); + + copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT; + copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT; + copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; + copy.offset = info->offset + info->dst->mem_offset; + copy.stride = info->stride; + copy.count = info->count; + + for (uint32_t i = 0; i < info->count; i++) { + assert(info->first < info->pool->query_count); + assert(info->first + info->count <= info->pool->query_count); + struct v3dv_query *query = &info->pool->queries[info->first + i]; + + offsets[i] = query->timestamp.offset; + syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj; + } + + copy.offsets = (uintptr_t)(void *)offsets; + copy.syncs = (uintptr_t)(void *)syncs; + + set_multisync(&ms, sync_info, NULL, 0, (void *)©, device, job, + V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + } else { + assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + submit.bo_handle_count = 1; + submit.bo_handles = (uintptr_t)(void *)&bo->handle; + + struct drm_v3d_copy_performance_query copy = {0}; + + set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0); + + /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR, + * results for each query are written as an array of the type indicated + * by VkPerformanceCounterKHR::storage for the counter being queried. + * For v3dv, VkPerformanceCounterKHR::storage is + * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR. + */ + copy.do_64bit = true; + copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT; + copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; + copy.offset = info->offset + info->dst->mem_offset; + copy.stride = info->stride; + copy.count = info->count; + copy.nperfmons = info->pool->perfmon.nperfmons; + copy.ncounters = info->pool->perfmon.ncounters; + + kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count); + + struct vk_sync_wait waits[info->count]; + unsigned wait_count = 0; + + for (uint32_t i = 0; i < info->count; i++) { + assert(info->first < info->pool->query_count); + assert(info->first + info->count <= info->pool->query_count); + struct v3dv_query *query = &info->pool->queries[info->first + i]; + + syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj; + kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids; + + if (info->flags & VK_QUERY_RESULT_WAIT_BIT) { + waits[wait_count] = (struct vk_sync_wait){ + .sync = query->perf.last_job_sync + }; + wait_count++; + } + } + + copy.syncs = (uintptr_t)(void *)syncs; + copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids; + + set_multisync(&ms, sync_info, waits, wait_count, (void *)©, device, job, + V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + submit.flags |= DRM_V3D_SUBMIT_EXTENSION; + submit.extensions = (uintptr_t)(void *)&ms; + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_SUBMIT_CPU, &submit); + + free(kperfmon_ids); + free(bo_handles); + free(offsets); + free(syncs); + multisync_free(device, &ms); + + queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false; + + if (ret) + return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m"); + + return VK_SUCCESS; + } + /* Map the entire dst buffer for the CPU copy if needed */ assert(!bo->map || bo->map_size == bo->size); if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) - return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); - /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a - * sync wait on the CPU for the corresponding GPU jobs to finish. We might - * want to use a submission thread to avoid blocking on the main thread. - */ uint8_t *offset = ((uint8_t *) bo->map) + info->offset + info->dst->mem_offset; v3dv_get_query_pool_results_cpu(job->device, @@ -243,344 +694,213 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job) } static VkResult -handle_set_event_cpu_job(struct v3dv_job *job, bool is_wait_thread) +handle_timestamp_query_cpu_job(struct v3dv_queue *queue, + struct v3dv_job *job, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { - /* From the Vulkan 1.0 spec: - * - * "When vkCmdSetEvent is submitted to a queue, it defines an execution - * dependency on commands that were submitted before it, and defines an - * event signal operation which sets the event to the signaled state. - * The first synchronization scope includes every command previously - * submitted to the same queue, including those in the same command - * buffer and batch". - * - * So we should wait for all prior work to be completed before signaling - * the event, this includes all active CPU wait threads spawned for any - * command buffer submitted *before* this. - * - * FIXME: we could avoid blocking the main thread for this if we use a - * submission thread. - */ + struct v3dv_device *device = queue->device; - /* If we are calling this from a wait thread it will only wait - * wait threads sspawned before it, otherwise it will wait for - * all active threads to complete. - */ - cpu_queue_wait_idle(&job->device->queue); + assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY); + struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp; - VkResult result = gpu_queue_wait_idle(&job->device->queue); - if (result != VK_SUCCESS) - return result; + if (!device->pdevice->caps.cpu_queue) { + /* Wait for completion of all work queued before the timestamp query */ + VkResult result = queue_wait_idle(queue, sync_info); + if (result != VK_SUCCESS) + return result; - struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set; - p_atomic_set(&info->event->state, info->state); + mtx_lock(&job->device->query_mutex); - return VK_SUCCESS; -} + /* Compute timestamp */ + struct timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); -static bool -check_wait_events_complete(struct v3dv_job *job) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); + for (uint32_t i = 0; i < info->count; i++) { + assert(info->query + i < info->pool->query_count); + struct v3dv_query *query = &info->pool->queries[info->query + i]; + query->maybe_available = true; - struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait; - for (uint32_t i = 0; i < info->event_count; i++) { - if (!p_atomic_read(&info->events[i]->state)) - return false; - } - return true; -} + /* Value */ + uint8_t *value_addr = + ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset; + *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull; -static void -wait_thread_finish(struct v3dv_queue *queue, pthread_t thread) -{ - mtx_lock(&queue->mutex); - list_for_each_entry(struct v3dv_queue_submit_wait_info, info, - &queue->submit_wait_list, list_link) { - for (uint32_t i = 0; i < info->wait_thread_count; i++) { - if (info->wait_threads[i].thread == thread) { - info->wait_threads[i].finished = true; - goto done; - } + /* Availability */ + result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0); } - } - unreachable(!"Failed to finish wait thread: not found"); + cnd_broadcast(&job->device->query_ended); + mtx_unlock(&job->device->query_mutex); -done: - mtx_unlock(&queue->mutex); -} - -static void * -event_wait_thread_func(void *_job) -{ - struct v3dv_job *job = (struct v3dv_job *) _job; - assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); - struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait; - - /* Wait for events to be signaled */ - const useconds_t wait_interval_ms = 1; - while (!check_wait_events_complete(job)) - usleep(wait_interval_ms * 1000); - - /* Now continue submitting pending jobs for the same command buffer after - * the wait job. - */ - struct v3dv_queue *queue = &job->device->queue; - list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next, - &job->cmd_buffer->jobs, list_link) { - /* We don't want to spawn more than one wait thread per command buffer. - * If this job also requires a wait for events, we will do the wait here. - */ - VkResult result = queue_submit_job(queue, pjob, info->sem_wait, NULL); - if (result == VK_NOT_READY) { - while (!check_wait_events_complete(pjob)) { - usleep(wait_interval_ms * 1000); - } - result = VK_SUCCESS; - } - - if (result != VK_SUCCESS) { - fprintf(stderr, "Wait thread job execution failed.\n"); - goto done; - } + return result; } -done: - wait_thread_finish(queue, pthread_self()); - return NULL; -} + struct drm_v3d_submit_cpu submit = {0}; -static VkResult -spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread) + submit.bo_handle_count = 1; + submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle; -{ - assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); - assert(job->cmd_buffer); - assert(wait_thread != NULL); + struct drm_v3d_timestamp_query timestamp = {0}; - if (pthread_create(wait_thread, NULL, event_wait_thread_func, job)) - return vk_error(job->device->instance, VK_ERROR_DEVICE_LOST); + set_ext(×tamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0); - return VK_NOT_READY; -} + timestamp.count = info->count; -static VkResult -handle_wait_events_cpu_job(struct v3dv_job *job, - bool sem_wait, - pthread_t *wait_thread) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); - struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait; + uint32_t *offsets = + (uint32_t *) malloc(sizeof(uint32_t) * info->count); + uint32_t *syncs = + (uint32_t *) malloc(sizeof(uint32_t) * info->count); - /* If all events are signaled then we are done and can continue submitting - * the rest of the command buffer normally. - */ - if (check_wait_events_complete(job)) - return VK_SUCCESS; + for (uint32_t i = 0; i < info->count; i++) { + assert(info->query + i < info->pool->query_count); + struct v3dv_query *query = &info->pool->queries[info->query + i]; + query->maybe_available = true; - /* Otherwise, we put the rest of the command buffer on a wait thread until - * all events are signaled. We only spawn a new thread on the first - * wait job we see for a command buffer, any additional wait jobs in the - * same command buffer will run in that same wait thread and will get here - * with a NULL wait_thread pointer. - * - * Also, whether we spawn a wait thread or not, we always return - * VK_NOT_READY (unless an error happened), so we stop trying to submit - * any jobs in the same command buffer after the wait job. The wait thread - * will attempt to submit them after the wait completes. - */ - info->sem_wait = sem_wait; - if (wait_thread) - return spawn_event_wait_thread(job, wait_thread); - else - return VK_NOT_READY; -} + offsets[i] = query->timestamp.offset; + syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj; + } -static VkResult -handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE); - struct v3dv_copy_buffer_to_image_cpu_job_info *info = - &job->cpu.copy_buffer_to_image; + timestamp.offsets = (uintptr_t)(void *)offsets; + timestamp.syncs = (uintptr_t)(void *)syncs; - /* Wait for all GPU work to finish first, since we may be accessing - * the BOs involved in the operation. + struct drm_v3d_multi_sync ms = {0}; + + /* The CPU job should be serialized so it only executes after all previously + * submitted work has completed */ - v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); - - /* Map BOs */ - struct v3dv_bo *dst_bo = info->image->mem->bo; - assert(!dst_bo->map || dst_bo->map_size == dst_bo->size); - if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size)) - return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - void *dst_ptr = dst_bo->map; - - struct v3dv_bo *src_bo = info->buffer->mem->bo; - assert(!src_bo->map || src_bo->map_size == src_bo->size); - if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size)) - return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - void *src_ptr = src_bo->map; - - const struct v3d_resource_slice *slice = - &info->image->slices[info->mip_level]; - - const struct pipe_box box = { - info->image_offset.x, info->image_offset.y, info->base_layer, - info->image_extent.width, info->image_extent.height, info->layer_count, - }; + job->serialize = V3DV_BARRIER_ALL; - /* Copy each layer */ - for (uint32_t i = 0; i < info->layer_count; i++) { - const uint32_t dst_offset = - v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i); - const uint32_t src_offset = - info->buffer->mem_offset + info->buffer_offset + - info->buffer_layer_stride * i; - v3d_store_tiled_image( - dst_ptr + dst_offset, slice->stride, - src_ptr + src_offset, info->buffer_stride, - slice->tiling, info->image->cpp, slice->padded_height, &box); - } + set_multisync(&ms, sync_info, NULL, 0, (void *)×tamp, device, job, + V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - return VK_SUCCESS; -} + submit.flags |= DRM_V3D_SUBMIT_EXTENSION; + submit.extensions = (uintptr_t)(void *)&ms; -static VkResult -handle_timestamp_query_cpu_job(struct v3dv_job *job) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY); - struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp; + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_SUBMIT_CPU, &submit); - /* Wait for completion of all work queued before the timestamp query */ - v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); + free(offsets); + free(syncs); + multisync_free(device, &ms); - /* Compute timestamp */ - struct timespec t; - clock_gettime(CLOCK_MONOTONIC, &t); + queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false; - for (uint32_t i = 0; i < info->count; i++) { - assert(info->query + i < info->pool->query_count); - struct v3dv_query *query = &info->pool->queries[info->query + i]; - query->maybe_available = true; - if (i == 0) - query->value = t.tv_sec * 1000000000ull + t.tv_nsec; - } + if (ret) + return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m"); return VK_SUCCESS; } static VkResult -handle_csd_job(struct v3dv_queue *queue, - struct v3dv_job *job, - bool do_sem_wait); - -static VkResult handle_csd_indirect_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, - bool do_sem_wait) + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { + struct v3dv_device *device = queue->device; + assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect; assert(info->csd_job); - /* Make sure the GPU is no longer using the indirect buffer*/ - assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); - v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE); - - /* Map the indirect buffer and read the dispatch parameters */ assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); struct v3dv_bo *bo = info->buffer->mem->bo; - if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) - return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - assert(bo->map); - const uint32_t offset = info->buffer->mem_offset + info->offset; - const uint32_t *group_counts = (uint32_t *) (bo->map + offset); - if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0) - return VK_SUCCESS; + if (!device->pdevice->caps.cpu_queue) { + /* Make sure the GPU is no longer using the indirect buffer*/ + v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE); - if (memcmp(group_counts, info->csd_job->csd.wg_count, - sizeof(info->csd_job->csd.wg_count)) != 0) { - v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); - } + /* Map the indirect buffer and read the dispatch parameters */ + if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) + return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); + assert(bo->map); - handle_csd_job(queue, info->csd_job, do_sem_wait); + const uint32_t offset = info->buffer->mem_offset + info->offset; + const uint32_t *group_counts = (uint32_t *) (bo->map + offset); + if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0) + return VK_SUCCESS; - return VK_SUCCESS; -} + if (memcmp(group_counts, info->csd_job->csd.wg_count, + sizeof(info->csd_job->csd.wg_count)) != 0) { + v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts); + } -static VkResult -process_semaphores_to_signal(struct v3dv_device *device, - uint32_t count, const VkSemaphore *sems) -{ - if (count == 0) return VK_SUCCESS; + } - int render_fd = device->pdevice->render_fd; + struct v3dv_job *csd_job = info->csd_job; - int fd; - mtx_lock(&device->mutex); - drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd); - mtx_unlock(&device->mutex); - if (fd == -1) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + struct drm_v3d_submit_cpu submit = {0}; - VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < count; i++) { - struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]); - - int ret; - if (!sem->temp_sync) - ret = drmSyncobjImportSyncFile(render_fd, sem->sync, fd); - else - ret = drmSyncobjImportSyncFile(render_fd, sem->temp_sync, fd); - - if (ret) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - break; - } + submit.bo_handle_count = 1; + submit.bo_handles = (uintptr_t)(void *)&bo->handle; + + csd_job->csd.submit.bo_handle_count = csd_job->bo_count; + uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count); + uint32_t bo_idx = 0; + set_foreach (csd_job->bos, entry) { + struct v3dv_bo *bo = (struct v3dv_bo *)entry->key; + bo_handles[bo_idx++] = bo->handle; } + csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles; - assert(fd >= 0); - close(fd); + struct drm_v3d_indirect_csd indirect = {0}; - return result; -} + set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0); -static VkResult -process_fence_to_signal(struct v3dv_device *device, VkFence _fence) -{ - if (_fence == VK_NULL_HANDLE) - return VK_SUCCESS; + indirect.submit = csd_job->csd.submit; + indirect.offset = info->buffer->mem_offset + info->offset; + indirect.wg_size = info->wg_size; + + for (int i = 0; i < 3; i++) { + if (info->wg_uniform_offsets[i]) { + assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base); + indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base; + } else { + indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */ + } + } - struct v3dv_fence *fence = v3dv_fence_from_handle(_fence); + indirect.indirect = csd_job->indirect.bo->handle; - int render_fd = device->pdevice->render_fd; + struct drm_v3d_multi_sync ms = {0}; - int fd; - mtx_lock(&device->mutex); - drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd); - mtx_unlock(&device->mutex); - if (fd == -1) + /* We need to configure the semaphores of this job with the indirect + * CSD job, as the CPU job must obey to the CSD job synchronization + * demands, such as barriers. + */ + set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job, + V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs); + if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - int ret; - if (!fence->temp_sync) - ret = drmSyncobjImportSyncFile(render_fd, fence->sync, fd); - else - ret = drmSyncobjImportSyncFile(render_fd, fence->temp_sync, fd); + submit.flags |= DRM_V3D_SUBMIT_EXTENSION; + submit.extensions = (uintptr_t)(void *)&ms; + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_SUBMIT_CPU, &submit); + + free(bo_handles); + multisync_free(device, &ms); - assert(fd >= 0); - close(fd); + queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false; + queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false; - return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS; + if (ret) + return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m"); + + return VK_SUCCESS; } static VkResult handle_cl_job(struct v3dv_queue *queue, struct v3dv_job *job, - bool do_sem_wait) + uint32_t counter_pass_idx, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { struct v3dv_device *device = queue->device; @@ -599,7 +919,8 @@ handle_cl_job(struct v3dv_queue *queue, struct v3dv_bo *bcl_fist_bo = list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link); submit.bcl_start = bcl_fist_bo->offset; - submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl); + submit.bcl_end = job->suspending ? job->suspended_bcl_end : + job->bcl.bo->offset + v3dv_cl_offset(&job->bcl); submit.rcl_start = job->rcl.bo->offset; submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl); @@ -611,6 +932,17 @@ handle_cl_job(struct v3dv_queue *queue, if (job->tmu_dirty_rcl) submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE; + /* If the job uses VK_KHR_buffer_device_address we need to ensure all + * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT + * are included. + */ + if (job->uses_buffer_device_address) { + util_dynarray_foreach(&queue->device->device_address_bo_list, + struct v3dv_bo *, bo) { + v3dv_job_add_bo(job, *bo); + } + } + submit.bo_handle_count = job->bo_count; uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count); @@ -622,34 +954,64 @@ handle_cl_job(struct v3dv_queue *queue, assert(bo_idx == submit.bo_handle_count); submit.bo_handles = (uintptr_t)(void *)bo_handles; - /* We need a binning sync if we are waiting on a sempahore (do_sem_wait) or - * if the job comes after a pipeline barrier than involves geometry stages - * (needs_bcl_sync). + submit.perfmon_id = job->perf ? + job->perf->kperfmon_ids[counter_pass_idx] : 0; + const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id; + queue->last_perfmon_id = submit.perfmon_id; + + /* We need a binning sync if we are the first CL job waiting on a semaphore + * with a wait stage that involves the geometry pipeline, or if the job + * comes after a pipeline barrier that involves geometry stages + * (needs_bcl_sync) or when performance queries are in use. * * We need a render sync if the job doesn't need a binning sync but has * still been flagged for serialization. It should be noted that RCL jobs * don't start until the previous RCL job has finished so we don't really * need to add a fence for those, however, we might need to wait on a CSD or * TFU job, which are not automatically serialized with CL jobs. - * - * FIXME: for now, if we are asked to wait on any semaphores, we just wait - * on the last job we submitted. In the future we might want to pass the - * actual syncobj of the wait semaphores so we don't block on the last RCL - * if we only need to wait for a previous CSD or TFU, for example, but - * we would have to extend our kernel interface to support the case where - * we have more than one semaphore to wait on. */ - const bool needs_bcl_sync = do_sem_wait || job->needs_bcl_sync; - const bool needs_rcl_sync = job->serialize && !needs_bcl_sync; + bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync; + if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) { + for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) { + needs_bcl_sync = sync_info->waits[i].stage_mask & + (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT | + VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | + VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | + VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | + VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | + VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | + VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT); + } + } + + bool needs_rcl_sync = job->serialize && !needs_bcl_sync; + + /* Replace single semaphore settings whenever our kernel-driver supports + * multiple semaphores extension. + */ + struct drm_v3d_multi_sync ms = { 0 }; + enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN; + set_multisync(&ms, sync_info, NULL, 0, NULL, device, job, + V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + submit.flags |= DRM_V3D_SUBMIT_EXTENSION; + submit.extensions = (uintptr_t)(void *)&ms; + + /* We are using multisync so disable legacy single-sync interface */ + submit.in_sync_rcl = 0; + submit.in_sync_bcl = 0; + submit.out_sync = 0; - mtx_lock(&queue->device->mutex); - submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0; - submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0; - submit.out_sync = device->last_job_sync; v3dv_clif_dump(device, job, &submit); int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit); - mtx_unlock(&queue->device->mutex); static bool warned = false; if (ret && !warned) { @@ -659,9 +1021,12 @@ handle_cl_job(struct v3dv_queue *queue, } free(bo_handles); + multisync_free(device, &ms); + + queue->last_job_syncs.first[V3DV_QUEUE_CL] = false; if (ret) - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); + return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m"); return VK_SUCCESS; } @@ -669,23 +1034,37 @@ handle_cl_job(struct v3dv_queue *queue, static VkResult handle_tfu_job(struct v3dv_queue *queue, struct v3dv_job *job, - bool do_sem_wait) + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { + assert(!V3D_DBG(DISABLE_TFU)); + struct v3dv_device *device = queue->device; - const bool needs_sync = do_sem_wait || job->serialize; + /* Replace single semaphore settings whenever our kernel-driver supports + * multiple semaphore extension. + */ + struct drm_v3d_multi_sync ms = { 0 }; + set_multisync(&ms, sync_info, NULL, 0, NULL, device, job, + V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION; + job->tfu.extensions = (uintptr_t)(void *)&ms; + + /* We are using multisync so disable legacy single-sync interface */ + job->tfu.in_sync = 0; + job->tfu.out_sync = 0; - mtx_lock(&device->mutex); - job->tfu.in_sync = needs_sync ? device->last_job_sync : 0; - job->tfu.out_sync = device->last_job_sync; int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu); - mtx_unlock(&device->mutex); - if (ret != 0) { - fprintf(stderr, "Failed to submit TFU job: %d\n", ret); - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); - } + multisync_free(device, &ms); + queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false; + + if (ret != 0) + return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m"); return VK_SUCCESS; } @@ -693,12 +1072,25 @@ handle_tfu_job(struct v3dv_queue *queue, static VkResult handle_csd_job(struct v3dv_queue *queue, struct v3dv_job *job, - bool do_sem_wait) + uint32_t counter_pass_idx, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { struct v3dv_device *device = queue->device; struct drm_v3d_submit_csd *submit = &job->csd.submit; + /* If the job uses VK_KHR_buffer_device_address we need to ensure all + * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT + * are included. + */ + if (job->uses_buffer_device_address) { + util_dynarray_foreach(&queue->device->device_address_bo_list, + struct v3dv_bo *, bo) { + v3dv_job_add_bo(job, *bo); + } + } + submit->bo_handle_count = job->bo_count; uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2)); @@ -710,14 +1102,28 @@ handle_csd_job(struct v3dv_queue *queue, assert(bo_idx == submit->bo_handle_count); submit->bo_handles = (uintptr_t)(void *)bo_handles; - const bool needs_sync = do_sem_wait || job->serialize; + /* Replace single semaphore settings whenever our kernel-driver supports + * multiple semaphore extension. + */ + struct drm_v3d_multi_sync ms = { 0 }; + set_multisync(&ms, sync_info, NULL, 0, NULL, device, job, + V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs); + if (!ms.base.id) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->flags |= DRM_V3D_SUBMIT_EXTENSION; + submit->extensions = (uintptr_t)(void *)&ms; + + /* We are using multisync so disable legacy single-sync interface */ + submit->in_sync = 0; + submit->out_sync = 0; + + submit->perfmon_id = job->perf ? + job->perf->kperfmon_ids[counter_pass_idx] : 0; + queue->last_perfmon_id = submit->perfmon_id; - mtx_lock(&queue->device->mutex); - submit->in_sync = needs_sync ? device->last_job_sync : 0; - submit->out_sync = device->last_job_sync; int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CSD, submit); - mtx_unlock(&queue->device->mutex); static bool warned = false; if (ret && !warned) { @@ -728,43 +1134,39 @@ handle_csd_job(struct v3dv_queue *queue, free(bo_handles); + multisync_free(device, &ms); + queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false; + if (ret) - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); + return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m"); return VK_SUCCESS; } static VkResult -queue_submit_job(struct v3dv_queue *queue, +queue_handle_job(struct v3dv_queue *queue, struct v3dv_job *job, - bool do_sem_wait, - pthread_t *wait_thread) + uint32_t counter_pass_idx, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { - assert(job); - switch (job->type) { case V3DV_JOB_TYPE_GPU_CL: - return handle_cl_job(queue, job, do_sem_wait); + return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs); case V3DV_JOB_TYPE_GPU_TFU: - return handle_tfu_job(queue, job, do_sem_wait); + return handle_tfu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_GPU_CSD: - return handle_csd_job(queue, job, do_sem_wait); + return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_RESET_QUERIES: - return handle_reset_query_cpu_job(job); + return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_END_QUERY: - return handle_end_query_cpu_job(job); + return handle_end_query_cpu_job(job, counter_pass_idx); case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: - return handle_copy_query_results_cpu_job(job); - case V3DV_JOB_TYPE_CPU_SET_EVENT: - return handle_set_event_cpu_job(job, wait_thread != NULL); - case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: - return handle_wait_events_cpu_job(job, do_sem_wait, wait_thread); - case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE: - return handle_copy_buffer_to_image_cpu_job(job); + return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: - return handle_csd_indirect_cpu_job(queue, job, do_sem_wait); + return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY: - return handle_timestamp_query_cpu_job(job); + return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs); default: unreachable("Unhandled job type"); } @@ -777,772 +1179,128 @@ queue_create_noop_job(struct v3dv_queue *queue) queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!queue->noop_job) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1); v3dv_X(device, job_emit_noop)(queue->noop_job); + /* We use no-op jobs to signal semaphores/fences. These jobs needs to be + * serialized across all hw queues to comply with Vulkan's signal operation + * order requirements, which basically require that signal operations occur + * in submission order. + */ + queue->noop_job->serialize = V3DV_BARRIER_ALL; + return VK_SUCCESS; } static VkResult -queue_submit_noop_job(struct v3dv_queue *queue, const VkSubmitInfo *pSubmit) +queue_submit_noop_job(struct v3dv_queue *queue, + uint32_t counter_pass_idx, + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { - /* VkQueue host access is externally synchronized so we don't need to lock - * here for the static variable. - */ if (!queue->noop_job) { VkResult result = queue_create_noop_job(queue); if (result != VK_SUCCESS) return result; } - return queue_submit_job(queue, queue->noop_job, - pSubmit->waitSemaphoreCount > 0, NULL); -} - -static VkResult -queue_submit_cmd_buffer(struct v3dv_queue *queue, - struct v3dv_cmd_buffer *cmd_buffer, - const VkSubmitInfo *pSubmit, - pthread_t *wait_thread) -{ - assert(cmd_buffer); - assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE); - - if (list_is_empty(&cmd_buffer->jobs)) - return queue_submit_noop_job(queue, pSubmit); - - list_for_each_entry_safe(struct v3dv_job, job, - &cmd_buffer->jobs, list_link) { - VkResult result = queue_submit_job(queue, job, - pSubmit->waitSemaphoreCount > 0, - wait_thread); - if (result != VK_SUCCESS) - return result; - } - - return VK_SUCCESS; -} - -static void -add_wait_thread_to_list(struct v3dv_device *device, - pthread_t thread, - struct v3dv_queue_submit_wait_info **wait_info) -{ - /* If this is the first time we spawn a wait thread for this queue - * submission create a v3dv_queue_submit_wait_info to track this and - * any other threads in the same submission and add it to the global list - * in the queue. - */ - if (*wait_info == NULL) { - *wait_info = - vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - (*wait_info)->device = device; - } - - /* And add the thread to the list of wait threads for this submission */ - const uint32_t thread_idx = (*wait_info)->wait_thread_count; - assert(thread_idx < 16); - (*wait_info)->wait_threads[thread_idx].thread = thread; - (*wait_info)->wait_threads[thread_idx].finished = false; - (*wait_info)->wait_thread_count++; -} - -static void -add_signal_semaphores_to_wait_list(struct v3dv_device *device, - const VkSubmitInfo *pSubmit, - struct v3dv_queue_submit_wait_info *wait_info) -{ - assert(wait_info); - - if (pSubmit->signalSemaphoreCount == 0) - return; - - /* FIXME: We put all the semaphores in a list and we signal all of them - * together from the submit master thread when the last wait thread in the - * submit completes. We could do better though: group the semaphores per - * submit and signal them as soon as all wait threads for a particular - * submit completes. Not sure if the extra work would be worth it though, - * since we only spawn waith threads for event waits and only when the - * event if set from the host after the queue submission. - */ - - /* Check the size of the current semaphore list */ - const uint32_t prev_count = wait_info->signal_semaphore_count; - const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore); - VkSemaphore *prev_list = wait_info->signal_semaphores; - - /* Resize the list to hold the additional semaphores */ - const uint32_t extra_alloc_size = - pSubmit->signalSemaphoreCount * sizeof(VkSemaphore); - wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount; - wait_info->signal_semaphores = - vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - - /* Copy the old list to the new allocation and free the old list */ - if (prev_count > 0) { - memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size); - vk_free(&device->vk.alloc, prev_list); - } - - /* Add the new semaphores to the list */ - memcpy(wait_info->signal_semaphores + prev_count, - pSubmit->pSignalSemaphores, extra_alloc_size); -} - -static VkResult -queue_submit_cmd_buffer_batch(struct v3dv_queue *queue, - const VkSubmitInfo *pSubmit, - struct v3dv_queue_submit_wait_info **wait_info) -{ - VkResult result = VK_SUCCESS; - bool has_wait_threads = false; - - /* Even if we don't have any actual work to submit we still need to wait - * on the wait semaphores and signal the signal semaphores and fence, so - * in this scenario we just submit a trivial no-op job so we don't have - * to do anything special, it should not be a common case anyway. - */ - if (pSubmit->commandBufferCount == 0) { - result = queue_submit_noop_job(queue, pSubmit); - } else { - for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) { - pthread_t wait_thread; - struct v3dv_cmd_buffer *cmd_buffer = - v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]); - result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit, - &wait_thread); - - /* We get VK_NOT_READY if we had to spawn a wait thread for the - * command buffer. In that scenario, we want to continue submitting - * any pending command buffers in the batch, but we don't want to - * process any signal semaphores for the batch until we know we have - * submitted every job for every command buffer in the batch. - */ - if (result == VK_NOT_READY) { - result = VK_SUCCESS; - add_wait_thread_to_list(queue->device, wait_thread, wait_info); - has_wait_threads = true; - } - - if (result != VK_SUCCESS) - break; - } - } - - if (result != VK_SUCCESS) - return result; - - /* If had to emit any wait threads in this submit we need to wait for all - * of them to complete before we can signal any semaphores. - */ - if (!has_wait_threads) { - return process_semaphores_to_signal(queue->device, - pSubmit->signalSemaphoreCount, - pSubmit->pSignalSemaphores); - } else { - assert(*wait_info); - add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info); - return VK_NOT_READY; - } + assert(queue->noop_job); + return queue_handle_job(queue, queue->noop_job, counter_pass_idx, + sync_info, signal_syncs); } -static void * -master_wait_thread_func(void *_wait_info) +VkResult +v3dv_queue_driver_submit(struct vk_queue *vk_queue, + struct vk_queue_submit *submit) { - struct v3dv_queue_submit_wait_info *wait_info = - (struct v3dv_queue_submit_wait_info *) _wait_info; - - struct v3dv_queue *queue = &wait_info->device->queue; - - /* Wait for all command buffer wait threads to complete */ - for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) { - int res = pthread_join(wait_info->wait_threads[i].thread, NULL); - if (res != 0) - fprintf(stderr, "Wait thread failed to join.\n"); - } - - /* Signal semaphores and fences */ + struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk); VkResult result; - result = process_semaphores_to_signal(wait_info->device, - wait_info->signal_semaphore_count, - wait_info->signal_semaphores); - if (result != VK_SUCCESS) - fprintf(stderr, "Wait thread semaphore signaling failed."); - - result = process_fence_to_signal(wait_info->device, wait_info->fence); - if (result != VK_SUCCESS) - fprintf(stderr, "Wait thread fence signaling failed."); - - /* Release wait_info */ - mtx_lock(&queue->mutex); - list_del(&wait_info->list_link); - mtx_unlock(&queue->mutex); - - vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores); - vk_free(&wait_info->device->vk.alloc, wait_info); - - return NULL; -} - - -static VkResult -spawn_master_wait_thread(struct v3dv_queue *queue, - struct v3dv_queue_submit_wait_info *wait_info) - -{ - VkResult result = VK_SUCCESS; - - mtx_lock(&queue->mutex); - if (pthread_create(&wait_info->master_wait_thread, NULL, - master_wait_thread_func, wait_info)) { - result = vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST); - goto done; - } - - list_addtail(&wait_info->list_link, &queue->submit_wait_list); - -done: - mtx_unlock(&queue->mutex); - return result; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_QueueSubmit(VkQueue _queue, - uint32_t submitCount, - const VkSubmitInfo* pSubmits, - VkFence fence) -{ - V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); - - struct v3dv_queue_submit_wait_info *wait_info = NULL; - - VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < submitCount; i++) { - result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info); - if (result != VK_SUCCESS && result != VK_NOT_READY) - goto done; - } - - if (!wait_info) { - assert(result != VK_NOT_READY); - result = process_fence_to_signal(queue->device, fence); - goto done; - } - - /* We emitted wait threads, so we have to spwan a master thread for this - * queue submission that waits for all other threads to complete and then - * will signal any semaphores and fences. - */ - assert(wait_info); - wait_info->fence = fence; - result = spawn_master_wait_thread(queue, wait_info); - -done: - return result; -} - -static void -destroy_syncobj(uint32_t device_fd, uint32_t *sync) -{ - assert(sync); - drmSyncobjDestroy(device_fd, *sync); - *sync = 0; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateSemaphore(VkDevice _device, - const VkSemaphoreCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkSemaphore *pSemaphore) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO); - - struct v3dv_semaphore *sem = - vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore), - VK_OBJECT_TYPE_SEMAPHORE); - if (sem == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync); - if (ret) { - vk_object_free(&device->vk, pAllocator, sem); - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - } - - *pSemaphore = v3dv_semaphore_to_handle(sem); - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceExternalSemaphoreProperties( - VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo, - VkExternalSemaphoreProperties *pExternalSemaphoreProperties) -{ - switch (pExternalSemaphoreInfo->handleType) { - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - pExternalSemaphoreProperties->exportFromImportedHandleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalSemaphoreProperties->compatibleHandleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; - - /* FIXME: we can't import external semaphores until we improve the kernel - * submit interface to handle multiple in syncobjs, because once we have - * an imported semaphore in our list of semaphores to wait on, we can no - * longer use the workaround of waiting on the last syncobj fence produced - * from the device, since the imported semaphore may not (and in fact, it - * would typically not) have been produced from same device. - * - * This behavior is exercised via dEQP-VK.synchronization.cross_instance.*. - * Particularly, this test: - * dEQP-VK.synchronization.cross_instance.dedicated. - * write_ssbo_compute_read_vertex_input.buffer_16384_binary_semaphore_fd - * fails consistently because of this, so it'll be a good reference to - * verify the implementation when the kernel bits are in place. - */ - pExternalSemaphoreProperties->externalSemaphoreFeatures = 0; - - /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties - * for details on why we can't export to SYNC_FD. - */ - if (pExternalSemaphoreInfo->handleType != - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { - pExternalSemaphoreProperties->externalSemaphoreFeatures |= - VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT; - } - break; - default: - pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0; - pExternalSemaphoreProperties->compatibleHandleTypes = 0; - pExternalSemaphoreProperties->externalSemaphoreFeatures = 0; - break; - } -} + struct v3dv_submit_sync_info sync_info = { + .wait_count = submit->wait_count, + .waits = submit->waits, + .signal_count = submit->signal_count, + .signals = submit->signals, + }; -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ImportSemaphoreFdKHR( - VkDevice _device, - const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore); - - assert(pImportSemaphoreFdInfo->sType == - VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR); - - int fd = pImportSemaphoreFdInfo->fd; - int render_fd = device->pdevice->render_fd; - - bool is_temporary = - pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT || - (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT); - - uint32_t new_sync; - switch (pImportSemaphoreFdInfo->handleType) { - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: { - /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the - * special value -1 for fd is treated like a valid sync file descriptor - * referring to an object that has already signaled. The import - * operation will succeed and the VkSemaphore will have a temporarily - * imported payload as if a valid file descriptor had been provided." - */ - unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0; - if (drmSyncobjCreate(render_fd, flags, &new_sync)) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (fd != -1) { - if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) { - drmSyncobjDestroy(render_fd, new_sync); - return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); + for (int i = 0; i < V3DV_QUEUE_COUNT; i++) + queue->last_job_syncs.first[i] = true; + + struct v3dv_job *first_suspend_job = NULL; + struct v3dv_job *current_suspend_job = NULL; + for (uint32_t i = 0; i < submit->command_buffer_count; i++) { + struct v3dv_cmd_buffer *cmd_buffer = + container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk); + list_for_each_entry_safe(struct v3dv_job, job, + &cmd_buffer->jobs, list_link) { + if (job->suspending) { + job = v3dv_X(job->device, + cmd_buffer_prepare_suspend_job_for_submit)(job); + if (!job) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; } - } - break; - } - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: { - if (drmSyncobjFDToHandle(render_fd, fd, &new_sync)) - return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); - break; - } - default: - return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); - } - destroy_syncobj(render_fd, &sem->temp_sync); - if (is_temporary) { - sem->temp_sync = new_sync; - } else { - destroy_syncobj(render_fd, &sem->sync); - sem->sync = new_sync; - } - - /* From the Vulkan 1.0.53 spec: - * - * "Importing a semaphore payload from a file descriptor transfers - * ownership of the file descriptor from the application to the - * Vulkan implementation. The application must not perform any - * operations on the file descriptor after a successful import." - * - * If the import fails, we leave the file descriptor open. - */ - if (fd != -1) - close(fd); - - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetSemaphoreFdKHR(VkDevice _device, - const VkSemaphoreGetFdInfoKHR *pGetFdInfo, - int *pFd) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore); - - assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR); - - *pFd = -1; - int render_fd = device->pdevice->render_fd; - switch (pGetFdInfo->handleType) { - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: { - drmSyncobjExportSyncFile(render_fd, sem->sync, pFd); - if (*pFd == -1) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - drmSyncobjHandleToFD(render_fd, sem->sync, pFd); - if (*pFd == -1) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - } - default: - unreachable("Unsupported external semaphore handle type"); - } - - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroySemaphore(VkDevice _device, - VkSemaphore semaphore, - const VkAllocationCallbacks *pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore); - - if (sem == NULL) - return; - - destroy_syncobj(device->pdevice->render_fd, &sem->sync); - destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync); - - vk_object_free(&device->vk, pAllocator, sem); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateFence(VkDevice _device, - const VkFenceCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkFence *pFence) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO); - - struct v3dv_fence *fence = - vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence), - VK_OBJECT_TYPE_FENCE); - if (fence == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - unsigned flags = 0; - if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) - flags |= DRM_SYNCOBJ_CREATE_SIGNALED; - int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync); - if (ret) { - vk_object_free(&device->vk, pAllocator, fence); - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - } - - *pFence = v3dv_fence_to_handle(fence); + if (job->suspending && !job->resuming) { + assert(!first_suspend_job); + assert(!current_suspend_job); + first_suspend_job = job; + } - return VK_SUCCESS; -} + if (job->resuming) { + assert(first_suspend_job); + assert(current_suspend_job); + v3dv_X(job->device, job_patch_resume_address)(first_suspend_job, + current_suspend_job, + job); + current_suspend_job = NULL; + } -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceExternalFenceProperties( - VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo, - VkExternalFenceProperties *pExternalFenceProperties) + if (job->suspending) { + current_suspend_job = job; + } else { + assert(!current_suspend_job); + struct v3dv_job *submit_job = first_suspend_job ? + first_suspend_job : job; + result = + queue_handle_job(queue, submit_job, submit->perf_pass_index, + &sync_info, false); -{ - switch (pExternalFenceInfo->handleType) { - case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: - case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: - pExternalFenceProperties->exportFromImportedHandleTypes = - VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalFenceProperties->compatibleHandleTypes = - VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalFenceProperties->externalFenceFeatures = - VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT; - - /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not - * the syncobj itself, and that fence is only created after we have - * submitted to the kernel and updated the syncobj for the fence to import - * the actual DRM fence created with the submission. Unfortunately, if the - * queue submission has a 'wait for events' we may hold any jobs after the - * wait in a user-space thread until the events are signaled, and in that - * case we don't update the out fence of the submit until the events are - * signaled and we can submit all the jobs involved with the vkQueueSubmit - * call. This means that if the applications submits with an out fence and - * a wait for events, trying to export the out fence to a SYNC_FD rigth - * after the submission and before the events are signaled will fail, - * because the actual DRM fence won't exist yet. This is not a problem - * with OPAQUE_FD because in this case we export the entire syncobj, not - * the underlying DRM fence. To fix this we need to rework our kernel - * interface to be more flexible and accept multiple in/out syncobjs so - * we can implement event waits as regular fence waits on the kernel side, - * until then, we can only reliably export OPAQUE_FD. - */ - if (pExternalFenceInfo->handleType != - VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) { - pExternalFenceProperties->externalFenceFeatures |= - VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT; - } - break; - default: - pExternalFenceProperties->exportFromImportedHandleTypes = 0; - pExternalFenceProperties->compatibleHandleTypes = 0; - pExternalFenceProperties->externalFenceFeatures = 0; - break; - } -} + if (result != VK_SUCCESS) + return result; -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ImportFenceFdKHR(VkDevice _device, - const VkImportFenceFdInfoKHR *pImportFenceFdInfo) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence); - - assert(pImportFenceFdInfo->sType == - VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR); - - int fd = pImportFenceFdInfo->fd; - int render_fd = device->pdevice->render_fd; - - bool is_temporary = - pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT || - (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT); - - uint32_t new_sync; - switch (pImportFenceFdInfo->handleType) { - case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { - /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the - * special value -1 for fd is treated like a valid sync file descriptor - * referring to an object that has already signaled. The import - * operation will succeed and the VkFence will have a temporarily - * imported payload as if a valid file descriptor had been provided." - */ - unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0; - if (drmSyncobjCreate(render_fd, flags, &new_sync)) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (fd != -1) { - if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) { - drmSyncobjDestroy(render_fd, new_sync); - return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); + first_suspend_job = NULL; } } - break; - } - case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: { - if (drmSyncobjFDToHandle(render_fd, fd, &new_sync)) - return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); - break; - } - default: - return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); - } - - destroy_syncobj(render_fd, &fence->temp_sync); - if (is_temporary) { - fence->temp_sync = new_sync; - } else { - destroy_syncobj(render_fd, &fence->sync); - fence->sync = new_sync; - } - - /* From the Vulkan 1.0.53 spec: - * - * "Importing a fence payload from a file descriptor transfers - * ownership of the file descriptor from the application to the - * Vulkan implementation. The application must not perform any - * operations on the file descriptor after a successful import." - * - * If the import fails, we leave the file descriptor open. - */ - if (fd != -1) - close(fd); - - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroyFence(VkDevice _device, - VkFence _fence, - const VkAllocationCallbacks *pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, _fence); - - if (fence == NULL) - return; - - destroy_syncobj(device->pdevice->render_fd, &fence->sync); - destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync); - vk_object_free(&device->vk, pAllocator, fence); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetFenceStatus(VkDevice _device, VkFence _fence) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, _fence); - - int ret = drmSyncobjWait(device->pdevice->render_fd, &fence->sync, 1, - 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL); - if (ret == -ETIME) - return VK_NOT_READY; - else if (ret) - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetFenceFdKHR(VkDevice _device, - const VkFenceGetFdInfoKHR *pGetFdInfo, - int *pFd) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence); - - assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR); - - *pFd = -1; - int render_fd = device->pdevice->render_fd; - switch (pGetFdInfo->handleType) { - case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { - drmSyncobjExportSyncFile(render_fd, fence->sync, pFd); - if (*pFd == -1) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: - drmSyncobjHandleToFD(render_fd, fence->sync, pFd); - if (*pFd == -1) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - } - default: - unreachable("Unsupported external fence handle type"); - } - - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - uint32_t *syncobjs = vk_alloc(&device->vk.alloc, - sizeof(*syncobjs) * fenceCount, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!syncobjs) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - int render_fd = device->pdevice->render_fd; - uint32_t reset_count = 0; - for (uint32_t i = 0; i < fenceCount; i++) { - struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]); - /* From the Vulkan spec, section 'Importing Fence Payloads': + /* If the command buffer ends with a barrier we need to consume it now. * - * "If the import is temporary, the fence will be restored to its - * permanent state the next time that fence is passed to - * vkResetFences. - * - * Note: Restoring a fence to its prior permanent payload is a - * distinct operation from resetting a fence payload." - * - * To restore the previous state, we just need to destroy the temporary. + * FIXME: this will drain all hw queues. Instead, we could use the pending + * barrier state to limit the queues we serialize against. */ - if (fence->temp_sync) - destroy_syncobj(render_fd, &fence->temp_sync); - else - syncobjs[reset_count++] = fence->sync; + if (cmd_buffer->state.barrier.dst_mask) { + result = queue_submit_noop_job(queue, submit->perf_pass_index, + &sync_info, false); + if (result != VK_SUCCESS) + return result; + } } - int ret = 0; - if (reset_count > 0) - ret = drmSyncobjReset(render_fd, syncobjs, reset_count); + assert(!first_suspend_job); + assert(!current_suspend_job); - vk_free(&device->vk.alloc, syncobjs); - - if (ret) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_WaitForFences(VkDevice _device, - uint32_t fenceCount, - const VkFence *pFences, - VkBool32 waitAll, - uint64_t timeout) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - const uint64_t abs_timeout = get_absolute_timeout(timeout); - - uint32_t *syncobjs = vk_alloc(&device->vk.alloc, - sizeof(*syncobjs) * fenceCount, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!syncobjs) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - for (uint32_t i = 0; i < fenceCount; i++) { - struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]); - syncobjs[i] = fence->temp_sync ? fence->temp_sync : fence->sync; + /* Handle signaling now */ + if (submit->signal_count > 0) { + /* Finish by submitting a no-op job that synchronizes across all queues. + * This will ensure that the signal semaphores don't get triggered until + * all work on any queue completes. See Vulkan's signal operation order + * requirements. + */ + return queue_submit_noop_job(queue, submit->perf_pass_index, + &sync_info, true); } - unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT; - if (waitAll) - flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL; - - int ret; - do { - ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount, - timeout, flags, NULL); - } while (ret == -ETIME && gettime_ns() < abs_timeout); - - vk_free(&device->vk.alloc, syncobjs); - - if (ret == -ETIME) - return VK_TIMEOUT; - else if (ret) - return vk_error(device->instance, VK_ERROR_DEVICE_LOST); return VK_SUCCESS; } @@ -1553,5 +1311,5 @@ v3dv_QueueBindSparse(VkQueue _queue, VkFence fence) { V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); - return vk_error(queue->device->instance, VK_ERROR_FEATURE_NOT_PRESENT); + return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT); } diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c index 47bc3a0b17c..eab8c0f0840 100644 --- a/src/broadcom/vulkan/v3dv_uniforms.c +++ b/src/broadcom/vulkan/v3dv_uniforms.c @@ -1,5 +1,5 @@ /* - * Copyright © 2019 Raspberry Pi + * Copyright © 2019 Raspberry Pi Ltd * * Based in part on v3d driver which is: * @@ -26,16 +26,6 @@ */ #include "v3dv_private.h" -#include "vk_format_info.h" - -/* The only version specific structure that we need is - * TMU_CONFIG_PARAMETER_1. This didn't seem to change significantly from - * previous V3D versions and we don't expect that to change, so for now let's - * just hardcode the V3D version here. - */ -#define V3D_VERSION 41 -#include "broadcom/common/v3d_macros.h" -#include "broadcom/cle/v3dx_pack.h" /* Our Vulkan resource indices represent indices in descriptor maps which * include all shader stages, so we need to size the arrays below @@ -57,7 +47,8 @@ struct state_bo_list { struct v3dv_bo *states[MAX_TOTAL_STATES]; }; -#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES) +#define MAX_TOTAL_UNIFORM_BUFFERS ((MAX_UNIFORM_BUFFERS + \ + MAX_INLINE_UNIFORM_BUFFERS) * MAX_STAGES) #define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES) struct buffer_bo_list { struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS]; @@ -74,29 +65,36 @@ state_bo_in_list(struct state_bo_list *list, struct v3dv_bo *bo) return false; } +static void +push_constants_bo_free(VkDevice _device, + uint64_t bo_ptr, + VkAllocationCallbacks *alloc) +{ + V3DV_FROM_HANDLE(v3dv_device, device, _device); + v3dv_bo_free(device, (struct v3dv_bo *)(uintptr_t) bo_ptr); +} + /* * This method checks if the ubo used for push constants is needed to be * updated or not. * - * push contants ubo is only used for push constants accessed by a non-const + * push constants ubo is only used for push constants accessed by a non-const * index. - * - * FIXME: right now for this cases we are uploading the full - * push_constants_data. An improvement would be to upload only the data that - * we need to rely on a UBO. */ static void check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_pipeline *pipeline) { - if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS) || + if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO) || pipeline->layout->push_constant_size == 0) return; if (cmd_buffer->push_constants_resource.bo == NULL) { cmd_buffer->push_constants_resource.bo = - v3dv_bo_alloc(cmd_buffer->device, MAX_PUSH_CONSTANTS_SIZE, - "push constants", true); + v3dv_bo_alloc(cmd_buffer->device, 4096, "push constants", true); + + v3dv_job_add_bo(cmd_buffer->state.job, + cmd_buffer->push_constants_resource.bo); if (!cmd_buffer->push_constants_resource.bo) { fprintf(stderr, "Failed to allocate memory for push constants\n"); @@ -105,28 +103,41 @@ check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer, bool ok = v3dv_bo_map(cmd_buffer->device, cmd_buffer->push_constants_resource.bo, - MAX_PUSH_CONSTANTS_SIZE); + cmd_buffer->push_constants_resource.bo->size); if (!ok) { fprintf(stderr, "failed to map push constants buffer\n"); abort(); } } else { - if (cmd_buffer->push_constants_resource.offset + MAX_PUSH_CONSTANTS_SIZE <= + if (cmd_buffer->push_constants_resource.offset + + cmd_buffer->state.push_constants_size <= cmd_buffer->push_constants_resource.bo->size) { - cmd_buffer->push_constants_resource.offset += MAX_PUSH_CONSTANTS_SIZE; + cmd_buffer->push_constants_resource.offset += + cmd_buffer->state.push_constants_size; } else { - /* FIXME: we got out of space for push descriptors. Should we create - * a new bo? This could be easier with a uploader + /* We ran out of space so we'll have to allocate a new buffer but we + * need to ensure the old one is preserved until the end of the command + * buffer life and make sure it is eventually freed. We use the + * private object machinery in the command buffer for this. */ + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uintptr_t) cmd_buffer->push_constants_resource.bo, + (v3dv_cmd_buffer_private_obj_destroy_cb) push_constants_bo_free); + + /* Now call back so we create a new BO */ + cmd_buffer->push_constants_resource.bo = NULL; + check_push_constants_ubo(cmd_buffer, pipeline); + return; } } + assert(cmd_buffer->state.push_constants_size <= MAX_PUSH_CONSTANTS_SIZE); memcpy(cmd_buffer->push_constants_resource.bo->map + cmd_buffer->push_constants_resource.offset, - cmd_buffer->push_constants_data, - MAX_PUSH_CONSTANTS_SIZE); + cmd_buffer->state.push_constants_data, + cmd_buffer->state.push_constants_size); - cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PUSH_CONSTANTS; + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO; } /** V3D 4.x TMU configuration parameter 0 (texture) */ @@ -203,11 +214,8 @@ write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer, /* Set unnormalized coordinates flag from sampler object */ uint32_t p1_packed = v3d_unit_data_get_offset(data); if (sampler->unnormalized_coordinates) { - struct V3DX(TMU_CONFIG_PARAMETER_1) p1_unpacked; - V3DX(TMU_CONFIG_PARAMETER_1_unpack)((uint8_t *)&p1_packed, &p1_unpacked); - p1_unpacked.unnormalized_coordinates = true; - V3DX(TMU_CONFIG_PARAMETER_1_pack)(NULL, (uint8_t *)&p1_packed, - &p1_unpacked); + v3d_pack_unnormalized_coordinates(&cmd_buffer->device->devinfo, &p1_packed, + sampler->unnormalized_coordinates); } cl_aligned_u32(uniforms, sampler_state_reloc.bo->offset + @@ -248,13 +256,14 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer, uint32_t dynamic_offset = 0; - /* For ubos, index is shifted, as 0 is reserved for push constants. + /* For ubos, index is shifted, as 0 is reserved for push constants + * and 1..MAX_INLINE_UNIFORM_BUFFERS are reserved for inline uniform + * buffers. */ - if (content == QUNIFORM_UBO_ADDR && - v3d_unit_data_get_unit(data) == 0) { - /* This calls is to ensure that the push_constant_ubo is - * updated. It already take into account it is should do the - * update or not + uint32_t index = v3d_unit_data_get_unit(data); + if (content == QUNIFORM_UBO_ADDR && index == 0) { + /* Ensure the push constants UBO is created and updated. This also + * adds the BO to the job so we don't need to track it in buffer_bos. */ check_push_constants_ubo(cmd_buffer, pipeline); @@ -265,42 +274,99 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer, cl_aligned_u32(uniforms, resource->bo->offset + resource->offset + offset + dynamic_offset); - buffer_bos->ubo[0] = resource->bo; } else { - uint32_t index = - content == QUNIFORM_UBO_ADDR ? - v3d_unit_data_get_unit(data) - 1 : - data; + if (content == QUNIFORM_UBO_ADDR) { + /* We reserve UBO index 0 for push constants in Vulkan (and for the + * constant buffer in GL) so the compiler always adds one to all UBO + * indices, fix it up before we access the descriptor map, since + * indices start from 0 there. + */ + assert(index > 0); + index--; + } else { + index = data; + } struct v3dv_descriptor *descriptor = v3dv_descriptor_map_get_descriptor(descriptor_state, map, pipeline->layout, index, &dynamic_offset); + + /* Inline UBO descriptors store UBO data in descriptor pool memory, + * instead of an external buffer. + */ assert(descriptor); - assert(descriptor->buffer); - assert(descriptor->buffer->mem); - assert(descriptor->buffer->mem->bo); if (content == QUNIFORM_GET_SSBO_SIZE || content == QUNIFORM_GET_UBO_SIZE) { cl_aligned_u32(uniforms, descriptor->range); } else { - cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset + - descriptor->buffer->mem_offset + - descriptor->offset + - offset + dynamic_offset); + /* Inline uniform buffers store their contents in pool memory instead + * of an external buffer. + */ + struct v3dv_bo *bo; + uint32_t addr; + if (descriptor->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + assert(dynamic_offset == 0); + struct v3dv_cl_reloc reloc = + v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device, + descriptor_state, map, + pipeline->layout, index, + NULL); + bo = reloc.bo; + addr = reloc.bo->offset + reloc.offset + offset; + } else { + assert(descriptor->buffer); + assert(descriptor->buffer->mem); + assert(descriptor->buffer->mem->bo); + + bo = descriptor->buffer->mem->bo; + addr = bo->offset + + descriptor->buffer->mem_offset + + descriptor->offset + + offset + dynamic_offset; + } + + cl_aligned_u32(uniforms, addr); if (content == QUNIFORM_UBO_ADDR) { - assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS); - buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo; + assert(index < MAX_TOTAL_UNIFORM_BUFFERS); + buffer_bos->ubo[index] = bo; } else { assert(index < MAX_TOTAL_STORAGE_BUFFERS); - buffer_bos->ssbo[index] = descriptor->buffer->mem->bo; + buffer_bos->ssbo[index] = bo; } } } } +static void +write_inline_uniform(struct v3dv_cl_out **uniforms, + uint32_t index, + uint32_t offset, + struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline *pipeline, + enum broadcom_shader_stage stage) +{ + assert(index < MAX_INLINE_UNIFORM_BUFFERS); + + struct v3dv_descriptor_state *descriptor_state = + v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline); + + struct v3dv_descriptor_map *map = + &pipeline->shared_data->maps[stage]->ubo_map; + + struct v3dv_cl_reloc reloc = + v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device, + descriptor_state, map, + pipeline->layout, index, + NULL); + + /* Offset comes in 32-bit units */ + uint32_t *addr = reloc.bo->map + reloc.offset + 4 * offset; + cl_aligned_u32(uniforms, *addr); +} + static uint32_t get_texture_size_from_image_view(struct v3dv_image_view *image_view, enum quniform_contents contents, @@ -420,7 +486,6 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect); struct v3dv_cl_out *uniforms = cl_start(&job->indirect); - for (int i = 0; i < uinfo->count; i++) { uint32_t data = uinfo->data[i]; @@ -430,24 +495,45 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, break; case QUNIFORM_UNIFORM: - cl_aligned_u32(&uniforms, cmd_buffer->push_constants_data[data]); + cl_aligned_u32(&uniforms, cmd_buffer->state.push_constants_data[data]); break; - case QUNIFORM_VIEWPORT_X_SCALE: - cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f); + case QUNIFORM_INLINE_UBO_0: + case QUNIFORM_INLINE_UBO_1: + case QUNIFORM_INLINE_UBO_2: + case QUNIFORM_INLINE_UBO_3: + write_inline_uniform(&uniforms, + uinfo->contents[i] - QUNIFORM_INLINE_UBO_0, data, + cmd_buffer, pipeline, variant->stage); break; - case QUNIFORM_VIEWPORT_Y_SCALE: - cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f); + case QUNIFORM_VIEWPORT_X_SCALE: { + float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); + cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity); break; + } - case QUNIFORM_VIEWPORT_Z_OFFSET: - cl_aligned_f(&uniforms, dynamic->viewport.translate[0][2]); + case QUNIFORM_VIEWPORT_Y_SCALE: { + float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); + cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity); break; + } - case QUNIFORM_VIEWPORT_Z_SCALE: - cl_aligned_f(&uniforms, dynamic->viewport.scale[0][2]); + case QUNIFORM_VIEWPORT_Z_OFFSET: { + float translate_z; + v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0, + &translate_z, NULL); + cl_aligned_f(&uniforms, translate_z); break; + } + + case QUNIFORM_VIEWPORT_Z_SCALE: { + float scale_z; + v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0, + NULL, &scale_z); + cl_aligned_f(&uniforms, scale_z); + break; + } case QUNIFORM_SSBO_OFFSET: case QUNIFORM_UBO_ADDR: @@ -527,9 +613,9 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, } else if (cmd_buffer->state.framebuffer) { num_layers = cmd_buffer->state.framebuffer->layers; } else { - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); num_layers = 2048; -#if DEBUG +#if MESA_DEBUG fprintf(stderr, "Skipping gl_LayerID shader sanity check for " "secondary command buffer\n"); #endif @@ -571,6 +657,20 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, cl_aligned_u32(&uniforms, pipeline->spill.size_per_thread); break; + case QUNIFORM_DRAW_ID: + cl_aligned_u32(&uniforms, job->cmd_buffer->state.draw_id); + break; + + case QUNIFORM_LINE_WIDTH: + cl_aligned_u32(&uniforms, + job->cmd_buffer->vk.dynamic_graphics_state.rs.line.width); + break; + + case QUNIFORM_AA_LINE_WIDTH: + cl_aligned_u32(&uniforms, + v3dv_get_aa_line_width(pipeline, job->cmd_buffer)); + break; + default: unreachable("unsupported quniform_contents uniform type\n"); } diff --git a/src/broadcom/vulkan/v3dv_wsi.c b/src/broadcom/vulkan/v3dv_wsi.c index 23c542cbc05..78af39448ce 100644 --- a/src/broadcom/vulkan/v3dv_wsi.c +++ b/src/broadcom/vulkan/v3dv_wsi.c @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Raspberry Pi + * Copyright © 2020 Raspberry Pi Ltd * based on intel anv code: * Copyright © 2015 Intel Corporation @@ -24,123 +24,40 @@ */ #include "v3dv_private.h" -#include "drm-uapi/drm_fourcc.h" -#include "vk_format_info.h" #include "vk_util.h" #include "wsi_common.h" +#include "wsi_common_drm.h" +#include "wsi_common_entrypoints.h" static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL v3dv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName) { V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice); - PFN_vkVoidFunction func; - - func = vk_instance_dispatch_table_get(&pdevice->vk.instance->dispatch_table, pName); - if (func != NULL) - return func; - - func = vk_physical_device_dispatch_table_get(&pdevice->vk.dispatch_table, pName); - if (func != NULL) - return func; - - return vk_device_dispatch_table_get(&vk_device_trampolines, pName); + return vk_instance_get_proc_addr_unchecked(pdevice->vk.instance, pName); } static bool v3dv_wsi_can_present_on_device(VkPhysicalDevice _pdevice, int fd) { V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, _pdevice); - - drmDevicePtr fd_devinfo, display_devinfo; - int ret; - - ret = drmGetDevice2(fd, 0, &fd_devinfo); - if (ret) - return false; - - ret = drmGetDevice2(pdevice->display_fd, 0, &display_devinfo); - if (ret) { - drmFreeDevice(&fd_devinfo); - return false; - } - - bool result = drmDevicesEqual(fd_devinfo, display_devinfo); - - drmFreeDevice(&fd_devinfo); - drmFreeDevice(&display_devinfo); - return result; + assert(pdevice->display_fd != -1); + return wsi_common_drm_devices_equal(fd, pdevice->display_fd); } -VkResult -v3dv_wsi_init(struct v3dv_physical_device *physical_device) -{ - VkResult result; - - result = wsi_device_init(&physical_device->wsi_device, - v3dv_physical_device_to_handle(physical_device), - v3dv_wsi_proc_addr, - &physical_device->vk.instance->alloc, - physical_device->master_fd, NULL, false); - - if (result != VK_SUCCESS) - return result; - physical_device->wsi_device.supports_modifiers = true; - physical_device->wsi_device.can_present_on_device = - v3dv_wsi_can_present_on_device; - - return VK_SUCCESS; -} - -void -v3dv_wsi_finish(struct v3dv_physical_device *physical_device) -{ - wsi_device_finish(&physical_device->wsi_device, - &physical_device->vk.instance->alloc); -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroySurfaceKHR( - VkInstance _instance, - VkSurfaceKHR _surface, - const VkAllocationCallbacks* pAllocator) +static void +filter_surface_capabilities(VkSurfaceKHR _surface, + VkSurfaceCapabilitiesKHR *caps) { - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface); - if (!surface) - return; - - vk_free2(&instance->vk.alloc, pAllocator, surface); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceSurfaceSupportKHR( - VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex, - VkSurfaceKHR surface, - VkBool32* pSupported) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - - return wsi_common_get_surface_support(&device->wsi_device, - queueFamilyIndex, - surface, - pSupported); -} - -static void -constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps) -{ - /* Our display pipeline requires that images are linear, so we cannot - * ensure that our swapchain images can be sampled. If we are running under - * a compositor in windowed mode, the DRM modifier negotiation should - * probably end up selecting an UIF layout for the swapchain images but it - * may still choose linear and send images directly for scanout if the - * surface is in fullscreen mode for example. If we are not running under - * a compositor, then we would always need them to be linear anyway. + /* Display images must be linear so they are restricted. This would + * affect sampling usages too, but we don't restrict those since we + * support on-the-fly conversion to UIF when sampling for simple 2D + * images at a performance penalty. */ - caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_SAMPLED_BIT; + if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY) + caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_STORAGE_BIT; } VKAPI_ATTR VkResult VKAPI_CALL @@ -149,13 +66,11 @@ v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR( VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR* pSurfaceCapabilities) { - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - VkResult result; - result = wsi_common_get_surface_capabilities(&device->wsi_device, - surface, - pSurfaceCapabilities); - constraint_surface_capabilities(pSurfaceCapabilities); + result = wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice, + surface, + pSurfaceCapabilities); + filter_surface_capabilities(surface, pSurfaceCapabilities); return result; } @@ -165,227 +80,50 @@ v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR( const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, VkSurfaceCapabilities2KHR* pSurfaceCapabilities) { - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - VkResult result; - result = wsi_common_get_surface_capabilities2(&device->wsi_device, - pSurfaceInfo, - pSurfaceCapabilities); - constraint_surface_capabilities(&pSurfaceCapabilities->surfaceCapabilities); + result = wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(physicalDevice, + pSurfaceInfo, + pSurfaceCapabilities); + filter_surface_capabilities(pSurfaceInfo->surface, + &pSurfaceCapabilities->surfaceCapabilities); return result; } -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceSurfaceFormatsKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - uint32_t* pSurfaceFormatCount, - VkSurfaceFormatKHR* pSurfaceFormats) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - - return wsi_common_get_surface_formats(&device->wsi_device, surface, - pSurfaceFormatCount, pSurfaceFormats); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceSurfaceFormats2KHR( - VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, - uint32_t* pSurfaceFormatCount, - VkSurfaceFormat2KHR* pSurfaceFormats) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - - return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo, - pSurfaceFormatCount, pSurfaceFormats); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceSurfacePresentModesKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - uint32_t* pPresentModeCount, - VkPresentModeKHR* pPresentModes) +VkResult +v3dv_wsi_init(struct v3dv_physical_device *physical_device) { - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - - return wsi_common_get_surface_present_modes(&device->wsi_device, surface, - pPresentModeCount, - pPresentModes); -} + VkResult result; -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateSwapchainKHR( - VkDevice _device, - const VkSwapchainCreateInfoKHR* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSwapchainKHR* pSwapchain) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - struct v3dv_instance *instance = device->instance; - struct v3dv_physical_device *pdevice = &instance->physicalDevice; - struct wsi_device *wsi_device = &pdevice->wsi_device; + result = wsi_device_init(&physical_device->wsi_device, + v3dv_physical_device_to_handle(physical_device), + v3dv_wsi_proc_addr, + &physical_device->vk.instance->alloc, + physical_device->display_fd, NULL, + &(struct wsi_device_options){.sw_device = false}); - ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface); - VkResult result = - v3dv_physical_device_acquire_display(instance, pdevice, surface); if (result != VK_SUCCESS) return result; - const VkAllocationCallbacks *alloc; - if (pAllocator) - alloc = pAllocator; - else - alloc = &device->vk.alloc; - - return wsi_common_create_swapchain(wsi_device, _device, - pCreateInfo, alloc, pSwapchain); -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroySwapchainKHR( - VkDevice _device, - VkSwapchainKHR swapchain, - const VkAllocationCallbacks* pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - const VkAllocationCallbacks *alloc; + physical_device->wsi_device.supports_modifiers = true; + physical_device->wsi_device.can_present_on_device = + v3dv_wsi_can_present_on_device; - if (pAllocator) - alloc = pAllocator; - else - alloc = &device->vk.alloc; + physical_device->vk.wsi_device = &physical_device->wsi_device; - wsi_common_destroy_swapchain(_device, swapchain, alloc); + return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetSwapchainImagesKHR( - VkDevice device, - VkSwapchainKHR swapchain, - uint32_t* pSwapchainImageCount, - VkImage* pSwapchainImages) +void +v3dv_wsi_finish(struct v3dv_physical_device *physical_device) { - return wsi_common_get_images(swapchain, - pSwapchainImageCount, - pSwapchainImages); + physical_device->vk.wsi_device = NULL; + wsi_device_finish(&physical_device->wsi_device, + &physical_device->vk.instance->alloc); } struct v3dv_image * v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index) { - uint32_t n_images = index + 1; - VkImage *images = malloc(sizeof(*images) * n_images); - VkResult result = wsi_common_get_images(swapchain, &n_images, images); - - if (result != VK_SUCCESS && result != VK_INCOMPLETE) { - free(images); - return NULL; - } - - V3DV_FROM_HANDLE(v3dv_image, image, images[index]); - free(images); - - return image; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_AcquireNextImageKHR( - VkDevice device, - VkSwapchainKHR swapchain, - uint64_t timeout, - VkSemaphore semaphore, - VkFence fence, - uint32_t* pImageIndex) -{ - VkAcquireNextImageInfoKHR acquire_info = { - .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR, - .swapchain = swapchain, - .timeout = timeout, - .semaphore = semaphore, - .fence = fence, - .deviceMask = 0, - }; - - return v3dv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_AcquireNextImage2KHR( - VkDevice _device, - const VkAcquireNextImageInfoKHR* pAcquireInfo, - uint32_t* pImageIndex) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, pAcquireInfo->fence); - V3DV_FROM_HANDLE(v3dv_semaphore, semaphore, pAcquireInfo->semaphore); - - struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; - - VkResult result; - result = wsi_common_acquire_next_image2(&pdevice->wsi_device, _device, - pAcquireInfo, pImageIndex); - - if (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR) { - if (fence) - drmSyncobjSignal(pdevice->render_fd, &fence->sync, 1); - if (semaphore) - drmSyncobjSignal(pdevice->render_fd, &semaphore->sync, 1); - } - - return result; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_QueuePresentKHR( - VkQueue _queue, - const VkPresentInfoKHR* pPresentInfo) -{ - V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); - struct v3dv_physical_device *pdevice = - &queue->device->instance->physicalDevice; - - return wsi_common_queue_present(&pdevice->wsi_device, - v3dv_device_to_handle(queue->device), - _queue, 0, - pPresentInfo); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetDeviceGroupPresentCapabilitiesKHR( - VkDevice device, - VkDeviceGroupPresentCapabilitiesKHR* pCapabilities) -{ - memset(pCapabilities->presentMask, 0, - sizeof(pCapabilities->presentMask)); - pCapabilities->presentMask[0] = 0x1; - pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR; - - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetDeviceGroupSurfacePresentModesKHR( - VkDevice device, - VkSurfaceKHR surface, - VkDeviceGroupPresentModeFlagsKHR* pModes) -{ - *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR; - - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDevicePresentRectanglesKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - uint32_t* pRectCount, - VkRect2D* pRects) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - - return wsi_common_get_present_rectangles(&device->wsi_device, - surface, - pRectCount, pRects); + VkImage image = wsi_common_get_image(swapchain, index); + return v3dv_image_from_handle(image); } diff --git a/src/broadcom/vulkan/v3dv_wsi_display.c b/src/broadcom/vulkan/v3dv_wsi_display.c deleted file mode 100644 index 3d1cf91ecbe..00000000000 --- a/src/broadcom/vulkan/v3dv_wsi_display.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright © 2020 Raspberry Pi - * based on KHR_display extension code: - * Copyright © 2017 Keith Packard - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that copyright - * notice and this permission notice appear in supporting documentation, and - * that the name of the copyright holders not be used in advertising or - * publicity pertaining to distribution of the software without specific, - * written prior permission. The copyright holders make no representations - * about the suitability of this software for any purpose. It is provided "as - * is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO - * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, - * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER - * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THIS SOFTWARE. - */ -#include "v3dv_private.h" -#include "wsi_common_display.h" - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device, - uint32_t *property_count, - VkDisplayPropertiesKHR *properties) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_get_physical_device_display_properties( - physical_device, - &pdevice->wsi_device, - property_count, - properties); -} - -VkResult -v3dv_GetPhysicalDeviceDisplayProperties2KHR( - VkPhysicalDevice physical_device, - uint32_t *pPropertyCount, - VkDisplayProperties2KHR *pProperties) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_get_physical_device_display_properties2( - physical_device, - &pdevice->wsi_device, - pPropertyCount, - pProperties); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceDisplayPlanePropertiesKHR( - VkPhysicalDevice physical_device, - uint32_t *property_count, - VkDisplayPlanePropertiesKHR *properties) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_get_physical_device_display_plane_properties( - physical_device, - &pdevice->wsi_device, - property_count, - properties); -} - -VkResult -v3dv_GetPhysicalDeviceDisplayPlaneProperties2KHR( - VkPhysicalDevice physical_device, - uint32_t *pPropertyCount, - VkDisplayPlaneProperties2KHR *pProperties) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_get_physical_device_display_plane_properties2( - physical_device, - &pdevice->wsi_device, - pPropertyCount, - pProperties); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device, - uint32_t plane_index, - uint32_t *display_count, - VkDisplayKHR *displays) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_get_display_plane_supported_displays( - physical_device, - &pdevice->wsi_device, - plane_index, - display_count, - displays); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device, - VkDisplayKHR display, - uint32_t *property_count, - VkDisplayModePropertiesKHR *properties) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_get_display_mode_properties(physical_device, - &pdevice->wsi_device, - display, - property_count, - properties); -} - -VkResult -v3dv_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device, - VkDisplayKHR display, - uint32_t *pPropertyCount, - VkDisplayModeProperties2KHR *pProperties) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_get_display_mode_properties2(physical_device, - &pdevice->wsi_device, - display, - pPropertyCount, - pProperties); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateDisplayModeKHR(VkPhysicalDevice physical_device, - VkDisplayKHR display, - const VkDisplayModeCreateInfoKHR *create_info, - const VkAllocationCallbacks *allocator, - VkDisplayModeKHR *mode) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_display_create_display_mode(physical_device, - &pdevice->wsi_device, - display, - create_info, - allocator, - mode); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device, - VkDisplayModeKHR mode_khr, - uint32_t plane_index, - VkDisplayPlaneCapabilitiesKHR *capabilities) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_get_display_plane_capabilities(physical_device, - &pdevice->wsi_device, - mode_khr, - plane_index, - capabilities); -} - -VkResult -v3dv_GetDisplayPlaneCapabilities2KHR( - VkPhysicalDevice physical_device, - const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo, - VkDisplayPlaneCapabilities2KHR *pCapabilities) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physical_device); - - return wsi_get_display_plane_capabilities2(physical_device, - &pdevice->wsi_device, - pDisplayPlaneInfo, - pCapabilities); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateDisplayPlaneSurfaceKHR( - VkInstance _instance, - const VkDisplaySurfaceCreateInfoKHR *create_info, - const VkAllocationCallbacks *allocator, - VkSurfaceKHR *surface) -{ - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); - const VkAllocationCallbacks *alloc; - - if (allocator) - alloc = allocator; - else - alloc = &instance->vk.alloc; - - return wsi_create_display_surface(_instance, alloc, - create_info, surface); -} diff --git a/src/broadcom/vulkan/v3dv_wsi_wayland.c b/src/broadcom/vulkan/v3dv_wsi_wayland.c deleted file mode 100644 index e61abf3c724..00000000000 --- a/src/broadcom/vulkan/v3dv_wsi_wayland.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright © 2020 Ella Stanforth - * based on intel anv code: - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "wsi_common_wayland.h" -#include "v3dv_private.h" - -VKAPI_ATTR VkBool32 VKAPI_CALL -v3dv_GetPhysicalDeviceWaylandPresentationSupportKHR( - VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex, - struct wl_display* display) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice); - - return wsi_wl_get_presentation_support(&physical_device->wsi_device, display); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateWaylandSurfaceKHR( - VkInstance _instance, - const VkWaylandSurfaceCreateInfoKHR* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSurfaceKHR* pSurface) -{ - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); - const VkAllocationCallbacks *alloc; - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR); - - if (pAllocator) - alloc = pAllocator; - else - alloc = &instance->vk.alloc; - - return wsi_create_wl_surface(alloc, pCreateInfo, pSurface); -} diff --git a/src/broadcom/vulkan/v3dv_wsi_x11.c b/src/broadcom/vulkan/v3dv_wsi_x11.c deleted file mode 100644 index 4fa99ccd5ab..00000000000 --- a/src/broadcom/vulkan/v3dv_wsi_x11.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright © 2020 Raspberry Pi - * - * based mostly on anv driver which is: - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <X11/Xlib-xcb.h> -#include <X11/xshmfence.h> -#include <xcb/xcb.h> -#include <xcb/dri3.h> -#include <xcb/present.h> - -#include "wsi_common_x11.h" -#include "v3dv_private.h" - -VKAPI_ATTR VkBool32 VKAPI_CALL -v3dv_GetPhysicalDeviceXcbPresentationSupportKHR( - VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex, - xcb_connection_t* connection, - xcb_visualid_t visual_id) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - - return wsi_get_physical_device_xcb_presentation_support( - &device->wsi_device, - queueFamilyIndex, - connection, visual_id); -} - -VKAPI_ATTR VkBool32 VKAPI_CALL -v3dv_GetPhysicalDeviceXlibPresentationSupportKHR( - VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex, - Display* dpy, - VisualID visualID) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice); - - return wsi_get_physical_device_xcb_presentation_support( - &device->wsi_device, - queueFamilyIndex, - XGetXCBConnection(dpy), visualID); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateXcbSurfaceKHR( - VkInstance _instance, - const VkXcbSurfaceCreateInfoKHR* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSurfaceKHR* pSurface) -{ - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); - const VkAllocationCallbacks *alloc; - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR); - - if (pAllocator) - alloc = pAllocator; - else - alloc = &instance->vk.alloc; - - return wsi_create_xcb_surface(alloc, pCreateInfo, pSurface); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateXlibSurfaceKHR( - VkInstance _instance, - const VkXlibSurfaceCreateInfoKHR* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSurfaceKHR* pSurface) -{ - V3DV_FROM_HANDLE(v3dv_instance, instance, _instance); - const VkAllocationCallbacks *alloc; - - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR); - - if (pAllocator) - alloc = pAllocator; - else - alloc = &instance->vk.alloc; - - return wsi_create_xlib_surface(alloc, pCreateInfo, pSurface); -} diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c index c2f2c77864b..d7fb087d9a8 100644 --- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,14 +23,13 @@ #include "v3dv_private.h" #include "broadcom/common/v3d_macros.h" +#include "broadcom/common/v3d_util.h" #include "broadcom/cle/v3dx_pack.h" #include "broadcom/compiler/v3d_compiler.h" #include "util/half_float.h" -#include "vulkan/util/vk_format.h" #include "util/u_pack_color.h" - -#include "vk_format_info.h" +#include "vk_format.h" void v3dX(job_emit_binning_flush)(struct v3dv_job *job) @@ -44,6 +43,34 @@ v3dX(job_emit_binning_flush)(struct v3dv_job *job) } void +v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job) +{ + assert(job->can_use_double_buffer); + assert(job->frame_tiling.double_buffer); + assert(!job->frame_tiling.msaa); + assert(job->bcl_tile_binning_mode_ptr); + + const struct v3dv_frame_tiling *tiling = &job->frame_tiling; + struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = { + cl_packet_header(TILE_BINNING_MODE_CFG), + }; + config.width_in_pixels = tiling->width; + config.height_in_pixels = tiling->height; +#if V3D_VERSION == 42 + config.number_of_render_targets = MAX2(tiling->render_target_count, 1); + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + unreachable("HW generation 71 not supported yet."); +#endif + + uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr; + cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config); +} + +void v3dX(job_emit_binning_prolog)(struct v3dv_job *job, const struct v3dv_frame_tiling *tiling, uint32_t layers) @@ -55,12 +82,27 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job, config.number_of_layers = layers; } + assert(!tiling->double_buffer || !tiling->msaa); + job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl); cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { config.width_in_pixels = tiling->width; config.height_in_pixels = tiling->height; +#if V3D_VERSION == 42 config.number_of_render_targets = MAX2(tiling->render_target_count, 1); config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideally we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif } /* There's definitely nothing in the VCD cache we want. */ @@ -106,18 +148,45 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer, uint32_t buffer) { const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image; + + /* We don't support rendering to ycbcr images, so the image view should be + * single-plane, and using a single-plane format. But note that the underlying + * image can be a ycbcr format, as we support rendering to a specific plane + * of an image. This is used for example on some meta_copy code paths, in + * order to copy from/to a plane of a ycbcr image. + */ + assert(iview->plane_count == 1); + assert(iview->format->plane_count == 1); + + uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects); const struct v3d_resource_slice *slice = - &image->slices[iview->vk.base_mip_level]; + &image->planes[image_plane].slices[iview->vk.base_mip_level]; + uint32_t layer_offset = v3dv_layer_offset(image, iview->vk.base_mip_level, - iview->vk.base_array_layer + layer); + iview->vk.base_array_layer + layer, image_plane); cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) { load.buffer_to_load = buffer; - load.address = v3dv_cl_address(image->mem->bo, layer_offset); + load.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset); + + load.input_image_format = iview->format->planes[0].rt_type; + + /* If we create an image view with only the stencil format, we + * re-interpret the format as RGBA8_UINT, as it is want we want in + * general (see CreateImageView). + * + * However, when we are loading/storing tiles from the ZSTENCIL tile + * buffer, we need to use the underlying DS format. + */ + if (buffer == ZSTENCIL && + iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) { + assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8); + load.input_image_format = image->format->planes[image_plane].rt_type; + } - load.input_image_format = iview->format->rt_type; - load.r_b_swap = iview->swap_rb; + load.r_b_swap = iview->planes[0].swap_rb; + load.channel_reverse = iview->planes[0].channel_reverse; load.memory_format = slice->tiling; if (slice->tiling == V3D_TILING_UIF_NO_XOR || @@ -135,38 +204,6 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer, } } -static bool -check_needs_load(const struct v3dv_cmd_buffer_state *state, - VkImageAspectFlags aspect, - uint32_t first_subpass_idx, - VkAttachmentLoadOp load_op) -{ - /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are - * testing does not exist in the image. - */ - if (!aspect) - return false; - - /* Attachment (or view) load operations apply on the first subpass that - * uses the attachment (or view), otherwise we always need to load. - */ - if (state->job->first_subpass > first_subpass_idx) - return true; - - /* If the job is continuing a subpass started in another job, we always - * need to load. - */ - if (state->job->is_subpass_continue) - return true; - - /* If the area is not aligned to tile boundaries, we always need to load */ - if (!state->tile_aligned_render_area) - return true; - - /* The attachment load operations must be LOAD */ - return load_op == VK_ATTACHMENT_LOAD_OP_LOAD; -} - static inline uint32_t v3dv_zs_buffer(bool depth, bool stencil) { @@ -185,7 +222,6 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer) { const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - const struct v3dv_framebuffer *framebuffer = state->framebuffer; const struct v3dv_render_pass *pass = state->pass; const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; @@ -222,12 +258,20 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer, attachment->first_subpass : attachment->views[layer].first_subpass; - bool needs_load = check_needs_load(state, - VK_IMAGE_ASPECT_COLOR_BIT, - first_subpass, - attachment->desc.loadOp); + uint32_t last_subpass = !pass->multiview_enabled ? + attachment->last_subpass : + attachment->views[layer].last_subpass; + + bool needs_load = + v3dv_cmd_buffer_check_needs_load(state, + VK_IMAGE_ASPECT_COLOR_BIT, + first_subpass, + attachment->desc.loadOp, + last_subpass, + attachment->desc.storeOp); if (needs_load) { - struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx]; + struct v3dv_image_view *iview = + state->attachments[attachment_idx].image_view; cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview, layer, RENDER_TARGET_0 + i); } @@ -245,21 +289,29 @@ cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer, ds_attachment->first_subpass : ds_attachment->views[layer].first_subpass; + uint32_t ds_last_subpass = !pass->multiview_enabled ? + ds_attachment->last_subpass : + ds_attachment->views[layer].last_subpass; + const bool needs_depth_load = - check_needs_load(state, - ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, - ds_first_subpass, - ds_attachment->desc.loadOp); + v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_first_subpass, + ds_attachment->desc.loadOp, + ds_last_subpass, + ds_attachment->desc.storeOp); const bool needs_stencil_load = - check_needs_load(state, - ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, - ds_first_subpass, - ds_attachment->desc.stencilLoadOp); + v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, + ds_first_subpass, + ds_attachment->desc.stencilLoadOp, + ds_last_subpass, + ds_attachment->desc.stencilStoreOp); if (needs_depth_load || needs_stencil_load) { struct v3dv_image_view *iview = - framebuffer->attachments[ds_attachment_idx]; + state->attachments[ds_attachment_idx].image_view; /* From the Vulkan spec: * * "When an image view of a depth/stencil image is used as a @@ -290,21 +342,53 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer, bool is_multisample_resolve) { const struct v3dv_image_view *iview = - cmd_buffer->state.framebuffer->attachments[attachment_idx]; + cmd_buffer->state.attachments[attachment_idx].image_view; const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image; + + /* We don't support rendering to ycbcr images, so the image view should be + * one-plane, and using a single-plane format. But note that the underlying + * image can be a ycbcr format, as we support rendering to a specific plane + * of an image. This is used for example on some meta_copy code paths, in + * order to copy from/to a plane of a ycbcr image. + */ + assert(iview->plane_count == 1); + assert(iview->format->plane_count == 1); + + uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects); const struct v3d_resource_slice *slice = - &image->slices[iview->vk.base_mip_level]; + &image->planes[image_plane].slices[iview->vk.base_mip_level]; uint32_t layer_offset = v3dv_layer_offset(image, iview->vk.base_mip_level, - iview->vk.base_array_layer + layer); + iview->vk.base_array_layer + layer, + image_plane); + + /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it + * is broken in earlier V3D versions. + */ + assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear); cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = buffer; - store.address = v3dv_cl_address(image->mem->bo, layer_offset); + store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset); store.clear_buffer_being_stored = clear; - store.output_image_format = iview->format->rt_type; - store.r_b_swap = iview->swap_rb; + store.output_image_format = iview->format->planes[0].rt_type; + + /* If we create an image view with only the stencil format, we + * re-interpret the format as RGBA8_UINT, as it is want we want in + * general (see CreateImageView). + * + * However, when we are loading/storing tiles from the ZSTENCIL tile + * buffer, we need to use the underlying DS format. + */ + if (buffer == ZSTENCIL && + iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) { + assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8); + store.output_image_format = image->format->planes[image_plane].rt_type; + } + + store.r_b_swap = iview->planes[0].swap_rb; + store.channel_reverse = iview->planes[0].channel_reverse; store.memory_format = slice->tiling; if (slice->tiling == V3D_TILING_UIF_NO_XOR || @@ -349,7 +433,7 @@ check_needs_clear(const struct v3dv_cmd_buffer_state *state, if (state->job->is_subpass_continue) return false; - /* If the render area is not aligned to tile boudaries we can't use the + /* If the render area is not aligned to tile boundaries we can't use the * TLB for a clear. */ if (!state->tile_aligned_render_area) @@ -366,36 +450,6 @@ check_needs_clear(const struct v3dv_cmd_buffer_state *state, return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR; } -static bool -check_needs_store(const struct v3dv_cmd_buffer_state *state, - VkImageAspectFlags aspect, - uint32_t last_subpass_idx, - VkAttachmentStoreOp store_op) -{ - /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are - * testing does not exist in the image. - */ - if (!aspect) - return false; - - /* Attachment (or view) store operations only apply on the last subpass - * where the attachment (or view) is used, in other subpasses we always - * need to store. - */ - if (state->subpass_idx < last_subpass_idx) - return true; - - /* Attachment store operations only apply on the last job we emit on the the - * last subpass where the attachment is used, otherwise we always need to - * store. - */ - if (!state->job->is_subpass_finish) - return true; - - /* The attachment store operation must be STORE */ - return store_op == VK_ATTACHMENT_STORE_OP_STORE; -} - static void cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl *cl, @@ -435,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, const VkImageAspectFlags aspects = vk_format_aspects(ds_attachment->desc.format); +#if V3D_VERSION <= 42 + /* GFXH-1689: The per-buffer store command's clear buffer bit is broken + * for depth/stencil. + * + * There used to be some confusion regarding the Clear Tile Buffers + * Z/S bit also being broken, but we confirmed with Broadcom that this + * is not the case, it was just that some other hardware bugs (that we + * need to work around, such as GFXH-1461) could cause this bit to behave + * incorrectly. + * + * There used to be another issue where the RTs bit in the Clear Tile + * Buffers packet also cleared Z/S, but Broadcom confirmed this is + * fixed since V3D 4.1. + * + * So if we have to emit a clear of depth or stencil we don't use + * the per-buffer store clear bit, even if we need to store the buffers, + * instead we always have to use the Clear Tile Buffers Z/S bit. + * If we have configured the job to do early Z/S clearing, then we + * don't want to emit any Clear Tile Buffers command at all here. + * + * Note that GFXH-1689 is not reproduced in the simulator, where + * using the clear buffer bit in depth/stencil stores works fine. + */ + /* Only clear once on the first subpass that uses the attachment */ uint32_t ds_first_subpass = !state->pass->multiview_enabled ? ds_attachment->first_subpass : @@ -454,47 +532,59 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, ds_attachment->desc.stencilLoadOp, subpass->do_stencil_clear_with_draw); + use_global_zs_clear = !state->job->early_zs_clear && + (needs_depth_clear || needs_stencil_clear); +#endif +#if V3D_VERSION >= 71 + /* The store command's clear buffer bit cannot be used for Z/S stencil: + * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles, + * so we don't want to emit redundant clears here. + */ + use_global_zs_clear = false; +#endif + /* Skip the last store if it is not required */ uint32_t ds_last_subpass = !pass->multiview_enabled ? ds_attachment->last_subpass : ds_attachment->views[layer].last_subpass; bool needs_depth_store = - check_needs_store(state, - aspects & VK_IMAGE_ASPECT_DEPTH_BIT, - ds_last_subpass, - ds_attachment->desc.storeOp); + v3dv_cmd_buffer_check_needs_store(state, + aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_last_subpass, + ds_attachment->desc.storeOp); bool needs_stencil_store = - check_needs_store(state, - aspects & VK_IMAGE_ASPECT_STENCIL_BIT, - ds_last_subpass, - ds_attachment->desc.stencilStoreOp); + v3dv_cmd_buffer_check_needs_store(state, + aspects & VK_IMAGE_ASPECT_STENCIL_BIT, + ds_last_subpass, + ds_attachment->desc.stencilStoreOp); + + /* If we have a resolve, handle it before storing the tile */ + const struct v3dv_cmd_buffer_attachment_state *ds_att_state = + &state->attachments[ds_attachment_idx]; + if (ds_att_state->use_tlb_resolve) { + assert(ds_att_state->has_resolve); + assert(subpass->resolve_depth || subpass->resolve_stencil); + const uint32_t resolve_attachment_idx = + subpass->ds_resolve_attachment.attachment; + assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED); + + const uint32_t zs_buffer = + v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil); + cmd_buffer_render_pass_emit_store(cmd_buffer, cl, + resolve_attachment_idx, layer, + zs_buffer, + false, false); + has_stores = true; + } else if (ds_att_state->has_resolve) { + /* If we can't use the TLB to implement the resolve we will need to + * store the attachment so we can implement it later using a blit. + */ + needs_depth_store = subpass->resolve_depth; + needs_stencil_store = subpass->resolve_stencil; + } - /* GFXH-1689: The per-buffer store command's clear buffer bit is broken - * for depth/stencil. - * - * There used to be some confusion regarding the Clear Tile Buffers - * Z/S bit also being broken, but we confirmed with Broadcom that this - * is not the case, it was just that some other hardware bugs (that we - * need to work around, such as GFXH-1461) could cause this bit to behave - * incorrectly. - * - * There used to be another issue where the RTs bit in the Clear Tile - * Buffers packet also cleared Z/S, but Broadcom confirmed this is - * fixed since V3D 4.1. - * - * So if we have to emit a clear of depth or stencil we don't use - * the per-buffer store clear bit, even if we need to store the buffers, - * instead we always have to use the Clear Tile Buffers Z/S bit. - * If we have configured the job to do early Z/S clearing, then we - * don't want to emit any Clear Tile Buffers command at all here. - * - * Note that GFXH-1689 is not reproduced in the simulator, where - * using the clear buffer bit in depth/stencil stores works fine. - */ - use_global_zs_clear = !state->job->early_zs_clear && - (needs_depth_clear || needs_stencil_clear); if (needs_depth_store || needs_stencil_store) { const uint32_t zs_buffer = v3dv_zs_buffer(needs_depth_store, needs_stencil_store); @@ -536,10 +626,10 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, attachment->views[layer].last_subpass; bool needs_store = - check_needs_store(state, - VK_IMAGE_ASPECT_COLOR_BIT, - last_subpass, - attachment->desc.storeOp); + v3dv_cmd_buffer_check_needs_store(state, + VK_IMAGE_ASPECT_COLOR_BIT, + last_subpass, + attachment->desc.storeOp); /* If we need to resolve this attachment emit that store first. Notice * that we must not request a tile buffer clear here in that case, since @@ -547,15 +637,16 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, * color attachment store below, since the clear happens after the * store is completed. * - * If the attachment doesn't support TLB resolves then we will have to - * fallback to doing the resolve in a shader separately after this - * job, so we will need to store the multisampled sttachment even if that - * wansn't requested by the client. + * If the attachment doesn't support TLB resolves (or the render area + * is not aligned to tile boundaries) then we will have to fallback to + * doing the resolve in a shader separately after this job, so we will + * need to store the multisampled attachment even if that wasn't + * requested by the client. */ - const bool needs_resolve = - subpass->resolve_attachments && - subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED; - if (needs_resolve && attachment->use_tlb_resolve) { + const struct v3dv_cmd_buffer_attachment_state *att_state = + &state->attachments[attachment_idx]; + if (att_state->use_tlb_resolve) { + assert(att_state->has_resolve); const uint32_t resolve_attachment_idx = subpass->resolve_attachments[i].attachment; cmd_buffer_render_pass_emit_store(cmd_buffer, cl, @@ -563,7 +654,7 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, RENDER_TARGET_0 + i, false, true); has_stores = true; - } else if (needs_resolve) { + } else if (att_state->has_resolve) { needs_store = true; } @@ -591,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, * bit and instead we have to emit a single clear of all tile buffers. */ if (use_global_zs_clear || use_global_rt_clear) { +#if V3D_VERSION == 42 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = use_global_zs_clear; clear.clear_all_render_targets = use_global_rt_clear; } +#endif +#if V3D_VERSION >= 71 + cl_emit(cl, CLEAR_RENDER_TARGETS, clear); +#endif } } @@ -698,11 +794,8 @@ set_rcl_early_z_config(struct v3dv_job *job, bool *early_z_disable, uint32_t *early_z_test_and_update_direction) { - /* If this is true then we have not emitted any draw calls in this job - * and we don't get any benefits form early Z. - */ - if (!job->decided_global_ez_enable) { - assert(job->draw_count == 0); + /* Disable if none of the draw calls in this job enabled EZ */ + if (!job->has_ez_draws) { *early_z_disable = true; return; } @@ -723,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job, } } +/* Note that for v71, render target cfg packets has just one field that + * combined the internal type and clamp mode. For simplicity we keep just one + * helper. + * + * Note: rt_type is in fact a "enum V3DX(Internal_Type)". + * + * FIXME: for v71 we are not returning all the possible combinations for + * render target internal type and clamp. For example for int types we are + * always using clamp int, and for 16f we are using clamp none or pos (that + * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In + * summary right now we are just porting what we were doing on 4.2 + */ +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format) +{ +#if V3D_VERSION == 42 + if (vk_format_is_int(vk_format)) + return V3D_RENDER_TARGET_CLAMP_INT; + else if (vk_format_is_srgb(vk_format)) + return V3D_RENDER_TARGET_CLAMP_NORM; + else + return V3D_RENDER_TARGET_CLAMP_NONE; +#endif +#if V3D_VERSION >= 71 + switch (rt_type) { + case V3D_INTERNAL_TYPE_8I: + return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; + case V3D_INTERNAL_TYPE_8UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; + case V3D_INTERNAL_TYPE_8: + return V3D_RENDER_TARGET_TYPE_CLAMP_8; + case V3D_INTERNAL_TYPE_16I: + return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; + case V3D_INTERNAL_TYPE_16UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; + case V3D_INTERNAL_TYPE_16F: + return vk_format_is_srgb(vk_format) ? + V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : + V3D_RENDER_TARGET_TYPE_CLAMP_16F; + case V3D_INTERNAL_TYPE_32I: + return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; + case V3D_INTERNAL_TYPE_32UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; + case V3D_INTERNAL_TYPE_32F: + return V3D_RENDER_TARGET_TYPE_CLAMP_32F; + default: + unreachable("Unknown internal render target type"); + } + + return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; +#endif +} + +static void +cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer, + int rt, + uint32_t *rt_bpp, +#if V3D_VERSION == 42 + uint32_t *rt_type, + uint32_t *rt_clamp) +#else + uint32_t *rt_type_clamp) +#endif +{ + const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + + assert(state->subpass_idx < state->pass->subpass_count); + const struct v3dv_subpass *subpass = + &state->pass->subpasses[state->subpass_idx]; + + if (rt >= subpass->color_count) + return; + + struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; + const uint32_t attachment_idx = attachment->attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) + return; + + assert(attachment_idx < state->framebuffer->attachment_count && + attachment_idx < state->attachment_alloc_count); + struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; + assert(vk_format_is_color(iview->vk.format)); + + assert(iview->plane_count == 1); + *rt_bpp = iview->planes[0].internal_bpp; +#if V3D_VERSION == 42 + *rt_type = iview->planes[0].internal_type; + *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, + iview->vk.format); +#endif +#if V3D_VERSION >= 71 + *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, + iview->vk.format); +#endif +} + void v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) { @@ -738,7 +928,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) * buffer. */ if (!framebuffer) { - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); return; } @@ -756,23 +946,44 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; struct v3dv_cl *rcl = &job->rcl; - /* Comon config must be the first TILE_RENDERING_MODE_CFG and + /* Common config must be the first TILE_RENDERING_MODE_CFG and * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional * updates to the previous HW state. */ bool do_early_zs_clear = false; const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; + assert(!tiling->msaa || !tiling->double_buffer); cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) { config.image_width_pixels = framebuffer->width; config.image_height_pixels = framebuffer->height; config.number_of_render_targets = MAX2(subpass->color_count, 1); config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { const struct v3dv_image_view *iview = - framebuffer->attachments[ds_attachment_idx]; - config.internal_depth_type = iview->internal_type; + state->attachments[ds_attachment_idx].image_view; + + /* At this point the image view should be single-plane. But note that + * the underlying image can be multi-plane, and the image view refer + * to one specific plane. + */ + assert(iview->plane_count == 1); + assert(iview->format->plane_count == 1); + config.internal_depth_type = iview->planes[0].internal_type; set_rcl_early_z_config(job, &config.early_z_disable, @@ -787,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) * Early-Z/S clearing is independent of Early Z/S testing, so it is * possible to enable one but not the other so long as their * respective requirements are met. + * + * From V3D 4.5.6, Z/S buffers are always cleared automatically + * between tiles, but we still want to enable early ZS clears + * when Z/S are not loaded or stored. */ struct v3dv_render_pass_attachment *ds_attachment = &pass->attachments[ds_attachment_idx]; @@ -794,6 +1009,13 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) const VkImageAspectFlags ds_aspects = vk_format_aspects(ds_attachment->desc.format); + bool needs_depth_store = + v3dv_cmd_buffer_check_needs_store(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp) || + subpass->resolve_depth; +#if V3D_VERSION <= 42 bool needs_depth_clear = check_needs_clear(state, ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, @@ -801,26 +1023,35 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) ds_attachment->desc.loadOp, subpass->do_depth_clear_with_draw); - bool needs_depth_store = - check_needs_store(state, - ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, - ds_attachment->last_subpass, - ds_attachment->desc.storeOp); - do_early_zs_clear = needs_depth_clear && !needs_depth_store; +#endif +#if V3D_VERSION >= 71 + bool needs_depth_load = + v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.loadOp, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp); + do_early_zs_clear = !needs_depth_load && !needs_depth_store; +#endif + if (do_early_zs_clear && vk_format_has_stencil(ds_attachment->desc.format)) { bool needs_stencil_load = - check_needs_load(state, - ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, - ds_attachment->first_subpass, - ds_attachment->desc.stencilLoadOp); + v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.stencilLoadOp, + ds_attachment->last_subpass, + ds_attachment->desc.stencilStoreOp); bool needs_stencil_store = - check_needs_store(state, - ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, - ds_attachment->last_subpass, - ds_attachment->desc.stencilStoreOp); + v3dv_cmd_buffer_check_needs_store(state, + ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, + ds_attachment->last_subpass, + ds_attachment->desc.stencilStoreOp) || + subpass->resolve_stencil; do_early_zs_clear = !needs_stencil_load && !needs_stencil_store; } @@ -837,25 +1068,38 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) */ job->early_zs_clear = do_early_zs_clear; +#if V3D_VERSION >= 71 + uint32_t base_addr = 0; +#endif for (uint32_t i = 0; i < subpass->color_count; i++) { uint32_t attachment_idx = subpass->color_attachments[i].attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) + if (attachment_idx == VK_ATTACHMENT_UNUSED) { +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.render_target_number = i; + rt.stride = 1; /* Unused */ + } +#endif continue; + } struct v3dv_image_view *iview = - state->framebuffer->attachments[attachment_idx]; + state->attachments[attachment_idx].image_view; + assert(iview->plane_count == 1); const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image; + + uint8_t plane = v3dv_plane_from_aspect(iview->vk.aspects); const struct v3d_resource_slice *slice = - &image->slices[iview->vk.base_mip_level]; + &image->planes[plane].slices[iview->vk.base_mip_level]; - const uint32_t *clear_color = + UNUSED const uint32_t *clear_color = &state->attachments[attachment_idx].clear_value.color[0]; - uint32_t clear_pad = 0; + UNUSED uint32_t clear_pad = 0; if (slice->tiling == V3D_TILING_UIF_NO_XOR || slice->tiling == V3D_TILING_UIF_XOR) { - int uif_block_height = v3d_utile_height(image->cpp) * 2; + int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2; uint32_t implicit_padded_height = align(framebuffer->height, uif_block_height) / uif_block_height; @@ -866,13 +1110,14 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) } } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { clear.clear_color_low_32_bits = clear_color[0]; clear.clear_color_next_24_bits = clear_color[1] & 0xffffff; clear.render_target_number = i; }; - if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) { + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) { clear.clear_color_mid_low_32_bits = ((clear_color[1] >> 24) | (clear_color[2] << 8)); @@ -882,29 +1127,81 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) }; } - if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) { + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) { cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) { clear.uif_padded_height_in_uif_blocks = clear_pad; clear.clear_color_high_16_bits = clear_color[3] >> 16; clear.render_target_number = i; }; } +#endif + +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.clear_color_low_bits = clear_color[0]; + cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp, + &rt.internal_type_and_clamping); + rt.stride = + v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, + v3d_internal_bpp_words(rt.internal_bpp)); + rt.base_address = base_addr; + rt.render_target_number = i; + + /* base_addr in multiples of 512 bits. We divide by 8 because stride + * is in 128-bit units, but it is packing 2 rows worth of data, so we + * need to divide it by 2 so it is only 1 row, and then again by 4 so + * it is in 512-bit units. + */ + base_addr += (tiling->tile_height * rt.stride) / 8; + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { + rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ + ((uint64_t) clear_color[1]) | + (((uint64_t) (clear_color[2] & 0xff)) << 32); + rt.render_target_number = i; + } + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { + rt.clear_color_top_bits = /* 56 bits (24 + 32) */ + (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) | + (((uint64_t) (clear_color[3])) << 24); + rt.render_target_number = i; + } + } +#endif + } + +#if V3D_VERSION >= 71 + /* If we don't have any color RTs, we still need to emit one and flag + * it as not used using stride = 1. + */ + if (subpass->color_count == 0) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.stride = 1; + } } +#endif +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 0, &rt.render_target_0_internal_bpp, &rt.render_target_0_internal_type, &rt.render_target_0_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 1, &rt.render_target_1_internal_bpp, &rt.render_target_1_internal_type, &rt.render_target_1_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 2, &rt.render_target_2_internal_bpp, &rt.render_target_2_internal_type, &rt.render_target_2_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 3, &rt.render_target_3_internal_bpp, &rt.render_target_3_internal_type, &rt.render_target_3_clamp); } +#endif /* Ends rendering mode config. */ if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { @@ -944,12 +1241,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) tiling->frame_height_in_supertiles; } - /* Start by clearing the tile buffer. */ - cl_emit(rcl, TILE_COORDINATES, coords) { - coords.tile_column_number = 0; - coords.tile_row_number = 0; - } - /* Emit an initial clear of the tile buffers. This is necessary * for any buffers that should be cleared (since clearing * normally happens at the *end* of the generic tile list), but @@ -964,17 +1255,22 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) * changes on V3D 3.x, and 2 dummy stores on 4.x. */ for (int i = 0; i < 2; i++) { - if (i > 0) - cl_emit(rcl, TILE_COORDINATES, coords); + cl_emit(rcl, TILE_COORDINATES, coords); cl_emit(rcl, END_OF_LOADS, end); cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = NONE; } - if (i == 0 && cmd_buffer->state.tile_aligned_render_area) { + if (cmd_buffer->state.tile_aligned_render_area && + (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { +#if V3D_VERSION == 42 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = !job->early_zs_clear; clear.clear_all_render_targets = true; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt); +#endif } cl_emit(rcl, END_OF_TILE_MARKER, end); } @@ -990,11 +1286,51 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) } void +v3dX(viewport_compute_xform)(const VkViewport *viewport, + float scale[3], + float translate[3]) +{ + float x = viewport->x; + float y = viewport->y; + float half_width = 0.5f * viewport->width; + float half_height = 0.5f * viewport->height; + double n = viewport->minDepth; + double f = viewport->maxDepth; + + scale[0] = half_width; + translate[0] = half_width + x; + scale[1] = half_height; + translate[1] = half_height + y; + + scale[2] = (f - n); + translate[2] = n; + + /* It seems that if the scale is small enough the hardware won't clip + * correctly so we work around this my choosing the smallest scale that + * seems to work. + * + * This case is exercised by CTS: + * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero + * + * V3D 7.x fixes this by using the new + * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND. + */ +#if V3D_VERSION <= 42 + const float min_abs_scale = 0.0005f; + if (fabs(scale[2]) < min_abs_scale) + scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; +#endif +} + +void v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) { struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; - /* FIXME: right now we only support one viewport. viewporst[0] would work - * now, would need to change if we allow multiple viewports + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + assert(pipeline); + + /* FIXME: right now we don't support multiViewport so viewports[0] would + * work now, but would need to change if we allow multiple viewports. */ float *vptranslate = dynamic->viewport.translate[0]; float *vpscale = dynamic->viewport.scale[0]; @@ -1010,29 +1346,83 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size); v3dv_return_if_oom(cmd_buffer, NULL); +#if V3D_VERSION == 42 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f; clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f; } +#endif +#if V3D_VERSION >= 71 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f; + clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f; + } +#endif + float translate_z, scale_z; + v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0, + &translate_z, &scale_z); + +#if V3D_VERSION == 42 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { - clip.viewport_z_offset_zc_to_zs = vptranslate[2]; - clip.viewport_z_scale_zc_to_zs = vpscale[2]; + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } +#endif + +#if V3D_VERSION >= 71 + /* If the Z scale is too small guardband clipping may not clip correctly */ + if (fabsf(scale_z) < 0.01f) { + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } + } else { + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } } +#endif + cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) { - /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */ - float z1 = vptranslate[2]; - float z2 = vptranslate[2] + vpscale[2]; + /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled, + * we are using OpenGL's [-1, 1] instead. + */ + float z1 = pipeline->negative_one_to_one ? translate_z - scale_z : + translate_z; + float z2 = translate_z + scale_z; clip.minimum_zw = MIN2(z1, z2); clip.maximum_zw = MAX2(z1, z2); } cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) { - vp.viewport_centre_x_coordinate = vptranslate[0]; - vp.viewport_centre_y_coordinate = vptranslate[1]; + float vp_fine_x = vptranslate[0]; + float vp_fine_y = vptranslate[1]; + int32_t vp_coarse_x = 0; + int32_t vp_coarse_y = 0; + + /* The fine coordinates must be unsigned, but coarse can be signed */ + if (unlikely(vp_fine_x < 0)) { + int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64); + vp_fine_x += 64.0f * blocks_64; + vp_coarse_x -= blocks_64; + } + + if (unlikely(vp_fine_y < 0)) { + int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64); + vp_fine_y += 64.0f * blocks_64; + vp_coarse_y -= blocks_64; + } + + vp.fine_x = vp_fine_x; + vp.fine_y = vp_fine_y; + vp.coarse_x = vp_coarse_x; + vp.coarse_y = vp_coarse_y; } - cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT; + BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty, + MESA_VK_DYNAMIC_VP_VIEWPORTS); } void @@ -1042,52 +1432,62 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer) assert(job); struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; - struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic; - - const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK | - V3DV_DYNAMIC_STENCIL_WRITE_MASK | - V3DV_DYNAMIC_STENCIL_REFERENCE; + struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; v3dv_cl_ensure_space_with_branch(&job->bcl, 2 * cl_packet_length(STENCIL_CFG)); v3dv_return_if_oom(cmd_buffer, NULL); + bool any_dynamic_stencil_state = + BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || + BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || + BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) || + BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP); + bool emitted_stencil = false; - for (uint32_t i = 0; i < 2; i++) { + const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front; + const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back; + + const bool needs_front_and_back = any_dynamic_stencil_state ? + memcmp(front, back, sizeof(*front)) != 0 : + pipeline->emit_stencil_cfg[1] == true; + const unsigned stencil_packets = needs_front_and_back ? 2 : 1; + + for (uint32_t i = 0; i < stencil_packets; i++) { if (pipeline->emit_stencil_cfg[i]) { - if (dynamic_state->mask & dynamic_stencil_states) { - cl_emit_with_prepacked(&job->bcl, STENCIL_CFG, - pipeline->stencil_cfg[i], config) { - if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) { - config.stencil_test_mask = - i == 0 ? dynamic_state->stencil_compare_mask.front : - dynamic_state->stencil_compare_mask.back; - } - if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) { - config.stencil_write_mask = - i == 0 ? dynamic_state->stencil_write_mask.front : - dynamic_state->stencil_write_mask.back; - } - if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) { - config.stencil_ref_value = - i == 0 ? dynamic_state->stencil_reference.front : - dynamic_state->stencil_reference.back; - } + if (any_dynamic_stencil_state) { + const struct vk_stencil_test_face_state *stencil_state = + i == 0 ? front : back; + + /* If we have any dynamic stencil state we just emit the entire + * packet since for simplicity + */ + cl_emit(&job->bcl, STENCIL_CFG, config) { + config.front_config = !needs_front_and_back || i == 0; + config.back_config = !needs_front_and_back || i == 1; + config.stencil_test_mask = stencil_state->compare_mask & 0xff; + config.stencil_write_mask = stencil_state->write_mask & 0xff; + config.stencil_ref_value = stencil_state->reference & 0xff; + config.stencil_test_function = stencil_state->op.compare; + config.stencil_pass_op = + v3dX(translate_stencil_op)(stencil_state->op.pass); + config.depth_test_fail_op = + v3dX(translate_stencil_op)(stencil_state->op.depth_fail); + config.stencil_test_fail_op = + v3dX(translate_stencil_op)(stencil_state->op.fail); } } else { cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]); } - emitted_stencil = true; } } - if (emitted_stencil) { - const uint32_t dynamic_stencil_dirty_flags = - V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK | - V3DV_CMD_DIRTY_STENCIL_WRITE_MASK | - V3DV_CMD_DIRTY_STENCIL_REFERENCE; - cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags; + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK); + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE); + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK); + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP); } } @@ -1103,19 +1503,51 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_job *job = cmd_buffer->state.job; assert(job); + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET)); v3dv_return_if_oom(cmd_buffer, NULL); - struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; cl_emit(&job->bcl, DEPTH_OFFSET, bias) { - bias.depth_offset_factor = dynamic->depth_bias.slope_factor; - bias.depth_offset_units = dynamic->depth_bias.constant_factor; + bias.depth_offset_factor = dyn->rs.depth_bias.slope; + bias.depth_offset_units = dyn->rs.depth_bias.constant; +#if V3D_VERSION <= 42 if (pipeline->depth_bias.is_z16) bias.depth_offset_units *= 256.0f; - bias.limit = dynamic->depth_bias.depth_bias_clamp; +#endif + bias.limit = dyn->rs.depth_bias.clamp; } - cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS; + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS); +} + +void +v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer) +{ + /* No depthBounds support for v42, so this method is empty in that case. + * + * Note that this method is being called as v3dv_job_init flags all state + * as dirty. See FIXME note in v3dv_job_init. + */ +#if V3D_VERSION >= 71 + struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + + if (!dyn->ds.depth.bounds_test.enable) + return; + + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); + + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS)); + v3dv_return_if_oom(cmd_buffer, NULL); + + cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) { + bounds.lower_test_limit = dyn->ds.depth.bounds_test.min; + bounds.upper_test_limit = dyn->ds.depth.bounds_test.max; + } + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS); +#endif } void @@ -1124,14 +1556,17 @@ v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_job *job = cmd_buffer->state.job; assert(job); + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH)); v3dv_return_if_oom(cmd_buffer, NULL); cl_emit(&job->bcl, LINE_WIDTH, line) { - line.line_width = cmd_buffer->state.dynamic.line_width; + line.line_width = v3dv_get_aa_line_width(cmd_buffer->state.gfx.pipeline, + cmd_buffer); } - cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH; + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH); } void @@ -1161,10 +1596,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; assert(pipeline); + const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo; + const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver); + const uint32_t blend_packets_size = cl_packet_length(BLEND_ENABLES) + cl_packet_length(BLEND_CONSTANT_COLOR) + - cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS; + cl_packet_length(BLEND_CFG) * max_color_rts; v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size); v3dv_return_if_oom(cmd_buffer, NULL); @@ -1176,23 +1614,26 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) } } - for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { + for (uint32_t i = 0; i < max_color_rts; i++) { if (pipeline->blend.enables & (1 << i)) cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]); } } - if (pipeline->blend.needs_color_constants && - cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) { - struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; + if (pipeline->blend.needs_color_constants) { + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) { - color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]); - color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]); - color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]); - color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]); + color.red_f16 = _mesa_float_to_half(dyn->cb.blend_constants[0]); + color.green_f16 = _mesa_float_to_half(dyn->cb.blend_constants[1]); + color.blue_f16 = _mesa_float_to_half(dyn->cb.blend_constants[2]); + color.alpha_f16 = _mesa_float_to_half(dyn->cb.blend_constants[3]); } - cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS; } + + BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty, + MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS); } void @@ -1202,13 +1643,21 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer) v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS)); struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; - struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; + struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic; + uint32_t color_write_mask = ~v3dv_dyn->color_write_enable | + pipeline->blend.color_write_masks; + +#if V3D_VERSION <= 42 + /* Only 4 RTs */ + color_write_mask &= 0xffff; +#endif + cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { - mask.mask = (~dynamic->color_write_enable | - pipeline->blend.color_write_masks) & 0xffff; + mask.mask = color_write_mask; } - cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; + BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty, + MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES); } static void @@ -1346,11 +1795,33 @@ v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer) } } -static void -job_update_ez_state(struct v3dv_job *job, - struct v3dv_pipeline *pipeline, - struct v3dv_cmd_buffer *cmd_buffer) +#if V3D_VERSION == 42 +/* Updates cmd_buffer, and their job, early z state tracking. Returns false if + * EZ must be disabled for the current draw call. + */ +static bool +cmd_buffer_update_ez_state(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline *pipeline) { + struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + /* Update first cmd_buffer ez_state tracking. If possible we reuse the + * values from the pipeline + */ + if (!BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP) && + !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) && + !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) && + !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) { + cmd_buffer->state.ez_state = pipeline->ez_state; + cmd_buffer->state.incompatible_ez_test = + pipeline->incompatible_ez_test; + } else { + v3dv_compute_ez_state(dyn, pipeline, + &cmd_buffer->state.ez_state, + &cmd_buffer->state.incompatible_ez_test); + } + + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); /* If first_ez_state is V3D_EZ_DISABLED it means that we have already * determined that we should disable EZ completely for all draw calls in * this job. This will cause us to disable EZ for the entire job in the @@ -1360,9 +1831,15 @@ job_update_ez_state(struct v3dv_job *job, */ if (job->first_ez_state == V3D_EZ_DISABLED) { assert(job->ez_state == V3D_EZ_DISABLED); - return; + return false; } + /* If ez_state is V3D_EZ_DISABLED it means that we have already decided + * that EZ must be disabled for the remaining of the frame. + */ + if (job->ez_state == V3D_EZ_DISABLED) + return false; + /* This is part of the pre draw call handling, so we should be inside a * render pass. */ @@ -1371,7 +1848,7 @@ job_update_ez_state(struct v3dv_job *job, /* If this is the first time we update EZ state for this job we first check * if there is anything that requires disabling it completely for the entire * job (based on state that is not related to the current draw call and - * pipeline state). + * pipeline/cmd_buffer state). */ if (!job->decided_global_ez_enable) { job->decided_global_ez_enable = true; @@ -1382,13 +1859,14 @@ job_update_ez_state(struct v3dv_job *job, if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) { job->first_ez_state = V3D_EZ_DISABLED; job->ez_state = V3D_EZ_DISABLED; - return; + return false; } - /* GFXH-1918: the early-z buffer may load incorrect depth values - * if the frame has odd width or height. + /* GFXH-1918: the early-z buffer may load incorrect depth values if the + * frame has odd width or height, or if the buffer is 16-bit and + * multisampled. * - * So we need to disable EZ in this case. + * So we need to disable EZ in these cases. */ const struct v3dv_render_pass_attachment *ds_attachment = &state->pass->attachments[subpass->ds_attachment.attachment]; @@ -1397,21 +1875,32 @@ job_update_ez_state(struct v3dv_job *job, vk_format_aspects(ds_attachment->desc.format); bool needs_depth_load = - check_needs_load(state, - ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, - ds_attachment->first_subpass, - ds_attachment->desc.loadOp); + v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.loadOp, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp); if (needs_depth_load) { + if (ds_attachment->desc.format == VK_FORMAT_D16_UNORM && + ds_attachment->desc.samples != VK_SAMPLE_COUNT_1_BIT) { + perf_debug("Loading depth aspect from a multisampled 16-bit " + "depth buffer disables early-Z tests.\n"); + job->first_ez_state = V3D_EZ_DISABLED; + job->ez_state = V3D_EZ_DISABLED; + return false; + } + struct v3dv_framebuffer *fb = state->framebuffer; if (!fb) { - assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); perf_debug("Loading depth aspect in a secondary command buffer " "without framebuffer info disables early-z tests.\n"); job->first_ez_state = V3D_EZ_DISABLED; job->ez_state = V3D_EZ_DISABLED; - return; + return false; } if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) { @@ -1419,24 +1908,18 @@ job_update_ez_state(struct v3dv_job *job, "or height disables early-Z tests.\n"); job->first_ez_state = V3D_EZ_DISABLED; job->ez_state = V3D_EZ_DISABLED; - return; + return false; } } } /* Otherwise, we can decide to selectively enable or disable EZ for draw - * calls using the CFG_BITS packet based on the bound pipeline state. + * calls using the CFG_BITS packet based on the bound pipeline state, or + * cmd_buffer state if some stencil/depth flags were dynamic. */ - - /* If the FS writes Z, then it may update against the chosen EZ direction */ - struct v3dv_shader_variant *fs_variant = - pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]; - if (fs_variant->prog_data.fs->writes_z) { - job->ez_state = V3D_EZ_DISABLED; - return; - } - - switch (pipeline->ez_state) { + bool disable_ez = false; + bool incompatible_test = false; + switch (cmd_buffer->state.ez_state) { case V3D_EZ_UNDECIDED: /* If the pipeline didn't pick a direction but didn't disable, then go * along with the current EZ state. This allows EZ optimization for Z @@ -1449,25 +1932,40 @@ job_update_ez_state(struct v3dv_job *job, /* If the pipeline picked a direction, then it needs to match the current * direction if we've decided on one. */ - if (job->ez_state == V3D_EZ_UNDECIDED) - job->ez_state = pipeline->ez_state; - else if (job->ez_state != pipeline->ez_state) - job->ez_state = V3D_EZ_DISABLED; + if (job->ez_state == V3D_EZ_UNDECIDED) { + job->ez_state = cmd_buffer->state.ez_state; + } else if (job->ez_state != pipeline->ez_state) { + disable_ez = true; + incompatible_test = true; + } break; case V3D_EZ_DISABLED: - /* If the pipeline disables EZ because of a bad Z func or stencil - * operation, then we can't do any more EZ in this frame. - */ - job->ez_state = V3D_EZ_DISABLED; + disable_ez = true; + incompatible_test = cmd_buffer->state.incompatible_ez_test; break; } - if (job->first_ez_state == V3D_EZ_UNDECIDED && - job->ez_state != V3D_EZ_DISABLED) { + if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) { + assert(job->ez_state != V3D_EZ_DISABLED); job->first_ez_state = job->ez_state; } + + /* If we had to disable EZ because of an incompatible test direction and + * and the cmd buffer writes depth then we need to disable EZ for the rest + * of the frame. + */ + if (incompatible_test && cmd_buffer->state.z_updates_enable) { + assert(disable_ez); + job->ez_state = V3D_EZ_DISABLED; + } + + if (!disable_ez) + job->has_ez_draws = true; + + return !disable_ez; } +#endif void v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) @@ -1478,16 +1976,60 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; assert(pipeline); - job_update_ez_state(job, pipeline, cmd_buffer); - v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS)); v3dv_return_if_oom(cmd_buffer, NULL); + struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + + /* Disable depth/stencil if we don't have a D/S attachment */ + bool has_depth = + pipeline->rendering_info.depth_attachment_format != VK_FORMAT_UNDEFINED; + bool has_stencil = + pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED; + cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) { - config.early_z_enable = job->ez_state != V3D_EZ_DISABLED; + if (dyn->ds.depth.test_enable && has_depth) { + config.z_updates_enable = dyn->ds.depth.write_enable; + config.depth_test_function = dyn->ds.depth.compare_op; + } else { + config.depth_test_function = VK_COMPARE_OP_ALWAYS; + } + + config.stencil_enable = dyn->ds.stencil.test_enable && has_stencil; + + cmd_buffer->state.z_updates_enable = config.z_updates_enable; +#if V3D_VERSION == 42 + bool enable_ez = cmd_buffer_update_ez_state(cmd_buffer, pipeline); + config.early_z_enable = enable_ez; config.early_z_updates_enable = config.early_z_enable && - pipeline->z_updates_enable; - } + cmd_buffer->state.z_updates_enable; +#endif + + if (pipeline->rasterization_enabled) { + assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_CULL_MODE)); + assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_FRONT_FACE)); + config.enable_forward_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT); + config.enable_reverse_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT); + /* Seems like the hardware is backwards regarding this setting... */ + config.clockwise_primitives = dyn->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE; + } + + /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that + * feature and it shouldn't be used by any pipeline. + */ + assert(cmd_buffer->device->devinfo.ver >= 71 || + !dyn->ds.depth.bounds_test.enable); +#if V3D_VERSION >= 71 + config.depth_bounds_test_enable = + dyn->ds.depth.bounds_test.enable && has_depth; +#endif + } + + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE); + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE); + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE); + BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE); } void @@ -1523,7 +2065,8 @@ cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer, if (!job) return NULL; - job->serialize = true; + /* FIXME: we can do better than all barriers */ + job->serialize = V3DV_BARRIER_ALL; job->needs_bcl_sync = is_bcl_barrier; return job; } @@ -1538,21 +2081,20 @@ cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary, const uint32_t total_state_count = p_state->query.end.used_count + s_state->query.end.used_count; v3dv_cmd_buffer_ensure_array_state(primary, - sizeof(struct v3dv_end_query_cpu_job_info), + sizeof(struct v3dv_end_query_info), total_state_count, &p_state->query.end.alloc_count, (void **) &p_state->query.end.states); v3dv_return_if_oom(primary, NULL); for (uint32_t i = 0; i < s_state->query.end.used_count; i++) { - const struct v3dv_end_query_cpu_job_info *s_qstate = + const struct v3dv_end_query_info *s_qstate = &secondary->state.query.end.states[i]; - struct v3dv_end_query_cpu_job_info *p_qstate = + struct v3dv_end_query_info *p_qstate = &p_state->query.end.states[p_state->query.end.used_count++]; - p_qstate->pool = s_qstate->pool; - p_qstate->query = s_qstate->query; + memcpy(p_qstate, s_qstate, sizeof(struct v3dv_end_query_info)); } } @@ -1563,6 +2105,20 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, { assert(primary->state.job); + /* Typically we postpone applying binning syncs until we see a draw call + * that may actually access proteted resources in the binning stage. However, + * if the draw calls are recorded in a secondary command buffer and the + * barriers were recorded in a primary command buffer, that won't work + * and we will have to check if we need a binning sync when executing the + * secondary. + */ + struct v3dv_job *primary_job = primary->state.job; + if (primary_job->serialize && + (primary->state.barrier.bcl_buffer_access || + primary->state.barrier.bcl_image_access)) { + v3dv_cmd_buffer_consume_bcl_sync(primary, primary_job); + } + /* Emit occlusion query state if needed so the draw calls inside our * secondaries update the counters. */ @@ -1575,8 +2131,7 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, * pipelines used by the secondaries do, we need to re-start the primary * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed. */ - bool pending_barrier = false; - bool pending_bcl_barrier = false; + struct v3dv_barrier_state pending_barrier = { 0 }; for (uint32_t i = 0; i < cmd_buffer_count; i++) { V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]); @@ -1585,7 +2140,7 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, list_for_each_entry(struct v3dv_job, secondary_job, &secondary->jobs, list_link) { - if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) { + if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) { /* If the job is a CL, then we branch to it from the primary BCL. * In this case the secondary's BCL is finished with a * RETURN_FROM_SUB_LIST command to return back to the primary BCL @@ -1609,10 +2164,14 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, * the RETURN_FROM_SUB_LIST into the primary job to skip the * branch? */ - struct v3dv_job *primary_job = primary->state.job; - if (!primary_job || secondary_job->serialize || pending_barrier) { + primary_job = primary->state.job; + if (!primary_job || secondary_job->serialize || + pending_barrier.dst_mask) { const bool needs_bcl_barrier = - secondary_job->needs_bcl_sync || pending_bcl_barrier; + secondary_job->needs_bcl_sync || + pending_barrier.bcl_buffer_access || + pending_barrier.bcl_image_access; + primary_job = cmd_buffer_subpass_split_for_barrier(primary, needs_bcl_barrier); @@ -1644,6 +2203,14 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, } } + if (!secondary_job->can_use_double_buffer) { + primary_job->can_use_double_buffer = false; + } else { + primary_job->double_buffer_score.geom += + secondary_job->double_buffer_score.geom; + primary_job->double_buffer_score.render += + secondary_job->double_buffer_score.render; + } primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl; } else { /* This is a regular job (CPU or GPU), so just finish the current @@ -1652,15 +2219,21 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, */ v3dv_cmd_buffer_finish_job(primary); v3dv_job_clone_in_cmd_buffer(secondary_job, primary); - if (pending_barrier) { - secondary_job->serialize = true; - if (pending_bcl_barrier) + if (pending_barrier.dst_mask) { + /* FIXME: do the same we do for primaries and only choose the + * relevant src masks. + */ + secondary_job->serialize = pending_barrier.src_mask_graphics | + pending_barrier.src_mask_transfer | + pending_barrier.src_mask_compute; + if (pending_barrier.bcl_buffer_access || + pending_barrier.bcl_image_access) { secondary_job->needs_bcl_sync = true; + } } } - pending_barrier = false; - pending_bcl_barrier = false; + memset(&pending_barrier, 0, sizeof(pending_barrier)); } /* If the secondary has recorded any vkCmdEndQuery commands, we need to @@ -1672,14 +2245,16 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, /* If this secondary had any pending barrier state we will need that * barrier state consumed with whatever comes next in the primary. */ - assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier); - pending_barrier = secondary->state.has_barrier; - pending_bcl_barrier = secondary->state.has_bcl_barrier; + assert(secondary->state.barrier.dst_mask || + (!secondary->state.barrier.bcl_buffer_access && + !secondary->state.barrier.bcl_image_access)); + + pending_barrier = secondary->state.barrier; } - if (pending_barrier) { - primary->state.has_barrier = true; - primary->state.has_bcl_barrier |= pending_bcl_barrier; + if (pending_barrier.dst_mask) { + v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier, + &pending_barrier); } } @@ -1698,7 +2273,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, gs_bin->prog_data.gs->base.threads == 4; shader.geometry_bin_mode_shader_start_in_final_thread_section = gs_bin->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_bin_mode_shader_propagate_nans = true; +#endif shader.geometry_bin_mode_shader_uniforms_address = gs_bin_uniforms; @@ -1708,21 +2285,23 @@ emit_gs_shader_state_record(struct v3dv_job *job, gs->prog_data.gs->base.threads == 4; shader.geometry_render_mode_shader_start_in_final_thread_section = gs->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_render_mode_shader_propagate_nans = true; +#endif shader.geometry_render_mode_shader_uniforms_address = gs_render_uniforms; } } static uint8_t -v3d_gs_output_primitive(uint32_t prim_type) +v3d_gs_output_primitive(enum mesa_prim prim_type) { switch (prim_type) { - case GL_POINTS: + case MESA_PRIM_POINTS: return GEOMETRY_SHADER_POINTS; - case GL_LINE_STRIP: + case MESA_PRIM_LINE_STRIP: return GEOMETRY_SHADER_LINE_STRIP; - case GL_TRIANGLE_STRIP: + case MESA_PRIM_TRIANGLE_STRIP: return GEOMETRY_SHADER_TRI_STRIP; default: unreachable("Unsupported primitive type"); @@ -1884,10 +2463,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) pipeline->vpm_cfg.Gv); } +#if V3D_VERSION == 42 struct v3dv_bo *default_attribute_values = pipeline->default_attribute_values != NULL ? pipeline->default_attribute_values : pipeline->device->default_attribute_float; +#endif cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD, pipeline->shader_state_record, shader) { @@ -1913,8 +2494,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs; shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs; +#if V3D_VERSION == 42 shader.address_of_default_attribute_values = v3dv_cl_address(default_attribute_values, 0); +#endif shader.any_shader_reads_hardware_written_primitive_id = (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid; @@ -1979,6 +2562,8 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) cs_loaded_any = true; } + attr.stride = + cmd_buffer->vk.dynamic_graphics_state.vi_binding_strides[binding]; attr.maximum_index = 0xffffff; } @@ -2027,6 +2612,11 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) } } + /* Clearing push constants and descriptor sets for all stages is not quite + * correct (some shader stages may not be used at all or they may not be + * consuming push constants), however this is not relevant because if we + * bind a different pipeline we always have to rebuild the uniform streams. + */ cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER | V3DV_CMD_DIRTY_DESCRIPTOR_SETS | V3DV_CMD_DIRTY_PUSH_CONSTANTS); @@ -2034,44 +2624,15 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS; } -/* FIXME: C&P from v3dx_draw. Refactor to common place? */ -static uint32_t -v3d_hw_prim_type(enum pipe_prim_type prim_type) -{ - switch (prim_type) { - case PIPE_PRIM_POINTS: - case PIPE_PRIM_LINES: - case PIPE_PRIM_LINE_LOOP: - case PIPE_PRIM_LINE_STRIP: - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - return prim_type; - - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY); - - default: - unreachable("Unsupported primitive type"); - } -} - void v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_draw_info *info) { struct v3dv_job *job = cmd_buffer->state.job; assert(job); - - struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - struct v3dv_pipeline *pipeline = state->gfx.pipeline; - - assert(pipeline); - - uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology); + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology); if (info->first_instance > 0) { v3dv_cl_ensure_space_with_branch( @@ -2226,7 +2787,9 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, assert(job); const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; - uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology); + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology); uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1; v3dv_cl_ensure_space_with_branch( @@ -2245,37 +2808,159 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, } void -v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, - int rt, - uint32_t *rt_bpp, - uint32_t *rt_type, - uint32_t *rt_clamp) +v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer) { - const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); - assert(state->subpass_idx < state->pass->subpass_count); - const struct v3dv_subpass *subpass = - &state->pass->subpasses[state->subpass_idx]; + job->suspending = true; - if (rt >= subpass->color_count) - return; + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(BRANCH)); - struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; - const uint32_t attachment_idx = attachment->attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) - return; + job->suspend_branch_inst_ptr = cl_start(&job->bcl); + cl_emit(&job->bcl, BRANCH, branch) { + branch.address = v3dv_cl_address(NULL, 0); + } - const struct v3dv_framebuffer *framebuffer = state->framebuffer; - assert(attachment_idx < framebuffer->attachment_count); - struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx]; - assert(iview->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT); - - *rt_bpp = iview->internal_bpp; - *rt_type = iview->internal_type; - if (vk_format_is_int(iview->vk.format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; - else if (vk_format_is_srgb(iview->vk.format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; - else - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + /* The sim complains if the command list ends with a branch */ + cl_emit(&job->bcl, NOP, nop); +} + +void +v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend, + struct v3dv_job *suspend, + struct v3dv_job *resume) +{ + assert(resume && resume->resuming); + assert(first_suspend && first_suspend->suspending); + assert(suspend && suspend->suspending); + assert(suspend->suspend_branch_inst_ptr != NULL); + + struct v3dv_bo *resume_bo = + list_first_entry(&resume->bcl.bo_list, struct v3dv_bo, list_link); + struct cl_packet_struct(BRANCH) branch = { + cl_packet_header(BRANCH), + }; + branch.address = v3dv_cl_address(NULL, resume_bo->offset); + + uint8_t *rewrite_addr = (uint8_t *) suspend->suspend_branch_inst_ptr; + cl_packet_pack(BRANCH)(NULL, rewrite_addr, &branch); + + if (resume != first_suspend) { + set_foreach(resume->bos, entry) { + struct v3dv_bo *bo = (void *)entry->key; + v3dv_job_add_bo(first_suspend, bo); + } + } + + first_suspend->suspended_bcl_end = resume->bcl.bo->offset + + v3dv_cl_offset(&resume->bcl); +} + +static void +job_destroy_cb(VkDevice device, uint64_t pobj, VkAllocationCallbacks *allocb) +{ + struct v3dv_job *clone = (struct v3dv_job *) (uintptr_t) pobj; + v3dv_job_destroy(clone); +} + +/** + * This checks if the command buffer has been created with + * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, in which case we won't be + * able to safely patch the resume address into the job (since we could have + * another instance of this job running in the GPU, potentially resuming in a + * different address). In that case, we clone the job and make the clone have + * its own BCL copied from the original job so we can later patch the resume + * address into it safely. + */ +struct v3dv_job * +v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job) +{ + assert(job->suspending); + assert(job->cmd_buffer); + assert(job->type == V3DV_JOB_TYPE_GPU_CL); + + if (!(job->cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) + return job; + + /* Create the clone job, but skip the BCL since we are going to create + * our own below. + */ + struct v3dv_job *clone = v3dv_job_clone(job, true); + if (!clone) + return NULL; + + /* Compute total size of BCL we need to copy */ + uint32_t bcl_size = 0; + list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) + bcl_size += bo->size; + + /* Prepare the BCL for the cloned job. For this we go over the BOs in the + * BCL of the original job and we copy their contents into the single BO + * in the BCL of the cloned job. + */ + clone->clone_owns_bcl = true; + v3dv_cl_init(clone, &clone->bcl); + v3dv_cl_ensure_space(&clone->bcl, bcl_size, 4); + if (!clone->bcl.bo) + return NULL; + + assert(clone->bcl.base); + assert(clone->bcl.base == clone->bcl.next); + + /* Unlink this job from the command buffer's execution list */ + list_inithead(&clone->list_link); + + /* Copy the contents of each BO in the original job's BCL into the single + * BO we have in the clone's BCL. + * + * If the BO is the last in the BCL (which we can tell because it wouldn't + * have emitted a BRANCH instruction to link to another BO) we need to copy + * up to the current BCL offset, otherwise we need to copy up to the BRANCH + * instruction (excluded, since we are putting everything together into a + * single BO here). + */ + list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) { + assert(bo->map); + uint32_t copy_size; + if (bo->cl_branch_offset == 0xffffffff) { /* Last BO in BCL */ + assert(bo == list_last_entry(&job->bcl.bo_list, struct v3dv_bo, list_link)); + copy_size = v3dv_cl_offset(&job->bcl); + } else { + assert(bo->cl_branch_offset >= cl_packet_length(BRANCH)); + copy_size = bo->cl_branch_offset - cl_packet_length(BRANCH); + } + + assert(v3dv_cl_offset(&job->bcl) + copy_size < bcl_size); + memcpy(cl_start(&clone->bcl), bo->map, copy_size); + cl_advance_and_end(&clone->bcl, copy_size); + } + + /* Now we need to fixup the pointer to the suspend BRANCH instruction at the + * end of the BCL so it points to the address in the new BCL. We know that + * to suspend a command buffer we always emit a BRANCH+NOP combo, so we just + * need to go back that many bytes in to the BCL to find the instruction. + */ + uint32_t suspend_terminator_size = + cl_packet_length(BRANCH) + cl_packet_length(NOP); + clone->suspend_branch_inst_ptr = (struct v3dv_cl_out *) + (((uint8_t *)cl_start(&clone->bcl)) - suspend_terminator_size); + assert(*(((uint8_t *)clone->suspend_branch_inst_ptr)) == V3DX(BRANCH_opcode)); + + /* This job is not in the execution list of the command buffer so it + * won't be destroyed with it; add it as a private object to get it freed. + * + * FIXME: every time this job is submitted we clone the job and we only + * destroy it when the command buffer is destroyed. If the user keeps the + * command buffer for the entire lifetime of the application, this command + * buffer could grow significantly, so maybe we want to do something smarter + * like having a syncobj bound to these jobs and every time we submit the + * command buffer again we first check these sncobjs to see if we can free + * some of these clones so we avoid blowing up memory. + */ + v3dv_cmd_buffer_add_private_obj( + job->cmd_buffer, (uintptr_t)clone, + (v3dv_cmd_buffer_private_obj_destroy_cb)job_destroy_cb); + + return clone; } diff --git a/src/broadcom/vulkan/v3dvx_descriptor_set.c b/src/broadcom/vulkan/v3dvx_descriptor_set.c index 2c28ce46aa5..ced7b7e8c85 100644 --- a/src/broadcom/vulkan/v3dvx_descriptor_set.c +++ b/src/broadcom/vulkan/v3dvx_descriptor_set.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -51,7 +51,7 @@ v3dX(descriptor_bo_size)(VkDescriptorType type) } /* To compute the max_bo_size we want to iterate through the descriptor - * types. Unfourtunately we can't just use the descriptor type enum values, as + * types. Unfortunately we can't just use the descriptor type enum values, as * the values are not defined consecutively (so extensions could add new * descriptor types), and VK_DESCRIPTOR_TYPE_MAX_ENUM is also a really big * number. @@ -86,13 +86,15 @@ v3dX(max_descriptor_bo_size)(void) uint32_t -v3dX(combined_image_sampler_texture_state_offset)(void) +v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane) { - return 0; + return v3dX(descriptor_bo_size)(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) * + plane; } uint32_t -v3dX(combined_image_sampler_sampler_state_offset)(void) +v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane) { - return cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32); + return v3dX(combined_image_sampler_texture_state_offset)(plane) + + cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32); } diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c index a48738aec42..a27d65cfd23 100644 --- a/src/broadcom/vulkan/v3dvx_device.c +++ b/src/broadcom/vulkan/v3dvx_device.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,7 +26,6 @@ #include "broadcom/common/v3d_macros.h" #include "broadcom/cle/v3dx_pack.h" #include "broadcom/compiler/v3d_compiler.h" -#include "vk_format_info.h" #include "util/u_pack_color.h" #include "util/half_float.h" @@ -50,8 +49,8 @@ vk_to_v3d_compare_func[] = { [VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS, }; - static union pipe_color_union encode_border_color( + const struct v3dv_device *device, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) { const struct util_format_description *desc = @@ -59,10 +58,55 @@ static union pipe_color_union encode_border_color( const struct v3dv_format *format = v3dX(get_format)(bc_info->format); + /* YCbCr doesn't interact with border color at all. From spec: + * + * "If sampler YCBCR conversion is enabled, addressModeU, addressModeV, + * and addressModeW must be VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + * anisotropyEnable must be VK_FALSE, and unnormalizedCoordinates must + * be VK_FALSE" + */ + assert(format->plane_count == 1); + + /* We use the swizzle in our format table to determine swizzle configuration + * for sampling as well as to decide if we need to use the Swap R/B and + * Reverse Channels bits for Tile Load/Store operations. The order of the + * R/B swap and Reverse operations matters and gives different swizzles. + * Our format table assumes that Reverse happens first and R/B Swap second. + * This seems to match semantics for texture sampling and Tile load/store, + * however, it seems that the semantics are reversed for custom border + * colors so we need to fix up the swizzle manually for this case. + */ + uint8_t swizzle[4]; + const bool v3d_has_reverse_swap_rb_bits = + v3dv_texture_shader_state_has_rb_swap_reverse_bits(device); + if (!v3d_has_reverse_swap_rb_bits && + v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && + v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) { + swizzle[0] = PIPE_SWIZZLE_W; + swizzle[1] = PIPE_SWIZZLE_X; + swizzle[2] = PIPE_SWIZZLE_Y; + swizzle[3] = PIPE_SWIZZLE_Z; + } + /* In v3d 7.x we no longer have a reverse flag for the border color. Instead + * we have to use the new reverse and swap_r/b flags in the texture shader + * state which will apply the format swizzle automatically when sampling + * the border color too and we should not apply it manually here. + */ + else if (v3d_has_reverse_swap_rb_bits && + (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) || + v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) { + swizzle[0] = PIPE_SWIZZLE_X; + swizzle[1] = PIPE_SWIZZLE_Y; + swizzle[2] = PIPE_SWIZZLE_Z; + swizzle[3] = PIPE_SWIZZLE_W; + } else { + memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle)); + } + union pipe_color_union border; for (int i = 0; i < 4; i++) { - if (format->swizzle[i] <= 3) - border.ui[i] = bc_info->customBorderColor.uint32[format->swizzle[i]]; + if (format->planes[0].swizzle[i] <= 3) + border.ui[i] = bc_info->customBorderColor.uint32[swizzle[i]]; else border.ui[i] = 0; } @@ -90,7 +134,11 @@ static union pipe_color_union encode_border_color( (1 << (desc->channel[i].size - 1)) - 1); } - /* convert from float to expected format */ +#if V3D_VERSION <= 42 + /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions + * for us. In V3D 4.x we need to manually convert floating point color + * values to the expected format. + */ if (vk_format_is_srgb(bc_info->format) || vk_format_is_compressed(bc_info->format)) { for (int i = 0; i < 4; i++) @@ -142,12 +190,14 @@ static union pipe_color_union encode_border_color( } } } +#endif return border; } void -v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, +v3dX(pack_sampler_state)(const struct v3dv_device *device, + struct v3dv_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) { @@ -175,21 +225,6 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, break; } - /* For some texture formats, when clamping to transparent black border the - * CTS expects alpha to be set to 1 instead of 0, but the border color mode - * will take priority over the texture state swizzle, so the only way to - * fix that is to apply a swizzle in the shader. Here we keep track of - * whether we are activating that mode and we will decide if we need to - * activate the texture swizzle lowering in the shader key at compile time - * depending on the actual texture format. - */ - if ((pCreateInfo->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER || - pCreateInfo->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER || - pCreateInfo->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER) && - border_color_mode == V3D_BORDER_COLOR_0000) { - sampler->clamp_to_transparent_black_border = true; - } - v3dvx_pack(sampler->sampler_state, SAMPLER_STATE, s) { if (pCreateInfo->anisotropyEnable) { s.anisotropy_enable = true; @@ -204,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, s.border_color_mode = border_color_mode; if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) { - union pipe_color_union border = encode_border_color(bc_info); + union pipe_color_union border = encode_border_color(device, bc_info); s.border_color_word_0 = border.ui[0]; s.border_color_word_1 = border.ui[1]; @@ -238,12 +273,15 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, void v3dX(framebuffer_compute_internal_bpp_msaa)( const struct v3dv_framebuffer *framebuffer, + const struct v3dv_cmd_buffer_attachment_state *attachments, const struct v3dv_subpass *subpass, - uint8_t *max_bpp, + uint8_t *max_internal_bpp, + uint8_t *total_color_bpp, bool *msaa) { STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0); - *max_bpp = V3D_INTERNAL_BPP_32; + *max_internal_bpp = V3D_INTERNAL_BPP_32; + *total_color_bpp = 0; *msaa = false; if (subpass) { @@ -252,11 +290,15 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( if (att_idx == VK_ATTACHMENT_UNUSED) continue; - const struct v3dv_image_view *att = framebuffer->attachments[att_idx]; + const struct v3dv_image_view *att = attachments[att_idx].image_view; assert(att); + assert(att->plane_count == 1); - if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) - *max_bpp = MAX2(*max_bpp, att->internal_bpp); + if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { + const uint32_t internal_bpp = att->planes[0].internal_bpp; + *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); + *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + } if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; @@ -264,23 +306,26 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( if (!*msaa && subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { const struct v3dv_image_view *att = - framebuffer->attachments[subpass->ds_attachment.attachment]; + attachments[subpass->ds_attachment.attachment].image_view; assert(att); if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; } - return; } assert(framebuffer->attachment_count <= 4); for (uint32_t i = 0; i < framebuffer->attachment_count; i++) { - const struct v3dv_image_view *att = framebuffer->attachments[i]; + const struct v3dv_image_view *att = attachments[i].image_view; assert(att); + assert(att->plane_count == 1); - if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) - *max_bpp = MAX2(*max_bpp, att->internal_bpp); + if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { + const uint32_t internal_bpp = att->planes[0].internal_bpp; + *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); + *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + } if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; @@ -342,7 +387,7 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color, } } -#ifdef DEBUG +#if MESA_DEBUG void v3dX(device_check_prepacked_sizes)(void) { diff --git a/src/broadcom/vulkan/v3dvx_formats.c b/src/broadcom/vulkan/v3dvx_formats.c index 4f77dd0086a..4fe548faee0 100644 --- a/src/broadcom/vulkan/v3dvx_formats.c +++ b/src/broadcom/vulkan/v3dvx_formats.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,6 +26,9 @@ #include "broadcom/cle/v3dx_pack.h" #include "util/format/u_format.h" +#include "vk_enum_to_str.h" +#include "vk_enum_defines.h" +#include "vk_util.h" #define SWIZ(x,y,z,w) { \ PIPE_SWIZZLE_##x, \ @@ -35,15 +38,34 @@ } #define FORMAT(vk, rt, tex, swiz, return_size, supports_filtering) \ - [VK_FORMAT_##vk] = { \ - true, \ - V3D_OUTPUT_IMAGE_FORMAT_##rt, \ - TEXTURE_DATA_FORMAT_##tex, \ - swiz, \ - return_size, \ + [VK_ENUM_OFFSET(VK_FORMAT_##vk)] = { \ + 1, \ + {{ \ + V3D_OUTPUT_IMAGE_FORMAT_##rt, \ + TEXTURE_DATA_FORMAT_##tex, \ + swiz, \ + return_size, \ + }}, \ supports_filtering, \ } +#define PLANE(rt, tex, swiz, return_size) \ + { \ + V3D_OUTPUT_IMAGE_FORMAT_##rt, \ + TEXTURE_DATA_FORMAT_##tex, \ + swiz, \ + return_size \ + } + +#define YCBCR_FORMAT(vk, supports_filtering, plane_count, ...) \ + [VK_ENUM_OFFSET(VK_FORMAT_##vk)] = { \ + plane_count, \ + { \ + __VA_ARGS__, \ + }, \ + supports_filtering, \ + } + #define SWIZ_X001 SWIZ(X, 0, 0, 1) #define SWIZ_XY01 SWIZ(X, Y, 0, 1) #define SWIZ_XYZ1 SWIZ(X, Y, Z, 1) @@ -57,6 +79,7 @@ #define SWIZ_XXXX SWIZ(X, X, X, X) #define SWIZ_000X SWIZ(0, 0, 0, X) #define SWIZ_WXYZ SWIZ(W, X, Y, Z) +#define SWIZ_WZYX SWIZ(W, Z, Y, X) /* FIXME: expand format table to describe whether the format is supported * for buffer surfaces (texel buffers, vertex buffers, etc). @@ -132,6 +155,7 @@ static const struct v3dv_format format_table[] = { FORMAT(A8B8G8R8_SRGB_PACK32, SRGB8_ALPHA8, RGBA8, SWIZ_XYZW, 16, true), /* RGBA8 sRGB */ FORMAT(A2B10G10R10_UNORM_PACK32,RGB10_A2, RGB10_A2, SWIZ_XYZW, 16, true), FORMAT(A2B10G10R10_UINT_PACK32, RGB10_A2UI, RGB10_A2UI, SWIZ_XYZW, 16, false), + FORMAT(A2R10G10B10_UNORM_PACK32,RGB10_A2, RGB10_A2, SWIZ_ZYXW, 16, true), FORMAT(E5B9G9R9_UFLOAT_PACK32, NO, RGB9_E5, SWIZ_XYZ1, 16, true), FORMAT(B10G11R11_UFLOAT_PACK32, R11F_G11F_B10F,R11F_G11F_B10F, SWIZ_XYZ1, 16, true), @@ -196,13 +220,61 @@ static const struct v3dv_format format_table[] = { FORMAT(ASTC_12x12_SRGB_BLOCK, NO, ASTC_12X12, SWIZ_XYZW, 16, true), }; +/** + * Vulkan layout for 4444 formats is defined like this: + * + * Vulkan ABGR4: (LSB) R | G | B | A (MSB) + * Vulkan ARGB4: (LSB) B | G | R | A (MSB) + * + * We map this to the V3D RGB4 texture format, which really, is ABGR4 with + * R in the MSB, so: + * + * V3D ABGR4 : (LSB) A | B | G | R (MSB) + * + * Which is reversed from Vulkan's ABGR4 layout. So in order to match Vulkan + * semantics we need to apply the following swizzles: + * + * ABGR4: WZYX (reverse) + * ARGB4: YZWX (reverse + swap R/B) + */ +static const struct v3dv_format format_table_4444[] = { + FORMAT(A4B4G4R4_UNORM_PACK16, ABGR4444, RGBA4, SWIZ_WZYX, 16, true), /* Reverse */ + FORMAT(A4R4G4B4_UNORM_PACK16, ABGR4444, RGBA4, SWIZ_YZWX, 16, true), /* Reverse + RB swap */ +}; + +static const struct v3dv_format format_table_ycbcr[] = { + YCBCR_FORMAT(G8_B8R8_2PLANE_420_UNORM, false, 2, + PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16), + PLANE(RG8, RG8, SWIZ(X, Y, 0, 1), 16) + ), + YCBCR_FORMAT(G8_B8_R8_3PLANE_420_UNORM, false, 3, + PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16), + PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16), + PLANE(R8, R8, SWIZ(X, 0, 0, 1), 16) + ), +}; + const struct v3dv_format * v3dX(get_format)(VkFormat format) { - if (format < ARRAY_SIZE(format_table) && format_table[format].supported) + /* Core formats */ + if (format < ARRAY_SIZE(format_table) && format_table[format].plane_count) return &format_table[format]; - else + + uint32_t ext_number = VK_ENUM_EXTENSION(format); + uint32_t enum_offset = VK_ENUM_OFFSET(format); + + switch (ext_number) { + case _VK_EXT_4444_formats_number: + return &format_table_4444[enum_offset]; + case _VK_KHR_sampler_ycbcr_conversion_number: + if (enum_offset < ARRAY_SIZE(format_table_ycbcr)) + return &format_table_ycbcr[enum_offset]; + else + return NULL; + default: return NULL; + } } void @@ -339,18 +411,32 @@ bool v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format) { uint32_t type, bpp; - v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp); + + /* Multiplanar images cannot be multisampled: + * + * "sampleCounts will be set to VK_SAMPLE_COUNT_1_BIT if at least one of + * the following conditions is true: (...) format is one of the formats + * that require a sampler Y′CBCR conversion (...)" + */ + if (!format->plane_count || format->plane_count > 1) + return false; + + v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type, &type, &bpp); return type == V3D_INTERNAL_TYPE_8 || type == V3D_INTERNAL_TYPE_16F; } bool v3dX(format_supports_blending)(const struct v3dv_format *format) { + /* ycbcr formats don't support blending */ + if (!format->plane_count || format->plane_count > 1) + return false; + /* Hardware blending is only supported on render targets that are configured * 4x8-bit unorm, 2x16-bit float or 4x16-bit float. */ uint32_t type, bpp; - v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp); + v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type, &type, &bpp); switch (type) { case V3D_INTERNAL_TYPE_8: return bpp == V3D_INTERNAL_BPP_32; @@ -426,23 +512,17 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format, uint32_t *internal_type, uint32_t *internal_bpp) { - const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT | - VK_IMAGE_ASPECT_STENCIL_BIT; - /* We can't store depth/stencil pixel formats to a raster format, so - * so instead we load our depth/stencil aspects to a compatible color - * format. + * instead we load our depth/stencil aspects to a compatible color format. */ - /* FIXME: pre-compute this at image creation time? */ - if (aspect_mask & ds_aspects) { + if (aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + *internal_bpp = V3D_INTERNAL_BPP_32; switch (vk_format) { case VK_FORMAT_D16_UNORM: *internal_type = V3D_INTERNAL_TYPE_16UI; - *internal_bpp = V3D_INTERNAL_BPP_64; break; case VK_FORMAT_D32_SFLOAT: *internal_type = V3D_INTERNAL_TYPE_32F; - *internal_bpp = V3D_INTERNAL_BPP_128; break; case VK_FORMAT_X8_D24_UNORM_PACK32: case VK_FORMAT_D24_UNORM_S8_UINT: @@ -451,7 +531,6 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format, * load command for more details. */ *internal_type = V3D_INTERNAL_TYPE_8UI; - *internal_bpp = V3D_INTERNAL_BPP_32; break; default: assert(!"unsupported format"); @@ -459,7 +538,9 @@ v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format, } } else { const struct v3dv_format *format = v3dX(get_format)(vk_format); - v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, + /* We only expect this to be called for single-plane formats */ + assert(format->plane_count == 1); + v3dX(get_internal_type_bpp_for_output_format)(format->planes[0].rt_type, internal_type, internal_bpp); } } diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c index a9aa0fb9797..de984e81220 100644 --- a/src/broadcom/vulkan/v3dvx_image.c +++ b/src/broadcom/vulkan/v3dvx_image.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,32 +26,6 @@ #include "broadcom/cle/v3dx_pack.h" #include "broadcom/compiler/v3d_compiler.h" -#include "vk_format_info.h" - -/* - * This method translates pipe_swizzle to the swizzle values used at the - * packet TEXTURE_SHADER_STATE - * - * FIXME: C&P from v3d, common place? - */ -static uint32_t -translate_swizzle(unsigned char pipe_swizzle) -{ - switch (pipe_swizzle) { - case PIPE_SWIZZLE_0: - return 0; - case PIPE_SWIZZLE_1: - return 1; - case PIPE_SWIZZLE_X: - case PIPE_SWIZZLE_Y: - case PIPE_SWIZZLE_Z: - case PIPE_SWIZZLE_W: - return 2 + pipe_swizzle; - default: - unreachable("unknown swizzle"); - } -} - /* * Packs and ensure bo for the shader state (the latter can be temporal). */ @@ -71,78 +45,125 @@ pack_texture_shader_state_helper(struct v3dv_device *device, image->vk.samples == VK_SAMPLE_COUNT_4_BIT); const uint32_t msaa_scale = image->vk.samples == VK_SAMPLE_COUNT_1_BIT ? 1 : 2; - v3dvx_pack(image_view->texture_shader_state[index], TEXTURE_SHADER_STATE, tex) { - - tex.level_0_is_strictly_uif = - (image->slices[0].tiling == V3D_TILING_UIF_XOR || - image->slices[0].tiling == V3D_TILING_UIF_NO_XOR); - - tex.level_0_xor_enable = (image->slices[0].tiling == V3D_TILING_UIF_XOR); - - if (tex.level_0_is_strictly_uif) - tex.level_0_ub_pad = image->slices[0].ub_pad; - - /* FIXME: v3d never sets uif_xor_disable, but uses it on the following - * check so let's set the default value - */ - tex.uif_xor_disable = false; - if (tex.uif_xor_disable || - tex.level_0_is_strictly_uif) { - tex.extended = true; - } - - tex.base_level = image_view->vk.base_mip_level; - tex.max_level = image_view->vk.base_mip_level + - image_view->vk.level_count - 1; - - tex.swizzle_r = translate_swizzle(image_view->swizzle[0]); - tex.swizzle_g = translate_swizzle(image_view->swizzle[1]); - tex.swizzle_b = translate_swizzle(image_view->swizzle[2]); - tex.swizzle_a = translate_swizzle(image_view->swizzle[3]); - - tex.texture_type = image_view->format->tex_type; - - if (image->vk.image_type == VK_IMAGE_TYPE_3D) { - tex.image_depth = image->vk.extent.depth; - } else { - tex.image_depth = image_view->vk.layer_count; + for (uint8_t plane = 0; plane < image_view->plane_count; plane++) { + uint8_t iplane = image_view->planes[plane].image_plane; + v3dvx_pack(image_view->planes[plane].texture_shader_state[index], TEXTURE_SHADER_STATE, tex) { + + tex.level_0_is_strictly_uif = + (image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_XOR || + image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_NO_XOR); + + tex.level_0_xor_enable = (image->planes[iplane].slices[0].tiling == V3D_TILING_UIF_XOR); + + if (tex.level_0_is_strictly_uif) + tex.level_0_ub_pad = image->planes[iplane].slices[0].ub_pad; + + /* FIXME: v3d never sets uif_xor_disable, but uses it on the following + * check so let's set the default value + */ + tex.uif_xor_disable = false; + if (tex.uif_xor_disable || + tex.level_0_is_strictly_uif) { + tex.extended = true; + } + + tex.base_level = image_view->vk.base_mip_level; + tex.max_level = image_view->vk.base_mip_level + + image_view->vk.level_count - 1; + + tex.swizzle_r = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[0]); + tex.swizzle_g = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[1]); + tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]); + tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]); + + tex.texture_type = image_view->format->planes[plane].tex_type; + + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { + tex.image_depth = image->vk.extent.depth; + } else { + tex.image_depth = image_view->vk.layer_count; + } + + /* Empirical testing with CTS shows that when we are sampling from cube + * arrays we want to set image depth to layers / 6, but not when doing + * image load/store. + */ + if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY && + !for_cube_map_array_storage) { + assert(tex.image_depth % 6 == 0); + tex.image_depth /= 6; + } + + tex.image_height = image->planes[iplane].height * msaa_scale; + tex.image_width = image->planes[iplane].width * msaa_scale; + + /* On 4.x, the height of a 1D texture is redefined to be the + * upper 14 bits of the width (which is only usable with txf). + */ + if (image->vk.image_type == VK_IMAGE_TYPE_1D) + tex.image_height = tex.image_width >> 14; + + tex.image_width &= (1 << 14) - 1; + tex.image_height &= (1 << 14) - 1; + + tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64; + + /* At this point we don't have the job. That's the reason the first + * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to + * add the bo to the job. This also means that we need to add manually + * the image bo to the job using the texture. + */ + const uint32_t base_offset = + image->planes[iplane].mem->bo->offset + + v3dv_layer_offset(image, 0, image_view->vk.base_array_layer, + iplane); + tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); + + bool is_srgb = vk_format_is_srgb(image_view->vk.format); + + /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose + * the reverse and/or swap_r/b swizzle from the format table with the + * image view swizzle. This, however, doesn't work for border colors, + * for that there is the reverse_standard_border_color. + * + * In v3d 7.x, however, there is no reverse_standard_border_color bit, + * since the reverse and swap_r/b bits also affect border colors. It is + * because of this that we absolutely need to use these bits with + * reversed and swpaped formats, since that's the only way to ensure + * correct border colors. In that case we don't want to program the + * swizzle to the composition of the format swizzle and the view + * swizzle like we do in v3d 4.x, since the format swizzle is applied + * via the reverse and swap_r/b bits. + */ +#if V3D_VERSION == 42 + tex.srgb = is_srgb; + tex.reverse_standard_border_color = + image_view->planes[plane].channel_reverse; +#endif +#if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; + + tex.reverse = image_view->planes[plane].channel_reverse; + tex.r_b_swap = image_view->planes[plane].swap_rb; + + if (tex.reverse || tex.r_b_swap) { + tex.swizzle_r = + v3d_translate_pipe_swizzle(image_view->view_swizzle[0]); + tex.swizzle_g = + v3d_translate_pipe_swizzle(image_view->view_swizzle[1]); + tex.swizzle_b = + v3d_translate_pipe_swizzle(image_view->view_swizzle[2]); + tex.swizzle_a = + v3d_translate_pipe_swizzle(image_view->view_swizzle[3]); + } + + tex.chroma_offset_x = 1; + tex.chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex.texture_base_pointer_cb = base_offset >> 6; + tex.texture_base_pointer_cr = base_offset >> 6; +#endif } - - /* Empirical testing with CTS shows that when we are sampling from cube - * arrays we want to set image depth to layers / 6, but not when doing - * image load/store. - */ - if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY && - !for_cube_map_array_storage) { - assert(tex.image_depth % 6 == 0); - tex.image_depth /= 6; - } - - tex.image_height = image->vk.extent.height * msaa_scale; - tex.image_width = image->vk.extent.width * msaa_scale; - - /* On 4.x, the height of a 1D texture is redefined to be the - * upper 14 bits of the width (which is only usable with txf). - */ - if (image->vk.image_type == VK_IMAGE_TYPE_1D) { - tex.image_height = tex.image_width >> 14; - } - tex.image_width &= (1 << 14) - 1; - tex.image_height &= (1 << 14) - 1; - - tex.array_stride_64_byte_aligned = image->cube_map_stride / 64; - - tex.srgb = vk_format_is_srgb(image_view->vk.format); - - /* At this point we don't have the job. That's the reason the first - * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to - * add the bo to the job. This also means that we need to add manually - * the image bo to the job using the texture. - */ - const uint32_t base_offset = - image->mem->bo->offset + - v3dv_layer_offset(image, 0, image_view->vk.base_array_layer); - tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); } } @@ -163,10 +184,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, const struct v3dv_buffer *buffer = buffer_view->buffer; v3dvx_pack(buffer_view->texture_shader_state, TEXTURE_SHADER_STATE, tex) { - tex.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X); - tex.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y); - tex.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z); - tex.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W); + tex.swizzle_r = + v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[0]); + tex.swizzle_g = + v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[1]); + tex.swizzle_b = + v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[2]); + tex.swizzle_a = + v3d_translate_pipe_swizzle(buffer_view->format->planes[0].swizzle[3]); tex.image_depth = 1; @@ -180,8 +205,16 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, tex.image_width &= (1 << 14) - 1; tex.image_height &= (1 << 14) - 1; - tex.texture_type = buffer_view->format->tex_type; - tex.srgb = vk_format_is_srgb(buffer_view->vk_format); + assert(buffer_view->format->plane_count == 1); + tex.texture_type = buffer_view->format->planes[0].tex_type; + + bool is_srgb = vk_format_is_srgb(buffer_view->vk_format); +#if V3D_VERSION == 42 + tex.srgb = is_srgb; +#endif +#if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; +#endif /* At this point we don't have the job. That's the reason the first * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to @@ -194,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, buffer_view->offset; tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); + +#if V3D_VERSION >= 71 + tex.chroma_offset_x = 1; + tex.chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex.texture_base_pointer_cb = base_offset >> 6; + tex.texture_base_pointer_cr = base_offset >> 6; +#endif } } diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c index 2f79e4e9c32..858096f9e4b 100644 --- a/src/broadcom/vulkan/v3dvx_meta_common.c +++ b/src/broadcom/vulkan/v3dvx_meta_common.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,11 +25,11 @@ #include "v3dv_meta_common.h" #include "broadcom/common/v3d_macros.h" +#include "broadcom/common/v3d_tfu.h" +#include "broadcom/common/v3d_util.h" #include "broadcom/cle/v3dx_pack.h" #include "broadcom/compiler/v3d_compiler.h" -#include "vk_format_info.h" - struct rcl_clear_info { const union v3dv_clear_value *clear_value; struct v3dv_image *image; @@ -51,25 +51,46 @@ emit_rcl_prologue(struct v3dv_job *job, if (job->cmd_buffer->state.oom) return NULL; + assert(!tiling->msaa || !tiling->double_buffer); cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) { config.early_z_disable = true; config.image_width_pixels = tiling->width; config.image_height_pixels = tiling->height; config.number_of_render_targets = 1; config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif config.internal_depth_type = fb->internal_depth_type; } + const uint32_t *color = NULL; if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) { - uint32_t clear_pad = 0; + UNUSED uint32_t clear_pad = 0; if (clear_info->image) { const struct v3dv_image *image = clear_info->image; + + /* From vkCmdClearColorImage: + * "image must not use any of the formats that require a sampler + * YCBCR conversion" + */ + assert(image->plane_count == 1); const struct v3d_resource_slice *slice = - &image->slices[clear_info->level]; + &image->planes[0].slices[clear_info->level]; if (slice->tiling == V3D_TILING_UIF_NO_XOR || slice->tiling == V3D_TILING_UIF_XOR) { - int uif_block_height = v3d_utile_height(image->cpp) * 2; + int uif_block_height = v3d_utile_height(image->planes[0].cpp) * 2; uint32_t implicit_padded_height = align(tiling->height, uif_block_height) / uif_block_height; @@ -81,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job, } } - const uint32_t *color = &clear_info->clear_value->color[0]; + color = &clear_info->clear_value->color[0]; + +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { clear.clear_color_low_32_bits = color[0]; clear.clear_color_next_24_bits = color[1] & 0x00ffffff; @@ -105,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job, clear.render_target_number = 0; }; } +#endif } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { rt.render_target_0_internal_bpp = tiling->internal_bpp; rt.render_target_0_internal_type = fb->internal_type; rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; } +#endif + +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + if (color) + rt.clear_color_low_bits = color[0]; + rt.internal_bpp = tiling->internal_bpp; + rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type, + fb->vk_format); + rt.stride = + v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, + v3d_internal_bpp_words(rt.internal_bpp)); + rt.base_address = 0; + rt.render_target_number = 0; + } + + if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { + rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ + ((uint64_t) color[1]) | + (((uint64_t) (color[2] & 0xff)) << 32); + rt.render_target_number = 0; + } + } + + if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { + rt.clear_color_top_bits = /* 56 bits (24 + 32) */ + (((uint64_t) (color[2] & 0xffffff00)) >> 8) | + (((uint64_t) (color[3])) << 24); + rt.render_target_number = 0; + } + } +#endif cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f; @@ -167,11 +226,20 @@ emit_frame_setup(struct v3dv_job *job, cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = NONE; } - if (clear_value && i == 0) { + /* When using double-buffering, we need to clear both buffers (unless + * we only have a single tile to render). + */ + if (clear_value && + (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { +#if V3D_VERSION == 42 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = true; clear.clear_all_render_targets = true; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, CLEAR_RENDER_TARGETS, clear); +#endif } cl_emit(rcl, END_OF_TILE_MARKER, end); } @@ -254,6 +322,9 @@ choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer, bool is_copy_to_buffer, bool is_copy_from_buffer) { + /* At this point the framebuffer was already lowered to single-plane */ + assert(framebuffer->format->plane_count == 1); + if (is_copy_to_buffer || is_copy_from_buffer) { switch (framebuffer->vk_format) { case VK_FORMAT_D16_UNORM: @@ -295,11 +366,11 @@ choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer, } } default: /* Color formats */ - return framebuffer->format->rt_type; + return framebuffer->format->planes[0].rt_type; break; } } else { - return framebuffer->format->rt_type; + return framebuffer->format->planes[0].rt_type; } } @@ -307,8 +378,24 @@ static inline bool format_needs_rb_swap(struct v3dv_device *device, VkFormat format) { - const uint8_t *swizzle = v3dv_get_format_swizzle(device, format); - return swizzle[0] == PIPE_SWIZZLE_Z; + /* We are calling these methods for framebuffer formats, that at this point + * should be single-plane + */ + assert(vk_format_get_plane_count(format) == 1); + const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0); + return v3dv_format_swizzle_needs_rb_swap(swizzle); +} + +static inline bool +format_needs_reverse(struct v3dv_device *device, + VkFormat format) +{ + /* We are calling these methods for framebuffer formats, that at this point + * should be single-plane + */ + assert(vk_format_get_plane_count(format) == 1); + const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0); + return v3dv_format_swizzle_needs_reverse(swizzle); } static void @@ -322,22 +409,29 @@ emit_image_load(struct v3dv_device *device, bool is_copy_to_buffer, bool is_copy_from_buffer) { - uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer); + uint8_t plane = v3dv_plane_from_aspect(aspect); + uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane); + /* For multi-plane formats we are copying plane by plane to the color + * tlb. Framebuffer format was already selected to be a tlb single-plane + * compatible format. We still need to use the real plane to get the + * address etc from the source image. + */ + assert(framebuffer->format->plane_count == 1); /* For image to/from buffer copies we always load to and store from RT0, * even for depth/stencil aspects, because the hardware can't do raster * stores or loads from/to the depth/stencil tile buffers. */ bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer || + image->format->plane_count > 1 || aspect == VK_IMAGE_ASPECT_COLOR_BIT; - const struct v3d_resource_slice *slice = &image->slices[mip_level]; + const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level]; cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) { load.buffer_to_load = load_to_color_tlb ? RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect); - load.address = v3dv_cl_address(image->mem->bo, layer_offset); - + load.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset); load.input_image_format = choose_tlb_format(framebuffer, aspect, false, is_copy_to_buffer, is_copy_from_buffer); @@ -374,6 +468,7 @@ emit_image_load(struct v3dv_device *device, * so we need to make sure we respect the format swizzle. */ needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format); + needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format); } load.r_b_swap = needs_rb_swap; @@ -406,17 +501,28 @@ emit_image_store(struct v3dv_device *device, bool is_copy_from_buffer, bool is_multisample_resolve) { - uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer); + uint8_t plane = v3dv_plane_from_aspect(aspect); + uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane); + + /* + * For multi-plane formats we are copying plane by plane to the color + * tlb. Framebuffer format was already selected to be a tlb single-plane + * compatible format. We still need to use the real plane to get the + * address etc. + */ + assert(framebuffer->format->plane_count == 1); bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer || + image->format->plane_count > 1 || aspect == VK_IMAGE_ASPECT_COLOR_BIT; - const struct v3d_resource_slice *slice = &image->slices[mip_level]; + const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level]; cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = store_from_color_tlb ? RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect); - store.address = v3dv_cl_address(image->mem->bo, layer_offset); + store.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset); + store.clear_buffer_being_stored = false; /* See rationale in emit_image_load() */ @@ -431,6 +537,7 @@ emit_image_store(struct v3dv_device *device, } else if (!is_copy_from_buffer && !is_copy_to_buffer && (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) { needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format); + needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format); } store.r_b_swap = needs_rb_swap; @@ -463,7 +570,7 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job, struct v3dv_buffer *buffer, struct v3dv_image *image, uint32_t layer_offset, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { struct v3dv_cl *cl = &job->indirect; v3dv_cl_ensure_space(cl, 200, 1); @@ -512,9 +619,10 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job, * Vulkan spec states that the output buffer must have packed stencil * values, where each stencil value is 1 byte. */ + uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask); uint32_t cpp = region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ? - 1 : image->cpp; + 1 : image->planes[plane].cpp; uint32_t buffer_stride = width * cpp; uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer_offset; @@ -543,7 +651,7 @@ emit_copy_layer_to_buffer(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer, image, layer, region); @@ -555,7 +663,7 @@ v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job, struct v3dv_buffer *buffer, struct v3dv_image *image, struct v3dv_meta_framebuffer *framebuffer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL); v3dv_return_if_oom(NULL, job); @@ -572,7 +680,7 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, uint32_t layer_offset, - const VkImageResolve2KHR *region) + const VkImageResolve2 *region) { struct v3dv_cl *cl = &job->indirect; v3dv_cl_ensure_space(cl, 200, 1); @@ -608,11 +716,14 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job, region->dstSubresource.baseArrayLayer + layer_offset : region->dstOffset.z + layer_offset; + bool is_depth_or_stencil = + region->dstSubresource.aspectMask & + (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); emit_image_store(job->device, cl, framebuffer, dst, region->dstSubresource.aspectMask, dst_layer, region->dstSubresource.mipLevel, - false, false, true); + false, false, !is_depth_or_stencil); cl_emit(cl, END_OF_TILE_MARKER, end); @@ -630,7 +741,7 @@ emit_resolve_image_layer(struct v3dv_job *job, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, - const VkImageResolve2KHR *region) + const VkImageResolve2 *region) { emit_resolve_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region); @@ -642,7 +753,7 @@ v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, - const VkImageResolve2KHR *region) + const VkImageResolve2 *region) { struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL); v3dv_return_if_oom(NULL, job); @@ -733,7 +844,7 @@ emit_copy_image_layer_per_tile_list(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, uint32_t layer_offset, - const VkImageCopy2KHR *region) + const VkImageCopy2 *region) { struct v3dv_cl *cl = &job->indirect; v3dv_cl_ensure_space(cl, 200, 1); @@ -791,7 +902,7 @@ emit_copy_image_layer(struct v3dv_job *job, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, - const VkImageCopy2KHR *region) + const VkImageCopy2 *region) { emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region); emit_supertile_coordinates(job, framebuffer); @@ -802,7 +913,7 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, - const VkImageCopy2KHR *region) + const VkImageCopy2 *region) { struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL); v3dv_return_if_oom(NULL, job); @@ -815,79 +926,108 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job, void v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_image *dst, - uint32_t dst_mip_level, - uint32_t dst_layer, - struct v3dv_image *src, - uint32_t src_mip_level, - uint32_t src_layer, + uint32_t dst_bo_handle, + uint32_t dst_offset, + enum v3d_tiling_mode dst_tiling, + uint32_t dst_padded_height_or_stride, + uint32_t dst_cpp, + uint32_t src_bo_handle, + uint32_t src_offset, + enum v3d_tiling_mode src_tiling, + uint32_t src_padded_height_or_stride, + uint32_t src_cpp, uint32_t width, uint32_t height, - const struct v3dv_format *format) + const struct v3dv_format_plane *format_plane) { - const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level]; - const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level]; - - assert(dst->mem && dst->mem->bo); - const struct v3dv_bo *dst_bo = dst->mem->bo; - - assert(src->mem && src->mem->bo); - const struct v3dv_bo *src_bo = src->mem->bo; - struct drm_v3d_submit_tfu tfu = { .ios = (height << 16) | width, .bo_handles = { - dst_bo->handle, - src_bo->handle != dst_bo->handle ? src_bo->handle : 0 + dst_bo_handle, + src_bo_handle != dst_bo_handle ? src_bo_handle : 0 }, }; - const uint32_t src_offset = - src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer); tfu.iia |= src_offset; - uint32_t icfg; - if (src_slice->tiling == V3D_TILING_RASTER) { - icfg = V3D_TFU_ICFG_FORMAT_RASTER; +#if V3D_VERSION <= 42 + if (src_tiling == V3D_TILING_RASTER) { + tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT; + } else { + tfu.icfg = (V3D33_TFU_ICFG_FORMAT_LINEARTILE + + (src_tiling - V3D_TILING_LINEARTILE)) << + V3D33_TFU_ICFG_FORMAT_SHIFT; + } + tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT; +#endif +#if V3D_VERSION >= 71 + if (src_tiling == V3D_TILING_RASTER) { + tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; } else { - icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE + - (src_slice->tiling - V3D_TILING_LINEARTILE); + tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + + (src_tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_ICFG_IFORMAT_SHIFT; } - tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT; + tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT; +#endif - const uint32_t dst_offset = - dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer); - tfu.ioa |= dst_offset; + tfu.ioa = dst_offset; - tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE + - (dst_slice->tiling - V3D_TILING_LINEARTILE)) << - V3D_TFU_IOA_FORMAT_SHIFT; - tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT; +#if V3D_VERSION <= 42 + tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE + + (dst_tiling - V3D_TILING_LINEARTILE)) << + V3D33_TFU_IOA_FORMAT_SHIFT; +#endif - switch (src_slice->tiling) { +#if V3D_VERSION >= 71 + tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE + + (dst_tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_IOC_FORMAT_SHIFT; + + switch (dst_tiling) { case V3D_TILING_UIF_NO_XOR: case V3D_TILING_UIF_XOR: - tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp)); + tfu.v71.ioc |= + (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) << + V3D71_TFU_IOC_STRIDE_SHIFT; break; case V3D_TILING_RASTER: - tfu.iis |= src_slice->stride / src->cpp; + tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) << + V3D71_TFU_IOC_STRIDE_SHIFT; break; default: break; } +#endif + switch (src_tiling) { + case V3D_TILING_UIF_NO_XOR: + case V3D_TILING_UIF_XOR: + tfu.iis |= src_padded_height_or_stride / (2 * v3d_utile_height(src_cpp)); + break; + case V3D_TILING_RASTER: + tfu.iis |= src_padded_height_or_stride / src_cpp; + break; + default: + break; + } + + /* The TFU can handle raster sources but always produces UIF results */ + assert(dst_tiling != V3D_TILING_RASTER); + +#if V3D_VERSION <= 42 /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the * OPAD field for the destination (how many extra UIF blocks beyond * those necessary to cover the height). */ - if (dst_slice->tiling == V3D_TILING_UIF_NO_XOR || - dst_slice->tiling == V3D_TILING_UIF_XOR) { - uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp); + if (dst_tiling == V3D_TILING_UIF_NO_XOR || dst_tiling == V3D_TILING_UIF_XOR) { + uint32_t uif_block_h = 2 * v3d_utile_height(dst_cpp); uint32_t implicit_padded_height = align(height, uif_block_h); - uint32_t icfg = - (dst_slice->padded_height - implicit_padded_height) / uif_block_h; - tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT; + uint32_t icfg = (dst_padded_height_or_stride - implicit_padded_height) / + uif_block_h; + tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT; } +#endif v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu); } @@ -1042,7 +1182,7 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_buffer *buffer, uint32_t layer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { struct v3dv_cl *cl = &job->indirect; v3dv_cl_ensure_space(cl, 200, 1); @@ -1072,8 +1212,9 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job, width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format)); height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format)); + uint8_t plane = v3dv_plane_from_aspect(imgrsc->aspectMask); uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ? - 1 : image->cpp; + 1 : image->planes[plane].cpp; uint32_t buffer_stride = width * cpp; uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer; @@ -1081,6 +1222,9 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job, uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask, false, false, true); + uint32_t image_layer = layer + (image->vk.image_type != VK_IMAGE_TYPE_3D ? + imgrsc->baseArrayLayer : region->imageOffset.z); + emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo, buffer_offset, buffer_stride, format); @@ -1100,13 +1244,13 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job, if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { emit_image_load(job->device, cl, framebuffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, - imgrsc->baseArrayLayer + layer, imgrsc->mipLevel, + image_layer, imgrsc->mipLevel, false, false); } else { assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT); emit_image_load(job->device, cl, framebuffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, - imgrsc->baseArrayLayer + layer, imgrsc->mipLevel, + image_layer, imgrsc->mipLevel, false, false); } } @@ -1117,20 +1261,20 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job, /* Store TLB to image */ emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask, - imgrsc->baseArrayLayer + layer, imgrsc->mipLevel, + image_layer, imgrsc->mipLevel, false, true, false); if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { emit_image_store(job->device, cl, framebuffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, - imgrsc->baseArrayLayer + layer, imgrsc->mipLevel, + image_layer, imgrsc->mipLevel, false, false, false); } else { assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT); emit_image_store(job->device, cl, framebuffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, - imgrsc->baseArrayLayer + layer, imgrsc->mipLevel, + image_layer, imgrsc->mipLevel, false, false, false); } } @@ -1151,7 +1295,7 @@ emit_copy_buffer_to_layer(struct v3dv_job *job, struct v3dv_buffer *buffer, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer, layer, region); @@ -1163,7 +1307,7 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_buffer *buffer, struct v3dv_meta_framebuffer *framebuffer, - const VkBufferImageCopy2KHR *region) + const VkBufferImageCopy2 *region) { struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL); v3dv_return_if_oom(NULL, job); @@ -1175,8 +1319,8 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job, } /* Figure out a TLB size configuration for a number of pixels to process. - * Beware that we can't "render" more than 4096x4096 pixels in a single job, - * if the pixel count is larger than this, the caller might need to split + * Beware that we can't "render" more than MAX_DIMxMAX_DIM pixels in a single + * job, if the pixel count is larger than this, the caller might need to split * the job and call this function multiple times. */ static void @@ -1186,7 +1330,7 @@ framebuffer_size_for_pixel_count(uint32_t num_pixels, { assert(num_pixels > 0); - const uint32_t max_dim_pixels = 4096; + const uint32_t max_dim_pixels = V3D_MAX_IMAGE_DIMENSION; const uint32_t max_pixels = max_dim_pixels * max_dim_pixels; uint32_t w, h; @@ -1215,7 +1359,7 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t dst_offset, struct v3dv_bo *src, uint32_t src_offset, - const VkBufferCopy2KHR *region) + const VkBufferCopy2 *region) { const uint32_t internal_bpp = V3D_INTERNAL_BPP_32; const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI; @@ -1264,7 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t width, height; framebuffer_size_for_pixel_count(num_items, &width, &height); - v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, 1, true, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type, @@ -1310,7 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t width, height; framebuffer_size_for_pixel_count(num_items, &width, &height); - v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, 1, true, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT, diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c index 8623a453701..616a7730cd4 100644 --- a/src/broadcom/vulkan/v3dvx_pipeline.c +++ b/src/broadcom/vulkan/v3dvx_pipeline.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,8 +26,6 @@ #include "broadcom/cle/v3dx_pack.h" #include "broadcom/compiler/v3d_compiler.h" -#include "vk_format_info.h" - static uint8_t blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants) { @@ -58,15 +56,10 @@ blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants) case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: case VK_BLEND_FACTOR_SRC1_ALPHA: case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: - assert(!"Invalid blend factor: dual source blending not supported."); + unreachable("Invalid blend factor: dual source blending not supported."); default: - assert(!"Unknown blend factor."); + unreachable("Unknown blend factor."); } - - /* Should be handled by the switch, added to avoid a "end of non-void - * function" error - */ - unreachable("Unknown blend factor."); } static void @@ -86,21 +79,19 @@ pack_blend(struct v3dv_pipeline *pipeline, if (!cb_info) return; - assert(pipeline->subpass); - if (pipeline->subpass->color_count == 0) + const struct vk_render_pass_state *ri = &pipeline->rendering_info; + if (ri->color_attachment_count == 0) return; - assert(pipeline->subpass->color_count == cb_info->attachmentCount); - + assert(ri->color_attachment_count == cb_info->attachmentCount); pipeline->blend.needs_color_constants = false; uint32_t color_write_masks = 0; - for (uint32_t i = 0; i < pipeline->subpass->color_count; i++) { + for (uint32_t i = 0; i < ri->color_attachment_count; i++) { const VkPipelineColorBlendAttachmentState *b_state = &cb_info->pAttachments[i]; - uint32_t attachment_idx = - pipeline->subpass->color_attachments[i].attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) + const VkFormat vk_format = ri->color_attachment_formats[i]; + if (vk_format == VK_FORMAT_UNDEFINED) continue; color_write_masks |= (~b_state->colorWriteMask & 0xf) << (4 * i); @@ -108,10 +99,13 @@ pack_blend(struct v3dv_pipeline *pipeline, if (!b_state->blendEnable) continue; - VkAttachmentDescription *desc = - &pipeline->pass->attachments[attachment_idx].desc; - const struct v3dv_format *format = v3dX(get_format)(desc->format); - bool dst_alpha_one = (format->swizzle[3] == PIPE_SWIZZLE_1); + const struct v3dv_format *format = v3dX(get_format)(vk_format); + + /* We only do blending with render pass attachments, so we should not have + * multiplanar images here + */ + assert(format->plane_count == 1); + bool dst_alpha_one = (format->planes[0].swizzle[3] == PIPE_SWIZZLE_1); uint8_t rt_mask = 1 << i; pipeline->blend.enables |= rt_mask; @@ -148,6 +142,7 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, const VkPipelineDepthStencilStateCreateInfo *ds_info, const VkPipelineRasterizationStateCreateInfo *rs_info, const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info, + const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info, const VkPipelineMultisampleStateCreateInfo *ms_info) { assert(sizeof(pipeline->cfg_bits) == cl_packet_length(CFG_BITS)); @@ -156,23 +151,21 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, ms_info && ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT; v3dvx_pack(pipeline->cfg_bits, CFG_BITS, config) { - config.enable_forward_facing_primitive = - rs_info ? !(rs_info->cullMode & VK_CULL_MODE_FRONT_BIT) : false; - - config.enable_reverse_facing_primitive = - rs_info ? !(rs_info->cullMode & VK_CULL_MODE_BACK_BIT) : false; - - /* Seems like the hardware is backwards regarding this setting... */ - config.clockwise_primitives = - rs_info ? rs_info->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE : false; - - config.enable_depth_offset = rs_info ? rs_info->depthBiasEnable: false; + /* Even if rs_info->depthBiasEnabled is true, we can decide to not + * enable it, like if there isn't a depth/stencil attachment with the + * pipeline. + */ + config.enable_depth_offset = pipeline->depth_bias.enabled; /* This is required to pass line rasterization tests in CTS while * exposing, at least, a minimum of 4-bits of subpixel precision * (the minimum requirement). */ - config.line_rasterization = 1; /* perp end caps */ + if (ls_info && + ls_info->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) + config.line_rasterization = V3D_LINE_RASTERIZATION_DIAMOND_EXIT; + else + config.line_rasterization = V3D_LINE_RASTERIZATION_PERP_END_CAPS; if (rs_info && rs_info->polygonMode != VK_POLYGON_MODE_FILL) { config.direct3d_wireframe_triangles_mode = true; @@ -180,7 +173,10 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, rs_info->polygonMode == VK_POLYGON_MODE_POINT; } - config.rasterizer_oversample_mode = pipeline->msaa ? 1 : 0; + /* diamond-exit rasterization does not support oversample */ + config.rasterizer_oversample_mode = + (config.line_rasterization == V3D_LINE_RASTERIZATION_PERP_END_CAPS && + pipeline->msaa) ? 1 : 0; /* From the Vulkan spec: * @@ -203,30 +199,42 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, config.blend_enable = pipeline->blend.enables != 0; - /* Disable depth/stencil if we don't have a D/S attachment */ - bool has_ds_attachment = - pipeline->subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED; - - if (ds_info && ds_info->depthTestEnable && has_ds_attachment) { - config.z_updates_enable = ds_info->depthWriteEnable; - config.depth_test_function = ds_info->depthCompareOp; +#if V3D_VERSION >= 71 + /* From the Vulkan spec: + * + * "depthClampEnable controls whether to clamp the fragment’s depth + * values as described in Depth Test. If the pipeline is not created + * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present + * then enabling depth clamp will also disable clipping primitives to + * the z planes of the frustrum as described in Primitive Clipping. + * Otherwise depth clipping is controlled by the state set in + * VkPipelineRasterizationDepthClipStateCreateInfoEXT." + */ + bool z_clamp_enable = rs_info && rs_info->depthClampEnable; + bool z_clip_enable = false; + const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info = + rs_info ? vk_find_struct_const(rs_info->pNext, + PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) : + NULL; + if (clip_info) + z_clip_enable = clip_info->depthClipEnable; + else if (!z_clamp_enable) + z_clip_enable = true; + + if (z_clip_enable) { + config.z_clipping_mode = pipeline->negative_one_to_one ? + V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE; } else { - config.depth_test_function = VK_COMPARE_OP_ALWAYS; + config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE; } - /* EZ state will be updated at draw time based on bound pipeline state */ - config.early_z_updates_enable = false; - config.early_z_enable = false; - - config.stencil_enable = - ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false; - - pipeline->z_updates_enable = config.z_updates_enable; + config.z_clamp_mode = z_clamp_enable; +#endif }; } -static uint32_t -translate_stencil_op(enum pipe_stencil_op op) +uint32_t +v3dX(translate_stencil_op)(VkStencilOp op) { switch (op) { case VK_STENCIL_OP_KEEP: @@ -255,7 +263,8 @@ pack_single_stencil_cfg(struct v3dv_pipeline *pipeline, uint8_t *stencil_cfg, bool is_front, bool is_back, - const VkStencilOpState *stencil_state) + const VkStencilOpState *stencil_state, + const struct vk_graphics_pipeline_state *state) { /* From the Vulkan spec: * @@ -267,60 +276,54 @@ pack_single_stencil_cfg(struct v3dv_pipeline *pipeline, * * In our case, 's' is always 8, so we clamp to that to prevent our packing * functions to assert in debug mode if they see larger values. - * - * If we have dynamic state we need to make sure we set the corresponding - * state bits to 0, since cl_emit_with_prepacked ORs the new value with - * the old. */ - const uint8_t write_mask = - pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK ? - 0 : stencil_state->writeMask & 0xff; - - const uint8_t compare_mask = - pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ? - 0 : stencil_state->compareMask & 0xff; - - const uint8_t reference = - pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ? - 0 : stencil_state->reference & 0xff; - v3dvx_pack(stencil_cfg, STENCIL_CFG, config) { config.front_config = is_front; config.back_config = is_back; - config.stencil_write_mask = write_mask; - config.stencil_test_mask = compare_mask; + config.stencil_write_mask = stencil_state->writeMask & 0xff; + config.stencil_test_mask = stencil_state->compareMask & 0xff; config.stencil_test_function = stencil_state->compareOp; - config.stencil_pass_op = translate_stencil_op(stencil_state->passOp); - config.depth_test_fail_op = translate_stencil_op(stencil_state->depthFailOp); - config.stencil_test_fail_op = translate_stencil_op(stencil_state->failOp); - config.stencil_ref_value = reference; + config.stencil_pass_op = + v3dX(translate_stencil_op)(stencil_state->passOp); + config.depth_test_fail_op = + v3dX(translate_stencil_op)(stencil_state->depthFailOp); + config.stencil_test_fail_op = + v3dX(translate_stencil_op)(stencil_state->failOp); + config.stencil_ref_value = stencil_state->reference & 0xff; } } static void pack_stencil_cfg(struct v3dv_pipeline *pipeline, - const VkPipelineDepthStencilStateCreateInfo *ds_info) + const VkPipelineDepthStencilStateCreateInfo *ds_info, + const struct vk_graphics_pipeline_state *state) { assert(sizeof(pipeline->stencil_cfg) == 2 * cl_packet_length(STENCIL_CFG)); - if (!ds_info || !ds_info->stencilTestEnable) + if ((!ds_info || !ds_info->stencilTestEnable) && + (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE))) { return; + } - if (pipeline->subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) + const struct vk_render_pass_state *ri = &pipeline->rendering_info; + if (ri->stencil_attachment_format == VK_FORMAT_UNDEFINED) return; - const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK | - V3DV_DYNAMIC_STENCIL_WRITE_MASK | - V3DV_DYNAMIC_STENCIL_REFERENCE; - + const bool any_dynamic_stencil_states = + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) || + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) || + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_DS_STENCIL_OP); /* If front != back or we have dynamic stencil state we can't emit a single * packet for both faces. */ bool needs_front_and_back = false; - if ((pipeline->dynamic_state.mask & dynamic_stencil_states) || - memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front))) + if ((any_dynamic_stencil_states) || + memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front))) { needs_front_and_back = true; + } /* If the front and back configurations are the same we can emit both with * a single packet. @@ -328,33 +331,41 @@ pack_stencil_cfg(struct v3dv_pipeline *pipeline, pipeline->emit_stencil_cfg[0] = true; if (!needs_front_and_back) { pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0], - true, true, &ds_info->front); + true, true, &ds_info->front, state); } else { pipeline->emit_stencil_cfg[1] = true; pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0], - true, false, &ds_info->front); + true, false, &ds_info->front, state); pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[1], - false, true, &ds_info->back); + false, true, &ds_info->back, state); } } + +/* FIXME: Now that we are passing the vk_graphics_pipeline_state we could + * avoid passing all those parameters. But doing that we would need to change + * all the code that uses the VkXXX structures, and use instead the equivalent + * vk_xxx + */ void v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline, const VkPipelineColorBlendStateCreateInfo *cb_info, const VkPipelineDepthStencilStateCreateInfo *ds_info, const VkPipelineRasterizationStateCreateInfo *rs_info, const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info, - const VkPipelineMultisampleStateCreateInfo *ms_info) + const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info, + const VkPipelineMultisampleStateCreateInfo *ms_info, + const struct vk_graphics_pipeline_state *state) { pack_blend(pipeline, cb_info); - pack_cfg_bits(pipeline, ds_info, rs_info, pv_info, ms_info); - pack_stencil_cfg(pipeline, ds_info); + pack_cfg_bits(pipeline, ds_info, rs_info, pv_info, ls_info, ms_info); + pack_stencil_cfg(pipeline, ds_info, state); } static void pack_shader_state_record(struct v3dv_pipeline *pipeline) { - assert(sizeof(pipeline->shader_state_record) == + assert(sizeof(pipeline->shader_state_record) >= cl_packet_length(GL_SHADER_STATE_RECORD)); struct v3d_fs_prog_data *prog_data_fs = @@ -378,7 +389,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) if (!pipeline->has_gs) { shader.point_size_in_shaded_vertex_data = - pipeline->topology == PIPE_PRIM_POINTS; + pipeline->topology == MESA_PRIM_POINTS; } else { struct v3d_gs_prog_data *prog_data_gs = pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]->prog_data.gs; @@ -390,6 +401,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) * shader needs to write the Z value (even just discards). */ shader.fragment_shader_does_z_writes = prog_data_fs->writes_z; + /* Set if the EZ test must be disabled (due to shader side * effects and the early_z flag not being present in the * shader). @@ -428,15 +440,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) shader.number_of_varyings_in_fragment_shader = prog_data_fs->num_inputs; - shader.coordinate_shader_propagate_nans = true; - shader.vertex_shader_propagate_nans = true; - shader.fragment_shader_propagate_nans = true; - - /* Note: see previous note about adresses */ + /* Note: see previous note about addresses */ /* shader.coordinate_shader_code_address */ /* shader.vertex_shader_code_address */ /* shader.fragment_shader_code_address */ +#if V3D_VERSION == 42 + shader.coordinate_shader_propagate_nans = true; + shader.vertex_shader_propagate_nans = true; + shader.fragment_shader_propagate_nans = true; + /* FIXME: Use combined input/output size flag in the common case (also * on v3d, see v3dx_draw). */ @@ -444,20 +457,32 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) prog_data_vs_bin->separate_segments; shader.vertex_shader_has_separate_input_and_output_vpm_blocks = prog_data_vs->separate_segments; - shader.coordinate_shader_input_vpm_segment_size = prog_data_vs_bin->separate_segments ? prog_data_vs_bin->vpm_input_size : 1; shader.vertex_shader_input_vpm_segment_size = prog_data_vs->separate_segments ? prog_data_vs->vpm_input_size : 1; +#endif + + /* On V3D 7.1 there isn't a specific flag to set if we are using + * shared/separate segments or not. We just set the value of + * vpm_input_size to 0, and set output to the max needed. That should be + * already properly set on prog_data_vs_bin + */ +#if V3D_VERSION == 71 + shader.coordinate_shader_input_vpm_segment_size = + prog_data_vs_bin->vpm_input_size; + shader.vertex_shader_input_vpm_segment_size = + prog_data_vs->vpm_input_size; +#endif shader.coordinate_shader_output_vpm_segment_size = prog_data_vs_bin->vpm_output_size; shader.vertex_shader_output_vpm_segment_size = prog_data_vs->vpm_output_size; - /* Note: see previous note about adresses */ + /* Note: see previous note about addresses */ /* shader.coordinate_shader_uniforms_address */ /* shader.vertex_shader_uniforms_address */ /* shader.fragment_shader_uniforms_address */ @@ -499,7 +524,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) shader.instance_id_read_by_vertex_shader = prog_data_vs->uses_iid; - /* Note: see previous note about adresses */ + /* Note: see previous note about addresses */ /* shader.address_of_default_attribute_values */ } } @@ -592,7 +617,6 @@ pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline, attr.instance_divisor = MIN2(pipeline->vb[binding].instance_divisor, 0xffff); - attr.stride = pipeline->vb[binding].stride; attr.type = get_attr_type(desc); } } @@ -652,3 +676,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, } } } + +#if V3D_VERSION == 42 +static bool +pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) +{ + for (uint8_t i = 0; i < pipeline->va_count; i++) { + if (vk_format_is_int(pipeline->va[i].vk_format)) + return true; + } + return false; +} +#endif + +bool +v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline) +{ +#if V3D_VERSION == 42 + return pipeline_has_integer_vertex_attrib(pipeline); +#endif + + return false; +} + +/* @pipeline can be NULL. In that case we assume the most common case. For + * example, for v42 we assume in that case that all the attributes have a + * float format (we only create an all-float BO once and we reuse it with all + * float pipelines), otherwise we look at the actual type of each attribute + * used with the specific pipeline passed in. + */ +struct v3dv_bo * +v3dX(create_default_attribute_values)(struct v3dv_device *device, + struct v3dv_pipeline *pipeline) +{ +#if V3D_VERSION >= 71 + return NULL; +#endif + + uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; + struct v3dv_bo *bo; + + bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); + + if (!bo) { + fprintf(stderr, "failed to allocate memory for the default " + "attribute values\n"); + return NULL; + } + + bool ok = v3dv_bo_map(device, bo, size); + if (!ok) { + fprintf(stderr, "failed to map default attribute values buffer\n"); + return NULL; + } + + uint32_t *attrs = bo->map; + uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; + for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { + attrs[i * 4 + 0] = 0; + attrs[i * 4 + 1] = 0; + attrs[i * 4 + 2] = 0; + VkFormat attr_format = + pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; + if (i < va_count && vk_format_is_int(attr_format)) { + attrs[i * 4 + 3] = 1; + } else { + attrs[i * 4 + 3] = fui(1.0); + } + } + + v3dv_bo_unmap(device, bo); + + return bo; +} diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h index ab134225a3a..68df5db74ad 100644 --- a/src/broadcom/vulkan/v3dvx_private.h +++ b/src/broadcom/vulkan/v3dvx_private.h @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -55,6 +55,9 @@ void v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer); void +v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer); + +void v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer); void @@ -75,6 +78,14 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job, uint32_t layers); void +v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job); + +void +v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend, + struct v3dv_job *suspend, + struct v3dv_job *resume); + +void v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, uint32_t cmd_buffer_count, const VkCommandBuffer *cmd_buffers); @@ -117,31 +128,34 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t stride); void +v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer); + +struct v3dv_job * +v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job); + +void v3dX(get_hw_clear_color)(const VkClearColorValue *color, uint32_t internal_type, uint32_t internal_size, uint32_t *hw_color); -void -v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, - int rt, - uint32_t *rt_bpp, - uint32_t *rt_type, - uint32_t *rt_clamp); - /* Used at v3dv_device */ void -v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, +v3dX(pack_sampler_state)(const struct v3dv_device *device, + struct v3dv_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info); void v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer, + const struct v3dv_cmd_buffer_attachment_state *attachments, const struct v3dv_subpass *subpass, - uint8_t *max_bpp, bool *msaa); + uint8_t *max_internal_bpp, + uint8_t *total_color_bpp, + bool *msaa); -#ifdef DEBUG +#if MESA_DEBUG void v3dX(device_check_prepacked_sizes)(void); #endif @@ -161,6 +175,10 @@ v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format); bool v3dX(format_supports_blending)(const struct v3dv_format *format); +/* FIXME: tex_format should be `enum V3DX(Texture_Data_Formats)`, but using + * that enum type in the header requires including v3dx_pack.h, which triggers + * circular include dependencies issues, so we're using a `uint32_t` for now. + */ bool v3dX(tfu_supports_tex_format)(uint32_t tex_format); @@ -189,14 +207,14 @@ v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job, struct v3dv_buffer *buffer, struct v3dv_image *image, struct v3dv_meta_framebuffer *framebuffer, - const VkBufferImageCopy2KHR *region); + const VkBufferImageCopy2 *region); void v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, - const VkImageResolve2KHR *region); + const VkImageResolve2 *region); void v3dX(meta_emit_copy_buffer)(struct v3dv_job *job, @@ -223,19 +241,23 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, - const VkImageCopy2KHR *region); + const VkImageCopy2 *region); void v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_image *dst, - uint32_t dst_mip_level, - uint32_t dst_layer, - struct v3dv_image *src, - uint32_t src_mip_level, - uint32_t src_layer, + uint32_t dst_bo_handle, + uint32_t dst_offset, + enum v3d_tiling_mode dst_tiling, + uint32_t dst_padded_height_or_stride, + uint32_t dst_cpp, + uint32_t src_bo_handle, + uint32_t src_offset, + enum v3d_tiling_mode src_tiling, + uint32_t src_padded_height_or_stride, + uint32_t src_cpp, uint32_t width, uint32_t height, - const struct v3dv_format *format); + const struct v3dv_format_plane *format_plane); void v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job, @@ -259,7 +281,7 @@ v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_buffer *buffer, struct v3dv_meta_framebuffer *framebuffer, - const VkBufferImageCopy2KHR *region); + const VkBufferImageCopy2 *region); void v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format, @@ -273,7 +295,7 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t dst_offset, struct v3dv_bo *src, uint32_t src_offset, - const VkBufferCopy2KHR *region); + const VkBufferCopy2 *region); void v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer, @@ -295,20 +317,57 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline, const VkPipelineDepthStencilStateCreateInfo *ds_info, const VkPipelineRasterizationStateCreateInfo *rs_info, const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info, - const VkPipelineMultisampleStateCreateInfo *ms_info); + const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info, + const VkPipelineMultisampleStateCreateInfo *ms_info, + const struct vk_graphics_pipeline_state *state); void v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, const VkPipelineVertexInputStateCreateInfo *vi_info, const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info); + +bool +v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline); + +struct v3dv_bo * +v3dX(create_default_attribute_values)(struct v3dv_device *device, + struct v3dv_pipeline *pipeline); + /* Used at v3dv_queue */ void v3dX(job_emit_noop)(struct v3dv_job *job); +/* Used at v3dv_query */ +VkResult +v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions); + /* Used at v3dv_descriptor_set, and other descriptor set utils */ uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type); uint32_t v3dX(max_descriptor_bo_size)(void); -uint32_t v3dX(combined_image_sampler_texture_state_offset)(void); +uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane); + +uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane); + +/* General utils */ + +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); + +#define V3D42_CLIPPER_XY_GRANULARITY 256.0f +#define V3D71_CLIPPER_XY_GRANULARITY 64.0f + +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); -uint32_t v3dX(combined_image_sampler_sampler_state_offset)(void); +void +v3dX(viewport_compute_xform)(const VkViewport *viewport, + float scale[3], + float translate[3]); + +uint32_t +v3dX(translate_stencil_op)(VkStencilOp op); diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c new file mode 100644 index 00000000000..e59a1e84ff6 --- /dev/null +++ b/src/broadcom/vulkan/v3dvx_query.c @@ -0,0 +1,67 @@ +/* + * Copyright © 2023 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3dv_private.h" + +#include "common/v3d_performance_counters.h" + +VkResult +v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions) +{ + uint32_t desc_count = *pCounterCount; + + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, + out, pCounters, pCounterCount); + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, + out_desc, pCounterDescriptions, &desc_count); + + for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { + vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { + counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; + counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; + counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; + + unsigned char sha1_result[20]; + _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], + strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), + sha1_result); + + memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); + } + + vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, + &out_desc, desc) { + desc->flags = 0; + snprintf(desc->name, sizeof(desc->name), "%s", + v3d_performance_counters[i][V3D_PERFCNT_NAME]); + snprintf(desc->category, sizeof(desc->category), "%s", + v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); + snprintf(desc->description, sizeof(desc->description), "%s", + v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); + } + } + + return vk_outarray_status(&out); +} diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c index 38f9efbfa5d..6eed2de9d54 100644 --- a/src/broadcom/vulkan/v3dvx_queue.c +++ b/src/broadcom/vulkan/v3dvx_queue.c @@ -1,5 +1,5 @@ /* - * Copyright © 2021 Raspberry Pi + * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -29,7 +29,8 @@ void v3dX(job_emit_noop)(struct v3dv_job *job) { - v3dv_job_start_frame(job, 1, 1, 1, true, 1, V3D_INTERNAL_BPP_32, false); + v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, + V3D_INTERNAL_BPP_32, 4, false); v3dX(job_emit_binning_flush)(job); struct v3dv_cl *rcl = &job->rcl; @@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job) config.image_height_pixels = 1; config.number_of_render_targets = 1; config.multisample_mode_4x = false; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = 3; /* Tile size 64 */ + config.log2_tile_height = 3; /* Tile size 64 */ +#endif } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32; rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8; rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.internal_bpp = V3D_INTERNAL_BPP_32; + rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8; + rt.stride = 1; /* Unused RT */ + } +#endif cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { clear.z_clear_value = 1.0f; diff --git a/src/broadcom/vulkan/vk_format_info.h b/src/broadcom/vulkan/vk_format_info.h deleted file mode 100644 index da85cb5b5dd..00000000000 --- a/src/broadcom/vulkan/vk_format_info.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright © 2016 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef VK_FORMAT_INFO_H -#define VK_FORMAT_INFO_H - -#include <stdbool.h> -#include <vulkan/vulkan.h> - -#include "util/format/u_format.h" -#include "vulkan/util/vk_format.h" - -/* FIXME: from freedreno vk_format.h, common place?*/ -static inline bool -vk_format_is_int(VkFormat format) -{ - return util_format_is_pure_integer(vk_format_to_pipe_format(format)); -} - -static inline bool -vk_format_is_sint(VkFormat format) -{ - return util_format_is_pure_sint(vk_format_to_pipe_format(format)); -} - -static inline bool -vk_format_is_uint(VkFormat format) -{ - return util_format_is_pure_uint(vk_format_to_pipe_format(format)); -} - -static inline bool -vk_format_is_unorm(VkFormat format) -{ - return util_format_is_unorm(vk_format_to_pipe_format(format)); -} - -static inline bool -vk_format_is_snorm(VkFormat format) -{ - return util_format_is_snorm(vk_format_to_pipe_format(format)); -} - -static inline bool -vk_format_is_float(VkFormat format) -{ - return util_format_is_float(vk_format_to_pipe_format(format)); -} - -static inline bool -vk_format_is_srgb(VkFormat format) -{ - return util_format_is_srgb(vk_format_to_pipe_format(format)); -} - -static inline unsigned -vk_format_get_blocksize(VkFormat format) -{ - return util_format_get_blocksize(vk_format_to_pipe_format(format)); -} - -static inline unsigned -vk_format_get_blockwidth(VkFormat format) -{ - return util_format_get_blockwidth(vk_format_to_pipe_format(format)); -} - -static inline unsigned -vk_format_get_blockheight(VkFormat format) -{ - return util_format_get_blockheight(vk_format_to_pipe_format(format)); -} - -static inline bool -vk_format_is_compressed(VkFormat format) -{ - return util_format_is_compressed(vk_format_to_pipe_format(format)); -} - -static inline const struct util_format_description * -vk_format_description(VkFormat format) -{ - return util_format_description(vk_format_to_pipe_format(format)); -} - -#endif /* VK_FORMAT_INFO_H */ |